In [4]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
from explore.funcs import load_datasets, get_docs

In [5]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [6]:
model = SentenceTransformer("BAAI/bge-m3")

In [7]:
datasets = load_datasets(["ru", "zh", "fa"])

datasets

{'ru': Dataset(id='neuclir/1/ru/hc4-filtered', provides=['docs', 'queries', 'qrels']),
 'zh': Dataset(id='neuclir/1/zh/hc4-filtered', provides=['docs', 'queries', 'qrels']),
 'fa': Dataset(id='neuclir/1/fa/hc4-filtered', provides=['docs', 'queries', 'qrels'])}

In [8]:
documents = pd.DataFrame(datasets["ru"].docs_iter())

documents.count()

doc_id     964719
title      964719
text       964719
url        964719
time       215421
cc_file    964719
dtype: int64

In [9]:

queries = pd.DataFrame(datasets["ru"].queries_iter())

queries

Unnamed: 0,query_id,title,description,ht_title,ht_description,mt_title,mt_description,narrative_by_relevance,report,report_url,report_date,translation_lang
0,3,British royal news impacts,What political and economic impacts does news ...,Влияние британских королевских новостей,Какое политическое и экономическое влияние нов...,Британские королевские новости влияют,Какие политические и экономические последствия...,{'very_valuable': 'Information regarding econo...,Announcement of engagement\nPrincess Eugenie o...,https://en.wikipedia.org/w/index.php?title=Wed...,2018-05-13,ru
1,6,Gibraltar's Sovereignty After Brexit,How will Gibraltar's sovereignty be impacted b...,Суверенитет Гибралтара после Брексита,Как на суверенитет Гибралтара повлияет на пере...,Суверенитет Гибралтара после Брексита,Как повлияют на суверенитет Гибралтара перегов...,{'very_valuable': 'Gibraltar will remain a ter...,Sovereignty\nSee also: Disputed status of Gibr...,https://en.wikipedia.org/w/index.php?title=Eff...,2018-11-26,ru
2,13,US-South Korea Trade Agreement,How will South Korea benefit from or be harmed...,США-Южнокорейское торговое соглашение,Как Южная Корея выиграет или пострадает от пер...,Торговое соглашение между США и Южной Кореей,Каким образом Южная Корея извлечет выгоду из с...,"{'very_valuable': 'N/A', 'somewhat_valuable': ...",South Korea reactions\n\t\nThis section is in ...,https://en.wikipedia.org/w/index.php?title=Uni...,2018-02-20,ru
3,14,North Korean Earthquakes and Nuclear Testing,Are earthquakes in North Korea being caused by...,Северокорейские землетрясения и ядерное испытание,Являются ли землетрясения в Северной Корее сле...,Северокорейские землетрясения и ядерные испытания,Являются ли землетрясения в Северной Корее при...,"{'very_valuable': 'N/A', 'somewhat_valuable': ...","On 3 September, at 3:31 AM UTC, the United Sta...",https://en.wikipedia.org/w/index.php?title=201...,2017-09-22,ru
4,101,Shipwrecks and Historical European Trade,What information about trade and shipping has ...,Кораблекрушения и историческая Европейская тор...,Какая информация о торговле и судоходстве была...,Кораблекрушения и историческая европейская тор...,Какая информация о торговле и судоходстве была...,{'very_valuable': 'Shipwrecks that belonged to...,Portuguese discoveries (Portuguese: Descobrime...,https://en.wikipedia.org/w/index.php?title=Por...,2018-05-16,ru
5,103,African Extremist School Girls Kidnapping,What is known about instances of extremist or ...,Похищение школьниц из числа африканских экстре...,"Что известно о случаях, когда экстремистские и...",Африканские Экстремистские Школьные Девушки По...,Что известно о случаях похищения девочками из ...,{'very_valuable': 'The number and instances of...,"On the night of 14–15 April 2014, 276 female s...",https://en.wikipedia.org/w/index.php?title=Chi...,2018-02-11,ru
6,105,Mount Sinabung Eruptions,How long will Mount Sinabung's eruptions last ...,Извержения горы Синабунг,Как долго продлится извержение горы Синабунг и...,Разрушения на горе Синабунг,Как долго будут продолжаться извержения горы С...,"{'very_valuable': 'None found', 'somewhat_valu...",Mount Sinabung erupted after a centuries-long ...,https://en.wikipedia.org/w/index.php?title=Mou...,2018-02-19,ru
7,107,UN Climate Change Economic Analysis,What is the economic impact of climate change ...,ООН Изменение Климата Экономический Анализ,Какие экономические последствия изменения клим...,Экономический анализ изменения климата ООН,Каково экономическое воздействие изменения кли...,{'very_valuable': 'Articles that described the...,The United Nations Framework Convention on Cli...,https://en.wikipedia.org/w/index.php?title=Uni...,2018-10-06,ru
8,108,Saudi Arabic movie theater restrictions,What restrictions did Saudi Arabia put on movi...,Ограничения кинотеатра в Саудовской Аравии,Какие ограничения Саудовская Аравия наложила н...,Ограничения саудовского арабского кинотеатра,Какие ограничения были введены Саудовской Арав...,{'very_valuable': 'Described the restrictions ...,Keif al-Hal? triggered a debate on the country...,https://en.wikipedia.org/w/index.php?title=Cin...,2018-04-16,ru
9,111,Chinese regulation of Fentanyl,Has the addition of fentanyl to China's list o...,Китайское регулирование Фентанила,Помогло ли добавление фентанила в список регул...,Китайское регулирование Фентанила,Успешно ли было включение фентанила в перечень...,{'very_valuable': 'One article focused on the ...,Several large quantities of illicitly produced...,https://en.wikipedia.org/w/index.php?title=Fen...,2019-03-30,ru


In [30]:
qrels = pd.DataFrame(datasets["ru"].qrels_iter())


# Check the count of each relevance score in qrels
relevance_counts = qrels["relevance"].value_counts()

# Determine the minimum count among relevance scores 0, 1, and 3 to ensure equal sampling
min_sample_count = 50

# Sample equally from each relevance score based on the minimum count
sample_0 = qrels[qrels["relevance"] == 0].sample(n=min_sample_count, random_state=42)
sample_1 = qrels[qrels["relevance"] == 1].sample(n=min_sample_count, random_state=42)
sample_3 = qrels[qrels["relevance"] == 3].sample(n=min_sample_count, random_state=42)

# Concatenate the samples into one DataFrame
sample = pd.concat([sample_0, sample_1, sample_3]).reset_index(drop=True)


doc_ids = sample["doc_id"].values
query_ids = sample["query_id"].values

docs = documents[documents["doc_id"].isin(doc_ids)]
queries = queries[queries["query_id"].isin(query_ids)]

print(docs["doc_id"].count())
print(queries["query_id"].count())

sample

32
30


Unnamed: 0,query_id,doc_id,relevance,iteration
0,192,00b8dacd-f427-4674-8d4c-9d55349f09c6,0,0
1,158,98b8d139-bb50-41b3-bcc5-9947e60fbea0,0,0
2,146,cebdec40-1b78-4b6d-aea4-2bec47d27db8,0,0
3,161,63e50279-4f81-4ea3-a6d1-62945eeb6f11,0,0
4,251,aafcdfe6-7775-486e-af55-70e5f007cc75,0,0
...,...,...,...,...
145,232,b8786279-cc7c-46aa-a4d4-443f9ce7bc81,3,0
146,103,2cba8cef-f155-4688-b842-dc6962a12ada,3,0
147,254,5e792c8d-d25b-44fd-90f6-8dcb3e3adbec,3,0
148,142,8f55f285-5f73-42f2-b54d-0558278d7d9e,3,0


In [35]:
pool = model.start_multi_process_pool(["cpu", "cpu", "cpu", "cpu"])
doc_embeddings = model.encode_multi_process(docs["text"].values, pool=pool, batch_size=10, show_progress_bar=True)
title_embeddings = model.encode_multi_process(docs["title"].values, pool=pool, batch_size=10, show_progress_bar=True)
query_embeddings = model.encode_multi_process(queries["description"].values, batch_size=10, show_progress_bar=True, pool=pool)

model.stop_multi_process_pool(pool)


Chunks:   0%|          | 0/32 [00:00<?, ?it/s]

Chunks:   0%|          | 0/32 [00:00<?, ?it/s]

Chunks:   0%|          | 0/30 [00:00<?, ?it/s]

In [37]:
doc_embedding_pairs = {doc_id: embedding for doc_id, embedding in zip(docs["doc_id"], doc_embeddings)}
title_embedding_pairs = {doc_id: embedding for doc_id, embedding in zip(docs["doc_id"], title_embeddings)}
query_embedding_pairs = {query_id: embedding for query_id, embedding in zip(queries["query_id"], query_embeddings)}

In [41]:
# Assuming sample, query_embeddings, and doc_embeddings are already defined
results = []

for i, row in sample.iterrows():
    query_id = row["query_id"]
    doc_id = row["doc_id"]
    relevance = row["relevance"]

    # Check if the query_id exists in the mapping and if doc_id exists in the documents
    if query_id in query_embedding_pairs and doc_id in doc_embedding_pairs:
        query_embedding = query_embedding_pairs[query_id]
        doc_embedding = doc_embedding_pairs[doc_id]
        title_embedding = title_embedding_pairs[doc_id]
        
        similarity_doc = np.dot(query_embedding, doc_embedding)
        similarity_title = np.dot(query_embedding, title_embedding)
        results.append([query_id, doc_id, similarity_doc, similarity_title, relevance])

# Convert the results list to a DataFrame
df = pd.DataFrame(results, columns=["query_id", "doc_id", "similarity_doc", "similiarity_title", "relevance"])

# Print the resulting DataFrame
df


Unnamed: 0,query_id,doc_id,similarity_doc,similiarity_title,relevance
0,232,eec368bf-876e-4e86-83ff-56cc967b75b4,0.663229,0.323594,0
1,13,b93df41a-7240-49d6-909c-e38814e07770,0.498686,0.439179,0
2,6,8e7fa312-7615-48d0-80e7-e5e1d7458f6e,0.480728,0.269829,0
3,134,ac7adb86-53b4-4371-b2ee-48b7d785a7f1,0.426618,0.496366,0
4,255,a23fb9a5-dccf-4247-aa22-6f46b72be7ae,0.565778,0.440132,0
5,13,61a806d1-0ebc-4c6c-8d5f-605bf02d293e,0.486842,0.508709,0
6,248,9558b60f-2ec5-480a-8abb-7db03a2afc88,0.640485,0.555054,0
7,172,069b4d10-2121-4755-a325-151dc140602b,0.366741,0.429471,0
8,150,1df3b02e-5dc7-47ee-a2ef-8ba6d22e0d43,0.541192,0.612064,1
9,6,ad60088c-f295-45d6-ad9b-4f498b7de791,0.618871,0.598164,1
