In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from funcs import load_datasets, get_docs
from ir_measures import measures, calc_aggregate

In [2]:
datasets = load_datasets(["ru", "zh", "fa"])

In [3]:
# Load the Qrels and Queries
qrels = pd.DataFrame(datasets["ru"].qrels_iter())  # ground truth
queries = pd.DataFrame(datasets["ru"].queries_iter())  # queries
documents = pd.DataFrame(datasets["ru"].docs_iter())  # documents

In [4]:
documents.head

<bound method NDFrame.head of                                       doc_id  \
0       ecd810c8-4b67-4a53-a0bb-20e0214becde   
1       bdcf1b07-7d19-41a8-923d-55d08957a8d6   
2       b148f67a-8605-48d9-b032-f32a2280f1f0   
3       fcd39864-6cf5-4193-8903-9a101b6863ba   
4       2a0acf64-5fd4-43af-acbf-3f728d65ca2a   
...                                      ...   
964714  ec1ab8fa-faa5-4d34-a150-873a482725c8   
964715  eec74ba1-0566-4122-9660-6a5ca9a94564   
964716  22750309-0e4c-4fe6-9c86-c705008bdeb8   
964717  8804aa1b-3a89-4811-8503-7e3b400bb411   
964718  c89674e1-fbaf-48f9-9ce2-2517968dc020   

                                                    title  \
0       Рафаэль Надаль – в четвертьфинале Открытого че...   
1       Житель Октябрьского района, обналичив чужую ка...   
2       Воспоминания участников войны в Афганистане из...   
3         Глава спецслужбы ФРГ Масен отправлен в отставку   
4       Европейские индексы - 02-04-18 | Новости Армен...   
...                        

In [5]:
common_query_ids = set(qrels["query_id"]).intersection(queries["query_id"])
filtered_queries = queries[queries["query_id"].isin(common_query_ids)]

In [6]:
print(f"Filtered Queries: {filtered_queries.head}")

Filtered Queries: <bound method NDFrame.head of    query_id                                              title  \
0         3                         British royal news impacts   
1         6               Gibraltar's Sovereignty After Brexit   
2        13                     US-South Korea Trade Agreement   
3        14       North Korean Earthquakes and Nuclear Testing   
4       101           Shipwrecks and Historical European Trade   
5       103          African Extremist School Girls Kidnapping   
6       105                           Mount Sinabung Eruptions   
7       107                UN Climate Change Economic Analysis   
8       108            Saudi Arabic movie theater restrictions   
9       111                     Chinese regulation of Fentanyl   
10      113                        Boeing 737 Flight 610 Crash   
11      114         Causes of Forest Fires in central Portugal   
12      116          Russian reaction to Maduro-Guaidó dispute   
13      126                 

In [7]:
#tokenize and normalize Russian

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

#nltk.download('punkt')
#nltk.download('stopwords')

def preprocess_ru(text):
    # Tokenize
    tokens = word_tokenize(text)
    # Lowercase and remove punctuation
    tokens = [word.lower() for word in tokens if word.isalpha()]
    # Remove stop words
    stop_words = set(stopwords.words('russian'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)  # Join tokens back into a single string

In [8]:
# Apply preprocessing to the document text
documents['processed_text'] = documents['text'].apply(preprocess_ru)

In [9]:
# Convert the processed documents to a list
processed_documents = documents["processed_text"].tolist()

# Verify a few processed documents
print(processed_documents[:5])

['двое друзей встретились парке гуляя собаками предложил зайти позавтракать ближайшее кафе пустят туда собаками возразил второй первый решительно направился кафе своей немецкой овчаркой хозяин остановил словами сэр нам заходить животными слепой это хозяин извинился проводил собакой столику друг подождал улице пять минут попробовал сделать самое ваш поводырь чихуахуа скептически осведомился хозяин чихуахуа удивился мужчина подсунули анекдот', 'нашли ошибку текст который выделяем смотрим выделили слишком максимальное количество символов попробуйте снова спасибо сообщение отправлено скоро исправим', 'бежит мышка кота прыгает стола попадает бутылку недопитым вином стоящую полу барахтается говорит коту вытащи дай умереть кошмарной смертью убежишь честное слово вытащил первым делом норку шасть сидит кот обижается мышка выходи сказала убежишь мало сказать мужчине женщина нетрезвом виде анекдот', 'председатель федерального ведомства охране конституции германии масен maaßen отправлен отставку о

In [10]:
# Apply preprocessing to the 'description' column

filtered_queries["processed_query"] = filtered_queries["mt_description"].apply(preprocess_ru) # change description to title to use the title column

# Convert the processed queries to a list
processed_queries = filtered_queries["processed_query"].tolist()

# Verify the processed queries
print(processed_queries[:5])

['какие политические экономические последствия имеют новости британской королевской семье внутри страны рубежом', 'повлияют суверенитет гибралтара переговоры брексите испанией великобританией', 'каким образом южная корея извлечет выгоду соглашения свободной торговле соединенными штатами пострадает', 'являются землетрясения северной корее причиной ядерных испытаний', 'информация торговле судоходстве обнаружена исследованием исторических кораблекрушений европейских судов']


In [11]:
#Vectorize Queries and Documents with TF-IDF 
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform on the combined data for consistent vocabulary
tfidf_documents = vectorizer.fit_transform(processed_documents)
tfidf_queries = vectorizer.transform(processed_queries)

In [12]:
# Calculate similarity

from sklearn.metrics.pairwise import cosine_similarity 
import numpy as np

# Define a similarity threshold (adjust as needed)
threshold = 0.2  # Only retrieve documents with similarity > 0.2

# Compute cosine similarity between each query and all documents
similarity_matrix = cosine_similarity(tfidf_queries, tfidf_documents)

# Number of top documents to retrieve
top_k = 5

# For each query, get the indices of the top-k most similar documents above the threshold
retrieved_docs = []
for i, similarities in enumerate(similarity_matrix):
    # Get the indices of documents where similarity > threshold
    valid_indices = np.where(similarities > threshold)[0]
    
    # If there are valid documents, get the top-k most similar ones from the valid ones
    if len(valid_indices) > 0:
        top_docs_indices = valid_indices[np.argsort(similarities[valid_indices])[-top_k:][::-1]]
    else:
        top_docs_indices = []  # If no documents exceed the threshold, return an empty list

    top_docs = documents.iloc[top_docs_indices]  # Select documents based on the filtered indices
    retrieved_docs.append(top_docs)

# Display the top documents for the first 3 queries
for i, query in enumerate(filtered_queries["processed_query"][:3]):
    print(f"Query: {query}")
    print("Top Documents:")
    print(retrieved_docs[i][["doc_id", "title", "text"]])  # Display doc_id, title, and text
    print("\n" + "="*50 + "\n")


Query: какие политические экономические последствия имеют новости британской королевской семье внутри страны рубежом
Top Documents:
                                      doc_id  \
765495  5e516636-c2cf-467c-a6bb-837249f55861   

                                                    title  \
765495  Принц Ганноверский нарушил традиции, выбирая и...   

                                                     text  
765495  Новости шоу бизнеса: Имя новорожденному сыну п...  


Query: повлияют суверенитет гибралтара переговоры брексите испанией великобританией
Top Documents:
                                      doc_id  \
635155  e9079c95-c8a6-43a0-b472-b2a5019dc51b   
389143  bdc4e621-76ab-435c-8822-9c4050df5a41   
636030  ad60088c-f295-45d6-ad9b-4f498b7de791   
695881  b85eb8ac-d3f1-4b11-a61c-cfba5e58dd0d   

                                                    title  \
635155  Испания грозит наложить вето на сделку по «бре...   
389143  Пивоварский надеется подписать соглашение с ЕС...   
636

In [13]:
# Extract Results in the Required Format 
retrieval_results = []

# Iterate through each query and its retrieved documents
for query_index, top_docs in enumerate(retrieved_docs):
    query_id = filtered_queries.iloc[query_index]["query_id"]  # Get the query ID
    
    # Extract doc_id and similarity scores
    for doc_index in top_docs.index:
        doc_id = top_docs.loc[doc_index, "doc_id"]
        score = similarity_matrix[query_index, doc_index]  # Get similarity score
        retrieval_results.append({"query_id": query_id, "doc_id": doc_id, "score": score})

# Create a DataFrame for the results
your_run = pd.DataFrame(retrieval_results)


In [14]:
# preview the result: 
print(your_run.head())

  query_id                                doc_id     score
0        3  5e516636-c2cf-467c-a6bb-837249f55861  0.205593
1        6  e9079c95-c8a6-43a0-b472-b2a5019dc51b  0.250075
2        6  bdc4e621-76ab-435c-8822-9c4050df5a41  0.224652
3        6  ad60088c-f295-45d6-ad9b-4f498b7de791  0.206008
4        6  b85eb8ac-d3f1-4b11-a61c-cfba5e58dd0d  0.203501


In [15]:
# Calculate the average score across all rows
average_score = your_run["score"].mean()

# Print the result
print(f"The average score across all query-document pairs is: {average_score:.4f}")


The average score across all query-document pairs is: 0.3021


Using the query ht_title column: The average score across all query-document pairs is:  0.3057

Evaluation

In [16]:
import ir_measures
from ir_measures import nDCG, P, Judged, RBP, AP, RR, R

evaluation_metrics = ir_measures.calc_aggregate(
    [
        nDCG@20,  # Normalized Discounted Cumulative Gain @20
        P@5,  # Precision @5
        P(rel=1)@5,  # Precision for relevance level >=1 @5
        Judged@10,  # Judged documents @10
        R@100,  # Recall @100
        R@1000,  # Recall @1000
        AP,  # Average Precision
        RR@10,  # Reciprocal Rank @10
    ],
    qrels,
    your_run
)
print("Results for TF-IDF using the query description",evaluation_metrics)


Results for TF-IDF using the query description {R@1000: 0.025448692740359404, P@5: 0.05555555555555555, RR@10: 0.13271604938271603, Judged@10: 0.1728395061728395, R@100: 0.025448692740359404, AP: 0.013580276705276706, nDCG@20: 0.038614626531733746}


# Using the query title

| Metric       | Value                          | Explanation                                  |
|--------------|--------------------------------|----------------------------------------------|
| **Judged@10** | 0.1321                         | 13.21% of the top 10 retrieved documents were judged for relevance |
| **R@100**     | 0.0212                         | 2.12% of all relevant documents were retrieved in the top 100 results |
| **AP**        | 0.0068                         | Average Precision is relatively low, showing room for improving ranking and relevance among retrieved documents. |
| **R@1000**    | 0.0212                         | Recall at 1000 is the same as at 100, showing that relevant documents are more sparsely distributed in the top 1000 results. |
| **P@5**       | 0.0481                         | Precision at 5 is 4.81%, indicating that relevant documents appear at the top of the ranking but still need further improvement. |
| **nDCG@20**   | 0.0237                         | Normalized Discounted Cumulative Gain at 20 shows that the ranking has room for improvement, especially for highly relevant documents appearing in the top 20 results. |
| **RR@10**     | 0.0806                         | Reciprocal Rank is relatively low, suggesting that the first relevant document is not being retrieved in the top ranks frequently. |



# Using the description

| Metric       | Value                          | Explanation                                  |
|--------------|--------------------------------|----------------------------------------------|
| **Judged@10** | 0.1728                         | 17.28% of the top 10 retrieved documents were judged for relevance, showing significant improvement. |
| **R@100**     | 0.0254                         | 2.5% of all relevant documents were retrieved in the top 100 results, indicating modest recall improvement. |
| **AP**        | 0.0136                         | Average Precision has improved, reflecting better ranking and precision for relevant documents. |
| **R@1000**    | 0.0254                         | The recall across the top 1000 results matches that in the top 100, showing that relevant documents are more spread out. |
| **P@5**       | 0.0556                         | About 5.6% of the top 5 results are relevant, showing a solid increase in precision at higher ranks. |
| **nDCG@20**   | 0.0386                         | Normalized Discounted Cumulative Gain at 20 shows better ranking performance with more relevant documents in higher ranks. |
| **RR@10**     | 0.1327                         | Reciprocal Rank indicates a significant improvement in finding the first relevant document, typically appearing earlier (around rank 8). |



In [17]:
# Query mapping

translated_queries = pd.read_excel("translated_queries_2.xlsx")

# Convert 'query_id' in both DataFrames to string type (or you can use int if you prefer)
translated_queries['query_id'] = translated_queries['query_id'].astype(str)
queries['query_id'] = queries['query_id'].astype(str)

# Merge the dataframes on the query_id column to map the translated query_id to the original query_id
query_mapping = pd.merge(translated_queries[['query_id']], queries[['query_id']], on='query_id', how='inner')

# This will give you a DataFrame with the query_id matching for both translated and original queries
print(query_mapping)

   query_id
0         3
1         6
2        13
3        14
4       101
5       103
6       105
7       107
8       108
9       111
10      113
11      114
12      116
13      126
14      127
15      128
16      133
17      134
18      135
19      136
20      137
21      138
22      142
23      146
24      150
25      151
26      157
27      158
28      161
29      164
30      172
31      179
32      185
33      192
34      199
35      208
36      229
37      230
38      231
39      232
40      233
41      234
42      245
43      246
44      247
45      248
46      249
47      250
48      251
49      252
50      253
51      254
52      255
53      256
