# LMIR

In [7]:
#!pip install ir-measures

import pandas as pd
import nltk
#nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
import ir_measures
from ir_measures import nDCG, P, Judged
#import nltk
#nltk.download('punkt')
#nltk.download('punkt_tab') 
#import ir_datasets

from funcs import load_datasets, get_docs

In [8]:
datasets = load_datasets(["ru", "zh", "fa"])

documents = pd.DataFrame(datasets["ru"].docs_iter())

documents.count()
print(documents.head())
print(documents.columns)
queries = pd.DataFrame(datasets["ru"].queries_iter())

queries.head()

queries.to_excel("translated_queries.xlsx", index=False)

# Load translated queries from the Excel file
translated_queries = pd.read_excel("translated_queries.xlsx")# Load the new dataset




                                 doc_id  \
0  ecd810c8-4b67-4a53-a0bb-20e0214becde   
1  bdcf1b07-7d19-41a8-923d-55d08957a8d6   
2  b148f67a-8605-48d9-b032-f32a2280f1f0   
3  fcd39864-6cf5-4193-8903-9a101b6863ba   
4  2a0acf64-5fd4-43af-acbf-3f728d65ca2a   

                                               title  \
0  Рафаэль Надаль – в четвертьфинале Открытого че...   
1  Житель Октябрьского района, обналичив чужую ка...   
2  Воспоминания участников войны в Афганистане из...   
3    Глава спецслужбы ФРГ Масен отправлен в отставку   
4  Европейские индексы - 02-04-18 | Новости Армен...   

                                                text  \
0  Двое друзей встретились в парке, гуляя с собак...   
1  Вы нашли ошибку\n\nКакой то текст который мы в...   
2  Бежит мышка от кота, прыгает со стола и попада...   
3  Председатель Федерального ведомства по охране ...   
4  МОСКВА, 3 АПРЕЛЯ, АРМЕНПРЕСС. Результаты торго...   

                                                 url  \
0  https://

In [9]:
#tokenize and normalize

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

#nltk.download('punkt')
#nltk.download('stopwords')

def preprocess(text):
    # Tokenize
    tokens = word_tokenize(text)
    # Lowercase and remove punctuation
    tokens = [word.lower() for word in tokens if word.isalpha()]
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)  # Join tokens back into a single string

In [10]:
#tokenize and normalize Russian

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

#nltk.download('punkt')
#nltk.download('stopwords')

def preprocess_ru(text):
    # Tokenize
    tokens = word_tokenize(text)
    # Lowercase and remove punctuation
    tokens = [word.lower() for word in tokens if word.isalpha()]
    # Remove stop words
    stop_words = set(stopwords.words('russian'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)  # Join tokens back into a single string

In [11]:
# Apply preprocessing to the document text
documents['processed_text'] = documents['text'].apply(preprocess_ru)

In [12]:
# Convert the processed documents to a list
processed_documents = documents["processed_text"].tolist()

# Verify a few processed documents
print(processed_documents[:5])

['двое друзей встретились парке гуляя собаками предложил зайти позавтракать ближайшее кафе пустят туда собаками возразил второй первый решительно направился кафе своей немецкой овчаркой хозяин остановил словами сэр нам заходить животными слепой это хозяин извинился проводил собакой столику друг подождал улице пять минут попробовал сделать самое ваш поводырь чихуахуа скептически осведомился хозяин чихуахуа удивился мужчина подсунули анекдот', 'нашли ошибку текст который выделяем смотрим выделили слишком максимальное количество символов попробуйте снова спасибо сообщение отправлено скоро исправим', 'бежит мышка кота прыгает стола попадает бутылку недопитым вином стоящую полу барахтается говорит коту вытащи дай умереть кошмарной смертью убежишь честное слово вытащил первым делом норку шасть сидит кот обижается мышка выходи сказала убежишь мало сказать мужчине женщина нетрезвом виде анекдот', 'председатель федерального ведомства охране конституции германии масен maaßen отправлен отставку о

In [13]:
# Apply preprocessing to the 'title' column
translated_queries["processed_query"] = translated_queries["title"].apply(preprocess)

# Convert the processed queries to a list
processed_queries = translated_queries["processed_query"].tolist()

# Verify the processed queries
print(processed_queries[:5])

['british royal news impacts', 'gibraltar sovereignty brexit', 'korea trade agreement', 'north korean earthquakes nuclear testing', 'shipwrecks historical european trade']


In [12]:
import pandas as pd
from collections import Counter
import numpy as np

# Preprocessed documents and queries are assumed to be stored in 'processed_documents' and 'processed_queries'
vocabulary = set(word for doc in processed_documents for word in doc.split())

alpha = 0.1  # Smoothing parameter, adjust as necessary

def build_lm(tokens):
    count = Counter(tokens)
    total_tokens = sum(count.values()) or 1  # Avoid division by zero
    # Only store non-zero probabilities
    return {word: (count[word] + alpha) / (total_tokens + alpha * len(vocabulary)) for word in count}

# Rebuild language models using the revised function
doc_models = {idx: build_lm(doc.split()) for idx, doc in enumerate(processed_documents)}

# Same for queries
query_models = {idx: build_lm(query.split()) for idx, query in enumerate(processed_queries)}

# Revised KL divergence calculation using sparse models
def kl_divergence(lm_query, lm_document):
    kl_div = 0
    for word in lm_query:
        prob_q = lm_query[word]
        prob_d = lm_document.get(word, alpha / (alpha * len(vocabulary)))
        kl_div += prob_q * np.log(prob_q / prob_d)
    return kl_div

# Calculate similarities with sparse models
similarities = {q_id: {doc_id: kl_divergence(qm, dm) for doc_id, dm in doc_models.items()} for q_id, qm in query_models.items()}


top_n = 10
similar_documents = {
    q_id: sorted(doc_sims.items(), key=lambda item: item[1])[:top_n] for q_id, doc_sims in similarities.items()
}

# Display results for the first three queries
for q_id in sorted(similar_documents.keys())[:3]:
    print(f"Query {q_id}: Top Documents: {similar_documents[q_id]}")

# Collect results
results = []
for q_id, docs in similar_documents.items():
    for doc_id, score in docs:
        results.append({"query_id": q_id, "doc_id": doc_id, "score": score})

# Display results for the first three queries
for q_id in sorted(similar_documents.keys())[:3]:
    print(f"Query {q_id}: Top Documents: {similar_documents[q_id]}")

Query 101: Top Documents: ['ecd810c8-4b67-4a53-a0bb-20e0214becde', 'bdcf1b07-7d19-41a8-923d-55d08957a8d6', 'b148f67a-8605-48d9-b032-f32a22801f10', 'fcd39864-6cf5-4193-8903-9a101b6863ba', '2a0acf64-5fd4-43af-acbf-3f728d65c2aa']
Query 103: Top Documents: ['ecd810c8-4b67-4a53-a0bb-20e0214becde', 'bdcf1b07-7d19-41a8-923d-55d08957a8d6', 'b148f67a-8605-48d9-b032-f32a22801f10', 'fcd39864-6cf5-4193-8903-9a101b6863ba', '2a0acf64-5fd4-43af-acbf-3f728d65c2aa']
Query 105: Top Documents: ['ecd810c8-4b67-4a53-a0bb-20e0214becde', 'bdcf1b07-7d19-41a8-923d-55d08957a8d6', 'b148f67a-8605-48d9-b032-f32a22801f10', 'fcd39864-6cf5-4193-8903-9a101b6863ba', '2a0acf64-5fd4-43af-acbf-3f728d65c2aa']


In [5]:

# Initialize a list to hold all results
results = []

# Iterate through each query and its retrieved documents
for q_id, docs in similar_documents.items():
    for doc_id in docs:
        # Retrieve the similarity score for each document
        similarity_score = similarities[q_id][doc_id]
        # Append results including the query ID, document ID, and similarity score
        results.append({
            "query_id": q_id,
            "doc_id": doc_id,
            "similarity_score": similarity_score
        })

# Create a DataFrame from the results list
results_df = pd.DataFrame(results)
# Ensure the column is correctly named 'score'
results_df.rename(columns={'similarity_score': 'score'}, inplace=True)
# Save the DataFrame to a CSV file
csv_file_path = 'query_document_similarity_scores.csv'
results_df.to_csv(csv_file_path, index=False)

# Load the saved CSV to confirm it's saved correctly
loaded_df = pd.read_csv(csv_file_path)

# Display the loaded DataFrame
print(loaded_df.head())

      query_id  doc_id                                  score     
                                                                    
0     3         ecd810c8-4b67-4a53-a0bb-20e0214becde    1.676250  
1     3         bdcf1b07-7d19-41a8-923d-55d08957a8d6    1.445455  
2     3         b148f67a-8605-48d9-b032-f32a2280f1f0    1.650720  
3     3         fcd39864-6cf5-4193-8903-9a101b6863ba    0.616157  
4     3         2a0acf64-5fd4-43af-acbf-3f728d65ca2a    1.343578  


# Evaluation

In [60]:
qrels = pd.DataFrame(datasets["ru"].qrels_iter())
#The qrels dataset contains relevance judgments that link queries to documents with relevance scores.

In [61]:
# Check the count of each relevance score in qrels
relevance_counts = qrels["relevance"].value_counts()

In [62]:
# Determine the minimum count among relevance scores to ensure balanced sampling
min_sample_count = 10

In [63]:
# Sample equally from each relevance score
sample_0 = qrels[qrels["relevance"] == 0].sample(n=min_sample_count, random_state=42)
sample_1 = qrels[qrels["relevance"] == 1].sample(n=min_sample_count, random_state=42)
sample_3 = qrels[qrels["relevance"] == 3].sample(n=min_sample_count, random_state=42)

In [64]:
sample = pd.concat([sample_0, sample_1, sample_3]).reset_index(drop=True)

In [24]:
sample = qrels

# Print a confirmation message
print("Using the full HC4 RU dataset for evaluation.")


Using the full HC4 RU dataset for evaluation.


In [66]:
doc_ids = sample["doc_id"].values
query_ids = sample["query_id"].values

docs = documents[documents["doc_id"].isin(doc_ids)]
queries = queries[queries["query_id"].isin(query_ids)]



In [31]:
# Find overlapping query_ids
common_query_ids = set(qrels["query_id"]).intersection(results_df["query_id"])

# Filter qrels and your_run to keep only common query_ids
qrels_filtered = qrels[qrels["query_id"].isin(common_query_ids)]
results_df_filtered = results_df[results_df["query_id"].isin(common_query_ids)]

# Ensure document IDs also overlap
common_doc_ids = set(qrels_filtered["doc_id"]).intersection(results_df_filtered["doc_id"])
qrels_filtered = qrels_filtered[qrels_filtered["doc_id"].isin(common_doc_ids)]
results_df_filtered = results_df_filtered[results_df_filtered["doc_id"].isin(common_doc_ids)]


# Check overlap in query_ids
print("Query IDs in qrels:\n", set(qrels["query_id"]))
print("Query IDs in your_run\n:", set(results_df["query_id"]))
print("Common Query IDs:\n", set(qrels["query_id"]).intersection(results_df["query_id"]))

# Check overlap in doc_ids
#print("Doc IDs in qrels:\n", set(qrels["doc_id"]))
print("Doc IDs in your_run:\n", set(results_df["doc_id"]))
print("Common Doc IDs:\n", set(qrels["doc_id"]).intersection(results_df["doc_id"]))

Query IDs in qrels:
{'6', '105', '250', '252', '150', '185', '157', '138', '230', '128', '249', '135', '246', '101', '137', '13', '111', '199', '253', '126', '133', '103', '234', '114', '179', '245', '146', '161', '108', '116', '134', '255', '3', '248', '231', '192', '208', '127', '107', '247', '158', '151', '233', '136', '172', '256', '229', '164', '232', '254', '14', '251', '142', '113'}

Query IDs in your_run:
{'6', '105', '250', '252', '150', '185', '157', '138', '230', '128', '249', '135', '246', '101', '137', '13', '111', '199', '253', '126', '133', '103', '234', '114', '179', '245', '146', '161', '108', '116', '134', '255', '3', '248', '231', '192', '208', '127', '107', '247', '158', '151', '233', '136', '172', '256', '229', '164', '232', '254', '14', '251', '142', '113'}

Common Query IDs:
{'6', '105', '250', '252', '150', '185', '157', '128', '230', '138', '249', '135', '137', '101', '246', '13', '111', '199', '253', '126', '133', '103', '234', '114', '179', '245', '146', '161

In [33]:
import ir_measures
from ir_measures import nDCG, P, Judged

evaluation_metrics = ir_measures.calc_aggregate(
    [nDCG@10, P@5, P(rel=2)@5, Judged@10],
    qrels_filtered,
    results_df_filtered
)
print(evaluation_metrics)

{nDCG@10: 0.007, P(rel=2)@5: 0.005, P@5: 0.008, Judged@10: 0.03}
