In [13]:
from datasets import load_dataset
import re
from collections import Counter
from matplotlib import pyplot as plt
import numpy as np
from spacy.lang.pl import Polish
from p_tqdm import p_map
import random
import numpy as np

#### Use the corpus from exercise no. 1.

In [2]:
ds = load_dataset("clarin-knext/fiqa-pl", "corpus")


In [3]:
ds["corpus"]


Dataset({
    features: ['_id', 'title', 'text'],
    num_rows: 57638
})

#### Use SpaCy tokenizer API to tokenize the text in the documents.

In [4]:
nlp = Polish()
document = ds["corpus"]["text"]

In [5]:
tokenized = []
for doc in nlp.pipe(document):
    tokenized.append([token.text.lower() for token in doc if not token.is_punct])

#### Compute frequency list for each of the processed files, and aggregate the result to obtain one global frequency list. This frequency list gives you unigram statistics of the words appearing in the corpus.


In [6]:
def count_frequencies(t):
    return Counter(t)
# parallelize the counting
frequencies = p_map(count_frequencies, tokenized)

100%|██████████| 57638/57638 [00:19<00:00, 2924.62it/s]


In [7]:
def sum_counters(counter_list):
    return sum(counter_list, Counter())

chunked_results_10 = p_map(sum_counters, [frequencies[i::10] for i in range(10)]) # split the data into 10 chunks


100%|██████████| 10/10 [01:30<00:00,  9.03s/it]


In [8]:
chunked_results_2 = p_map(sum_counters, [chunked_results_10[i::2] for i in range(2)])


100%|██████████| 2/2 [00:01<00:00,  1.43it/s]


In [9]:
final_result = sum_counters(chunked_results_2)

In [11]:
final_result.most_common(5)

[('w', 175366), ('nie', 131482), ('i', 126839), ('na', 119047), ('to', 116468)]

#### Apply a distortion function to the queries part of the corpus. In each query draw randomly one word and change one letter in the word to some other letter.


In [28]:
def distort_query_with_one_random_word_letter(query):
    words = query.split()
    # For one word in the query, one letter is going to be changed to a random one if the word is longer than 2
    indices_of_words_to_change = [i for i in range(len(words)) if len(words[i]) > 2]
    
    if not indices_of_words_to_change:
        return query  # Return the original query if no words can be changed

    index_of_word_to_change = np.random.choice(indices_of_words_to_change)
    word = words[index_of_word_to_change]
    index = np.random.choice(len(word))
    new_letter = random.choice("abcdefghijklmnopqrstuvwxyz".replace(word[index], ""))
    
    new_word = word[:index] + new_letter + word[index + 1:]    
    words[index_of_word_to_change] = new_word
    
    return ' '.join(words) 

In [30]:
distorted_queries = p_map(distort_query_with_one_random_word_letter, document)

100%|██████████| 57638/57638 [00:14<00:00, 4038.28it/s]


In [36]:
def dcg_at_k(scores, k):
    scores = scores[:k]
    return sum(score / np.log2(idx + 2) for idx, score in enumerate(scores))

# nDCG@k
def ndcg_at_k(relevance_scores, k=10):
    ideal_scores = sorted(relevance_scores, reverse=True)
    dcg = dcg_at_k(relevance_scores, k)
    idcg = dcg_at_k(ideal_scores, k)
    return dcg / idcg if idcg > 0 else 0

In [39]:
relevance_scores = [random.choices([0, 1], k=len(query)) for query in distorted_queries]
ndcg_scores = p_map(ndcg_at_k, relevance_scores)

# Average nDCG@10 
avg_ndcg_10 = np.mean(ndcg_scores)
print(f"Average nDCG@10 for distorted queries: {avg_ndcg_10}")


100%|██████████| 57638/57638 [00:40<00:00, 1440.24it/s]

Average nDCG@10 for distorted queries: 0.4999274433971518



