In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dis2023-project1-data/queries.jsonl
/kaggle/input/dis2023-project1-data/corpus.jsonl
/kaggle/input/tokenized-docs/tokenized_corpus.jsonl
/kaggle/input/dis-project-1-text-retrieval/sample_submission.csv
/kaggle/input/dis-project-1-text-retrieval/task2_test.tsv
/kaggle/input/dis-project-1-text-retrieval/task1_train.tsv
/kaggle/input/dis-project-1-text-retrieval/task1_test.tsv
/kaggle/input/dis-project-1-text-retrieval/task2_train.tsv
/kaggle/input/queries-task1-e/test_queries_t1_expanded_100.jsonl
/kaggle/input/queries-t2-e/test_queries_t2_expanded_100.jsonl


In [2]:
import pandas as pd
import json
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import string
from nltk.corpus import stopwords
import math
from operator import itemgetter
from collections import Counter
import multiprocessing


In [3]:
def load_jsonl_data(data_path: str, key_name: str, value_name: str):
    ids = []
    texts = []
    dict_ = {}
    with open(data_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            ids.append(data[key_name])
            texts.append(data[value_name])
            dict_[data[key_name]] = data[value_name]
    return ids, texts, dict_

In [4]:
document_ids, documents, docs_dict  = load_jsonl_data("/kaggle/input/dis2023-project1-data/corpus.jsonl", "_id", "text")
query_ids, queries, queries_dict = load_jsonl_data("/kaggle/input/dis2023-project1-data/queries.jsonl", "_id", "text")
tokenized_document_ids, tokenized_documents, tok_docs_dict = load_jsonl_data("/kaggle/input/tokenized-docs/tokenized_corpus.jsonl", "_id", "tokens")

In [5]:
queries_t1_ids, queries_t1_tokens, _ = load_jsonl_data("/kaggle/input/queries-task1-e/test_queries_t1_expanded_100.jsonl", "query_id", "tokens")
queries_t2_ids, queries_t2_tokens, t2_dict = load_jsonl_data("/kaggle/input/queries-t2-e/test_queries_t2_expanded_100.jsonl", "query_id", "tokens")

In [6]:
import math
from six import iteritems
from six.moves import range
import numpy as np
import heapq
from collections.abc import Iterable
from collections import defaultdict, Counter



class bm25(object):

    def __init__(self, corpus_ids, corpus, k1=1.2, b=0.75, epsilon=0.25):
        self.k1 = k1
        self.b = b
        self.epsilon = epsilon
        self.corpus_size = 0
        self.avg_doc_length = 0
        self.doc_frequencies = []
        self.idf = {}
        self.doc_lengths = []
        self.corpus = corpus
        self.corpus_ids = corpus_ids

    def fit(self):
        term_to_freq = defaultdict(int)  
        total_length = 0

        for document in self.corpus:
            self.corpus_size += 1
            doc_length = len(document)
            total_length += doc_length
            self.doc_lengths.append(doc_length)

            frequencies = Counter(document)
            self.doc_frequencies.append(frequencies)

            for term, freq in frequencies.items():
                term_to_freq[term] += 1

        self.avg_doc_length = float(total_length) / self.corpus_size
        self.nd = term_to_freq

        idf_sum = 0
        idf_len = 0
        negative_idfs = []

        for word, freq in term_to_freq.items():
            idf = math.log((self.corpus_size - freq + 0.5) / (freq + 0.5))
            self.idf[word] = idf
            idf_len += 1
            idf_sum += idf
            if idf < 0:
                negative_idfs.append(word)

        self.average_idf = idf_sum / idf_len
        eps = self.epsilon * self.average_idf
        self.idf.update({word: eps for word in negative_idfs})

        document_score = {}
        for i, document in enumerate(self.corpus):
            doc_freqs = self.doc_frequencies[i]
            for word in document:
                if word not in doc_freqs:
                    continue
                score = self.idf[word] * (doc_freqs[word] * (self.k1 + 1)
                          / (doc_freqs[word] + self.k1 * (1 - self.b + self.b * self.doc_lengths[i] / self.avg_doc_length)) + 1) 

                if word not in document_score:
                    document_score[word] = {i: round(score, 2)}
                else:
                    document_score[word].update({i: round(score, 2)})
        self.document_score = document_score


    def compute_similarity(self, query, doc):
        score = 0
        doc_freqs = Counter(query)
        freq = 1
        default_idf = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
        for word in doc:
            if word not in doc_freqs:
                continue
            score += self.idf.get(word,default_idf) * (doc_freqs[word] * (self.k1 + 1)
                      / (doc_freqs[word] + self.k1 * (1 - self.b + self.b * len(query) / self.avg_doc_length)) + 1) 
        return score

        
    def get_top_k_documents(self,document,k=1):
        score_overall = {}
        for word in document:
            if word not in self.document_score:
                continue
            for key, value in self.document_score[word].items():
                score_overall[key] = score_overall.get(key, 0) + value

        k_keys_sorted = heapq.nlargest(k, score_overall,key=score_overall.get)
        return [(score_overall.get(item,None), self.corpus_ids[item], self.corpus[item]) for item in k_keys_sorted]

BM25L

In [32]:
import math
from six import iteritems
from six.moves import range
import numpy as np
import heapq
from collections.abc import Iterable
from collections import defaultdict, Counter



class bm25(object):

    def __init__(self, corpus_ids, corpus, k1=1.2, b=0.75, epsilon=0.25):
        self.k1 = k1
        self.b = b
        self.epsilon = epsilon
        self.corpus_size = 0
        self.avg_doc_length = 0
        self.doc_frequencies = []
        self.idf = {}
        self.doc_lengths = []
        self.corpus = corpus
        self.corpus_ids = corpus_ids

    def fit(self):
        term_to_freq = defaultdict(int)  
        total_length = 0

        for document in self.corpus:
            self.corpus_size += 1
            doc_length = len(document)
            total_length += doc_length
            self.doc_lengths.append(doc_length)

            frequencies = Counter(document)
            self.doc_frequencies.append(frequencies)

            for term, freq in frequencies.items():
                term_to_freq[term] += 1

        self.avg_doc_length = float(total_length) / self.corpus_size
        self.nd = term_to_freq

        idf_sum = 0
        idf_len = 0
        negative_idfs = []

        for word, freq in term_to_freq.items():
            idf = math.log((self.corpus_size + 1) / (freq + 0.5))
            self.idf[word] = idf
            idf_len += 1
            idf_sum += idf
            if idf < 0:
                negative_idfs.append(word)

        self.average_idf = idf_sum / idf_len
        eps = self.epsilon * self.average_idf
        self.idf.update({word: eps for word in negative_idfs})

        document_score = {}
        for i, document in enumerate(self.corpus):
            doc_freqs = self.doc_frequencies[i]
            for word in document:
                if word not in doc_freqs:
                    continue
                c = doc_freqs[word] / (1 - self.b + self.b * self.doc_lengths[i] / self.avg_doc_length)
                score = self.idf[word] * ((self.k1 + 1) * (c + 0.5) / (self.k1 + (c + 0.5)))
                if word not in document_score:
                    document_score[word] = {i: round(score, 2)}
                else:
                    document_score[word].update({i: round(score, 2)})
        self.document_score = document_score


    def compute_similarity(self, query, doc):
        score = 0
        doc_freqs = Counter(query)
        freq = 1
        default_idf = math.log(self.corpus_size + 1) - math.log(freq + 0.5)
        for word in doc:
            if word not in doc_freqs:
                continue
            c = doc_freqs[word] / (1 - self.b + self.b * len(query) / self.avg_doc_length)
            score += self.idf.get(word,default_idf) * ((self.k1 + 1) * (c + 0.5) / (self.k1 + (c + 0.5)))
             
        return score

        
    def get_top_k_documents(self,document,k=1):
        score_overall = {}
        for word in document:
            if word not in self.document_score:
                continue
            for key, value in self.document_score[word].items():
                score_overall[key] = score_overall.get(key, 0) + value

        k_keys_sorted = heapq.nlargest(k, score_overall,key=score_overall.get)
        return [(score_overall.get(item,None), self.corpus_ids[item], self.corpus[item]) for item in k_keys_sorted]

In [15]:
import pandas as pd
import numpy as np

import json
import random
import string
import Levenshtein

import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

from tqdm import tqdm

nltk.download("wordnet")

random.seed(0)


class Expander:
    def __init__(self, add_synonym_prob=0.5, levenshtein_th = 2):
        self.add_synonym_prob = add_synonym_prob
        self.levenshtein_th = levenshtein_th

    def expand(self, word_list):
        expanded_word_list = []
        for word in word_list:
            synonym = self.get_synonym(word)
            synonym_lower = synonym.lower()
            if (
                random.random() < self.add_synonym_prob
                and Levenshtein.distance(synonym_lower, word) > self.levenshtein_th
            ):
                expanded_word_list.append(synonym_lower)
                print("Synonym added: {} -> {}".format(word, synonym))

        return word_list + expanded_word_list

    def get_synonym(self, word):
        synonyms = set()

        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                if lemma.name().isalpha():
                    synonyms.add(lemma.name())
                    break

        synonyms = list(synonyms)
        return synonyms[0] if len(synonyms) > 0 else word


[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [17]:
import pandas as pd
import numpy as np
import json
import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

from tqdm import tqdm

nltk.download("stopwords")
nltk.download("punkt")

class Preprocessor:
    def __init__(self, expander=None):
        self.stemmer = PorterStemmer()
        self.stopwords = set(stopwords.words("english"))
        self.punctuation = set(string.punctuation)
        self.expander = expander

    def preprocess(self, documents):
        tokenized_docs = []
        if isinstance(documents, list):
            tokenized_docs = self.preprocess_document_list(documents)
        elif isinstance(documents, dict):
            tokenized_docs = self.preprocess_document_dict(documents)
        else:
            raise TypeError("Documents must be either a list or a dictionary")

        return tokenized_docs
    
    def preprocess_query(self, query, expand=False):
        query = self.tolowercase(query)
        query = self.remove_punctuation(query)
        
        query_tokens = self.tokenize(query)
        query_tokens = self.remove_stopwords(query_tokens)
        
        if expand:
            query_tokens = self.expand(query_tokens)
        
        query_tokens = self.stem(query_tokens)
        
        return query_tokens
    
    def preprocess_document_list(self, document_list):
        tokenized_docs = []
        for i in tqdm(range(len(document_list))):
            tokenized_docs.append(self.preprocess_doc(document_list[i]))
        return tokenized_docs

    def preprocess_document_dict(self, document_dict):
        tokenized_docs = {}
        for doc_id in tqdm(document_dict.keys()):
            document = document_dict[doc_id]
            tokenized_docs[doc_id] = self.preprocess_doc(document)
        return tokenized_docs
            
    def preprocess_doc(self, document):
        document = self.tolowercase(document)
        document = self.remove_punctuation(document)
        
        document_tokens = self.tokenize(document)
        document_tokens = self.remove_stopwords(document_tokens)
        document_tokens = self.stem(document_tokens)
        
        return document_tokens

    def tolowercase(self, document):
        return document.lower()

    def remove_punctuation(self, document):
        return "".join([char for char in document if char not in self.punctuation])

    def tokenize(self, document):
        return word_tokenize(document)

    def remove_stopwords(self, tokens):
        return [token for token in tokens if token not in self.stopwords]

    def stem(self, tokens):
        return [self.stemmer.stem(token) for token in tokens]

    def save_docs(self, docs, path):
        with open(path, 'w') as jsonl_file:
            for docID in docs:
                doc_data = {"_id": str(docID), "tokens": docs[docID]}
                json_line = json.dumps(doc_data)
                jsonl_file.write(json_line + '\n')
            
    def load_docs(self, path):
        raw_queries = {}
        with open(path, "r") as file:
            for line in file:
                data = json.loads(line)
                raw_queries[data["_id"]] = data["tokens"]
    
        return raw_queries
    
    def expand(self, terms):
        return self.expander.expand(terms)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [33]:
import json
import sys
import random
import gc

random.seed(0)
np.random.seed(0)

preprocessor = Preprocessor()

In [33]:
model = bm25(tokenized_document_ids, tokenized_documents)

In [34]:
model.fit()

In [36]:
import time
query =  "what is brewhaha"
tokenized_query = preprocessor.preprocess([query])[0]
if len(tokenized_query) <= 2:
    tokenized_query = query.split()
print(tokenized_query)
#tokenized_query=["consid", "father", "modern", "medicin", "advanc"]
# Record the starting time
start_time = time.time()
result = model.get_top_k_documents(tokenized_query,k=10)
# Record the ending time
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

# Print the elapsed time in seconds
print(f"Time taken: {elapsed_time:.6f} seconds")
for r in result:
    print(r)
    print()


100%|██████████| 1/1 [00:00<00:00, 2737.80it/s]

['what', 'is', 'brewhaha']
Time taken: 0.002791 seconds
(22.92, '7704833', ['brewhaha', 'â\x80\x93', 'celebr', 'beer', 'year', 'brewhaha', 'saturday', 'march', '5', 'two', 'session', '14pm', '69pm', 'brewhaha', 'celebr', 'beer', 'itâ\x80\x99', 'opportun', 'tast', 'sampl', 'best', 'brewer', 'chicagoland', 'area'])

(17.87, '2370932', ['follow', 'reason', 'studi', 'human', 'particular', 'focu', 'art', 'reason', 'balanc', 'util', 'idealist', 'qualiti', 'life', 'issu', 'thu', 'want', 'stress', 'is', 'does', 'human', 'fact', 'version', 'horatian', 'credo', 'delight', 'instruct'])

(17.34, '7704831', ['brewhaha', 'beer', 'lover', 'level', 'whether', 'consid', 'connoisseur', 'novic', 'event', 'educ', 'expand', 'beer', 'knowledg', 'tast', 'palat', 'feel', 'best', 'beer', 'one', 'pop', 'star', 'commerci', 'smallest', 'amount', 'calori'])

(13.28, '3058085', ['what', 'what', 'what', 'what', 'favourit', 'what', 'what', 'what', 'what', 'favourit', 'what', 'what', 'what', 'favorit', 'what', 'what',




In [15]:
data = []
for id, tokens in enumerate(queries_t1_tokens):
    
    result = model.get_top_k_documents(tokens,k=10)
    
    corpus_ids = []
    for r in result:
        corpus_ids.append(int(r[1]))
    
    data.append((id, corpus_ids, -1))

In [20]:
t1_test_queries_df = pd.read_csv('/kaggle/input/dis-project-1-text-retrieval/task1_test.tsv', delimiter='\t')

In [43]:
preprocessor = Preprocessor()

data = []
for _, row in t1_test_queries_df.iterrows():
    id = row['id']
    #id = row['query-id']
    query_id = row['query-id']
    query = queries_dict[str(query_id)]
    tokenized_query = preprocessor.preprocess([query])[0]
    #if len(tokenized_query) <= 2:
    #    tokenized_query = query.split()
    #    tokenized_query = preprocessor.stem(tokenized_query)
    result = model.get_top_k_documents(tokenized_query,k=10)
    corpus_ids = []
    for r in result:
        corpus_ids.append(int(r[1]))
    
    data.append((id, corpus_ids, -1))

100%|██████████| 1/1 [00:00<00:00, 1321.46it/s]
100%|██████████| 1/1 [00:00<00:00, 2700.78it/s]
100%|██████████| 1/1 [00:00<00:00, 2291.97it/s]
100%|██████████| 1/1 [00:00<00:00, 1448.31it/s]
100%|██████████| 1/1 [00:00<00:00, 1296.54it/s]
100%|██████████| 1/1 [00:00<00:00, 1544.86it/s]
100%|██████████| 1/1 [00:00<00:00, 1449.31it/s]
100%|██████████| 1/1 [00:00<00:00, 1620.67it/s]
100%|██████████| 1/1 [00:00<00:00, 2103.46it/s]
100%|██████████| 1/1 [00:00<00:00, 1394.85it/s]
100%|██████████| 1/1 [00:00<00:00, 1805.55it/s]
100%|██████████| 1/1 [00:00<00:00, 1970.08it/s]
100%|██████████| 1/1 [00:00<00:00, 1061.04it/s]
100%|██████████| 1/1 [00:00<00:00, 1724.63it/s]
100%|██████████| 1/1 [00:00<00:00, 2071.26it/s]
100%|██████████| 1/1 [00:00<00:00, 2047.00it/s]
100%|██████████| 1/1 [00:00<00:00, 1565.62it/s]
100%|██████████| 1/1 [00:00<00:00, 2380.42it/s]
100%|██████████| 1/1 [00:00<00:00, 2255.00it/s]
100%|██████████| 1/1 [00:00<00:00, 1142.55it/s]
100%|██████████| 1/1 [00:00<00:00, 2242.

In [44]:
import csv
csv_file = "output_task1.csv"

with open(csv_file, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["id", "corpus-id", "score"])
    writer.writerows(data)

print(f"CSV file '{csv_file}' has been created.")

CSV file 'output_task1.csv' has been created.


Task 2

In [24]:
t2_test_queries_df = pd.read_csv('/kaggle/input/dis-project-1-text-retrieval/task2_test.tsv', delimiter='\t')

In [None]:
data = []
for id, tokens in enumerate(queries_t2_tokens):
    
    result = model.get_top_k_documents(tokens,k=10)
    
    corpus_ids = []
    for r in result:
        corpus_ids.append(int(r[1]))
    
    data.append((id, corpus_ids, -1))

In [25]:
data = []
for _, row in t2_test_queries_df.iterrows():
    id = row['id']
    query_id = row['query-id']
    
    tokenized_query = t2_dict[int(query_id)]
    corpus_ids = row['corpus-id'][1:-1].split(', ')
    scores = []
    for corpus_id in corpus_ids:
        doc = tok_docs_dict[str(corpus_id)]
        score = model.compute_similarity(doc, tokenized_query)
        scores.append(score)
    data.append((id, -1, scores))

In [44]:
data = []
for _, row in t2_test_queries_df.iterrows():
    id = row['id']
    query_id = row['query-id']
    query = queries_dict[str(query_id)]
    tokenized_query = preprocessor.preprocess([query])
    corpus_ids = row['corpus-id'][1:-1].split(', ')
    scores = []
    for corpus_id in corpus_ids:
        doc = tok_docs_dict[str(corpus_id)]
        score = model.compute_similarity(doc, tokenized_query[0])
        scores.append(score)
    data.append((id, -1, scores))

100%|██████████| 1/1 [00:00<00:00, 1529.65it/s]
100%|██████████| 1/1 [00:00<00:00, 1084.92it/s]
100%|██████████| 1/1 [00:00<00:00, 2166.48it/s]
100%|██████████| 1/1 [00:00<00:00, 1940.91it/s]
100%|██████████| 1/1 [00:00<00:00, 1764.54it/s]
100%|██████████| 1/1 [00:00<00:00, 2353.71it/s]
100%|██████████| 1/1 [00:00<00:00, 1484.71it/s]
100%|██████████| 1/1 [00:00<00:00, 1557.48it/s]
100%|██████████| 1/1 [00:00<00:00, 1632.66it/s]
100%|██████████| 1/1 [00:00<00:00, 1864.96it/s]
100%|██████████| 1/1 [00:00<00:00, 2215.69it/s]
100%|██████████| 1/1 [00:00<00:00, 1741.10it/s]
100%|██████████| 1/1 [00:00<00:00, 2175.47it/s]
100%|██████████| 1/1 [00:00<00:00, 3024.01it/s]
100%|██████████| 1/1 [00:00<00:00, 1852.61it/s]
100%|██████████| 1/1 [00:00<00:00, 1509.29it/s]
100%|██████████| 1/1 [00:00<00:00, 1808.67it/s]
100%|██████████| 1/1 [00:00<00:00, 2570.04it/s]
100%|██████████| 1/1 [00:00<00:00, 1860.00it/s]
100%|██████████| 1/1 [00:00<00:00, 1151.02it/s]
100%|██████████| 1/1 [00:00<00:00, 1985.

In [26]:
import csv
csv_file = "output2.csv"

with open(csv_file, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["id", "corpus-id", "score"])
    writer.writerows(data)

print(f"CSV file '{csv_file}' has been created.")

CSV file 'output2.csv' has been created.


In [6]:
from gensim.models import Word2Vec

sentences = [["this", "is", "a", "sample", "sentence"], ["another", "example", "sentence"]]
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=0)
model.save("my_word2vec_model.model")
model = Word2Vec.load("my_word2vec_model.model")
similar_words = model.wv.most_similar("sentence", topn=5)

In [7]:
similar_words

[('this', 0.016134681180119514),
 ('example', -0.01083916611969471),
 ('a', -0.02775035798549652),
 ('another', -0.05234673246741295),
 ('is', -0.059876296669244766)]