In [1]:
import pandas as pd
import json
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import string
from nltk.corpus import stopwords
import math
from operator import itemgetter
from collections import Counter
import numpy as np
import concurrent.futures

import multiprocessing


stemmer = PorterStemmer()


In [2]:
def load_jsonl_data_dict(data_path: str):
    raw_dict = {}
    with open(data_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            raw_dict[data['_id']] = data['text']
    return raw_dict

In [3]:
raw_corpus = load_jsonl_data_dict("data/dataset/corpus.jsonl")
raw_queries = load_jsonl_data_dict("data/dataset/queries.jsonl")

In [4]:
stopwords = set(['for', 'a', 'of', 'the', 'and', 'to', 'in'])

In [None]:
corpus = raw_corpus.values()

texts = [
    [word for word in document.lower().split() if word not in stopwords]
    for document in corpus
]

word_count_dict = {}
for text in texts:
    for token in text:
        word_count = word_count_dict.get(token, 0) + 1
        word_count_dict[token] = word_count

texts = [[token for token in text if word_count_dict[token] > 1] for text in texts]
texts

In [None]:
def score_document(query, index, bm25_instance):
    return bm25_instance._score(query, index)

In [None]:
class BM25:
    def __init__(self, k1=1.5, b=0.75):
        self.b = b
        self.k1 = k1

    def fit(self, corpus):
        tf = []
        df = {}
        idf = {}
        doc_len = []
        corpus_size = 0
        for document in corpus:
            corpus_size += 1
            doc_len.append(len(document))

            # compute tf (term frequency) per document
            frequencies = {}
            for term in document:
                term_count = frequencies.get(term, 0) + 1
                frequencies[term] = term_count

            tf.append(frequencies)

            # compute df (document frequency) per term
            for term, _ in frequencies.items():
                df_count = df.get(term, 0) + 1
                df[term] = df_count

        for term, freq in df.items():
            idf[term] = math.log(1 + (corpus_size - freq + 0.5) / (freq + 0.5))

        self.tf_ = tf
        self.df_ = df
        self.idf_ = idf
        self.doc_len_ = doc_len
        self.corpus_ = corpus
        self.corpus_size_ = corpus_size
        self.avg_doc_len_ = sum(doc_len) / corpus_size
        return self

    def search(self, query):
        scores = np.array([self._score(query, index) for index in range(self.corpus_size_)])
        return scores

    def _score(self, query, index):
        score = 0.0

        doc_len = self.doc_len_[index]
        frequencies = self.tf_[index]
        for term in query:
            if term not in frequencies:
                continue

            freq = frequencies[term]
            numerator = self.idf_[term] * freq * (self.k1 + 1)
            denominator = freq + self.k1 * (1 - self.b + self.b * doc_len / self.avg_doc_len_)
            score += (numerator / denominator)

        return score


    def search_fast(self, query):
        with multiprocessing.Pool() as pool:
            scores = pool.starmap(score_document, [(query, index, self) for index in range(self.corpus_size_)])
        return scores


In [None]:
corpus_ids = list(raw_corpus.keys())

In [None]:
bm25 = BM25()
bm25.fit(texts)

In [None]:
import time

start_time = time.time()
query = 'what are rhetorical topics'
query = [word for word in query.lower().split() if word not in stopwords]

scores = bm25.search(query)

top_scores_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:10]

for i in top_scores_indices:
    score = round(scores[i], 3)
    print(str(score) + '\t' + list(corpus)[i])
    print(corpus_ids[i])
    print()

elapsed_time = time.time() - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

In [None]:
test_queries_df = pd.read_csv('data/task1_test.tsv', delimiter='\t')
test_queries_df.head()

In [13]:
res = {}
query_ids = list(test_queries_df['query-id'])

start_time = time.time()

for query_id in query_ids[:100]:
    query = raw_queries[str(query_id)]
    query = [word for word in query.lower().split() if word not in stopwords]
    scores = bm25.search(query)

    top_scores_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:10]

    res[query_id] = [corpus_ids[id] for id in top_scores_indices]

elapsed_time = time.time() - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

Time taken: 225.71 seconds


In [16]:
res

{300674: ['2495755',
  '7067034',
  '1266300',
  '7067032',
  '4107182',
  '4501973',
  '4501978',
  '4665897',
  '3538966',
  '4190260'],
 125705: ['752079',
  '8257656',
  '8102334',
  '4729700',
  '4913387',
  '4742931',
  '7887415',
  '1321604',
  '8110420',
  '4946343'],
 94798: ['1263956',
  '5593857',
  '7067180',
  '7106962',
  '2747827',
  '7361019',
  '1440563',
  '7106959',
  '729925',
  '1263957'],
 9083: ['7277875',
  '1176428',
  '1195004',
  '7067274',
  '3485264',
  '3935658',
  '881408',
  '3023447',
  '4120264',
  '5110891'],
 174249: ['1819312',
  '756490',
  '6875071',
  '138586',
  '5804537',
  '8040506',
  '711200',
  '5163089',
  '5452822',
  '5551912'],
 320792: ['1801860',
  '1959158',
  '6175036',
  '2794135',
  '4719403',
  '6444012',
  '5638820',
  '6150530',
  '4066424',
  '507999'],
 1090270: ['7067796',
  '3022772',
  '5933622',
  '3022768',
  '3930922',
  '3192174',
  '4990154',
  '2985362',
  '4194433',
  '6183393'],
 1101279: ['3692936',
  '5190801',
 

## Using cosine similarity

In [31]:
documents = texts

In [8]:
import numpy as np

def cosine_similarity(v1, v2):
    dot_product = np.dot(v1, v2)
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)

    # Guard against divide-by-zero errors
    if norm_v1 * norm_v2 == 0:
        return 0

    return dot_product / (norm_v1 * norm_v2)

In [10]:
# BM25 parameters
k1 = 1.5
b = 0.75

In [35]:
# create the vocabulary
vocabulary = list(set([item for sublist in documents for item in sublist]))
vocabulary.sort()

In [45]:
class BM25_2:
    def __init__(self, k1=1.5, b=0.75):
        self.b = b
        self.k1 = k1

    def fit(self, corpus):
        tf = []
        df = {}
        idf = {}
        doc_len = []
        corpus_size = 0
        for document in corpus:
            corpus_size += 1
            doc_len.append(len(document))

            # compute tf (term frequency) per document
            frequencies = {}
            for term in document:
                term_count = frequencies.get(term, 0) + 1
                frequencies[term] = term_count

            tf.append(frequencies)

            # compute df (document frequency) per term
            for term, _ in frequencies.items():
                df_count = df.get(term, 0) + 1
                df[term] = df_count

        for term, freq in df.items():
            idf[term] = math.log(1 + (corpus_size - freq + 0.5) / (freq + 0.5))

        self.tf_ = tf
        self.df_ = df
        self.idf_ = idf
        self.doc_len_ = doc_len
        self.corpus_ = corpus
        self.corpus_size_ = corpus_size
        self.avg_doc_len_ = sum(doc_len) / corpus_size
        return self

    def search(self, query):
        scores = np.array([self._score(query, index) for index in range(self.corpus_size_)])
        return scores

    def _score(self, query, index):
        score = 0.0

        doc_len = self.doc_len_[index]
        frequencies = self.tf_[index]
        for term in query:
            if term not in frequencies:
                continue

            freq = frequencies[term]
            numerator = self.idf_[term] * freq * (self.k1 + 1)
            denominator = freq + self.k1 * (1 - self.b + self.b * doc_len / self.avg_doc_len_)
            score += (numerator / denominator)

        return score

    def _score_term(self, term, index):
        doc_len = self.doc_len_[index]
        frequencies = self.tf_[index]

        if term not in frequencies:
            return 0.0

        freq = frequencies[term]
        numerator = self.idf_[term] * freq * (self.k1 + 1)
        denominator = freq + self.k1 * (1 - self.b + self.b * doc_len / self.avg_doc_len_)
        score = (numerator / denominator)

        return score

In [46]:
bm25_2 = BM25_2()
bm25_2.fit(texts)

<__main__.BM25_2 at 0x7fd63ea6f4c0>

In [48]:
# Function to generate the BM25 vector for a document
def bm25_vectorize(document, vocabulary, idf, avg_dl, i):
    return [bm25_2._score_term(term, i) for term in vocabulary]

# Compute BM25 vectors
document_bm25_vectors = [bm25_vectorize(d, vocabulary, bm25_2.idf_, bm25_2.avg_doc_len_, i) for i ,d in enumerate(documents)]


KeyboardInterrupt: 

In [35]:
# Adjusting the search function for BM25
def bm25_search_vec(query, topk=10):
    q = query.split()
    q = [stemmer.stem(w) for w in q]
    query_vector = bm25_vectorize(q, vocabulary, idf, avg_dl)  # Use the BM25 vector function
    scores = [[cosine_similarity(query_vector, document_bm25_vectors[d]), d] for d in range(len(documents))]
    scores.sort(key=lambda x: -x[0])
    doc_ids = []
    for i in range(topk):
        doc_ids.append(scores[i][1])
    return doc_ids


In [36]:
tf_doc_ids = bm25_search_vec('can you use a calculator on the compass test')
for i, v in enumerate(tf_doc_ids):
    print(documents[v])

TypeError: 'dict_values' object is not subscriptable

In [38]:
for i, v in enumerate(tf_doc_ids):
    print(list(documents)[v])
    print("-------")

in italian the name oria means golden other origins for the name oria include italian dominican the name oria is most often used as a girl name or female name italian name meaning golden
-------
english to spanish translation spanish to english translation french to english translation english to german translation
-------
it stands for intermediate computer science and. incident command system. it stands for intermediate computer science and. incident command system.
-------
icmp stands for internet control message protocol and is one of many utilities that are part of the internet protocol suite
-------
a meta analysis is a statistical analysis that combines the results of multiple scientific studies
-------
that human touch two years ago the commission on dental accreditation coda sent a message to dental schools around the country don t forget the human element creating a culture of respect has emerged as a national priority in dental education
-------
inpatient coding mantra for s