In [1]:
import pandas as pd
import json
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import string
from nltk.corpus import stopwords
import math
from operator import itemgetter
from collections import Counter
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /Users/hind/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
def load_jsonl_data_dict(data_path: str):
    raw_dict = {}
    with open(data_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            raw_dict[data['_id']] = data['text']
    return raw_dict

def get_vocab(documents):
    vocabulary = list(set([item for sublist in documents for item in sublist]))
    vocabulary.sort()

    return vocabulary

In [5]:
raw_documents = load_jsonl_data_dict("data/dataset/corpus.jsonl")
print("Number of documents: {}".format(len(raw_documents)))

raw_queries = load_jsonl_data_dict("data/dataset/queries.jsonl")
print("Number of queries:   {}".format(len(raw_queries)))

Number of documents: 1471406
Number of queries:   509962


In [29]:
get_vocab(list(raw_queries.values()))

[' ',
 '!',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '{',
 '|',
 '}',
 '~',
 '\xa0',
 '£',
 '§',
 '®',
 '°',
 '²',
 'µ',
 'Â',
 'É',
 'ß',
 'á',
 'ã',
 'ä',
 'å',
 'ç',
 'è',
 'é',
 'ê',
 'ë',
 'í',
 'î',
 'ñ',
 'ó',
 'ö',
 'ø',
 'ú',
 'ü',
 'ă',
 'ō',
 'β',
 'ت',
 'ع',
 'م',
 'ن',
 '\u200b',
 '‘',
 '’',
 '“',
 '”',
 '•',
 '◦',
 '\ufeff']

In [35]:
# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df = 1, stop_words = 'english')

# Fit and transform the documents to compute TF-IDF scores
tfidf_matrix = vectorizer.fit_transform(raw_documents.values())

In [36]:
from sklearn.metrics.pairwise import cosine_similarity

query = ")what was the immediate impact of the success of the manhattan project?"

# Vectorize the query using the same TF-IDF vectorizer
query_vector = vectorizer.transform([query])
query_vector.toarray()
cosine_similarities = cosine_similarity(query_vector, tfidf_matrix)
document_indices_sorted_by_similarity = cosine_similarities[0].argsort()[::-1]

# Get the corresponding similarity scores
similarity_scores_sorted = cosine_similarities[0][document_indices_sorted_by_similarity]
document_indices_sorted_by_similarity
num_top_documents = 10

# Retrieve the top N documents and their similarity scores
top_documents = [list(raw_documents.values())[i] for i in document_indices_sorted_by_similarity[:num_top_documents]]
top_documents_ids = [list(raw_documents.keys())[i] for i in document_indices_sorted_by_similarity[:num_top_documents]]

top_similarity_scores = similarity_scores_sorted[:num_top_documents]

# Present the results
for i, (document, similarity, id_) in enumerate(zip(top_documents, top_similarity_scores, top_documents_ids), start=1):
    print(f"Rank {i}: Similarity Score = {similarity:.4f}")
    print(document)
    print(id_)
    print()

Rank 1: Similarity Score = 0.4853
Manhattan Project. 1  The Manhattan Project was a secret military project created in 1942 to produce the first US nuclear weapon. Fears that Nazi Germany would build and use a nuclear weapon during World War II triggered the start of the Manhattan Project, which was originally based in Manhattan, New York.
3607205

Rank 2: Similarity Score = 0.4781
The project was given its name due to the fact that at least 10 of the sites used for the research were located in Manhattan. Following is a timeline of the key events related to the development of the atomic bomb and the Manhattan Project. Manhattan Project Timeline
7243450

Rank 3: Similarity Score = 0.4629
Manhattan Project. The Manhattan Project was a secret military project created in 1942 to produce the first US nuclear weapon. Fears that Nazi Germany would build and use a nuclear weapon during World War II triggered the start of the Manhattan Project, which was originally based in Manhattan, New York.

In [37]:
train_df = pd.read_csv('data/task1_train.tsv', sep='\t')

# Display the first few rows of the DataFrame
print(train_df.head())

   query-id  corpus-id  score
0   1185869          0      1
1   1185868         16      1
2    597651         49      1
3    403613         60      1
4   1183785        389      1


In [38]:
query_ids = list(train_df['query-id'])
query_ids

[1185869,
 1185868,
 597651,
 403613,
 1183785,
 312651,
 80385,
 645590,
 645337,
 186154,
 457407,
 441383,
 683408,
 1164799,
 484187,
 460668,
 666321,
 182487,
 564233,
 455279,
 208108,
 733739,
 1164798,
 402608,
 443797,
 662502,
 1184679,
 14562,
 602162,
 545059,
 708236,
 310130,
 693161,
 186617,
 573027,
 1173772,
 541973,
 273090,
 441269,
 642237,
 503515,
 637443,
 1164796,
 749988,
 749988,
 135841,
 295446,
 653051,
 691147,
 410621,
 410621,
 1164795,
 598443,
 596451,
 651441,
 452286,
 308543,
 202126,
 114820,
 501778,
 531029,
 651110,
 594127,
 1164794,
 396032,
 705580,
 658203,
 387734,
 655102,
 224712,
 411732,
 1164793,
 605902,
 581014,
 559240,
 608711,
 535936,
 130335,
 147535,
 1164792,
 595576,
 569308,
 753706,
 627871,
 673608,
 510071,
 113839,
 1164791,
 460953,
 685235,
 650643,
 1183784,
 1164790,
 96740,
 26666,
 490046,
 485823,
 635632,
 534505,
 498612,
 85453,
 493122,
 512836,
 605764,
 748182,
 725274,
 401461,
 499565,
 641186,
 468434,


In [39]:
def get_top_10(query, query_id):
    query_vector = vectorizer.transform([query])
    query_vector.toarray()
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix)
    document_indices_sorted_by_similarity = cosine_similarities[0].argsort()[::-1]

    similarity_scores_sorted = cosine_similarities[0][document_indices_sorted_by_similarity]
    document_indices_sorted_by_similarity
    num_top_documents = 10

    top_documents = [list(raw_documents.values())[i] for i in document_indices_sorted_by_similarity[:num_top_documents]]
    top_documents_ids = [list(raw_documents.keys())[i] for i in document_indices_sorted_by_similarity[:num_top_documents]]

    top_similarity_scores = similarity_scores_sorted[:num_top_documents]

    return top_documents_ids

In [40]:
res = {}
for query_id in query_ids[:20]:
    query = raw_queries[str(query_id)]
    res[query_id] = get_top_10(query, query_id)

res

{1185869: ['3607205',
  '7243450',
  '2036644',
  '3870080',
  '3870082',
  '2395246',
  '462457',
  '4138462',
  '5117689',
  '2148554'],
 1185868: ['4204615',
  '1641650',
  '3020376',
  '6821177',
  '16',
  '4219366',
  '6900825',
  '4285040',
  '8339717',
  '1525277'],
 597651: ['217868',
  '6398884',
  '6836989',
  '4334506',
  '6770277',
  '2172327',
  '2617786',
  '7653670',
  '162760',
  '6264745'],
 403613: ['8161983',
  '261443',
  '3824713',
  '1041491',
  '1945651',
  '3797201',
  '1924201',
  '6999540',
  '6242980',
  '8381473'],
 1183785: ['389',
  '8730384',
  '7087848',
  '7051451',
  '7142224',
  '7684506',
  '3938573',
  '2257752',
  '8718244',
  '246254'],
 312651: ['4229988',
  '6734984',
  '1250194',
  '4123611',
  '1437908',
  '610',
  '2464811',
  '7336587',
  '2408004',
  '2384542'],
 80385: ['2494972',
  '2867801',
  '2161672',
  '4233993',
  '2161665',
  '6310939',
  '194251',
  '3839570',
  '7755089',
  '2332611'],
 645590: ['89161',
  '3899722',
  '944',
  '

In [20]:
from gensim.summarization.bm25 import BM25


In [30]:
# Tokenize the documents (you can use your own tokenization method)
tokenized_documents = [doc.split() for doc in raw_documents.values()]

In [32]:
bm25 = BM25(tokenized_documents)
query = raw_queries['597651']
tokenized_query = query.split()
bm25_scores = bm25.get_scores(tokenized_query)
sorted_documents = sorted(enumerate(bm25_scores), key=lambda x: x[1], reverse=True)
top_10_documents = sorted_documents[:10]

for rank, (doc_idx, score) in enumerate(top_10_documents):
    print(f"Rank {rank + 1}: Document {doc_idx}, BM25 Score = {score:.4f}")

Rank 1: Document 1403702, BM25 Score = 33.0991
Rank 2: Document 981590, BM25 Score = 32.9258
Rank 3: Document 989139, BM25 Score = 31.6476
Rank 4: Document 504882, BM25 Score = 31.2029
Rank 5: Document 1006512, BM25 Score = 30.5902
Rank 6: Document 1101667, BM25 Score = 29.1159
Rank 7: Document 573022, BM25 Score = 29.0387
Rank 8: Document 307611, BM25 Score = 28.9447
Rank 9: Document 1028360, BM25 Score = 28.2745
Rank 10: Document 329684, BM25 Score = 28.1823


In [33]:
for rank, (doc_idx, score) in enumerate(top_10_documents):
    print(f"Rank {rank + 1}: Document {list(raw_documents.keys())[doc_idx] }, BM25 Score = {score:.4f}")

Rank 1: Document 6398884, BM25 Score = 33.0991
Rank 2: Document 5946033, BM25 Score = 32.9258
Rank 3: Document 162760, BM25 Score = 31.6476
Rank 4: Document 783336, BM25 Score = 31.2029
Rank 5: Document 217868, BM25 Score = 30.5902
Rank 6: Document 1794262, BM25 Score = 29.1159
Rank 7: Document 266682, BM25 Score = 29.0387
Rank 8: Document 6836989, BM25 Score = 28.9447
Rank 9: Document 584377, BM25 Score = 28.2745
Rank 10: Document 5717066, BM25 Score = 28.1823


In [None]:
def compute_recall_at_k(predict, gt, k):
    correct_recall = set(predict[:k]).intersection(set(gt))
    return len(correct_recall)/len(gt)

In [34]:
# Retrieval oracle
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df = 1, stop_words = 'english')
features = tf.fit_transform(raw_documents.values())
npm_tfidf = features.todense()

# Return all document ids that have cosine similarity with the query larger than a threshold
def search_vec_sklearn(query, features, threshold=0.1):
    new_features = tf.transform([query])
    cosine_similarities = linear_kernel(new_features, features).flatten()
    related_docs_indices, cos_sim_sorted = zip(*sorted(enumerate(cosine_similarities), key=itemgetter(1),
                                                       reverse=True))
    doc_ids = []
    for i, cos_sim in enumerate(cos_sim_sorted):
        if cos_sim < threshold:
            break
        doc_ids.append(related_docs_indices[i])
    return doc_ids

KeyboardInterrupt: 

In [None]:
# computing the search result
def search_vec(query, topk=10):
    q = query.split()
    query_vector = vectorize(q, vocabulary, idf)
    scores = [[cosine_similarity(query_vector, document_vectors[d]), d] for d in range(len(documents))]
    scores.sort(key=lambda x: -x[0])
    doc_ids = []
    for i in range(topk):
        doc_ids.append(scores[i][1])
    return doc_ids