# HW01 Gensim Corpus and Tfidf Model

## Import Data

In [1]:
# Import required libraries for excecution
from IPython.display import clear_output
from gensim import corpora, models, similarities
import numpy as np
import pandas as pd
import csv

In [2]:
# Load vocabulary, doc_corpus, query_corpus and df with tags
vocabulary = corpora.Dictionary.load('resources/vocab.dict')
doc_corpus = corpora.MmCorpus("resources/doc_corpus.mm")
query_corpus = corpora.MmCorpus("resources/query_corpus.mm")
df = pd.read_csv('./data/relevance-judgments.tsv', sep='\t', header=None)
df.columns = ['query', 'doc']

In [3]:
# Glimpse at vocabulary
print(vocabulary)
#print(vocabulary.token2id)

Dictionary(17365 unique tokens: ['1', '1785', '1812', '1819', '1820']...)


In [4]:
# Glimpse to doc and query corpus
print(doc_corpus)
print(query_corpus)

MmCorpus(331 documents, 17365 features, 81038 non-zero entries)
MmCorpus(35 documents, 16373 features, 110 non-zero entries)


## Gensim Tfidf Model

In [5]:
# Create tfidf model for document corpus
tfidf = models.TfidfModel(doc_corpus)

# Model transformation
print('Query 1 (tfidf form): ')
print(tfidf[query_corpus][0]) 

# Similarity Matrix
index = similarities.MatrixSimilarity(tfidf[doc_corpus])
index.save('resources/similarity_matrix.index')

Query 1 (tfidf form): 
[(693, 0.4785475819504571), (1228, 0.7316664619034943), (2283, 0.48544453888677447)]


## Querying 

In [6]:
""" Runs GENSIM Tfidf model for each query

Args:
    query_corpus (gensim.corpora.mmcorpus.MmCorpus): corpus with the queries. 
    df (pandas.core.frame.DataFrame): Dataframe with read queries.
    
Returns:
    df (pandas.core.frame.DataFrame): Dataframe with new column with resulting documents for each query.
"""

# Load index matrix
index = similarities.MatrixSimilarity.load('resources/similarity_matrix.index')

# Querying
print('Gensim Tfidf Model Query results:')
print('')

# Array to save results
df_Gensim_results = []

for q in query_corpus:
    # Similarity between all docs and query q
    sims = list(enumerate(index[tfidf[q]]))
    dtype = [('doc_id', int), ('score', float)]
    doc_sims = np.array(sims, dtype=dtype)
    # Adjust docs ID
    doc_sims['doc_id'] = doc_sims['doc_id'] + 1
    
    # Sort Docs by similarity
    doc_sims_sorted = np.flip(np.sort(doc_sims, order='score'))

    # Retrieve only documents with non zero score
    k = len(np.nonzero(doc_sims['score'])[0])
    relevant_docs = doc_sims_sorted[0:k]
    
    # Print only top 5 docs
    print(relevant_docs['doc_id'][0:5])
    
    # Save results in desired format
    doc_list = ''
    for doc in relevant_docs['doc_id']:
        doc_list = doc_list + 'd' + str(f'{doc:03}') + ','
        
    df_Gensim_results.append(doc_list[:-1])
    
# Write results into a .tsv file
df['results'] = df_Gensim_results
df = df.set_index('query')
df.drop('doc', axis=1).to_csv('./results/GENSIM-queries_results.tsv', sep='\t', header=False)

Gensim Tfidf Model Query results:

[ 16 259 254 186  85]
[147 149 283 134   2]
[291 283 152]
[ 19 270  49 275 310]
[297 329  26  29 233]
[146 289   4 266  34]
[251 292 180 106 110]
[199 198 217 223 177]
[ 60  36 231 100  52]
[258 277 239 250 176]
[272 219  49 276  56]
[145  41 121   2  91]
[184 132 250 277 176]
[172 271 146  91 121]
[192 194 201 230 111]
[179 323   4 102 273]
[258 239  11  49 277]
[219 276 323  26   7]
[199 220 223  61 198]
[166  20 328 156 128]
[152 291  95 143 171]
[103 316  51  54  56]
[316  56 136 291  94]
[314 113   1 120 133]
[ 90  25  67 139  92]
[224 221 312  75 113]
[257 321  23 277 265]
[169 116  62 327 141]
[239 263 294  56  36]
[307 283  42 250 228]
[174 128 150 268  36]
[298 218 314 125 174]
[ 29 105 239 185 164]
[105  29 185  85 126]
[145 321  13 133 122]


The evaluation of these results is done on the HW01_7.ipybn notebook.