In [0]:
!pip install rank_bm25
from google.colab import drive
import pandas as pd
import pyarrow.parquet as pq
import numpy as np
from rank_bm25 import BM25Okapi
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
import heapq

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Load Document Judgements

In [0]:
filepath =  "/content/drive/My Drive/processed/document_judgments.csv"
judgements = pd.read_csv(filepath)

#Load Queries

In [0]:
queries_file =  "/content/drive/My Drive/2018CommonCoreTrack/expanded_queries_final.csv"
queries_df = pd.read_csv(queries_file)
print(queries_df.columns)

Index(['Unnamed: 0', 'topic_id', 'title', 'in_out_q', 'in_in_q'], dtype='object')


In [0]:
stop_words = set(stopwords.words('english'))
def prepros_query(query):
    tokenized_query = [w.lower() for w in nltk.word_tokenize(query) if w not in stop_words and w != ',' and  w != "' '"]
    return tokenized_query

# Discounted Cumulative Gain

In [0]:
def gain(rank, topic_id):
    DC = 0
    ID = 0 
    for i in range(1, len(rank)+1):
        judgement = (judgements.loc[(judgements.topic_id == topic_id) & (judgements.document_id == rank[i-1])]["judgment"])
        if  not len(judgement) == 0:
            rel = judgement.iloc[0]
        else:
            rel = 0 
        if i > 1:
            DC = DC + rel/ np.log2(i)
        else:
            DC = rel
    for i in range(1, len(rank)+1):
        rel = 0
        if i < len(ideal):
            rel =  ideal.iloc[i-1][3]
        if i > 1:
            ID = ID + rel/ np.log2(i)
        else:
            ID = rel
    nDC = DC/ID
    return nDC

# Evaluate on non-Stemmed Corpus

## Load Tokenized Corpus

In [0]:
pathfile = "/content/drive/My Drive/processed/tokenized_data.parquet"
tokenized_df = pd.read_parquet(pathfile)

Convert to list

In [0]:
tokenized_c = []
for x in range(len(tokenized_df)):
    tokenized_c.append(tokenized_df.iloc[x][2].tolist())

In [0]:
len(tokenized_c)

592341

##Evaluate

In [0]:
bm25 = BM25Okapi(tokenized_c)

In [0]:
topic_ids = queries_df.topic_id
topic_bm25at = []
for topic_id in topic_ids:
    print(topic_id)

    # Compute ideal judgement:
    ideal = judgements.loc[(judgements.topic_id == topic_id)]
    ideal  = ideal.sort_values(by=['judgment'],ascending=False)
    
    query = queries_df.loc[queries_df['topic_id'] == topic_id]["title"].iloc[0]
    query_in_out = queries_df.loc[queries_df['topic_id'] == topic_id]["in_out_q"].iloc[0]
    query_in_in = queries_df.loc[queries_df['topic_id'] == topic_id]["in_in_q"].iloc[0]

    queries = [query, query_in_out, query_in_in]
    
    topic_bm25 = []
    for query in queries:
        tokenized_query = prepros_query(query)
        bm25_scores = bm25.get_scores(tokenized_query)

        bm25at = []
        for at in [1,3,10,1000]:
            top_doc = np.asarray(heapq.nlargest(at, range(len(bm25_scores)), bm25_scores.take))
            top_doc_id = list(tokenized_df.iloc[top_doc][1])
            
            nDC = gain(top_doc_id, topic_id)
            bm25at.append(nDC)
        topic_bm25.append(bm25at)
    topic_bm25at.append(topic_bm25)

321
336
341
347
350
362
363
367
375
378
393
397
400
408
414
422
426
427
433
439


In [0]:
topic_bm25at[0]

[[0.0, 0.0, 0.11269280912425929, 0.2316099304108914],
 [0.0, 0.0, 0.04910158613120102, 0.23480543843234036],
 [0.0, 0.0, 0.04910158613120102, 0.10772106671898116]]

In [0]:
# save file
df_output = pd.DataFrame(topic_bm25at, columns = ['baseline', 'in_out_q',"in_in_q"]) 
output_file =  "drive/My Drive/processed/output.csv"
df_output.to_csv(output_file)

# Evaluate on Stemmed Corpus

In [0]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

##Load Stemmed Corpus

In [0]:
pathfile = "/content/drive/My Drive/processed/stemmed_data.parquet"
stemmed_df = pd.read_parquet(pathfile)

Convert to list

In [0]:
stemmed_c = []
for x in range(len(stemmed_df)):
    stemmed_c.append(stemmed_df.iloc[x][2].tolist())

##Evaluate

In [0]:
bm25 = BM25Okapi(stemmed_c)

In [0]:
topic_ids = queries_df.topic_id
topic_bm25at = []
for topic_id in topic_ids:
    print(topic_id)

    # Compute ideal judgement:
    ideal = judgements.loc[(judgements.topic_id == topic_id)]
    ideal  = ideal.sort_values(by=['judgment'],ascending=False)
    
    query = queries_df.loc[queries_df['topic_id'] == topic_id]["title"].iloc[0]
    query_in_out = queries_df.loc[queries_df['topic_id'] == topic_id]["in_out_q"].iloc[0]
    query_in_in = queries_df.loc[queries_df['topic_id'] == topic_id]["in_in_q"].iloc[0]

    queries = [query, query_in_out, query_in_in]
    
    topic_bm25 = []
    for query in queries:
        tokenized_query = prepros_query(query)
        stemmed_query = [stemmer.stem(w) for w in tokenized_query]
        print(stemmed_query)
        bm25_scores = bm25.get_scores(stemmed_query)

        bm25at = []
        for at in [1,3,10,1000]:
            top_doc = np.asarray(heapq.nlargest(at, range(len(bm25_scores)), bm25_scores.take))
            top_doc_id = list(tokenized_df.iloc[top_doc][1])
            
            nDC = gain(top_doc_id, topic_id)
            bm25at.append(nDC)
        topic_bm25.append(bm25at)
    topic_bm25at.append(topic_bm25)

321
['women', 'parliament']
['women', 'parliament', 'european', 'elect']
['women', 'parliament', 'men', 'femal']
336
['black', 'bear', 'attack']
['black', 'bear', 'attack', 'panther', 'grizzli', 'moos']
['black', 'bear', 'attack', 'white', 'red', 'big']
341
['airport', 'secur']
['airport', 'secur', 'lax', 'dfw']
['airport', 'secur', 'bwi', 'transport']
347
['wildlif', 'extinct']
['wildlif', 'extinct', 'geograph', 'speci']
['wildlif', 'extinct', 'wetland', 'conserv']
350
['health', 'comput', 'termin']
['health', 'comput', 'termin', 'integr', 'connect', 'allianc']
['health', 'comput', 'termin', 'system', 'technolog', 'peripher']
362
['human', 'smuggl']
['human', 'smuggl', 'traffick', 'convict']
['human', 'smuggl', 'traffick', 'illicit']
363
['transport', 'tunnel', 'disast']
['transport', 'tunnel', 'disast', 'railroad', 'rail', 'termin']
['transport', 'tunnel', 'disast', 'pollut', 'rout', 'highway']
367
['piraci']
['piraci', 'anti']
['piraci', 'cybercrim']
375
['hydrogen', 'energi']
['hyd

In [0]:
# save file
df_output = pd.DataFrame(topic_bm25at, columns = ['baseline', 'in_out_q',"in_in_q"]) 
output_file =  "drive/My Drive/processed/output_stemmed.csv"
df_output.to_csv(output_file)