In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
import re, string, unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
import numpy as np

In [4]:
f = open("dataset/passage_collection_new.txt", 'r', encoding = 'utf-8') 
document = f.read()

In [5]:
header_list = ["qid", "pid", "query", "passage"]
candidate_passages_top1000 = pd.read_csv("dataset/candidate_passages_top1000.tsv", sep='\t', names=header_list)

In [6]:
header_list = ["qid", "query"]
test_queries = pd.read_csv("dataset/test-queries.tsv", sep='\t', names=header_list)

In [7]:
train_data = pd.read_csv("part2/train_data.tsv", sep='\t')
validation_data = pd.read_csv("part2/validation_data.tsv", sep='\t')


In [8]:
train_data.head(10)

Unnamed: 0,qid,pid,queries,passage,relevancy
0,188714,1000052,foods and supplements to lower blood sugar,Watch portion sizes: ■ Even healthy foods will...,0.0
1,995526,1000094,where is the federal penitentiary in ind,It takes THOUSANDS of Macy's associates to bri...,0.0
2,660957,1000115,what foods are good if you have gout?,The good news is that you will discover what g...,0.0
3,837202,1000252,what is the nutritional value of oatmeal,"Oats make an easy, balanced breakfast. One cup...",0.0
4,130825,1000268,definition for daring,Such a requirement would have three desirable ...,0.0
5,408149,1000288,is dhgate a scam,If you think you ve been targeted by a counter...,0.0
6,1019649,1000419,what study for mets to brain,Sorry he's having so much pain. The reason tha...,0.0
7,1099065,1000436,how far deep to plant beet early wonder,"The simplest way, and my preference, is to roa...",0.0
8,1084910,1000466,what disease do roof rats cause,1 A cage trap baited with peanut butter or a s...,0.0
9,959083,1000479,when was niagara falls created,"Bulbar Onset – ALS. ALS is like Niagara Falls,...",0.0


In [9]:
validation_data.head(10)

Unnamed: 0,qid,pid,queries,passage,relevancy
0,1082792,1000084,what does the golgi apparatus do to the protei...,"Start studying Bonding, Carbs, Proteins, Lipid...",0.0
1,995825,1000492,where is the graphic card located in the cpu,"For example, a “PC Expansion Card” maybe the j...",0.0
2,995825,1000494,where is the graphic card located in the cpu,The Common Cards & Buses. The most common type...,0.0
3,1091246,1000522,property premises meaning,The occurrence of since tells us that the firs...,0.0
4,1047854,1000585,what is printing mechanism,Windows desktop applications Develop Desktop t...,0.0
5,991832,1000599,who discovered the element carbon,1. 1 a nonmetallic element existing in the th...,0.0
6,185299,1000647,fastest cell phone processor,Tips for calling a cell phone in Greece: To ca...,0.0
7,574730,1000663,what are the three monetary policy tools of th...,"Federal Reserve updates including rates, news ...",0.0
8,1085008,1000675,what did maria theresa do for the serfs,"In this feudal system, the king awarded land g...",0.0
9,609628,1000771,what county is mitchell south dakota in,South Dakota: According to our research of Sou...,0.0


In [10]:
train_data.shape
validation_data.shape

(4364339, 5)

(1103039, 5)

# Preprocessing

In [11]:
def remove_punctuation(tokens):
    new_tokens = []
    for token in tokens:
        new_token = re.sub(r'[^\w\s]', '', token)
        if new_token != '':
            new_tokens.append(new_token)
    return new_tokens

def remove_stopwords(tokens):
    new_tokens = []
    stopword_set = set(stopwords.words('english'))
    for token in tokens:
        if token not in stopword_set:
            new_tokens.append(token)
    return new_tokens

def lemmatize_verbs(tokens):
    lemmatizer = WordNetLemmatizer()
    root_words = []
    for token in tokens:
        root_word = lemmatizer.lemmatize(token, pos='v')
#         root_word = lemmatizer.lemmatize(token, pos='n')
#         root_word = lemmatizer.lemmatize(token, pos='a')
        root_words.append(root_word)
    return root_words

def remove_numbers(tokens):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    new_tokens = []
    for token in tokens:
        if token.isdigit():
            pass
        else:
            new_tokens.append(token)
    return new_tokens

def preprocessing(passage):
    passage = passage.lower()
    tokens = nltk.word_tokenize(passage)
    tokens = remove_punctuation(tokens)
    tokens = remove_stopwords(tokens)
    tokens = lemmatize_verbs(tokens)
    tokens = remove_numbers(tokens)
    return tokens

# Subtask 1

## BM25

In [31]:
validation_data_no_dup_passages = validation_data.drop_duplicates(subset=['pid'], inplace=False)


In [34]:
# validation_data_no_dup_passages.head(20)
validation_data_no_dup_passages.shape

(955211, 5)

In [38]:
def get_passage_average_length_and_total_word_occurences_corpus():
    number_of_passages = len(validation_data_no_dup_passages)
    count_total_length = 0
    for idx, row in validation_data_no_dup_passages.iterrows():
        count_total_length += len(preprocessing(row['passage']))
    return count_total_length, count_total_length/number_of_passages 


In [39]:
total_word_occurences, avdl = get_passage_average_length_and_total_word_occurences_corpus()

In [41]:
from math import log

k1 = 1.2
k2 = 100
b = 0.75
R = 0
r = 0
N = len(validation_data_no_dup_passages)

def K_cal(dl):
    return k1 * ((1-b) + b * (float(dl)/float(avdl)) )


def BM25_cal(query, passage):
    query_tokens = preprocessing(query)
    passage_tokens = preprocessing(passage)
    query_length = len(query_tokens)
    query_token_freq_dict = nltk.FreqDist(query_tokens)
    passage_token_freq_dict = nltk.FreqDist(passage_tokens)
    dl = len(passage_tokens)
    K = K_cal(dl)
    score = 0
    for token in query_tokens:
        try:
            n = len(inverted_index[token])
        except:
            n = 0
        f = passage_token_freq_dict[token]
        qf = query_token_freq_dict[token]
        first_term = log( ( (r + 0.5) / (R - r + 0.5) ) / ( (n - r + 0.5) / (N - n - R + r + 0.5)) )
        second_term = ((k1 + 1) * f) / (K + f)
        third_term = ((k2+1) * qf) / (k2 + qf)
        score += first_term * second_term * third_term
    return score
    

In [43]:
bm25_rankings = []
for idx, row in validation_data.iterrows():
#     print('count:', idx+1)
    query = row['queries']
    passage = row['passage']
    bm25_rankings.append(BM25_cal(query, passage))

    

In [50]:
ranking_k = 100
results_bm25 = np.array(bm25_rankings).argsort()[-ranking_k:][::-1]
results_bm25

array([ 218534, 1076853,   95986,   95987,  831065,  725920,  504442,
        654647,  341262,  281695,  950916,  586050,  871938,  793356,
        944762,  714508,  900185,  206973,  629358,  701682,  248618,
        259324,  702792,  286327,  205094,   98221,  519566,  242694,
        705667,  922032,  197484,  431189,  419681,  237993,  502543,
        793493,  646679,  559150,  303774,  638714,  778563,  155707,
         45281,  132857,  213755,  299424,  252540,    9338, 1075869,
        139233,  733889,  401291,  373812,  784890, 1008311,  911301,
        909449,  558879,  857483,  602553,  171827,  217514,  857646,
        539660,  195886, 1078616,  554316,  872234,  346989,   28250,
       1026344,  330458, 1026153,  612051,  395691,  530538,  313678,
        693646,  175037,  751525,  216400,  663797,   99840,  928996,
          5558,  428329,  546185,   46827,  112665,  118094,  752555,
       1017110,  455444,  910607,  648719,  793681, 1063650,  212436,
        415429,  654

In [52]:
ranking_list_df = validation_data.loc[results_bm25]
ranking_list_df

Unnamed: 0,qid,pid,queries,passage,relevancy
218534,1007691,7251254,"when allocating service department costs, the ...",Direct method allocates each service departmen...,1.0
1076853,1007691,7251259,"when allocating service department costs, the ...",The direct method is the most widely-used meth...,0.0
95986,1007691,7251251,"when allocating service department costs, the ...",service department provides a large amount of ...,0.0
95987,1007691,7251253,"when allocating service department costs, the ...","The rows sum to 100%, so that all services pro...",0.0
831065,1007691,7251255,"when allocating service department costs, the ...",The most defensible sequence is to start with ...,0.0
...,...,...,...,...,...
793681,1007691,4814576,"when allocating service department costs, the ...",Service Members | Veterans | Both. Military On...,0.0
1063650,1007691,6395207,"when allocating service department costs, the ...",that hospital emergency department services ar...,0.0
212436,1007691,6872353,"when allocating service department costs, the ...",Yelp Customer Service customer service phone n...,0.0
415429,1007691,4114248,"when allocating service department costs, the ...","A service fee, service charge, or surcharge is...",0.0


In [74]:
validation_data[(validation_data['relevancy'] < 1.0) & (validation_data['relevancy'] > 0)]


Unnamed: 0,qid,pid,queries,passage,relevancy


In [68]:
def average_precision_cal(ranking_list_df):
    ranking_list_df = ranking_list_df.reset_index(drop=True, inplace=False)
    total_relevant_retrieved = 0
    precision_sum = 0
    for idx, row in ranking_list_df.iterrows():
        relevancy = row['relevancy']
        if (relevancy):
            isRelevant = True
            total_relevant_retrieved += 1
        precision = total_relevant_retrieved / (idx + 1)
        precision_sum += precision
    result = precision_sum / len(ranking_list_df)
    return result

In [69]:
average_precision_cal(ranking_list_df)

0.07445786781310981

# Subtask 2

NameError: name 'validation_data' is not defined