In [1]:
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
import re
import string
from operator import itemgetter
import csv
from collections import Counter
import time

Task 1

In [2]:
def loadtext(filepath):
    with open(filepath, 'r') as f:
        txt = []
        for line in f.readlines():
            txt.append(line.strip().lower())
        
    return txt

def preprocessing(text, remove):
    #initial setting
    lemmatizer = WordNetLemmatizer()
    stemmer = SnowballStemmer('english')
    stop_words = set(stopwords.words('english'))
    tokenizer = RegexpTokenizer(r'\s+', gaps = True)
    processed_txt = []

    for line in text:
        sentence = []
        #remove the punctuation
        table = str.maketrans(dict.fromkeys(string.punctuation))
        line = line.translate(table)

        line = re.sub(r'[^a-zA-Z\s]', u' ', line, flags = re.UNICODE)

        line_token = tokenizer.tokenize(line)

        #remove stop words
        if (remove == True):
            line_token = [w for w in line_token if not w in stop_words]
        

        for word in line_token:
            word = lemmatizer.lemmatize(word)
            word = stemmer.stem(word)
            sentence.append(word)

        processed_txt.append(sentence)
    
    return processed_txt

def inverted_index(pid_data, processed_data):

    dic = {}
    
    for i in range(len(pid_data)):
        processed_passage = processed_data[i]

        for token in processed_passage:
            token_num = processed_passage.count(token)

            if token not in dic.keys():
                dic[token] = {pid_data[i]: token_num}
            else:
                add_num = {pid_data[i]: token_num}
                dic[token].update(add_num)

    return dic

In [3]:
validation_data = pd.read_csv('validation_data.tsv', sep = '\t', header = 0)
validation_data

Unnamed: 0,qid,pid,queries,passage,relevancy
0,1082792,1000084,what does the golgi apparatus do to the protei...,"Start studying Bonding, Carbs, Proteins, Lipid...",0.0
1,995825,1000492,where is the graphic card located in the cpu,"For example, a “PC Expansion Card” maybe the j...",0.0
2,995825,1000494,where is the graphic card located in the cpu,The Common Cards & Buses. The most common type...,0.0
3,1091246,1000522,property premises meaning,The occurrence of since tells us that the firs...,0.0
4,1047854,1000585,what is printing mechanism,Windows desktop applications Develop Desktop t...,0.0
...,...,...,...,...,...
1103034,176994,999706,dynamic link library meaning,Internet Public Library – The Internet Public ...,0.0
1103035,1089177,999765,united home life insurance phone number,Geico Customer Service Phone Number: 1-800-861...,0.0
1103036,42555,999824,average salary for primary care sports medicin...,The current average NBA salary of $6.2 million...,0.0
1103037,1044249,999824,what is the average salary of an nba player,The current average NBA salary of $6.2 million...,0.0


In [6]:
validation_queries_list = preprocessing(validation_data['queries'], True)
validation_passage_list = preprocessing(validation_data['passage'], True)

validation_qid_list = list(validation_data['qid'])
validation_pid_list = list(validation_data['pid'])

validation_qid_dict = dict(zip(validation_data['qid'], validation_queries_list))
validation_pid_dict = dict(zip(validation_data['pid'], validation_passage_list))

np.save('validation_qid_dict.npy', validation_qid_dict)
np.save('validation_pid_dict.npy', validation_pid_dict)

In [None]:
#np.save('validation_qid_dict.npy', validation_qid_dict)
#np.save('validation_pid_dict.npy', validation_pid_dict)


In [4]:
validation_qid_dict = np.load('validation_qid_dict.npy', allow_pickle= True).tolist()
validation_pid_dict = np.load('validation_pid_dict.npy', allow_pickle= True).tolist()

In [7]:
validation_rel_list = list(validation_data['relevancy'])
invert_index = inverted_index(validation_data['pid'], validation_passage_list)

In [8]:
avdl_sum = 0
for pid in list(validation_pid_dict.keys()):
    avdl_sum += len(validation_pid_dict[pid])

avdl = avdl_sum / len(validation_pid_dict)
avdl

34.203629355189584

In [9]:
def generate_rel_dict(qid_list, pid_list, rel_list):
    '''
    generate two dict according to the given data
    '''

    rel_dict = {}
    non_rel_dict = {}

    for i in range(len(qid_list)):
        qid = qid_list[i]
        pid = pid_list[i]
        rel = rel_list[i]

        if rel > 0:
            add_dict = {pid:i}
            if qid in rel_dict.keys():
                rel_dict[qid].update(add_dict)
            else:
                rel_dict[qid] = add_dict
        else:
            add_dict = {pid:i}
            if qid in non_rel_dict.keys():
                non_rel_dict[qid].update(add_dict)
            else:
                non_rel_dict[qid] = add_dict

    return rel_dict, non_rel_dict

rel_dic, non_rel_dic = generate_rel_dict(validation_qid_list, validation_pid_list, validation_rel_list)

In [11]:
def BM25 (qid, inverted_index_passage, pid_passages_dict, qid_query_dict, k1 = 1.2, k2 = 100, b = 0.75):

    query = qid_query_dict[qid]
    query_tokens_dict = Counter(query)
    query_tokens_dict = query_tokens_dict.most_common()
    #match all the same pid with the same qid in candidate-passages


    pid_index = [i for i,x in enumerate(validation_qid_list) if x == qid ]

    N = len(pid_passages_dict)

    score_dict = {}

    for i in pid_index:

        pid = validation_pid_list[i]
        passage = pid_passages_dict[pid]
        dl = len(passage)
        K = k1 * (0.25 + 0.75*(dl / avdl))
        score = 0

        for token_set in query_tokens_dict:
            token = token_set[0]
            qf = token_set[1]

            if token in passage:
                ni = len(inverted_index_passage[token])
                fi = inverted_index_passage[token][pid]
            else:
                ni = 0
                fi = 0

            score += np.log((1/((ni + 0.5) / (N - ni + 0.5)))) * (((k1 + 1) * fi) / (K + fi)) * (((k2 + 1) * qf) / (k2 + qf))

        add_dict = {pid:score}
        score_dict.update(add_dict)

    sorted_top_100 = dict(sorted(score_dict.items(), key=itemgetter(1), reverse = True)[: 100])

    return sorted_top_100

In [12]:
with open('BM25.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    qid_list = list(validation_qid_dict.keys())
    BM_25 = {}
    for qid in qid_list:
        top_100_dic = BM25(qid, invert_index, validation_pid_dict, validation_qid_dict)
        add_dict = {qid: top_100_dic}
        BM_25.update(add_dict)
        for pid in list(top_100_dic.keys()):
            writer.writerow([qid, pid, top_100_dic[pid]])

In [13]:
print(len(BM_25))

1148


In [14]:
def generate_AP(model, rel_dic, non_rel_dic):
    
    total_ap = []
    qid_list = list(model.keys())

    for qid in qid_list:
        N = 0
        R_rel = 0
        precision = 0
        model_pid_list= list(model[qid].keys())
        rel_pid_list = list(rel_dic[qid].keys())
        

        if len(set(model_pid_list) & set(rel_pid_list)) == 0:
            precision = 0
            total_ap.append(precision)

        else:
            for pid in model_pid_list:
                N += 1
                if pid in rel_pid_list:
                    R_rel += 1
                    precision += R_rel/N
                else:
                    continue
                if R_rel == len(rel_dic[qid]):
                    break

        
            total_ap.append(precision / R_rel)
    
    return total_ap

def generate_NDCG(model, rel_dic, non_rel_dic):

    NDCG = []
    qid_list = list(model.keys())
    
    for qid in qid_list:
        N = 0
        N_opt = 0
        DCG = 0
        DCG_opt = 0
        model_pid_list = list(model[qid].keys())
        rel_pid_list = list(rel_dic[qid].keys())

        if len(set(model_pid_list) & set(rel_pid_list)) == 0:
            DCG += 0
        

        else:
            rel_qid_dic = rel_dic[qid]
            for pid in model_pid_list:
                N += 1
                if pid in rel_pid_list:
                    rel_pid = 1
                else:
                    rel_pid = 0
            
                DCG += (2**rel_pid - 1)/np.log(1 + N)

        #find the opt DCG
        rel_qid_dic = rel_dic[qid]
        best_sort_ranking = dict(sorted(rel_qid_dic.items(), key=itemgetter(1), reverse = True)[: 100])
        opt_pid_list = list(best_sort_ranking.keys())
        
        for pid in opt_pid_list:
            rel_pid = 1
            N_opt += 1
            DCG_opt += (2**rel_pid - 1)/np.log(1 + N_opt)

        NDCG.append(DCG / DCG_opt)
    

    return NDCG

In [15]:
BM_25_AP_list = generate_AP(BM_25, rel_dic, non_rel_dic)
BM_25_AP = np.mean(BM_25_AP_list)
BM_25_AP

0.2328068614123093

In [16]:
BM_25_NDCG_list = generate_NDCG(BM_25, rel_dic, non_rel_dic)
BM_25_NDCG = np.mean(BM_25_NDCG_list)
BM_25_NDCG

0.351790647818926