In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [1]:
import re, string, unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd


In [2]:
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

In [3]:
f = open("../dataset/passage_collection_new.txt", 'r', encoding = 'utf-8') 

In [4]:
document = f.read()

In [5]:
header_list = ["qid", "pid", "query", "passage"]
candidate_passages_top1000 = pd.read_csv("../dataset/candidate_passages_top1000.tsv", sep='\t', names=header_list)

In [6]:
header_list = ["qid", "query"]
test_queries = pd.read_csv("../dataset/test-queries.tsv", sep='\t', names=header_list)

# Preprocessing

## Lowercase

In [7]:
# document = document.lower()

## Tokenizing

In [8]:
# tokens = nltk.word_tokenize(document)

## Remove Punctuation

In [9]:
def remove_punctuation(tokens):
    new_tokens = []
    for token in tokens:
        new_token = re.sub(r'[^\w\s]', '', token)
        if new_token != '':
            new_tokens.append(new_token)
    return new_tokens

In [10]:
# tokens = remove_punctuation(tokens)

## Remove Stopwords

In [11]:
def remove_stopwords(tokens):
    new_tokens = []
    stopword_set = set(stopwords.words('english'))
    for token in tokens:
        if token not in stopword_set:
            new_tokens.append(token)
    return new_tokens


In [12]:
# tokens = remove_stopwords(tokens)

## Lemmatization

In [13]:
def lemmatize_verbs(tokens):
    lemmatizer = WordNetLemmatizer()
    root_words = []
    for token in tokens:
        root_word = lemmatizer.lemmatize(token, pos='v')
#         root_word = lemmatizer.lemmatize(token, pos='n')
#         root_word = lemmatizer.lemmatize(token, pos='a')
        root_words.append(root_word)
    return root_words

In [14]:
# tokens = lemmatize_verbs(tokens)

## Remove numbers

In [15]:
def remove_numbers(tokens):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    new_tokens = []
    for token in tokens:
        if token.isdigit():
            pass
        else:
            new_tokens.append(token)
    return new_tokens


In [16]:
# tokens = remove_numbers(tokens)

In [17]:
def preprocessing(passage):
    passage = passage.lower()
    tokens = nltk.word_tokenize(passage)
    tokens = remove_punctuation(tokens)
    tokens = remove_stopwords(tokens)
    tokens = lemmatize_verbs(tokens)
    tokens = remove_numbers(tokens)
    return tokens

# Inverted Index

In [18]:
candidate_passages_top1000_no_dup = candidate_passages_top1000.drop_duplicates(subset=['pid'], inplace=False)


In [19]:
import time

start = time.time()
inverted_index = {}

for idx, row in candidate_passages_top1000_no_dup.iterrows():
    pid = row['pid']
    tokens = preprocessing(row['passage'])
    freqDist = nltk.FreqDist(tokens)
    passage_words_count = len(tokens)
    for token, freq in freqDist.items():
        inverted_index.setdefault(token, [])
        inverted_index[token].append((pid, freq, passage_words_count))

end = time.time()
print(end - start)

278.8622319698334


# Subtask 3

## Caluclate TF-IDF

In [20]:
import numpy as np
tf_idf = {}
number_of_passages = len(candidate_passages_top1000_no_dup)
for token, tuple_list in inverted_index.items():
    df = len(tuple_list)
    for pid, freq, passage_word_count in tuple_list:
        tf = freq/passage_word_count
        idf = np.log((number_of_passages+1)/(df+1))
        tf_idf[pid, token] = tf*idf

## Vectors Representation + top100

In [21]:
total_tokens = list(inverted_index.keys())

In [22]:
num_of_words = len(total_tokens)
number_of_passages = len(candidate_passages_top1000_no_dup)
def query_vector_generate(tokens):

    queryVector = np.zeros((num_of_words))
    
    queryFreqDist = nltk.FreqDist(tokens)
    words_count = len(tokens)
    
    for token in np.unique(tokens):
        
        tf = queryFreqDist[token]/words_count
        try:
            df = len(inverted_index[token])
        except:
            df = 0
        idf = math.log((number_of_passages+1)/(df+1))

        try:
            idx = total_tokens.index(token)
            queryVector[idx] = tf*idf
        except:
            pass
    return queryVector

In [23]:
def passage_vector_generate(tokens, pid):
    passageVector = np.zeros((num_of_words))
#     passageFreqDist = nltk.FreqDist(tokens)
#     words_count = len(tokens)
    
    for token in np.unique(tokens):
        try:
            idx = total_tokens.index(token)
            passageVector[idx] = tf_idf[(pid, token)]
        except:
            pass
    return passageVector
    

In [24]:
import numpy as np
def cosine_sim_formula(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

In [25]:
import math
def cosine_similarity_cal(query, passage, pid):
    query_tokens = preprocessing(query)
    passage_tokens = preprocessing(passage)
    
    cosine_sim_values = []
    
    query_vector = query_vector_generate(query_tokens)
    passage_vector = passage_vector_generate(passage_tokens, pid)
    
    
    return cosine_sim_formula(query_vector, passage_vector)
    

In [26]:
def cosine_similarity_ranking(number_of_retrieved, cosine_sim_array):
    result = np.array(cosine_sim_array).argsort()[-number_of_retrieved:][::-1]
    return result

In [27]:
count = 0 
simlarity_values = []
for idx, row in candidate_passages_top1000.iterrows():
#     count += 1
#     print('count:', count)
    pid = row['pid']
    passage = row['passage']
    query = row['query']
    simlarity_values.append(cosine_similarity_cal(query, passage, pid))

    

In [28]:
results = cosine_similarity_ranking(100, simlarity_values)


In [29]:
tf_idf_top100 = candidate_passages_top1000.loc[results]['passage']

# BM25

In [30]:
def get_passage_average_length_and_total_word_occurences_corpus():
    number_of_passages = len(candidate_passages_top1000_no_dup)
    count_total_length = 0
    for idx, row in candidate_passages_top1000_no_dup.iterrows():
        count_total_length += len(preprocessing(row['passage']))
    return count_total_length, count_total_length/number_of_passages 


In [31]:
total_word_occurences, avdl = get_passage_average_length_and_total_word_occurences_corpus()

In [32]:
from math import log

k1 = 1.2
k2 = 100
b = 0.75
R = 0
r = 0
N = number_of_passages

def K_cal(dl):
    return k1 * ((1-b) + b * (float(dl)/float(avdl)) )


def BM25_cal(query, passage):
    query_tokens = preprocessing(query)
    passage_tokens = preprocessing(passage)
    query_length = len(query_tokens)
    query_token_freq_dict = nltk.FreqDist(query_tokens)
    passage_token_freq_dict = nltk.FreqDist(passage_tokens)
    dl = len(passage_tokens)
    K = K_cal(dl)
    score = 0
    for token in query_tokens:
        try:
            n = len(inverted_index[token])
        except:
            n = 0
        f = passage_token_freq_dict[token]
        qf = query_token_freq_dict[token]
        first_term = log( ( (r + 0.5) / (R - r + 0.5) ) / ( (n - r + 0.5) / (N - n - R + r + 0.5)) )
        second_term = ((k1 + 1) * f) / (K + f)
        third_term = ((k2+1) * qf) / (k2 + qf)
        score += first_term * second_term * third_term
    return score
    

In [33]:
bm25_rankings = []
for idx, row in candidate_passages_top1000.iterrows():
#     print('count:', idx+1)
    query = row['query']
    passage = row['passage']
    bm25_rankings.append(BM25_cal(query, passage))

    

In [34]:
results_bm25 = np.array(bm25_rankings).argsort()[-100:][::-1]


In [35]:
results_bm25

array([186357,   2345, 141256,  59743,  92208,   2346, 127880, 150942,
       144557,  39421, 147534, 128029, 111415,   6017, 147535,  38670,
       133820, 157379,  89606, 131976, 119616, 153936, 153937,  28581,
       183555,  99156, 140278, 152488, 126866, 153896, 148280,  57226,
       153935,  89593,  30705,  15991,   9503,   7144, 104996, 157384,
       155380, 118713,  86284, 104563,  78843,  28580, 118311,  65019,
         8693,  88483,  90918, 121817, 155382, 104069, 140279, 157378,
       141505, 153949,  32186, 107815,  28005,  75277,  72325, 181124,
        42799,  96989,   8290,   8041, 133300, 105617,  71504,  49341,
       158122, 104070, 140376, 147532, 171451, 185929,  57381, 117330,
       105531,  26718,  90268,   3976, 179125,  83926, 134104,  31035,
       133819,  41232, 169314, 182321, 173361,  87844,  21638, 161051,
        12745, 155521,  22307, 103992], dtype=int64)

# Subtask 4

In [56]:
## calculate Y
V = 0
list_temp = []
for idx, row in candidate_passages_top1000_no_dup.iterrows():
    tokens = preprocessing(row['passage'])
    for token in tokens:
        if token not in list_temp:
            list_temp.append(token)
    
V = len(list_temp)

SyntaxError: invalid syntax (<ipython-input-56-e70da2a40476>, line 7)

In [54]:
total_word_occurences

5889367

In [52]:
def QLModel_Laplace(query, passage):
    query_tokens = preprocessing(query)
    passage_tokens = preprocessing(passage)
    D = len(passage_tokens)
    passage_token_freq_dict = nltk.FreqDist(passage_tokens)
    distribution = 1
    for token in query_tokens:
        distribution *= ((passage_token_freq_dict[token] + 1)/(D + V))
    return distribution
    

In [53]:
laplace_rankings = []
for idx, row in candidate_passages_top1000.iterrows():
#     print('count:', idx+1)
    query = row['query']
    passage = row['passage']
    laplace_rankings.append(QLModel_Laplace(query, passage))

    

KeyboardInterrupt: 

In [None]:
results_laplace = np.array(laplace_rankings).argsort()[-100:][::-1]
results_laplace

In [None]:
laplace_top100 = candidate_passages_top1000.loc[results_laplace]
laplace_top100[:50]

In [None]:
from math import log
def QLModel_Lindstone(query, passage):
    query_tokens = preprocessing(query)
    passage_tokens = preprocessing(passage)
    D = len(passage_tokens)
    passage_token_freq_dict = nltk.FreqDist(passage_tokens)
    score = 0
    alpha = 0.5
    for token in query_tokens:
        score += log((passage_token_freq_dict[token] + alpha)/(D + V * alpha))
    return score
        

In [None]:
lindstone_rankings = []
for idx, row in candidate_passages_top1000.iterrows():
#     print('count:', idx+1)
    query = row['query']
    passage = row['passage']
    lindstone_rankings.append(QLModel_Lindstone(query, passage))

In [None]:
results_lindstone = np.array(laplace_rankings).argsort()[-100:][::-1]


In [None]:
lindstone_rankings

In [None]:
def get_corpus_freq(token):
    freq = 0
    try:
        for tup in inverted_index[token]:
            freq += tup[1]
    except:
        pass
    return freq

In [None]:
from math import log
def QLModel_Dirichlet(query, passage):
    query_tokens = preprocessing(query)
    passage_tokens = preprocessing(passage)
    passage_token_freq_dict = nltk.FreqDist(passage_tokens)
    N = len(passage_tokens)
    micro = 2000
    score = 0
    for token in query_tokens:
        corpus_freq = get_corpus_freq(token)
        lambda_dirichlet = N / (N + micro)
        one_minus_lambda_dirichlet = micro / (N + micro)
        first_term = lambda_dirichlet * (passage_token_freq_dict[token] / N)
        second_term = one_minus_lambda_dirichlet * (corpus_freq / total_word_occurences)
        if (first_term + second_term == 0):
            continue
        score += log(first_term + second_term)
    return score
        

In [None]:
dirichlet_rankings = []
for idx, row in candidate_passages_top1000.iterrows():
#     print('count:', idx+1)
    query = row['query']
    passage = row['passage']
    dirichlet_rankings.append(QLModel_Dirichlet(query, passage))

In [None]:
results_dirichlet = np.array(dirichlet_rankings).argsort()[-100:][::-1]


In [None]:
results_dirichlet

In [None]:
dirichlet_top100 = candidate_passages_top1000.loc[results_dirichlet]
dirichlet_top100[:50]