In [85]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import re, string, unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
import numpy as np
import smart_open
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import math
from math import log
from numpy.random import default_rng, rand


In [3]:
f = open("../dataset/passage_collection_new.txt", 'r', encoding = 'utf-8') 
document = f.read()

In [4]:
header_list = ["qid", "pid", "query", "passage"]
candidate_passages_top1000 = pd.read_csv("../dataset/candidate_passages_top1000.tsv", sep='\t', names=header_list)

In [5]:
candidate_passages_top1000

Unnamed: 0,qid,pid,query,passage
0,494835,7130104,"sensibilities, definition",This is the definition of RNA along with examp...
1,1128373,7130104,iur definition,This is the definition of RNA along with examp...
2,131843,7130104,definition of a sigmet,This is the definition of RNA along with examp...
3,20455,7130335,ar glasses definition,Best Answer: The AR designation comes from the...
4,719381,7130335,what is ar balance,Best Answer: The AR designation comes from the...
...,...,...,...,...
189872,1056204,79980,who was the first steam boat operator,Other operators with special formats accept mo...
189873,1132213,7998257,how long to hold bow in yoga,You may be surprised that to learn that yoga t...
189874,324211,7998651,how much money a united airline get as a capta...,Find cheap airline tickets & deals on flights ...
189875,1116341,7998709,closed ended mortgage definition,"What is a wrap-around mortgage, and who is it ..."


In [6]:
header_list = ["qid", "query"]
test_queries = pd.read_csv("../dataset/test-queries.tsv", sep='\t', names=header_list)

In [7]:
train_data = pd.read_csv("../part2/train_data.tsv", sep='\t')
validation_data = pd.read_csv("../part2/validation_data.tsv", sep='\t')


In [8]:
train_data.head(10)

Unnamed: 0,qid,pid,queries,passage,relevancy
0,188714,1000052,foods and supplements to lower blood sugar,Watch portion sizes: ■ Even healthy foods will...,0.0
1,995526,1000094,where is the federal penitentiary in ind,It takes THOUSANDS of Macy's associates to bri...,0.0
2,660957,1000115,what foods are good if you have gout?,The good news is that you will discover what g...,0.0
3,837202,1000252,what is the nutritional value of oatmeal,"Oats make an easy, balanced breakfast. One cup...",0.0
4,130825,1000268,definition for daring,Such a requirement would have three desirable ...,0.0
5,408149,1000288,is dhgate a scam,If you think you ve been targeted by a counter...,0.0
6,1019649,1000419,what study for mets to brain,Sorry he's having so much pain. The reason tha...,0.0
7,1099065,1000436,how far deep to plant beet early wonder,"The simplest way, and my preference, is to roa...",0.0
8,1084910,1000466,what disease do roof rats cause,1 A cage trap baited with peanut butter or a s...,0.0
9,959083,1000479,when was niagara falls created,"Bulbar Onset – ALS. ALS is like Niagara Falls,...",0.0


In [8]:
train_data.shape

(4364339, 5)

In [10]:
if 0.0:
    print("work")
else:
    print("no")

no


In [9]:
validation_data.head(10)

Unnamed: 0,qid,pid,queries,passage,relevancy
0,1082792,1000084,what does the golgi apparatus do to the protei...,"Start studying Bonding, Carbs, Proteins, Lipid...",0.0
1,995825,1000492,where is the graphic card located in the cpu,"For example, a “PC Expansion Card” maybe the j...",0.0
2,995825,1000494,where is the graphic card located in the cpu,The Common Cards & Buses. The most common type...,0.0
3,1091246,1000522,property premises meaning,The occurrence of since tells us that the firs...,0.0
4,1047854,1000585,what is printing mechanism,Windows desktop applications Develop Desktop t...,0.0
5,991832,1000599,who discovered the element carbon,1. 1 a nonmetallic element existing in the th...,0.0
6,185299,1000647,fastest cell phone processor,Tips for calling a cell phone in Greece: To ca...,0.0
7,574730,1000663,what are the three monetary policy tools of th...,"Federal Reserve updates including rates, news ...",0.0
8,1085008,1000675,what did maria theresa do for the serfs,"In this feudal system, the king awarded land g...",0.0
9,609628,1000771,what county is mitchell south dakota in,South Dakota: According to our research of Sou...,0.0


In [10]:
train_data.shape
validation_data.shape

(1103039, 5)

# Preprocessing

In [12]:
def remove_punctuation(tokens):
    new_tokens = []
    for token in tokens:
        new_token = re.sub(r'[^\w\s]', '', token)
        if new_token != '':
            new_tokens.append(new_token)
    return new_tokens

def remove_stopwords(tokens):
    new_tokens = []
    stopword_set = set(stopwords.words('english'))
    for token in tokens:
        if token not in stopword_set:
            new_tokens.append(token)
    return new_tokens

def lemmatize_verbs(tokens):
    lemmatizer = WordNetLemmatizer()
    root_words = []
    for token in tokens:
        root_word = lemmatizer.lemmatize(token, pos='v')
#         root_word = lemmatizer.lemmatize(token, pos='n')
#         root_word = lemmatizer.lemmatize(token, pos='a')
        root_words.append(root_word)
    return root_words

def remove_numbers(tokens):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    new_tokens = []
    for token in tokens:
        if token.isdigit():
            pass
        else:
            new_tokens.append(token)
    return new_tokens

def preprocessing(passage):
    passage = passage.lower()
    tokens = nltk.word_tokenize(passage)
    tokens = remove_punctuation(tokens)
    tokens = remove_stopwords(tokens)
    tokens = lemmatize_verbs(tokens)
    tokens = remove_numbers(tokens)
    return tokens

# Subtask 1

## BM25

In [13]:
validation_data_no_dup_passages = validation_data.drop_duplicates(subset=['pid'], inplace=False)


In [14]:
# validation_data_no_dup_passages.head(20)
validation_data_no_dup_passages.shape

(955211, 5)

In [10]:
def get_passage_average_length_and_total_word_occurences_corpus():
    number_of_passages = len(validation_data_no_dup_passages)
    count_total_length = 0
    for idx, row in validation_data_no_dup_passages.iterrows():
        count_total_length += len(preprocessing(row['passage']))
    return count_total_length, count_total_length/number_of_passages 


In [11]:
# total_word_occurences, avdl = get_passage_average_length_and_total_word_occurences_corpus()
total_word_occurences = 30757932 # for validation data
avdl = 32.200144261320276 # for validation data

In [12]:
# total_word_occurences
# avdl

In [303]:
from math import log

k1 = 1.2
k2 = 100
b = 0.75
R = 0
r = 0
N = len(validation_data_no_dup_passages)

def K_cal(dl):
    return k1 * ((1-b) + b * (float(dl)/float(avdl)) )


def BM25_cal(query, passage):
    query_tokens = preprocessing(query)
    passage_tokens = preprocessing(passage)
    query_length = len(query_tokens)
    query_token_freq_dict = nltk.FreqDist(query_tokens)
    passage_token_freq_dict = nltk.FreqDist(passage_tokens)
    dl = len(passage_tokens)
    K = K_cal(dl)
    score = 0
    for token in query_tokens:
        try:
            n = len(inverted_index[token])
        except:
            n = 0
        f = passage_token_freq_dict[token]
        qf = query_token_freq_dict[token]
        first_term = log( ( (r + 0.5) / (R - r + 0.5) ) / ( (n - r + 0.5) / (N - n - R + r + 0.5)) )
        second_term = ((k1 + 1) * f) / (K + f)
        third_term = ((k2+1) * qf) / (k2 + qf)
        score += first_term * second_term * third_term
    return score
    

NameError: name 'validation_data_no_dup_passages' is not defined

In [14]:
bm25_rankings = []
for idx, row in validation_data.iterrows():
#     print('count:', idx+1)
    query = row['queries']
    passage = row['passage']
    bm25_rankings.append(BM25_cal(query, passage))

    

In [15]:
ranking_k = 100
results_bm25 = np.array(bm25_rankings).argsort()[-ranking_k:][::-1]
results_bm25

array([ 218534, 1076853,   95986,   95987,  831065,  725920,  504442,
        654647,  341262,  281695,  950916,  586050,  871938,  793356,
        944762,  714508,  900185,  206973,  629358,  701682,  248618,
        259324,  702792,  286327,  205094,   98221,  519566,  242694,
        705667,  922032,  197484,  431189,  419681,  237993,  502543,
        793493,  646679,  559150,  303774,  638714,  778563,  155707,
         45281,  132857,  213755,  299424,  252540,    9338, 1075869,
        139233,  733889,  401291,  373812,  784890, 1008311,  911301,
        909449,  558879,  857483,  602553,  171827,  217514,  857646,
        539660,  195886, 1078616,  554316,  872234,  346989,   28250,
       1026344,  330458, 1026153,  612051,  395691,  530538,  313678,
        693646,  175037,  751525,  216400,  663797,   99840,  928996,
          5558,  428329,  546185,   46827,  112665,  118094,  752555,
       1017110,  455444,  910607,  648719,  793681, 1063650,  212436,
        415429,  654

In [16]:
ranking_list_df = validation_data.loc[results_bm25]
ranking_list_df

Unnamed: 0,qid,pid,queries,passage,relevancy
218534,1007691,7251254,"when allocating service department costs, the ...",Direct method allocates each service departmen...,1.0
1076853,1007691,7251259,"when allocating service department costs, the ...",The direct method is the most widely-used meth...,0.0
95986,1007691,7251251,"when allocating service department costs, the ...",service department provides a large amount of ...,0.0
95987,1007691,7251253,"when allocating service department costs, the ...","The rows sum to 100%, so that all services pro...",0.0
831065,1007691,7251255,"when allocating service department costs, the ...",The most defensible sequence is to start with ...,0.0
...,...,...,...,...,...
793681,1007691,4814576,"when allocating service department costs, the ...",Service Members | Veterans | Both. Military On...,0.0
1063650,1007691,6395207,"when allocating service department costs, the ...",that hospital emergency department services ar...,0.0
212436,1007691,6872353,"when allocating service department costs, the ...",Yelp Customer Service customer service phone n...,0.0
415429,1007691,4114248,"when allocating service department costs, the ...","A service fee, service charge, or surcharge is...",0.0


In [17]:
validation_data[(validation_data['relevancy'] < 1.0) & (validation_data['relevancy'] > 0)]


Unnamed: 0,qid,pid,queries,passage,relevancy


In [18]:
def average_precision_cal(ranking_list_df):
    ranking_list_df = ranking_list_df.reset_index(drop=True, inplace=False)
    total_relevant_retrieved = 0
    precision_sum = 0
    for idx, row in ranking_list_df.iterrows():
        relevancy = row['relevancy']
        if (relevancy):
#             isRelevant = True
            total_relevant_retrieved += 1
            precision = total_relevant_retrieved / (idx + 1)
            precision_sum += precision
    result = precision_sum / len(ranking_list_df)
    return result

In [19]:
average_precision_cal(ranking_list_df)

0.07445786781310981

In [20]:
ranking_list_df.sort_values(by=['relevancy'], ascending=False)


Unnamed: 0,qid,pid,queries,passage,relevancy
218534,1007691,7251254,"when allocating service department costs, the ...",Direct method allocates each service departmen...,1.0
950916,1089945,7079883,the __________ test is a quick and dirty test ...,• The Smell Test is familiar ground in most bu...,1.0
539660,1007691,423230,"when allocating service department costs, the ...",IT Service (ITILv3): A Service provided to one...,0.0
612051,1007691,994382,"when allocating service department costs, the ...",All UK telephone numbers beginning with the di...,0.0
1026153,1007691,3941750,"when allocating service department costs, the ...",Respite (Out-of-Home) Services [edit]. Respite...,0.0
...,...,...,...,...,...
431189,1007691,5146501,"when allocating service department costs, the ...",The Department offers service coordination and...,0.0
197484,1007691,5904987,"when allocating service department costs, the ...",If you are a delinquent juror who has been ins...,0.0
922032,1007691,5220119,"when allocating service department costs, the ...",There are different types of customer service ...,0.0
705667,1007691,7088169,"when allocating service department costs, the ...",Us Postal Service Customer Service Phone Numbe...,0.0


In [21]:
ranking_list_df

Unnamed: 0,qid,pid,queries,passage,relevancy
218534,1007691,7251254,"when allocating service department costs, the ...",Direct method allocates each service departmen...,1.0
1076853,1007691,7251259,"when allocating service department costs, the ...",The direct method is the most widely-used meth...,0.0
95986,1007691,7251251,"when allocating service department costs, the ...",service department provides a large amount of ...,0.0
95987,1007691,7251253,"when allocating service department costs, the ...","The rows sum to 100%, so that all services pro...",0.0
831065,1007691,7251255,"when allocating service department costs, the ...",The most defensible sequence is to start with ...,0.0
...,...,...,...,...,...
793681,1007691,4814576,"when allocating service department costs, the ...",Service Members | Veterans | Both. Military On...,0.0
1063650,1007691,6395207,"when allocating service department costs, the ...",that hospital emergency department services ar...,0.0
212436,1007691,6872353,"when allocating service department costs, the ...",Yelp Customer Service customer service phone n...,0.0
415429,1007691,4114248,"when allocating service department costs, the ...","A service fee, service charge, or surcharge is...",0.0


In [22]:
def get_IDCG(ranking_list_df):
    ranking_list_df_sorted = ranking_list_df.sort_values(by=['relevancy'], ascending=False)
#     ranking_list_df_sorted = ranking_list_df.reset_index(drop=True, inplace=False)
    ranking_list_df_sorted = ranking_list_df_sorted.reset_index().reindex(ranking_list_df_sorted.columns, axis=1)
    ideal_discounted_gain_sum = 0
    for idx, row in ranking_list_df_sorted.iterrows():
        index = idx + 1
        relevance_score = row['relevancy']
        gain = 2 ** relevance_score - 1
        discounted_gain = gain / math.log2(index + 1)
        ideal_discounted_gain_sum += discounted_gain
    return ideal_discounted_gain_sum

In [23]:
def get_NDCG(ranking_list_df):
    ranking_list_df = ranking_list_df.reset_index(drop=True, inplace=False)
    discounted_gain_sum = 0
    for idx, row in ranking_list_df.iterrows():
        index = idx + 1
        relevance_score = row['relevancy']
        gain = 2 ** relevance_score - 1
        discounted_gain = gain / math.log2(index + 1)
        discounted_gain_sum += discounted_gain
    IDCG = get_IDCG(ranking_list_df)
    
    nDCG = discounted_gain_sum / IDCG
    return nDCG

In [24]:
get_NDCG(ranking_list_df)

0.7841802768331765

# Subtask 2

In [89]:
# train_data = pd.read_csv("../part2/train_data.tsv", sep='\t')
# validation_data = pd.read_csv("../part2/validation_data.tsv", sep='\t')

In [90]:
train_data = train_data[:1000] # 일단 1000개만 해봄
validation_data = validation_data[:1000] # 일단 1000개만 해봄

## Preprocessing

In [91]:
# passages = validation_data.passage.values[:1000]
# validation_data_temp = validation_data[:1000]
# validation_data_temp['passage_cleaned']=validation_data_temp.passage.apply(lambda x: preprocessing(x))
# validation_data_temp['queries_cleaned']=validation_data_temp.queries.apply(lambda x: preprocessing(x))

In [92]:
train_data['passage_cleaned']=train_data.passage.apply(lambda x: preprocessing(x))
train_data['query_cleaned']=train_data.queries.apply(lambda x: preprocessing(x))

In [93]:
train_data

Unnamed: 0,qid,pid,queries,passage,relevancy,passage_cleaned,query_cleaned,co_similarity,bm25
0,188714,1000052,foods and supplements to lower blood sugar,Watch portion sizes: ■ Even healthy foods will...,0.0,"[watch, portion, size, even, healthy, foods, c...","[foods, supplement, lower, blood, sugar]",0.812210,28.648708
1,995526,1000094,where is the federal penitentiary in ind,It takes THOUSANDS of Macy's associates to bri...,0.0,"[take, thousands, macy, associate, bring, magi...","[federal, penitentiary, ind]",0.345645,8.308349
2,660957,1000115,what foods are good if you have gout?,The good news is that you will discover what g...,0.0,"[good, news, discover, go, action, spur, narro...","[foods, good, gout]",0.640991,21.800407
3,837202,1000252,what is the nutritional value of oatmeal,"Oats make an easy, balanced breakfast. One cup...",0.0,"[oats, make, easy, balance, breakfast, one, cu...","[nutritional, value, oatmeal]",0.670880,9.377437
4,130825,1000268,definition for daring,Such a requirement would have three desirable ...,0.0,"[requirement, would, three, desirable, consequ...","[definition, dare]",0.386194,5.959412
...,...,...,...,...,...,...,...,...,...
995,400803,1016366,is a revocable trust a separate legal entity,The income and deductions of the trust are rep...,0.0,"[income, deductions, trust, report, income, ta...","[revocable, trust, separate, legal, entity]",0.667951,21.828710
996,400803,1016370,is a revocable trust a separate legal entity,A grantor trust is a living revocable trust in...,0.0,"[grantor, trust, live, revocable, trust, grant...","[revocable, trust, separate, legal, entity]",0.699175,22.962073
997,544319,1016449,"weather in gig harbor, wa",The gig economy is the collection of markets t...,0.0,"[gig, economy, collection, market, match, prov...","[weather, gig, harbor, wa]",0.383561,11.130238
998,617246,1016466,what decisions rules can determine upheld or d...,"To claim a tax deduction for business mileage,...",0.0,"[claim, tax, deduction, business, mileage, sel...","[decisions, rule, determine, uphold, dismiss, ...",0.534107,17.704609


In [94]:
validation_data['passage_cleaned']=validation_data.passage.apply(lambda x: preprocessing(x))
validation_data['query_cleaned']=validation_data.queries.apply(lambda x: preprocessing(x))

## Loading word2vec embedding

In [95]:
# loading pre-trained embeddings, each word is represented as a 300 dimensional vector
import gensim
W2V_PATH="../GoogleNews-vectors-negative300.bin"
model_w2v = gensim.models.KeyedVectors.load_word2vec_format(W2V_PATH, binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


MemoryError: Unable to allocate 3.35 GiB for an array with shape (3000000, 300) and data type float32

# Embedding documents and queries

## embedding documents

### training data

In [None]:
# tokenize and pad every document to make them of the same size
passage_tokenizer=Tokenizer()
passage_tokenizer.fit_on_texts(train_data.passage_cleaned)
passage_max_length = 128 # document length including padding
query_max_length = 64 # query length including padding

In [None]:
train_data_length = train_data.shape[0]
document_word_embeddings_train=np.zeros((train_data_length, passage_max_length,300)) # 64 == padding
passages = train_data.passage_cleaned
for i in range(len(passages)):
    passage = passages[i]
    passage_length = len(passage)
    for j in range(passage_length): 
        word = passage[j]
        if word in model_w2v:
            document_word_embeddings_train[i][j] = model_w2v[word]
document_word_embeddings_train.shape

In [None]:
average_vector_list_length = document_word_embeddings_train.shape[0]
average_document_vectors_train = np.zeros((average_vector_list_length,300))
for i in range(average_vector_list_length):
    average_document_vectors_train[i] = np.mean(document_word_embeddings_train[i], axis=0)
average_document_vectors_train.shape

### validation data

In [None]:
# tokenize and pad every document to make them of the same size
passage_tokenizer=Tokenizer()
passage_tokenizer.fit_on_texts(validation_data.passage_cleaned)


In [None]:
validation_data_length = validation_data.shape[0]
document_word_embeddings_val=np.zeros((validation_data_length, passage_max_length,300)) # 64 == padding
passages = validation_data.passage_cleaned
for i in range(len(passages)):
    passage = passages[i]
    passage_length = len(passage)
    for j in range(passage_length): 
        word = passage[j]
        if word in model_w2v:
            document_word_embeddings_val[i][j] = model_w2v[word]
document_word_embeddings_val.shape

In [None]:
average_vector_list_length = document_word_embeddings_val.shape[0]
average_document_vectors_val = np.zeros((average_vector_list_length,300))
for i in range(average_vector_list_length):
    average_document_vectors_val[i] = np.mean(document_word_embeddings_val[i], axis=0)
average_document_vectors_val.shape

## embedding quries

## 문제점:
1. 특정 단어가 word embedding에 없는 경우가 있음
-> query_vector 의 모든 entry가 0임

-> cosine similarity 계산할 때 nan 이 나옴

temp solution:
cosine similarity 가 0일떄 결과값에 0 을 넣어줌

### training data

In [96]:
# tokenize and pad every document to make them of the same size
query_tokenizer=Tokenizer()
query_tokenizer.fit_on_texts(train_data.query_cleaned)


In [97]:
train_data_length = train_data.shape[0]
query_word_embeddings_train=np.zeros((train_data_length, query_max_length,300)) # 64 == padding
queries = train_data.query_cleaned
for i in range(len(queries)):
    query = queries[i]
    query_length = len(query)
    for j in range(query_length): 
        word = query[j]
        if word in model_w2v:
            query_word_embeddings_train[i][j] = model_w2v[word]
query_word_embeddings_train.shape

(1000, 64, 300)

In [98]:
average_vector_list_length = query_word_embeddings_train.shape[0]
average_query_vectors_train = np.zeros((average_vector_list_length,300))
for i in range(average_vector_list_length):
    average_query_vectors_train[i] = np.mean(query_word_embeddings_train[i], axis=0)
average_query_vectors_train.shape

(1000, 300)

### validation data

In [99]:
# tokenize and pad every document to make them of the same size
query_tokenizer=Tokenizer()
query_tokenizer.fit_on_texts(validation_data.query_cleaned)


In [100]:
validation_data_length = validation_data.shape[0]
query_word_embeddings_val=np.zeros((validation_data_length, query_max_length,300)) # 64 == padding
queries = validation_data.query_cleaned
for i in range(len(queries)):
    query = queries[i]
    query_length = len(query)
    for j in range(query_length): 
        word = query[j]
        if word in model_w2v:
            query_word_embeddings_val[i][j] = model_w2v[word]
query_word_embeddings_val.shape

(1000, 64, 300)

In [101]:
average_vector_list_length = query_word_embeddings_val.shape[0]
average_query_vectors_val = np.zeros((average_vector_list_length,300))
for i in range(average_vector_list_length):
    average_query_vectors_val[i] = np.mean(query_word_embeddings_val[i], axis=0)
average_query_vectors_val.shape

(1000, 300)

## creating training data(adding features)

In [102]:
num_of_features = 2 # cosine_sim, bm25

In [103]:
x_train = np.zeros((average_query_vectors_train.shape[0], num_of_features))
x_val = np.zeros((average_query_vectors_val.shape[0], num_of_features))
x_train.shape

(1000, 2)

In [104]:
x_train[0][0]

0.0

### Cosine Simliarity

In [105]:
def cosine_sim_formula(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    if np.isnan(cos_sim):
        cos_sim = 0
    return cos_sim

In [106]:
for i in range(x_train.shape[0]):
    query_vector = average_query_vectors_train[i]
    passage_vector = average_document_vectors_train[i]
    x_train[i][0] = cosine_sim_formula(query_vector, passage_vector)
    if np.isnan(x_train[i][0]):
        x_train[i][0] = 0


  


In [107]:
train_data['co_similarity'] = x_train[:,0]

In [108]:
train_data

Unnamed: 0,qid,pid,queries,passage,relevancy,passage_cleaned,query_cleaned,co_similarity,bm25
0,188714,1000052,foods and supplements to lower blood sugar,Watch portion sizes: ■ Even healthy foods will...,0.0,"[watch, portion, size, even, healthy, foods, c...","[foods, supplement, lower, blood, sugar]",0.812210,28.648708
1,995526,1000094,where is the federal penitentiary in ind,It takes THOUSANDS of Macy's associates to bri...,0.0,"[take, thousands, macy, associate, bring, magi...","[federal, penitentiary, ind]",0.345645,8.308349
2,660957,1000115,what foods are good if you have gout?,The good news is that you will discover what g...,0.0,"[good, news, discover, go, action, spur, narro...","[foods, good, gout]",0.640991,21.800407
3,837202,1000252,what is the nutritional value of oatmeal,"Oats make an easy, balanced breakfast. One cup...",0.0,"[oats, make, easy, balance, breakfast, one, cu...","[nutritional, value, oatmeal]",0.670880,9.377437
4,130825,1000268,definition for daring,Such a requirement would have three desirable ...,0.0,"[requirement, would, three, desirable, consequ...","[definition, dare]",0.386194,5.959412
...,...,...,...,...,...,...,...,...,...
995,400803,1016366,is a revocable trust a separate legal entity,The income and deductions of the trust are rep...,0.0,"[income, deductions, trust, report, income, ta...","[revocable, trust, separate, legal, entity]",0.667951,21.828710
996,400803,1016370,is a revocable trust a separate legal entity,A grantor trust is a living revocable trust in...,0.0,"[grantor, trust, live, revocable, trust, grant...","[revocable, trust, separate, legal, entity]",0.699175,22.962073
997,544319,1016449,"weather in gig harbor, wa",The gig economy is the collection of markets t...,0.0,"[gig, economy, collection, market, match, prov...","[weather, gig, harbor, wa]",0.383561,11.130238
998,617246,1016466,what decisions rules can determine upheld or d...,"To claim a tax deduction for business mileage,...",0.0,"[claim, tax, deduction, business, mileage, sel...","[decisions, rule, determine, uphold, dismiss, ...",0.534107,17.704609


In [109]:
for i in range(len(x_val)):
    query_vector = average_query_vectors_val[i]
    passage_vector = average_document_vectors_val[i]
    x_val[i][0] = cosine_sim_formula(query_vector, passage_vector)
    if np.isnan(x_val[i][0]):
        x_val[i][0] = 0

In [110]:
validation_data['co_similarity'] = x_val[:,0]

In [111]:
validation_data

Unnamed: 0,qid,pid,queries,passage,relevancy,passage_cleaned,query_cleaned,co_similarity,bm25
0,1082792,1000084,what does the golgi apparatus do to the protei...,"Start studying Bonding, Carbs, Proteins, Lipid...",0.0,"[start, study, bond, carbs, proteins, lipids, ...","[golgi, apparatus, proteins, lipids, arrive]",0.609541,19.798340
1,995825,1000492,where is the graphic card located in the cpu,"For example, a “PC Expansion Card” maybe the j...",0.0,"[example, pc, expansion, card, maybe, jargon, ...","[graphic, card, locate, cpu]",0.675188,19.714530
2,995825,1000494,where is the graphic card located in the cpu,The Common Cards & Buses. The most common type...,0.0,"[common, card, bus, common, type, expansion, c...","[graphic, card, locate, cpu]",0.762009,20.945883
3,1091246,1000522,property premises meaning,The occurrence of since tells us that the firs...,0.0,"[occurrence, since, tell, us, first, statement...","[property, premise, mean]",0.561729,12.263720
4,1047854,1000585,what is printing mechanism,Windows desktop applications Develop Desktop t...,0.0,"[windows, desktop, applications, develop, desk...","[print, mechanism]",0.612431,13.277623
...,...,...,...,...,...,...,...,...,...
995,1069028,1063432,what is a preliminary source,"In September 22 1862, after the Union's victor...",0.0,"[september, union, victory, antietam, lincoln,...","[preliminary, source]",0.296364,9.579258
996,14947,1063503,airport code mont tremblant,Cities > Norway > Airports near Ølen. The clos...,0.0,"[cities, norway, airports, near, ølen, closest...","[airport, code, mont, tremblant]",0.559477,20.494116
997,1036002,1063567,who is melvin booker,"Double or Nothing, a song by B.o.B and Big Boi...",0.0,"[double, nothing, song, bob, big, boi, album, ...","[melvin, booker]",0.396677,6.983769
998,731736,1063649,what is coastal erosion,1.1 DEFINING COASTAL AREAS. Coastal areas are ...,0.0,"[define, coastal, areas, coastal, areas, commo...","[coastal, erosion]",0.570154,12.559034


In [112]:
y_train = train_data.relevancy.values
y_val = validation_data.relevancy.values

### BM25

In [113]:
train_data_no_dup_passages = train_data.drop_duplicates(subset=['pid'], inplace=False)
validation_data_no_dup_passages = validation_data.drop_duplicates(subset=['pid'], inplace=False)

In [114]:
train_data_no_dup_passages.shape
validation_data_no_dup_passages.shape
N_train = train_data_no_dup_passages.shape[0]
N_val = validation_data_no_dup_passages.shape[0]

(701, 9)

(878, 9)

In [115]:
def get_passage_average_length(dataframe_no_dup_passages):
    number_of_passages = validation_data_no_dup_passages.shape[0]
    count_total_length = 0
    for idx, row in dataframe_no_dup_passages.iterrows():
        count_total_length += len(row['passage_cleaned'])
    return count_total_length/number_of_passages 


In [116]:
# total_word_occurences, avdl = get_passage_average_length_and_total_word_occurences_corpus()
avdl_train = get_passage_average_length(train_data_no_dup_passages)
avdl_val = get_passage_average_length(validation_data_no_dup_passages)

In [117]:
from math import log

k1 = 1.2
k2 = 100
b = 0.75
R = 0
r = 0
# N = len(validation_data_no_dup_passages)

def K_cal(dl, avdl):
    return k1 * ((1-b) + b * (float(dl)/float(avdl)) )


def BM25_cal_for_preprocessed_words(query_tokens, passage_tokens, N, avdl):
    query_length = len(query_tokens)
    query_token_freq_dict = nltk.FreqDist(query_tokens)
    passage_token_freq_dict = nltk.FreqDist(passage_tokens)
    dl = len(passage_tokens)
    K = K_cal(dl, avdl)
    score = 0
    for token in query_tokens:
        try:
            n = len(inverted_index[token])
        except:
            n = 0
        f = passage_token_freq_dict[token]
        qf = query_token_freq_dict[token]
        first_term = log( ( (r + 0.5) / (R - r + 0.5) ) / ( (n - r + 0.5) / (N - n - R + r + 0.5)) )
        second_term = ((k1 + 1) * f) / (K + f)
        third_term = ((k2+1) * qf) / (k2 + qf)
        score += first_term * second_term * third_term
    return score
    

In [118]:
for idx, row in train_data.iterrows():
    query_cleaned = row['query_cleaned']
    passage_cleaned = row['passage_cleaned']
    bm25_score = BM25_cal_for_preprocessed_words(query_cleaned, passage_cleaned, N_train, avdl_train)
    x_train[idx][1] = bm25_score

In [119]:
for idx, row in validation_data.iterrows():
    query_cleaned = row['query_cleaned']
    passage_cleaned = row['passage_cleaned']
    bm25_score = BM25_cal_for_preprocessed_words(query_cleaned, passage_cleaned, N_val, avdl_val)
    x_val[idx][1] = bm25_score

In [120]:
train_data['bm25'] = x_train[:,1]
validation_data['bm25'] = x_val[:,1]

In [121]:
train_data

Unnamed: 0,qid,pid,queries,passage,relevancy,passage_cleaned,query_cleaned,co_similarity,bm25
0,188714,1000052,foods and supplements to lower blood sugar,Watch portion sizes: ■ Even healthy foods will...,0.0,"[watch, portion, size, even, healthy, foods, c...","[foods, supplement, lower, blood, sugar]",0.812210,28.648708
1,995526,1000094,where is the federal penitentiary in ind,It takes THOUSANDS of Macy's associates to bri...,0.0,"[take, thousands, macy, associate, bring, magi...","[federal, penitentiary, ind]",0.345645,8.308349
2,660957,1000115,what foods are good if you have gout?,The good news is that you will discover what g...,0.0,"[good, news, discover, go, action, spur, narro...","[foods, good, gout]",0.640991,21.800407
3,837202,1000252,what is the nutritional value of oatmeal,"Oats make an easy, balanced breakfast. One cup...",0.0,"[oats, make, easy, balance, breakfast, one, cu...","[nutritional, value, oatmeal]",0.670880,9.377437
4,130825,1000268,definition for daring,Such a requirement would have three desirable ...,0.0,"[requirement, would, three, desirable, consequ...","[definition, dare]",0.386194,5.959412
...,...,...,...,...,...,...,...,...,...
995,400803,1016366,is a revocable trust a separate legal entity,The income and deductions of the trust are rep...,0.0,"[income, deductions, trust, report, income, ta...","[revocable, trust, separate, legal, entity]",0.667951,21.828710
996,400803,1016370,is a revocable trust a separate legal entity,A grantor trust is a living revocable trust in...,0.0,"[grantor, trust, live, revocable, trust, grant...","[revocable, trust, separate, legal, entity]",0.699175,22.962073
997,544319,1016449,"weather in gig harbor, wa",The gig economy is the collection of markets t...,0.0,"[gig, economy, collection, market, match, prov...","[weather, gig, harbor, wa]",0.383561,11.130238
998,617246,1016466,what decisions rules can determine upheld or d...,"To claim a tax deduction for business mileage,...",0.0,"[claim, tax, deduction, business, mileage, sel...","[decisions, rule, determine, uphold, dismiss, ...",0.534107,17.704609


## tf-idf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

word embedding cosine similarity 보다 정확하지 않을 가능성이 있고, time consuming 한 작업이라, 이 feature는 추가하지 않았다.

## Convert data into metrics form

In [54]:
x_train = x_train.reshape(x_train.shape[0],x_train.shape[1], 1)
x_val = x_val.reshape(x_val.shape[0],x_val.shape[1], 1)

## Logisitc Regression

## 현재 문제 
1. 딥러닝 과제처럼, 한 data씩 처리 할건지, 아니면 인터넷 예제처럼 모든 value를 metrics 에 넣어서 한번에 처리할건지
2. 딥러닝 과제처럼 처리했을 경우, gradient descent function은 무엇인지, 인터넷 예제처럼 했을 때도, 인터넷에 있는 gradient descent function 이 어떻게 derive 됬는지 알기

A: 결국엔 둘다 똑같은 방법인데, metrics 로 처리하는 경우는 한번에 모든 데이터를 처리하는 경우이고, 딥러닝 과제는 한 data sampling 을 하나씩 처리하는 방법이다. 결국엔 둘 다 똑같다.

In [454]:
x_train

array([[28.64870805, 28.64870805],
       [ 0.34564488,  8.30834856],
       [ 0.64099141, 21.80040673],
       ...,
       [ 0.38356136, 11.13023754],
       [ 0.53410694, 17.70460941],
       [ 0.57767753, 22.99640013]])

In [392]:
x_train.shape

(1000, 2)

In [464]:
x = x_train[1]
x

array([[0.34564488],
       [8.30834856]])

In [445]:
x.shape

(2, 1)

In [465]:
weights = np.random.rand(1, 2)


In [466]:
weights.shape

(1, 2)

In [467]:
weights

array([[0.21645978, 0.05495869]])

In [476]:
np.dot(weights, x)[0][0]

0.531434157722148

In [433]:
weights = weights.reshape(1,2)

In [435]:
weights.shape

(1, 2)

In [440]:
np.dot(x, weights)

array([45.76190532])

In [487]:
np.zeros((1, 2)).shape

(1, 2)

In [490]:
class LogisticRegression:

    def __init__(self, learning_rate=0.001, epoch=500):
        self.lr = learning_rate
        self.epoch = epoch
        self.weights = None
        self.bias = None
        self.loss_train = None
        self.loss_val = None

        self.acc_train = None
        self.train_correct = None
        self.val_correct = None
        
        self.losslist_train = []
        self.losslist_val = []
        self.acclist_train = []
        self.acclist_val = []
    

        
    def fit(self, trainxs, trainys, trainxs_val, trainys_val):
        n_samples, n_features, _ = trainxs.shape
        n_samples_val = trainxs_val.shape[0]

        # init parameters
#         self.weights = np.zeros((n_features, 1))
        self.weights = np.zeros((1, n_features))
        self.bias = 0
        
        
        
        # gradient descent
        for _ in range(self.epoch):
            idx = self.shuffleIdx(trainxs.shape[0])
            X = trainxs[idx]
            Y = trainys[idx]
            
            
            self.loss_train = 0
            self.loss_val = 0

            self.acc_train = 0
            self.train_correct = 0
            self.val_correct = 0
            total_weight_gradient1 = 0
            total_weight_gradient2 = 0
            
            total_bias_gradient = 0
            
            for i in range(trainxs.shape[0]):
                x = X[i]
                y = Y[i]
                # approximate output variable (y) with linear combination of weights and x, plus bias
                linear_equation = np.dot(self.weights, x)[0][0] + self.bias
                # apply sigmoid function
                prediction = self.sigmoid(linear_equation)
                if prediction >= 0.5:
                    yprime = 1
                else:
                    yprime = 0

                if yprime == y:
                    self.train_correct += 1

                self.loss_train += self.loss_function(y, prediction)

                # compute gradients
                dw1 = (prediction - y) * x[0]  #derivative w.r.t weight1
                dw2 = (prediction - y) * x[1] #derivative w.r.t weight2
                db = prediction - y  #derivative w.r.t bias
                    
                    
                total_weight_gradient1 += dw1
                total_weight_gradient2 += dw2
                total_bias_gradient += db
               
                 # COMPUTING LOSS AND ACCURACY OF VALIDATION SET
                if (i < trainxs_val.shape[0]):
                    val_x = trainxs_val[i]
                    val_y = trainys_val[i]
                    linear_equation = np.dot(self.weights, val_x)[0][0] + self.bias
                    prediction = self.sigmoid(linear_equation)

                    if prediction  >= 0.5:
                        yprime = 1
                    else:
                        yprime = 0

                    if yprime == val_y:
                        self.val_correct += 1

                    self.loss_val += ((val_y - prediction)**2)/2

                    
            # COMPUTING THE AVERAGE OF GRADIENTS FOR EACH EPOCH SINCE WE ARE DOING        
            # FULL-BATCH GRADIENT DESCEND AND UPDATING THE PARAMETERS AFTER EACH EPOCH
            total_weight_gradient1 = total_weight_gradient1 / n_samples
            total_weight_gradient2 = total_weight_gradient2 / n_samples
            total_bias_gradient = total_bias_gradient / n_samples

            # update parameters
            self.weights[0][0] -= self.lr * total_weight_gradient1
            self.weights[0][1] -= self.lr * total_weight_gradient2
            self.bias -= self.lr * total_bias_gradient
            print("total_weight_gradient1:", total_weight_gradient1)
            print("total_weight_gradient2:", total_weight_gradient2)
            
            self.loss_train = self.loss_train/n_samples
            self.losslist_train.append(self.loss_train)

            self.loss_val = self.loss_val/(n_samples_val)
            self.losslist_val.append(self.loss_val)

            self.train_correct = self.train_correct/n_samples
            self.acclist_train.append(self.train_correct)

            self.val_correct = self.val_correct/(n_samples_val)
            self.acclist_val.append(self.val_correct)

            
    def shuffleIdx(self, n):
        rng = default_rng()
        rand_idx = rng.permutation(n)
        return rand_idx

    def predict(self, X):
        linear_equation = np.dot(X, self.weights) + self.bias
        y_predicted = self.sigmoid(linear_equation)
        y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
        return np.array(y_predicted_cls)

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def loss_function(self, y, prediction):
        return -log((1 - prediction)**(1 - y)) - log(prediction**y)
    

In [491]:
lr = LogisticRegression(learning_rate=0.0001, epoch=1)
lr.fit(x_train, y_train, x_val, y_val)
# predictions = regressor.predict(xtest)


total_weight_gradient1: [0.30174978]
total_weight_gradient2: [7.99752143]


In [492]:
lr.losslist_train

[0.6931471805599322]

In [493]:
lr.bias

-5e-05

In [494]:
lr.weights[0][0]

-3.017497769225429e-05

# Subtask 3


In [55]:
import pandas as pd
import numpy as np
from xgboost import DMatrix,train



## Preprocessing the data into the form which XGboost can take

In [56]:
num_of_features = 2

## train data

In [57]:
train_data = train_data.sort_values(by=['queries'], ascending=False)
train_data


Unnamed: 0,qid,pid,queries,passage,relevancy,passage_cleaned,query_cleaned,co_similarity,bm25
376,1079831,1006123,world trade organization definition,Definition of free trade. : trade based on the...,0.0,"[definition, free, trade, trade, base, unrestr...","[world, trade, organization, definition]",0.652645,20.733095
104,1078920,100198,women benefits from taking dim,Effects of Medications during Pregnancy. We kn...,0.0,"[effect, medications, pregnancy, know, little,...","[women, benefit, take, dim]",0.478454,16.041879
676,1078752,1011079,withdrawal symptoms of amitriptyline,Amitriptyline for the treatment of depression....,0.0,"[amitriptyline, treatment, depression, amitrip...","[withdrawal, symptoms, amitriptyline]",0.693089,14.635104
74,1101861,1001625,windstream troubleshooting phone number,Fisher Price Smart Cycle Manual Troubleshootin...,0.0,"[fisher, price, smart, cycle, manual, troubles...","[windstream, troubleshoot, phone, number]",0.604961,11.993841
865,1068924,1014493,why hemorrhagic disease tests due to vitamin k...,Vitamin B12 is also said to help some sleep di...,0.0,"[vitamin, b12, also, say, help, sleep, disorde...","[hemorrhagic, disease, test, due, vitamin, k, ...",0.785148,28.156938
...,...,...,...,...,...,...,...,...,...
720,15063,1011490,alabama central credit union routing number,Generations Federal Credit Union Routing Numbe...,0.0,"[generations, federal, credit, union, rout, nu...","[alabama, central, credit, union, rout, number]",0.768005,41.377591
753,14151,1012028,age requirements for name change,"For example, someone's age might be an indepen...",0.0,"[example, someone, age, might, independent, va...","[age, requirements, name, change]",0.554912,18.958650
519,1086532,1008690,advanced weighing technology definition,La Crosse Technology brings you the most affor...,0.0,"[la, crosse, technology, bring, affordable, re...","[advance, weigh, technology, definition]",0.527106,15.648874
731,8854,1011732,________ disparity refers to the slightly diff...,Salaries for women in residency are lower than...,0.0,"[salaries, women, residency, lower, men, respe...","[________, disparity, refer, slightly, differe...",0.508428,10.672300


In [58]:
lambdaMart_x_train = np.zeros((train_data.shape[0], num_of_features))

In [59]:
lambdaMart_x_train

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       ...,
       [0., 0.],
       [0., 0.],
       [0., 0.]])

In [60]:
lambdaMart_x_train = train_data[['co_similarity', 'bm25']].values
lambdaMart_x_train

array([[ 0.65264477, 20.73309484],
       [ 0.47845439, 16.04187895],
       [ 0.6930888 , 14.63510436],
       ...,
       [ 0.52710561, 15.64887427],
       [ 0.50842772, 10.67230032],
       [ 0.78260784, 13.88766498]])

In [61]:
lambdaMart_y_train = train_data.relevancy.values

In [62]:
query_count_dict_train = train_data['queries'].value_counts().to_dict()

In [83]:
query_count_dict_train

{'what cities allow pit bulls': 5,
 'how much bitcoins are there in the world': 5,
 'is chromium reactive': 5,
 'what does rice stand for with a muscle injury': 5,
 'what are two main gases in': 4,
 'what does starling mean': 4,
 'price chopper locations in ct': 4,
 'gold price in ksa dammam': 4,
 'what is one role of the element phosphorus?': 3,
 'what medications could cause pemphigoid': 3,
 'where is bellevue canada?': 3,
 'what is the fdic and what does it do': 3,
 'average temperatures cairo november': 3,
 'how long does respite last': 3,
 'how big can leopard tortoises get': 3,
 'what is respite': 3,
 'price of silver per ounce history': 3,
 'what is chattahoochee': 3,
 'aspen dental corporate telephone number': 3,
 'how much is the average dj for a wedding': 3,
 'what is supply chain mps': 3,
 'how does a hydrate differ from an anhydrate': 3,
 'what country does fennel come from': 3,
 'what are the two major subdivisions of the nervous system?': 3,
 'who makes jammy dodgers': 3,

In [161]:
len(query_count_dict_train)

816

In [197]:
dgroup_train = []
dgroup_train

[]

In [198]:
checker = []
for idx, row in train_data.iterrows():
    query = row['queries']
    if query in checker:
        continue
    checker.append(query)
    group_entry_count = query_count_dict_train[query]
    dgroup_train.append(group_entry_count)

In [201]:
# dgroup_train = np.array(dgroup_train)

In [202]:
dgroup_train

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 2,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,


In [167]:
np.sum(dgroup_train)

1000

## validation data

In [88]:
validation_data = validation_data.sort_values(by=['queries'], ascending=False)
validation_data

Unnamed: 0,qid,pid,queries,passage,relevancy,passage_cleaned,query_cleaned,co_similarity,bm25
719,1101806,1046669,wow essential oil,Use approximately six drops of essential oil f...,0.0,"[use, approximately, six, drop, essential, oil...","[wow, essential, oil]",0.644733,24.310915
774,1078446,1050857,wine cabinets definition,"Terroir definition, the environmental conditio...",0.0,"[terroir, definition, environmental, condition...","[wine, cabinets, definition]",0.550765,17.995400
567,1101868,1037249,willie weeks net worth,Kirk Frost net worth: $600 Thousand. Kirk Fros...,0.0,"[kirk, frost, net, worth, thousand, kirk, fros...","[willie, weeks, net, worth]",0.661165,25.569472
907,1101868,1059211,willie weeks net worth,Updated Constance McCashin Net Worth in 2017. ...,0.0,"[update, constance, mccashin, net, worth, wiki...","[willie, weeks, net, worth]",0.684801,25.771400
915,1101868,1059337,willie weeks net worth,The Leonardo DiCaprio net worth total of $217 ...,0.0,"[leonardo, dicaprio, net, worth, total, millio...","[willie, weeks, net, worth]",0.812987,27.338163
...,...,...,...,...,...,...,...,...,...
996,14947,1063503,airport code mont tremblant,Cities > Norway > Airports near Ølen. The clos...,0.0,"[cities, norway, airports, near, ølen, closest...","[airport, code, mont, tremblant]",0.559477,20.494116
627,10264,1040388,access parallels cost,Some are available every day of the year. At t...,0.0,"[available, every, day, year, beach, surf, wat...","[access, parallel, cost]",0.341840,8.133568
968,1099488,1062356,a scar meaning,Last Week's Popular Questions for Celebrities ...,0.0,"[last, week, popular, question, celebrities, a...","[scar, mean]",0.646320,11.629403
969,1099488,1062358,a scar meaning,Last Week's Popular Questions for Celebrities....,0.0,"[last, week, popular, question, celebrities, a...","[scar, mean]",0.590652,10.590634


In [89]:
lambdaMart_x_val = np.zeros((validation_data.shape[0], num_of_features))

In [90]:
lambdaMart_x_val = validation_data[['co_similarity', 'bm25']].values
lambdaMart_x_val

array([[ 0.64473255, 24.31091456],
       [ 0.55076461, 17.99540048],
       [ 0.66116479, 25.56947225],
       ...,
       [ 0.64632024, 11.62940342],
       [ 0.59065184, 10.59063425],
       [ 0.84312551, 27.049819  ]])

In [91]:
lambdaMart_y_val = validation_data.relevancy.values

In [92]:
query_count_dict_val = validation_data['queries'].value_counts().to_dict()

In [195]:
dgroup_val = []
dgroup_val

[]

In [196]:
checker = []
for idx, row in validation_data.iterrows():
    query = row['queries']
    if query in checker:
        continue
    checker.append(query)
    group_entry_count = query_count_dict_val[query]
    dgroup_val.append(group_entry_count)

In [199]:
# dgroup_val = np.array(dgroup_val)

In [200]:
dgroup_val

[1,
 1,
 4,
 2,
 2,
 1,
 1,
 2,
 1,
 3,
 3,
 1,
 1,
 3,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 4,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 3,
 2,
 2,
 2,
 1,
 2,
 5,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 4,
 1,
 2,
 2,
 2,
 2,
 2,
 1,
 2,
 1,
 1,
 2,
 2,
 2,
 3,
 2,
 1,
 2,
 1,
 2,
 1,
 3,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 3,
 2,
 2,
 1,
 1,
 4,
 1,
 3,
 1,
 2,
 3,
 1,
 1,
 1,
 3,
 3,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 2,
 3,
 1,
 2,
 1,
 3,
 1,
 2,
 1,
 2,
 1,
 2,
 3,
 1,
 1,
 1,
 1,
 3,
 1,
 2,
 2,
 1,
 5,
 2,
 2,
 2,
 3,
 3,
 3,
 2,
 2,
 5,
 1,
 2,
 1,
 1,
 2,
 2,
 1,
 2,
 3,
 2,
 5,
 1,
 1,
 1,
 1,
 1,
 1,
 6,
 2,
 1,
 4,
 4,
 2,
 1,
 5,
 2,
 1,
 2,
 4,
 2,
 1,
 1,
 2,
 1,
 5,
 3,
 1,
 3,
 1,
 1,
 3,
 1,
 1,
 3,
 2,
 1,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 3,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 2,
 2,
 1,
 1,
 2,
 1,
 1,
 4,
 3,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 1,


## LambdaMart Implementation

In [203]:
# xgb_rank_params1 ={    
#     'booster' : 'gbtree',
#     'eta': 0.1,
#     'gamma' : 1.0 ,
#     'min_child_weight' : 0.1,
#     'objective' : 'rank:pairwise',
#     'eval_metric' : 'merror',
#     'max_depth' : 6,
#     'num_boost_round':10,
#     'save_period' : 0 
# }

# params = {
#     'bst:max_depth':2, 
#     'bst:eta':1, 'silent':1, 
#     'objective':'rank:pairwise',
#     'nthread':4,
#     'eval_metric':'ndcg'
# }
  



In [204]:
#generate training dataset
# dtrain= lambdaMart_x_train
# dtarget= lambdaMart_y_train
# dgroup_train = dgroup_train
# dtrain= lambdaMart_x_train[:2]
# dtarget= lambdaMart_y_train[:2]
# dgroup_train = dgroup_train[:2]
# dtrain.shape
# dtarget.shape

In [205]:
lambdaMart_x_train

array([[ 0.65264477, 20.73309484],
       [ 0.47845439, 16.04187895],
       [ 0.6930888 , 14.63510436],
       ...,
       [ 0.52710561, 15.64887427],
       [ 0.50842772, 10.67230032],
       [ 0.78260784, 13.88766498]])

In [206]:
dgroup_train

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 2,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,


In [207]:
# concate Train data, very import here !

train_dmatrix = DMatrix(lambdaMart_x_train, label = lambdaMart_y_train)
valid_dmatrix = DMatrix(lambdaMart_x_val, label = lambdaMart_y_val)
train_dmatrix.set_group(dgroup_train)
valid_dmatrix.set_group(dgroup_val)




In [208]:
params = {'objective': 'rank:ndcg', 'eta': 0.1, 'gamma': 1.0,
          'min_child_weight': 0.1, 'max_depth': 6}

In [209]:
lambdaRank_model = train(params, train_dmatrix, num_boost_round=4,
                      evals=[(valid_dmatrix, 'validation')])


[0]	validation-map:1
[1]	validation-map:1
[2]	validation-map:1
[3]	validation-map:1


In [None]:
# pred = lambdaRank_model.predict(test_dmatrix)

In [181]:
# # generate eval data -> validation data
# dtrain_eval= lambdaMart_x_val   
# xgbTrain_eval = DMatrix(lambdaMart_x_val, label = lambdaMart_y_val)

# evallist  = [(xgbTrain,'train'),(xgbTrain_eval, 'eval')]


In [182]:
# # train model
# # xgb_rank_params1加上 evals 这个参数会报错，还没找到原因
# # rankModel = train(xgb_rank_params1,xgbTrain,num_boost_round=10)
# rankModel = train(params,xgbTrain,num_boost_round=20,evals=evallist)



[0]	train-ndcg:1	eval-ndcg:1
[1]	train-ndcg:1	eval-ndcg:1
[2]	train-ndcg:1	eval-ndcg:1
[3]	train-ndcg:1	eval-ndcg:1
[4]	train-ndcg:1	eval-ndcg:1
[5]	train-ndcg:1	eval-ndcg:1
[6]	train-ndcg:1	eval-ndcg:1
[7]	train-ndcg:1	eval-ndcg:1
[8]	train-ndcg:1	eval-ndcg:1
[9]	train-ndcg:1	eval-ndcg:1
[10]	train-ndcg:1	eval-ndcg:1
[11]	train-ndcg:1	eval-ndcg:1
[12]	train-ndcg:1	eval-ndcg:1
[13]	train-ndcg:1	eval-ndcg:1
[14]	train-ndcg:1	eval-ndcg:1
[15]	train-ndcg:1	eval-ndcg:1
[16]	train-ndcg:1	eval-ndcg:1
[17]	train-ndcg:1	eval-ndcg:1
[18]	train-ndcg:1	eval-ndcg:1
[19]	train-ndcg:1	eval-ndcg:1


In [189]:
# #test dataset
# dtest= lambdaMart_x_val  
# dtestgroup=dgroup_val
# xgbTest = DMatrix(dtest)
# xgbTest.set_group(dgroup_train)


In [190]:
# print(rankModel.predict(xgbTest))

[0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.

## Subtask 4

In [57]:
import tensorflow as tf
from tensorflow.keras import layers, activations, losses, Model, Input
from tensorflow.nn import leaky_relu
import numpy as np
from itertools import combinations
from tensorflow.keras.utils import plot_model, Progbar
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# model architecture
class RankNet(Model):
    def __init__(self):
        super().__init__()
        self.dense = [layers.Dense(16, activation=leaky_relu), layers.Dense(8, activation=leaky_relu)]
        self.o = layers.Dense(1, activation='linear')
        self.oi_minus_oj = layers.Subtract()
    
    def call(self, inputs):
        xi, xj = inputs
        densei = self.dense[0](xi)
        densej = self.dense[0](xj)
        for dense in self.dense[1:]:
            densei = dense(densei)
            densej = dense(densej)
        oi = self.o(densei)
        oj= self.o(densej)
        oij = self.oi_minus_oj([oi, oj])
        output = layers.Activation('sigmoid')(oij)
        return output
    
    def build_graph(self):
        x = [Input(shape=(10)), Input(shape=(10))]
        return Model(inputs=x, outputs=self.call(x))

# visualize model architecture
# plot_model(RankNet().build_graph(), show_shapes=False)

In [122]:
train_data

Unnamed: 0,qid,pid,queries,passage,relevancy,passage_cleaned,query_cleaned,co_similarity,bm25
0,188714,1000052,foods and supplements to lower blood sugar,Watch portion sizes: ■ Even healthy foods will...,0.0,"[watch, portion, size, even, healthy, foods, c...","[foods, supplement, lower, blood, sugar]",0.812210,28.648708
1,995526,1000094,where is the federal penitentiary in ind,It takes THOUSANDS of Macy's associates to bri...,0.0,"[take, thousands, macy, associate, bring, magi...","[federal, penitentiary, ind]",0.345645,8.308349
2,660957,1000115,what foods are good if you have gout?,The good news is that you will discover what g...,0.0,"[good, news, discover, go, action, spur, narro...","[foods, good, gout]",0.640991,21.800407
3,837202,1000252,what is the nutritional value of oatmeal,"Oats make an easy, balanced breakfast. One cup...",0.0,"[oats, make, easy, balance, breakfast, one, cu...","[nutritional, value, oatmeal]",0.670880,9.377437
4,130825,1000268,definition for daring,Such a requirement would have three desirable ...,0.0,"[requirement, would, three, desirable, consequ...","[definition, dare]",0.386194,5.959412
...,...,...,...,...,...,...,...,...,...
995,400803,1016366,is a revocable trust a separate legal entity,The income and deductions of the trust are rep...,0.0,"[income, deductions, trust, report, income, ta...","[revocable, trust, separate, legal, entity]",0.667951,21.828710
996,400803,1016370,is a revocable trust a separate legal entity,A grantor trust is a living revocable trust in...,0.0,"[grantor, trust, live, revocable, trust, grant...","[revocable, trust, separate, legal, entity]",0.699175,22.962073
997,544319,1016449,"weather in gig harbor, wa",The gig economy is the collection of markets t...,0.0,"[gig, economy, collection, market, match, prov...","[weather, gig, harbor, wa]",0.383561,11.130238
998,617246,1016466,what decisions rules can determine upheld or d...,"To claim a tax deduction for business mileage,...",0.0,"[claim, tax, deduction, business, mileage, sel...","[decisions, rule, determine, uphold, dismiss, ...",0.534107,17.704609


In [123]:
validation_data

Unnamed: 0,qid,pid,queries,passage,relevancy,passage_cleaned,query_cleaned,co_similarity,bm25
0,1082792,1000084,what does the golgi apparatus do to the protei...,"Start studying Bonding, Carbs, Proteins, Lipid...",0.0,"[start, study, bond, carbs, proteins, lipids, ...","[golgi, apparatus, proteins, lipids, arrive]",0.609541,19.798340
1,995825,1000492,where is the graphic card located in the cpu,"For example, a “PC Expansion Card” maybe the j...",0.0,"[example, pc, expansion, card, maybe, jargon, ...","[graphic, card, locate, cpu]",0.675188,19.714530
2,995825,1000494,where is the graphic card located in the cpu,The Common Cards & Buses. The most common type...,0.0,"[common, card, bus, common, type, expansion, c...","[graphic, card, locate, cpu]",0.762009,20.945883
3,1091246,1000522,property premises meaning,The occurrence of since tells us that the firs...,0.0,"[occurrence, since, tell, us, first, statement...","[property, premise, mean]",0.561729,12.263720
4,1047854,1000585,what is printing mechanism,Windows desktop applications Develop Desktop t...,0.0,"[windows, desktop, applications, develop, desk...","[print, mechanism]",0.612431,13.277623
...,...,...,...,...,...,...,...,...,...
995,1069028,1063432,what is a preliminary source,"In September 22 1862, after the Union's victor...",0.0,"[september, union, victory, antietam, lincoln,...","[preliminary, source]",0.296364,9.579258
996,14947,1063503,airport code mont tremblant,Cities > Norway > Airports near Ølen. The clos...,0.0,"[cities, norway, airports, near, ølen, closest...","[airport, code, mont, tremblant]",0.559477,20.494116
997,1036002,1063567,who is melvin booker,"Double or Nothing, a song by B.o.B and Big Boi...",0.0,"[double, nothing, song, bob, big, boi, album, ...","[melvin, booker]",0.396677,6.983769
998,731736,1063649,what is coastal erosion,1.1 DEFINING COASTAL AREAS. Coastal areas are ...,0.0,"[define, coastal, areas, coastal, areas, commo...","[coastal, erosion]",0.570154,12.559034


In [134]:
qids = train_data.qid.values
doc_features = train_data[['co_similarity', 'bm25']].values
doc_scoures = train_data.relevancy.values

In [135]:
qids.shape

(1000,)

In [136]:
np.unique(qids).shape

(816,)

In [137]:
# generate data
# nb_query = 20
# query = np.array([i+1 for i in range(nb_query) for x in range(int(np.ceil(np.abs(np.random.normal(0,scale=15))+2)))])
# doc_features = np.random.random((len(query), 10))
# doc_scores = np.random.randint(5, size=len(query)).astype(np.float32)




# put data into pairs
xi = []
xj = []
pij = []
pair_id = []
pair_query_id = []
for qid in np.unique(qids):
    query_idx = np.where(query == qid)[0]
    for pair_idx in combinations(query_idx, 2):
        pair_query_id.append(qid)
        
        pair_id.append(pair_idx)
        i = pair_idx[0]
        j = pair_idx[1]
        xi.append(doc_features[i])
        xj.append(doc_features[j])
        
        if doc_scores[i] == doc_scores[j]:
            _pij = 0.5
        elif doc_scores[i] > doc_scores[j]:
            _pij = 1
        else: 
            _pij = 0
        pij.append(_pij)
        
xi = np.array(xi)
xj = np.array(xj)
pij = np.array(pij)
pair_query_id = np.array(pair_query_id)

xi_train, xi_test, xj_train, xj_test, pij_train, pij_test, pair_id_train, pair_id_test = train_test_split(
    xi, xj, pij, pair_id, test_size=0.2, stratify=pair_query_id)

In [138]:
# train model using compile and fit
ranknet = RankNet()
ranknet.compile(optimizer='adam', loss='binary_crossentropy')
history = ranknet.fit([xi_train, xj_train], pij_train, epochs=50, batch_size=1, validation_data=([xi_test, xj_test], pij_test))


Train on 260 samples, validate on 65 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
