In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [154]:
import re, string, unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
import numpy as np
import smart_open
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from math import log
from numpy.random import default_rng, rand


In [4]:
f = open("../dataset/passage_collection_new.txt", 'r', encoding = 'utf-8') 
document = f.read()

In [5]:
header_list = ["qid", "pid", "query", "passage"]
candidate_passages_top1000 = pd.read_csv("../dataset/candidate_passages_top1000.tsv", sep='\t', names=header_list)

In [6]:
header_list = ["qid", "query"]
test_queries = pd.read_csv("../dataset/test-queries.tsv", sep='\t', names=header_list)

In [7]:
train_data = pd.read_csv("../part2/train_data.tsv", sep='\t')
validation_data = pd.read_csv("../part2/validation_data.tsv", sep='\t')


In [7]:
train_data.head(10)

Unnamed: 0,qid,pid,queries,passage,relevancy
0,188714,1000052,foods and supplements to lower blood sugar,Watch portion sizes: ■ Even healthy foods will...,0.0
1,995526,1000094,where is the federal penitentiary in ind,It takes THOUSANDS of Macy's associates to bri...,0.0
2,660957,1000115,what foods are good if you have gout?,The good news is that you will discover what g...,0.0
3,837202,1000252,what is the nutritional value of oatmeal,"Oats make an easy, balanced breakfast. One cup...",0.0
4,130825,1000268,definition for daring,Such a requirement would have three desirable ...,0.0
5,408149,1000288,is dhgate a scam,If you think you ve been targeted by a counter...,0.0
6,1019649,1000419,what study for mets to brain,Sorry he's having so much pain. The reason tha...,0.0
7,1099065,1000436,how far deep to plant beet early wonder,"The simplest way, and my preference, is to roa...",0.0
8,1084910,1000466,what disease do roof rats cause,1 A cage trap baited with peanut butter or a s...,0.0
9,959083,1000479,when was niagara falls created,"Bulbar Onset – ALS. ALS is like Niagara Falls,...",0.0


In [8]:
train_data.shape

(4364339, 5)

In [9]:
validation_data.head(10)

Unnamed: 0,qid,pid,queries,passage,relevancy
0,1082792,1000084,what does the golgi apparatus do to the protei...,"Start studying Bonding, Carbs, Proteins, Lipid...",0.0
1,995825,1000492,where is the graphic card located in the cpu,"For example, a “PC Expansion Card” maybe the j...",0.0
2,995825,1000494,where is the graphic card located in the cpu,The Common Cards & Buses. The most common type...,0.0
3,1091246,1000522,property premises meaning,The occurrence of since tells us that the firs...,0.0
4,1047854,1000585,what is printing mechanism,Windows desktop applications Develop Desktop t...,0.0
5,991832,1000599,who discovered the element carbon,1. 1 a nonmetallic element existing in the th...,0.0
6,185299,1000647,fastest cell phone processor,Tips for calling a cell phone in Greece: To ca...,0.0
7,574730,1000663,what are the three monetary policy tools of th...,"Federal Reserve updates including rates, news ...",0.0
8,1085008,1000675,what did maria theresa do for the serfs,"In this feudal system, the king awarded land g...",0.0
9,609628,1000771,what county is mitchell south dakota in,South Dakota: According to our research of Sou...,0.0


In [10]:
train_data.shape
validation_data.shape

(1103039, 5)

# Preprocessing

In [8]:
def remove_punctuation(tokens):
    new_tokens = []
    for token in tokens:
        new_token = re.sub(r'[^\w\s]', '', token)
        if new_token != '':
            new_tokens.append(new_token)
    return new_tokens

def remove_stopwords(tokens):
    new_tokens = []
    stopword_set = set(stopwords.words('english'))
    for token in tokens:
        if token not in stopword_set:
            new_tokens.append(token)
    return new_tokens

def lemmatize_verbs(tokens):
    lemmatizer = WordNetLemmatizer()
    root_words = []
    for token in tokens:
        root_word = lemmatizer.lemmatize(token, pos='v')
#         root_word = lemmatizer.lemmatize(token, pos='n')
#         root_word = lemmatizer.lemmatize(token, pos='a')
        root_words.append(root_word)
    return root_words

def remove_numbers(tokens):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    new_tokens = []
    for token in tokens:
        if token.isdigit():
            pass
        else:
            new_tokens.append(token)
    return new_tokens

def preprocessing(passage):
    passage = passage.lower()
    tokens = nltk.word_tokenize(passage)
    tokens = remove_punctuation(tokens)
    tokens = remove_stopwords(tokens)
    tokens = lemmatize_verbs(tokens)
    tokens = remove_numbers(tokens)
    return tokens

# Subtask 1

## BM25

In [8]:
validation_data_no_dup_passages = validation_data.drop_duplicates(subset=['pid'], inplace=False)


In [9]:
# validation_data_no_dup_passages.head(20)
validation_data_no_dup_passages.shape

(955211, 5)

In [10]:
def get_passage_average_length_and_total_word_occurences_corpus():
    number_of_passages = len(validation_data_no_dup_passages)
    count_total_length = 0
    for idx, row in validation_data_no_dup_passages.iterrows():
        count_total_length += len(preprocessing(row['passage']))
    return count_total_length, count_total_length/number_of_passages 


In [11]:
# total_word_occurences, avdl = get_passage_average_length_and_total_word_occurences_corpus()
total_word_occurences = 30757932
avdl = 32.200144261320276

In [12]:
# total_word_occurences
# avdl

In [13]:
from math import log

k1 = 1.2
k2 = 100
b = 0.75
R = 0
r = 0
N = len(validation_data_no_dup_passages)

def K_cal(dl):
    return k1 * ((1-b) + b * (float(dl)/float(avdl)) )


def BM25_cal(query, passage):
    query_tokens = preprocessing(query)
    passage_tokens = preprocessing(passage)
    query_length = len(query_tokens)
    query_token_freq_dict = nltk.FreqDist(query_tokens)
    passage_token_freq_dict = nltk.FreqDist(passage_tokens)
    dl = len(passage_tokens)
    K = K_cal(dl)
    score = 0
    for token in query_tokens:
        try:
            n = len(inverted_index[token])
        except:
            n = 0
        f = passage_token_freq_dict[token]
        qf = query_token_freq_dict[token]
        first_term = log( ( (r + 0.5) / (R - r + 0.5) ) / ( (n - r + 0.5) / (N - n - R + r + 0.5)) )
        second_term = ((k1 + 1) * f) / (K + f)
        third_term = ((k2+1) * qf) / (k2 + qf)
        score += first_term * second_term * third_term
    return score
    

In [14]:
bm25_rankings = []
for idx, row in validation_data.iterrows():
#     print('count:', idx+1)
    query = row['queries']
    passage = row['passage']
    bm25_rankings.append(BM25_cal(query, passage))

    

In [15]:
ranking_k = 100
results_bm25 = np.array(bm25_rankings).argsort()[-ranking_k:][::-1]
results_bm25

array([ 218534, 1076853,   95986,   95987,  831065,  725920,  504442,
        654647,  341262,  281695,  950916,  586050,  871938,  793356,
        944762,  714508,  900185,  206973,  629358,  701682,  248618,
        259324,  702792,  286327,  205094,   98221,  519566,  242694,
        705667,  922032,  197484,  431189,  419681,  237993,  502543,
        793493,  646679,  559150,  303774,  638714,  778563,  155707,
         45281,  132857,  213755,  299424,  252540,    9338, 1075869,
        139233,  733889,  401291,  373812,  784890, 1008311,  911301,
        909449,  558879,  857483,  602553,  171827,  217514,  857646,
        539660,  195886, 1078616,  554316,  872234,  346989,   28250,
       1026344,  330458, 1026153,  612051,  395691,  530538,  313678,
        693646,  175037,  751525,  216400,  663797,   99840,  928996,
          5558,  428329,  546185,   46827,  112665,  118094,  752555,
       1017110,  455444,  910607,  648719,  793681, 1063650,  212436,
        415429,  654

In [16]:
ranking_list_df = validation_data.loc[results_bm25]
ranking_list_df

Unnamed: 0,qid,pid,queries,passage,relevancy
218534,1007691,7251254,"when allocating service department costs, the ...",Direct method allocates each service departmen...,1.0
1076853,1007691,7251259,"when allocating service department costs, the ...",The direct method is the most widely-used meth...,0.0
95986,1007691,7251251,"when allocating service department costs, the ...",service department provides a large amount of ...,0.0
95987,1007691,7251253,"when allocating service department costs, the ...","The rows sum to 100%, so that all services pro...",0.0
831065,1007691,7251255,"when allocating service department costs, the ...",The most defensible sequence is to start with ...,0.0
...,...,...,...,...,...
793681,1007691,4814576,"when allocating service department costs, the ...",Service Members | Veterans | Both. Military On...,0.0
1063650,1007691,6395207,"when allocating service department costs, the ...",that hospital emergency department services ar...,0.0
212436,1007691,6872353,"when allocating service department costs, the ...",Yelp Customer Service customer service phone n...,0.0
415429,1007691,4114248,"when allocating service department costs, the ...","A service fee, service charge, or surcharge is...",0.0


In [17]:
validation_data[(validation_data['relevancy'] < 1.0) & (validation_data['relevancy'] > 0)]


Unnamed: 0,qid,pid,queries,passage,relevancy


In [18]:
def average_precision_cal(ranking_list_df):
    ranking_list_df = ranking_list_df.reset_index(drop=True, inplace=False)
    total_relevant_retrieved = 0
    precision_sum = 0
    for idx, row in ranking_list_df.iterrows():
        relevancy = row['relevancy']
        if (relevancy):
            isRelevant = True
            total_relevant_retrieved += 1
        precision = total_relevant_retrieved / (idx + 1)
        precision_sum += precision
    result = precision_sum / len(ranking_list_df)
    return result

In [19]:
average_precision_cal(ranking_list_df)

0.07445786781310981

In [20]:
ranking_list_df.sort_values(by=['relevancy'], ascending=False)


Unnamed: 0,qid,pid,queries,passage,relevancy
218534,1007691,7251254,"when allocating service department costs, the ...",Direct method allocates each service departmen...,1.0
950916,1089945,7079883,the __________ test is a quick and dirty test ...,• The Smell Test is familiar ground in most bu...,1.0
539660,1007691,423230,"when allocating service department costs, the ...",IT Service (ITILv3): A Service provided to one...,0.0
612051,1007691,994382,"when allocating service department costs, the ...",All UK telephone numbers beginning with the di...,0.0
1026153,1007691,3941750,"when allocating service department costs, the ...",Respite (Out-of-Home) Services [edit]. Respite...,0.0
...,...,...,...,...,...
431189,1007691,5146501,"when allocating service department costs, the ...",The Department offers service coordination and...,0.0
197484,1007691,5904987,"when allocating service department costs, the ...",If you are a delinquent juror who has been ins...,0.0
922032,1007691,5220119,"when allocating service department costs, the ...",There are different types of customer service ...,0.0
705667,1007691,7088169,"when allocating service department costs, the ...",Us Postal Service Customer Service Phone Numbe...,0.0


In [21]:
ranking_list_df

Unnamed: 0,qid,pid,queries,passage,relevancy
218534,1007691,7251254,"when allocating service department costs, the ...",Direct method allocates each service departmen...,1.0
1076853,1007691,7251259,"when allocating service department costs, the ...",The direct method is the most widely-used meth...,0.0
95986,1007691,7251251,"when allocating service department costs, the ...",service department provides a large amount of ...,0.0
95987,1007691,7251253,"when allocating service department costs, the ...","The rows sum to 100%, so that all services pro...",0.0
831065,1007691,7251255,"when allocating service department costs, the ...",The most defensible sequence is to start with ...,0.0
...,...,...,...,...,...
793681,1007691,4814576,"when allocating service department costs, the ...",Service Members | Veterans | Both. Military On...,0.0
1063650,1007691,6395207,"when allocating service department costs, the ...",that hospital emergency department services ar...,0.0
212436,1007691,6872353,"when allocating service department costs, the ...",Yelp Customer Service customer service phone n...,0.0
415429,1007691,4114248,"when allocating service department costs, the ...","A service fee, service charge, or surcharge is...",0.0


In [22]:
def get_IDCG(ranking_list_df):
    ranking_list_df_sorted = ranking_list_df.sort_values(by=['relevancy'], ascending=False)
#     ranking_list_df_sorted = ranking_list_df.reset_index(drop=True, inplace=False)
    ranking_list_df_sorted = ranking_list_df_sorted.reset_index().reindex(ranking_list_df_sorted.columns, axis=1)
    ideal_discounted_gain_sum = 0
    for idx, row in ranking_list_df_sorted.iterrows():
        index = idx + 1
        relevance_score = row['relevancy']
        gain = 2 ** relevance_score - 1
        discounted_gain = gain / math.log2(index + 1)
        ideal_discounted_gain_sum += discounted_gain
    return ideal_discounted_gain_sum

In [23]:
def get_NDCG(ranking_list_df):
    ranking_list_df = ranking_list_df.reset_index(drop=True, inplace=False)
    discounted_gain_sum = 0
    for idx, row in ranking_list_df.iterrows():
        index = idx + 1
        relevance_score = row['relevancy']
        gain = 2 ** relevance_score - 1
        discounted_gain = gain / math.log2(index + 1)
        discounted_gain_sum += discounted_gain
    IDCG = get_IDCG(ranking_list_df)
    
    nDCG = discounted_gain_sum / IDCG
    return nDCG

In [24]:
get_NDCG(ranking_list_df)

0.7841802768331765

# Subtask 2

In [34]:
train_data = train_data[:1000] # 일단 1000개만 해봄
validation_data = validation_data[:1000] # 일단 1000개만 해봄

## Preprocessing

In [35]:
# passages = validation_data.passage.values[:1000]
# validation_data_temp = validation_data[:1000]
# validation_data_temp['passage_cleaned']=validation_data_temp.passage.apply(lambda x: preprocessing(x))
# validation_data_temp['queries_cleaned']=validation_data_temp.queries.apply(lambda x: preprocessing(x))

In [231]:
train_data['passage_cleaned']=train_data.passage.apply(lambda x: preprocessing(x))
train_data['query_cleaned']=train_data.queries.apply(lambda x: preprocessing(x))

In [232]:
train_data

Unnamed: 0,qid,pid,queries,passage,relevancy,passage_cleaned,queries_cleaned,query_cleaned
0,188714,1000052,foods and supplements to lower blood sugar,Watch portion sizes: ■ Even healthy foods will...,0.0,"[watch, portion, size, even, healthy, foods, c...","[foods, supplement, lower, blood, sugar]","[foods, supplement, lower, blood, sugar]"
1,995526,1000094,where is the federal penitentiary in ind,It takes THOUSANDS of Macy's associates to bri...,0.0,"[take, thousands, macy, associate, bring, magi...","[federal, penitentiary, ind]","[federal, penitentiary, ind]"
2,660957,1000115,what foods are good if you have gout?,The good news is that you will discover what g...,0.0,"[good, news, discover, go, action, spur, narro...","[foods, good, gout]","[foods, good, gout]"
3,837202,1000252,what is the nutritional value of oatmeal,"Oats make an easy, balanced breakfast. One cup...",0.0,"[oats, make, easy, balance, breakfast, one, cu...","[nutritional, value, oatmeal]","[nutritional, value, oatmeal]"
4,130825,1000268,definition for daring,Such a requirement would have three desirable ...,0.0,"[requirement, would, three, desirable, consequ...","[definition, dare]","[definition, dare]"
...,...,...,...,...,...,...,...,...
995,400803,1016366,is a revocable trust a separate legal entity,The income and deductions of the trust are rep...,0.0,"[income, deductions, trust, report, income, ta...","[revocable, trust, separate, legal, entity]","[revocable, trust, separate, legal, entity]"
996,400803,1016370,is a revocable trust a separate legal entity,A grantor trust is a living revocable trust in...,0.0,"[grantor, trust, live, revocable, trust, grant...","[revocable, trust, separate, legal, entity]","[revocable, trust, separate, legal, entity]"
997,544319,1016449,"weather in gig harbor, wa",The gig economy is the collection of markets t...,0.0,"[gig, economy, collection, market, match, prov...","[weather, gig, harbor, wa]","[weather, gig, harbor, wa]"
998,617246,1016466,what decisions rules can determine upheld or d...,"To claim a tax deduction for business mileage,...",0.0,"[claim, tax, deduction, business, mileage, sel...","[decisions, rule, determine, uphold, dismiss, ...","[decisions, rule, determine, uphold, dismiss, ..."


In [241]:
validation_data['passage_cleaned']=validation_data.passage.apply(lambda x: preprocessing(x))
validation_data['query_cleaned']=validation_data.queries.apply(lambda x: preprocessing(x))

## Loading word2vec embedding

In [47]:
# loading pre-trained embeddings, each word is represented as a 300 dimensional vector
import gensim
W2V_PATH="../GoogleNews-vectors-negative300.bin"
model_w2v = gensim.models.KeyedVectors.load_word2vec_format(W2V_PATH, binary=True)

# Embedding documents and queries

## embedding documents

### training data

In [229]:
# tokenize and pad every document to make them of the same size
passage_tokenizer=Tokenizer()
passage_tokenizer.fit_on_texts(train_data.passage_cleaned)
passage_max_length = 128 # document length including padding
query_max_length = 64 # query length including padding

In [None]:
# # creating embedding matrix, every row is a vector representation from the vocabulary indexed by the tokenizer index. 
# document_embedding_matrix=np.zeros((passage_vocab_size,300))
# for word,i in passage_tokenizer.word_index.items():
#     if word in model_w2v:
#         document_embedding_matrix[i]=model_w2v[word]
# # creating document-word embeddings
# train_data_length = tokenized_paded_documents.shape[0]
# passage_max_length = 64 # document length including padding
# document_word_embeddings_train=np.zeros((train_data_length, passage_max_length,300)) # 64 == padding
# for i in range(train_data_length):
#     for j in range(passage_max_length): 
#         document_word_embeddings_train[i][j] = document_embedding_matrix[tokenized_paded_documents[i][j]]
# document_word_embeddings_train.shape

In [219]:
train_data_length = train_data.shape[0]
document_word_embeddings_train=np.zeros((train_data_length, passage_max_length,300)) # 64 == padding
passages = train_data.passage_cleaned
for i in range(len(passages)):
    passage = passages[i]
    passage_length = len(passage)
    for j in range(passage_length): 
        word = passage[j]
        if word in model_w2v:
            document_word_embeddings_train[i][j] = model_w2v[word]
document_word_embeddings_train.shape

(1000, 128, 300)

In [258]:
average_vector_list_length = document_word_embeddings_train.shape[0]
average_document_vectors_train = np.zeros((average_vector_list_length,300))
for i in range(average_vector_list_length):
    average_document_vectors_train[i] = np.mean(document_word_embeddings_train[i], axis=0)
average_document_vectors_train.shape

(1000, 300)

### validation data

In [221]:
# tokenize and pad every document to make them of the same size
passage_tokenizer=Tokenizer()
passage_tokenizer.fit_on_texts(validation_data.passage_cleaned)


In [55]:
# # creating embedding matrix, every row is a vector representation from the vocabulary indexed by the tokenizer index. 
# document_embedding_matrix=np.zeros((passage_vocab_size,300))
# for word,i in passage_tokenizer.word_index.items():
#     if word in model_w2v:
#         document_embedding_matrix[i]=model_w2v[word]
# # creating document-word embeddings
# document_word_embeddings_val=np.zeros((len(tokenized_paded_documents),64,300))
# for i in range(len(tokenized_paded_documents)):
#     for j in range(len(tokenized_paded_documents[0])):
#         document_word_embeddings_val[i][j]=document_embedding_matrix[tokenized_paded_documents[i][j]]
# document_word_embeddings_val.shape

(1000, 64, 300)

In [226]:
validation_data_length = validation_data.shape[0]
document_word_embeddings_val=np.zeros((validation_data_length, passage_max_length,300)) # 64 == padding
passages = validation_data.passage_cleaned
for i in range(len(passages)):
    passage = passages[i]
    passage_length = len(passage)
    for j in range(passage_length): 
        word = passage[j]
        if word in model_w2v:
            document_word_embeddings_val[i][j] = model_w2v[word]
document_word_embeddings_val.shape

(1000, 128, 300)

In [257]:
average_vector_list_length = document_word_embeddings_val.shape[0]
average_document_vectors_val = np.zeros((average_vector_list_length,300))
for i in range(average_vector_list_length):
    average_document_vectors_val[i] = np.mean(document_word_embeddings_val[i], axis=0)
average_document_vectors_val.shape

(1000, 300)

## embedding quries

### training data

In [233]:
# tokenize and pad every document to make them of the same size
query_tokenizer=Tokenizer()
query_tokenizer.fit_on_texts(train_data.queries_cleaned)


In [234]:
# # creating embedding matrix, every row is a vector representation from the vocabulary indexed by the tokenizer index. 
# query_embedding_matrix=np.zeros((query_vocab_size,300))
# for word,i in query_tokenizer.word_index.items():
#     if word in model_w2v:
#         query_embedding_matrix[i]=model_w2v[word]
# # creating query-word embeddings
# query_word_embeddings_train=np.zeros((len(tokenized_paded_quries),32,300))
# for i in range(len(tokenized_paded_quries)):
#     for j in range(len(tokenized_paded_quries[0])):
#         query_word_embeddings_train[i][j]=query_embedding_matrix[tokenized_paded_quries[i][j]]
# query_word_embeddings_train.shape

In [236]:
train_data_length = train_data.shape[0]
query_word_embeddings_train=np.zeros((train_data_length, query_max_length,300)) # 64 == padding
queries = train_data.query_cleaned
for i in range(len(queries)):
    query = queries[i]
    query_length = len(query)
    for j in range(query_length): 
        word = query[j]
        if word in model_w2v:
            query_word_embeddings_train[i][j] = model_w2v[word]
query_word_embeddings_train.shape

(1000, 64, 300)

In [256]:
average_vector_list_length = query_word_embeddings_train.shape[0]
average_query_vectors_train = np.zeros((average_vector_list_length,300))
for i in range(average_vector_list_length):
    average_query_vectors_train[i] = np.mean(query_word_embeddings_train[i], axis=0)
average_query_vectors_train.shape

(1000, 300)

### validation data

In [238]:
# tokenize and pad every document to make them of the same size
query_tokenizer=Tokenizer()
query_tokenizer.fit_on_texts(validation_data.queries_cleaned)


In [63]:
# # creating embedding matrix, every row is a vector representation from the vocabulary indexed by the tokenizer index. 
# query_embedding_matrix=np.zeros((query_vocab_size,300))
# for word,i in query_tokenizer.word_index.items():
#     if word in model_w2v:
#         query_embedding_matrix[i]=model_w2v[word]
# # creating query-word embeddings
# query_word_embeddings_val=np.zeros((len(tokenized_paded_quries),32,300))
# for i in range(len(tokenized_paded_quries)):
#     for j in range(len(tokenized_paded_quries[0])):
#         query_word_embeddings_val[i][j]=query_embedding_matrix[tokenized_paded_quries[i][j]]
# query_word_embeddings_val.shape

(1000, 32, 300)

In [242]:
validation_data_length = validation_data.shape[0]
query_word_embeddings_val=np.zeros((validation_data_length, query_max_length,300)) # 64 == padding
queries = validation_data.query_cleaned
for i in range(len(queries)):
    query = queries[i]
    query_length = len(query)
    for j in range(query_length): 
        word = query[j]
        if word in model_w2v:
            query_word_embeddings_val[i][j] = model_w2v[word]
query_word_embeddings_val.shape

(1000, 64, 300)

In [254]:
average_vector_list_length = query_word_embeddings_val.shape[0]
average_query_vectors_val = np.zeros((average_vector_list_length,300))
for i in range(average_vector_list_length):
    average_query_vectors_val[i] = np.mean(query_word_embeddings_val[i], axis=0)
average_query_vectors_val.shape

(1000, 300)

In [249]:
query_word_embeddings_val[0].shape

(64, 300)

In [253]:
query_word_embeddings_val[0]

64

In [251]:
np.mean(query_word_embeddings_val[0], axis=0).shape

(300,)

## creating training data

In [259]:
def cosine_sim_formula(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

In [260]:
x_train = np.zeros((average_query_vectors_train.shape[0], 1))
x_train_val = np.zeros((average_query_vectors_val.shape[0], 1))
x_train.shape

(1000, 1)

In [261]:
for i in range(x_train.shape[0]):
    query_vector = average_query_vectors_train[i]
    passage_vector = average_document_vectors_train[i]
    x_train[i] = cosine_sim_formula(query_vector, passage_vector)

  


In [262]:
for i in range(len(x_train_val)):
    query_vector = average_query_vectors_val[i]
    passage_vector = average_document_vectors_val[i]
    x_train_val[i] = cosine_sim_formula(query_vector, passage_vector)

In [263]:
y_train = train_data.relevancy.values
y_train_val = validation_data.relevancy.values

## Logisitc Regression

## 현재 문제 
1. 딥러닝 과제처럼, 한 data씩 처리 할건지, 아니면 인터넷 예제처럼 모든 value를 metrics 에 넣어서 한번에 처리할건지
2. 딥러닝 과제처럼 처리했을 경우, gradient descent function은 무엇인지, 인터넷 예제처럼 했을 때도, 인터넷에 있는 gradient descent function 이 어떻게 derive 됬는지 알기

In [155]:
class LogisticRegression:

    def __init__(self, learning_rate=0.001, epoch=500):
        self.lr = learning_rate
        self.epoch = epoch
        self.weights = None
        self.bias = None
        self.loss_train = None
        self.loss_val = None

        self.acc_train = None
        self.train_correct = None
        self.val_correct = None
        
        self.losslist_train = []
        self.losslist_val = []
        self.acclist_train = []
        self.acclist_val = []
    

        
    def fit(self, trainxs, trainys, trainxs_val, trainys_val):
        n_samples, n_features = trainxs.shape

        # init parameters
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        
        
        # gradient descent
        for _ in range(self.epoch):
            idx = self.shuffleIdx(trainxs.shape[0])
            X = trainxs[idx]
            Y = trainys[idx]
            
            
            self.loss_train = 0
            self.loss_val = 0

            self.acc_train = 0
            self.train_correct = 0
            self.val_correct = 0
            
            for i in range(trainxs.shape[0]):
                x = X[i]
                y = Y[i]
                # approximate output variable (y) with linear combination of weights and x, plus bias
                linear_equation = np.dot(x, self.weights) + self.bias
                # apply sigmoid function
                prediction = self.sigmoid(linear_equation)
                print("prediction:", prediction)
                
                if prediction >= 0.5:
                    yprime = 1
                else:
                    yprime = 0

                if yprime == y:
                    self.train_correct += 1

                self.loss_train += self.loss_function(y, prediction)

                # compute gradients
                dw = (1 / n_samples) * np.dot(X.T, (prediction - y)) #derivative w.r.t weights
                db = (1 / n_samples) * np.sum(prediction - y)  #derivative w.r.t bias
                # update parameters
                self.weights -= self.lr * dw
                self.bias -= self.lr * db

                 # COMPUTING LOSS AND ACCURACY OF VALIDATION SET
                if (i < trainxs_val.shape[0]):
                    val_x = trainxs_val[i]
                    val_y = trainys_val[i]
                    linear_equation = np.dot(val_x, self.weights) + self.bias
                    prediction = self.sigmoid(linear_equation)

                    if prediction  >= 0.5:
                        yprime = 1
                    else:
                        yprime = 0

                    if yprime == val_y:
                        self.val_correct += 1

                    self.loss_val += ((val_y - prediction)**2)/2

            self.loss_train = self.loss_train/n_samples
            self.losslist_train.append(self.loss_train)

            self.loss_val = self.loss_val/(n_samples_val)
            self.losslist_val.append(self.loss_val)

            self.train_correct = self.train_correct/n_samples
            self.acclist_train.append(self.train_correct)

            self.val_correct = self.val_correct/(n_samples_val)
            self.acclist_val.append(self.val_correct)

            
    def shuffleIdx(self, n):
        rng = default_rng()
        rand_idx = rng.permutation(n)
        return rand_idx

    def predict(self, X):
        linear_equation = np.dot(X, self.weights) + self.bias
        y_predicted = self.sigmoid(linear_equation)
        y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
        return np.array(y_predicted_cls)

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def loss_function(self, y, prediction):
        return -log((1 - prediction)**(1 - y)) - log(prediction**y)
    
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

In [156]:
epochs=[i in range(1000)]
accuracy=[]

In [157]:
lr = LogisticRegression(learning_rate=0.0001, epoch=500)
lr.fit(x_train, y_train, x_train_val, y_train_val)
# predictions = regressor.predict(xtest)


prediction: 0.5


ValueError: non-broadcastable output operand with shape (1,) doesn't match the broadcast shape (1,1000)

In [74]:
lr.weights

In [114]:
lr.bias

-0.024742149807413506

# Subtask 3


In [12]:
import pandas as pd
import numpy as np
from xgboost import DMatrix,train



In [13]:
# xgb_rank_params1 ={    
#     'booster' : 'gbtree',
#     'eta': 0.1,
#     'gamma' : 1.0 ,
#     'min_child_weight' : 0.1,
#     'objective' : 'rank:pairwise',
#     'eval_metric' : 'merror',
#     'max_depth' : 6,
#     'num_boost_round':10,
#     'save_period' : 0 
# }

params = {
    'bst:max_depth':2, 
    'bst:eta':1, 'silent':1, 
    'objective':'rank:pairwise',
    'nthread':4,
    'eval_metric':'ndcg'
}
  



In [21]:
#generate training dataset
n_group=2
n_choice=3  
dtrain=np.random.uniform(0,100,[n_group*n_choice,2])    
#numpy.random.choice(a, size=None, replace=True, p=None)
dtarget=np.array([np.random.choice([0,1,2],3,False) for i in range(n_group)]).flatten()
#n_group用于表示从前到后每组各自有多少样本，前提是样本中各组是连续的，[3，3]表示一共6条样本中前3条是第一组，后3条是第二组
dgroup= np.array([n_choice for i in range(n_group)]).flatten()



In [22]:
dtarget

array([1, 2, 0, 2, 0, 1])

In [23]:
dtrain

array([[76.13114507, 72.51374314],
       [56.71265772, 39.39356257],
       [67.39643181, 34.78308006],
       [98.54392286, 32.87569918],
       [18.00958146, 23.27901363],
       [79.99957957, 23.68170943]])

[0, 1]

In [24]:
dgroup

array([3, 3])

In [None]:
# concate Train data, very import here !
xgbTrain = DMatrix(dtrain, label = dtarget)
xgbTrain.set_group(dgroup)

# generate eval data
dtrain_eval=np.random.uniform(0,100,[n_group*n_choice,2])        
xgbTrain_eval = DMatrix(dtrain_eval, label = dtarget)
xgbTrain_eval .set_group(dgroup)
evallist  = [(xgbTrain,'train'),(xgbTrain_eval, 'eval')]



In [None]:
# train model
# xgb_rank_params1加上 evals 这个参数会报错，还没找到原因
# rankModel = train(xgb_rank_params1,xgbTrain,num_boost_round=10)
rankModel = train(xgb_rank_params2,xgbTrain,num_boost_round=20,evals=evallist)

#test dataset
dtest=np.random.uniform(0,100,[n_group*n_choice,2])    
dtestgroup=np.array([n_choice for i in range(n_group)]).flatten()
xgbTest = DMatrix(dtest)
xgbTest.set_group(dgroup)

# test
print(rankModel.predict( xgbTest))