In [1]:
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
import re
import string
from operator import itemgetter
import csv
from collections import Counter
import time
import random
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import torch
from torch import nn, optim
import matplotlib.pyplot as plt
from tqdm import tqdm
import xgboost as xgb

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [29]:
def loadtext(filepath):
    with open(filepath, 'r') as f:
        txt = []
        for line in f.readlines():
            txt.append(line.strip().lower())
        
    return txt

def preprocessing(text, remove):
    #initial setting
    lemmatizer = WordNetLemmatizer()
    stemmer = SnowballStemmer('english')
    stop_words = set(stopwords.words('english'))
    tokenizer = RegexpTokenizer(r'\s+', gaps = True)
    processed_txt = []

    for line in text:
        sentence = []
        #remove the punctuation
        table = str.maketrans(dict.fromkeys(string.punctuation))
        line = line.translate(table)

        line = re.sub(r'[^a-zA-Z\s]', u' ', line, flags = re.UNICODE)

        line_token = tokenizer.tokenize(line)

        #remove stop words
        if (remove == True):
            line_token = [w for w in line_token if not w in stop_words]
        

        for word in line_token:
            word = lemmatizer.lemmatize(word)
            word = stemmer.stem(word)
            sentence.append(word)

        processed_txt.append(sentence)
    
    return processed_txt

def inverted_index(pid_data, processed_data):

    dic = {}
    
    for i in range(len(pid_data)):
        processed_passage = processed_data[i]

        for token in processed_passage:
            token_num = processed_passage.count(token)

            if token not in dic.keys():
                dic[token] = {pid_data[i]: token_num}
            else:
                add_num = {pid_data[i]: token_num}
                dic[token].update(add_num)

    return dic


def generate_rel_dict(qid_list, pid_list, rel_list):
    '''
    generate two dict according to the given data
    '''

    rel_dict = {}
    non_rel_dict = {}

    for i in range(len(qid_list)):
        qid = qid_list[i]
        pid = pid_list[i]
        rel = rel_list[i]

        if rel > 0:
            add_dict = {pid:i}
            if qid in rel_dict.keys():
                rel_dict[qid].update(add_dict)
            else:
                rel_dict[qid] = add_dict
        else:
            add_dict = {pid:i}
            if qid in non_rel_dict.keys():
                non_rel_dict[qid].update(add_dict)
            else:
                non_rel_dict[qid] = add_dict

    return rel_dict, non_rel_dict

def generate_AP(model, rel_dic, non_rel_dic):
    
    total_ap = []
    qid_list = list(model.keys())

    for qid in qid_list:
        N = 0
        R_rel = 0
        precision = 0
        model_pid_list= list(model[qid].keys())
        rel_pid_list = list(rel_dic[qid].keys())
        

        if len(set(model_pid_list) & set(rel_pid_list)) == 0:
            precision = 0
            total_ap.append(precision)

        else:
            for pid in model_pid_list:
                N += 1
                if pid in rel_pid_list:
                    R_rel += 1
                    precision += R_rel/N
                else:
                    continue
                if R_rel == len(rel_dic[qid]):
                    break

        
            total_ap.append(precision / R_rel)
    
    return total_ap

def generate_NDCG(model, rel_dic, non_rel_dic):

    NDCG = []
    qid_list = list(model.keys())
    
    for qid in qid_list:
        N = 0
        N_opt = 0
        DCG = 0
        DCG_opt = 0
        model_pid_list = list(model[qid].keys())
        rel_pid_list = list(rel_dic[qid].keys())

        if len(set(model_pid_list) & set(rel_pid_list)) == 0:
            DCG += 0
        

        else:
            rel_qid_dic = rel_dic[qid]
            for pid in model_pid_list:
                N += 1
                if pid in rel_pid_list:
                    rel_pid = 1
                else:
                    rel_pid = 0
            
                DCG += (2**rel_pid - 1)/np.log(1 + N)

        #find the opt DCG
        rel_qid_dic = rel_dic[qid]
        best_sort_ranking = dict(sorted(rel_qid_dic.items(), key=itemgetter(1), reverse = True)[: 100])
        opt_pid_list = list(best_sort_ranking.keys())
        
        for pid in opt_pid_list:
            rel_pid = 1
            N_opt += 1
            DCG_opt += (2**rel_pid - 1)/np.log(1 + N_opt)

        NDCG.append(DCG / DCG_opt)
    

    return NDCG

In [5]:
train_data = pd.read_csv('train_data.tsv', sep = '\t', header = 0)
validation_data = pd.read_csv('validation_data.tsv', sep = '\t', header = 0)

train_qid_dict = np.load('train_qid_dict.npy', allow_pickle= True).tolist()
train_pid_dict = np.load('train_pid_dict.npy', allow_pickle= True).tolist()

train_qid_list = list(train_data['qid'])
train_pid_list = list(train_data['pid'])


train_rel_list = list(train_data['relevancy'])
train_rel_dic, train_non_rel_dic = generate_rel_dict(train_qid_list, train_pid_list, train_rel_list)

In [6]:
def get_row(dict):
    key_list = list(dict.keys())
    len_row = 0
    for key in key_list:
        len_row += len(dict[key])

    return len_row

print('the number of revelance rows is', get_row(train_rel_dic))#4797
print('the number of non-revelance rows is', get_row(train_non_rel_dic))#4359542

the number of revelance rows is 4797
the number of non-revelance rows is 4359542


In [7]:
def subsampling(pid_dict, non_rel_dic):
    
    qid_list = list(non_rel_dic.keys())


    save_index_list = []
    for qid in qid_list:

        non_rel_pid_dict = non_rel_dic[qid]

        non_rel_index_list = list(non_rel_pid_dict.values())
        save_len = int(len(non_rel_index_list) * 0.025)
        #shuffle the pid then delete by specific ratio
        random.shuffle(non_rel_index_list)
        new_non_rel_index_list = non_rel_index_list[0:save_len]

        #upadte the pid revelant
        save_index_list.extend(new_non_rel_index_list)

        #new_rel_dic[qid] = add_non_rel

    #new_pid_dict = {key:val for key, val in pid_dict.items() if key in save_pid_list}

    return save_index_list

new_non_index = subsampling(train_pid_dict, train_non_rel_dic)



rel_index = []
for qid in list(train_rel_dic.keys()):
    rel_index_list = list(train_rel_dic[qid].values())
    rel_index.extend(rel_index_list)

new_train_index = new_non_index + rel_index
print('current length of new dataset is', len(new_train_index))

new_train_data = []
for index in new_train_index:
    new_train_data.append(train_data[index:index + 1])

random.shuffle(new_train_data)

new_train_data = pd.concat(new_train_data, axis = 0, ignore_index=True)

print(new_train_data)

train_passage = preprocessing(new_train_data['passage'], True)
train_queries = preprocessing(new_train_data['queries'], True)


train_passage_dict = dict(zip(new_train_data['pid'], train_passage))
train_queries_dict = dict(zip(new_train_data['qid'], train_queries))

validation_passage_dict = np.load('validation_pid_dict.npy', allow_pickle= True).tolist()
validation_queries_dict = np.load('validation_qid_dict.npy', allow_pickle= True).tolist()

validation_passage = preprocessing(validation_data['passage'], True)
validation_queries = preprocessing(validation_data['queries'], True)

validation_qid_list = list(validation_data['qid'])
validation_pid_list = list(validation_data['pid'])

validation_rel_list = list(validation_data['relevancy'])
validation_rel_dic, validation_non_rel_dic = generate_rel_dict(validation_qid_list, validation_pid_list, validation_rel_list)

current length of new dataset is 109468
            qid      pid                                            queries  \
0       1011811  4354002                     which is positive and negative   
1        467597  6683905                           oak ridge is what county   
2       1013797  8040628    what types of animals live in the tropical zone   
3        118365  7153226                                    define biobanks   
4        826518  6084159  what is the job description for insurance company   
...         ...      ...                                                ...   
109463  1085535  8102331                             cardiovascular meaning   
109464  1087904   608735                           what age do moles appear   
109465   472359  7571063                         paulding oh in what county   
109466  1084076  2933694                         what does cilostazol treat   
109467   960003  5930777                             when was stew invented   

           

In [8]:
with open('validation_passage.txt', 'w') as f:
    for i in range(len(validation_passage)):
        f.write(' '.join(validation_passage[i]) + '\n')

sentences = LineSentence('validation_passage.txt')
model_validation_passage = Word2Vec(sentences, sg = 1, vector_size=100, window = 5, min_count=1, negative = 5, hs = 0, workers = 4)

In [9]:
with open('validation_queries.txt', 'w') as f:
    for i in range(len(validation_queries)):
        f.write(' '.join(validation_queries[i]) + '\n')

sentences = LineSentence('validation_queries.txt')
model_validation_queries = Word2Vec(sentences, sg = 1, vector_size=100, window = 5, min_count=1, negative = 5, hs = 0, workers = 4)

In [10]:
with open('train_passage.txt', 'w') as f:
    for i in range(len(train_passage)):
        f.write(' '.join(train_passage[i]) + '\n')

sentences = LineSentence('train_passage.txt')
model_train_passage = Word2Vec(sentences, sg = 1, vector_size=100, window = 5, min_count=1, negative = 5, hs = 0, workers = 4)

In [11]:
with open('train_queries.txt', 'w') as f:
    for i in range(len(train_queries)):
        f.write(' '.join(train_queries[i]) + '\n')

sentences = LineSentence('train_queries.txt')
model_train_queries = Word2Vec(sentences, sg = 1, vector_size=100, window = 5, min_count=1, negative = 5, hs = 0, workers = 4)

In [12]:
def average_embedding(data_dict, model):

    return_dic= {}
    for i in list(data_dict.keys()):
        data = data_dict[i]

        if len(data) != 0:
            token_vector = model.wv[data]
            avg_vector = np.mean(token_vector, axis = 0)

        add_dict = {i:avg_vector}
        return_dic.update(add_dict)

    return return_dic

def generate_data_for_models(data, embedding_query_dict, embedding_passgae_dict, rel_dict):

    qid_list = []
    pid_list = []
    queries_list = []
    passgaes_list = []
    rel_list = []
    for i in range(len(data)):
        qid = data.qid[i]
        pid = data.pid[i]

        if qid in embedding_query_dict.keys() and pid in embedding_passgae_dict.keys():
            qid_list.append(qid)
            pid_list.append(pid)
            queries_list.append(embedding_query_dict[qid].reshape(-1))
            passgaes_list.append(embedding_passgae_dict[pid].reshape(-1))

            if pid in rel_dict[qid].keys():
                rel = 1
            else:
                rel = 0
            rel_list.append(rel)
    
    query_data = np.array(queries_list)
    passages_data = np.array(passgaes_list)

    x_data = np.concatenate((query_data, passages_data), axis = 1)
    y_data = np.array(rel_list)
    
    return x_data, y_data

In [13]:
train_queries_embedding = average_embedding(train_queries_dict, model_train_queries)
train_passages_embedding = average_embedding(train_passage_dict, model_train_passage)

train_x_data, train_y_data = generate_data_for_models(new_train_data, train_queries_embedding, train_passages_embedding, train_rel_dic)

print("The size of x for training dataset is", train_x_data.shape)
print("The size of y for training dataset is", train_y_data.shape)


validation_queries_embedding = average_embedding(validation_queries_dict, model_validation_queries)
validation_passages_embedding = average_embedding(validation_passage_dict, model_validation_passage)

validation_x_data, validation_y_data = generate_data_for_models(validation_data, validation_queries_embedding, validation_passages_embedding, validation_rel_dic)

print("The size of x for validation dataset is", validation_x_data.shape)
print("The size of y for validation dataset is", validation_y_data.shape)

The size of x for training dataset is (109468, 200)
The size of y for training dataset is (109468,)
The size of x for validation dataset is (1103039, 200)
The size of y for validation dataset is (1103039,)


In [14]:
def make_group(qid_list, xTr, yTr):
  qid_array = np.array(qid_list)
  idx = np.argsort(qid_array)
  qid_array = qid_array[idx]
  xTr = xTr[idx]
  yTr = yTr[idx].reshape(-1, 1)
  unique, count = np.unique(qid_array, return_counts = True)
  group = list(count)
               
  return xTr, yTr, group


In [15]:
model_list = []
lr_list = [0.1, 0.05, 0.01, 0.001]
estimator_list = [100, 200, 300]
depth_list = [5, 6, 7]
train_x, train_y, train_group = make_group(list(new_train_data['qid']), train_x_data, train_y_data)

for lr in tqdm(lr_list):
  for est in tqdm(estimator_list):
    for dep in tqdm(depth_list):
      model = xgb.XGBRanker(
          booster = 'gbtree',
          objective = 'rank:pairwise',
          eta = lr,
          max_depth = dep,
          n_estimators = est
      )
      model_list.append(model.fit(train_x, train_y, group = train_group, verbose = 0))

  0%|          | 0/4 [00:00<?, ?it/s]
  0%|          | 0/3 [00:00<?, ?it/s][A

  0%|          | 0/3 [00:00<?, ?it/s][A[A

 33%|███▎      | 1/3 [02:40<05:21, 160.68s/it][A[A

 67%|██████▋   | 2/3 [05:59<03:03, 183.03s/it][A[A

100%|██████████| 3/3 [09:54<00:00, 198.01s/it]

 33%|███▎      | 1/3 [09:54<19:48, 594.04s/it][A

  0%|          | 0/3 [00:00<?, ?it/s][A[A

 33%|███▎      | 1/3 [05:10<10:20, 310.37s/it][A[A

 67%|██████▋   | 2/3 [11:31<05:52, 352.04s/it][A[A

100%|██████████| 3/3 [18:54<00:00, 378.17s/it]

 67%|██████▋   | 2/3 [28:48<15:11, 911.97s/it][A

  0%|          | 0/3 [00:00<?, ?it/s][A[A

 33%|███▎      | 1/3 [07:36<15:13, 456.94s/it][A[A

 67%|██████▋   | 2/3 [16:58<08:38, 518.71s/it][A[A

100%|██████████| 3/3 [27:57<00:00, 559.05s/it]

100%|██████████| 3/3 [56:45<00:00, 1135.24s/it]
 25%|██▌       | 1/4 [56:45<2:50:17, 3405.74s/it]
  0%|          | 0/3 [00:00<?, ?it/s][A

  0%|          | 0/3 [00:00<?, ?it/s][A[A

 33%|███▎      | 1/3 [02:43<05

In [16]:
def gengerate_LM_score(qid, model, queries_embedding, passages_embedding):
  
  qid_dict = {}
  qid_vector = queries_embedding[qid]
  pid_index = [i for i,x in enumerate(validation_qid_list) if x == qid ]
  pred_list = []
  for index in pid_index:
    pid = validation_pid_list[index]
    pid_vector = passages_embedding[pid]
    qid_pid_vector = np.hstack((qid_vector, pid_vector))
    qid_pid_vector = qid_pid_vector.reshape(-1)
    pred_list.append(qid_pid_vector)
  
  pred_array = np.array(pred_list)
  pred = model.predict(pred_array)

  for i in range(len(pred)):
    pid = validation_pid_list[pid_index[i]]
    score = float(pred[i])
    add_dict = {pid:score}
    qid_dict.update(add_dict)

  sorted_top_100 = dict(sorted(qid_dict.items(), key=itemgetter(1), reverse = True)[: 100])

  return sorted_top_100

In [17]:
score_dict_list = []
for i in tqdm(range(len(model_list))):
  LM_dict = {}
  for qid in validation_qid_list:
    if qid in LM_dict.keys():
      continue
    else:
      qid_score_dict = gengerate_LM_score(qid, model_list[i], validation_queries_embedding, validation_passages_embedding)
      add_dict = {qid: qid_score_dict}
      LM_dict.update(add_dict)
  score_dict_list.append(LM_dict)


100%|██████████| 36/36 [1:03:50<00:00, 106.41s/it]


In [18]:
for i in range(len(score_dict_list)):
  LM_AP_list = generate_AP(score_dict_list[i], validation_rel_dic, validation_non_rel_dic)
  LM_AP = np.mean(LM_AP_list)
  print('MAP for model ' + str(i + 1) + ': ' + str(LM_AP))

MAP for model 1: 0.012821884082367141
MAP for model 2: 0.010564471085824244
MAP for model 3: 0.010066165931645723
MAP for model 4: 0.01036002140485415
MAP for model 5: 0.009682296644411647
MAP for model 6: 0.009905109325958155
MAP for model 7: 0.010366439657542975
MAP for model 8: 0.00909928625117986
MAP for model 9: 0.009570690510520853
MAP for model 10: 0.010721145410539803
MAP for model 11: 0.01204384717798579
MAP for model 12: 0.013116383530122873
MAP for model 13: 0.011025947851044503
MAP for model 14: 0.012012977642358393
MAP for model 15: 0.010256827987842916
MAP for model 16: 0.010428246595554353
MAP for model 17: 0.010244443742248343
MAP for model 18: 0.009512697653894975
MAP for model 19: 0.009626103741977659
MAP for model 20: 0.010067879641273165
MAP for model 21: 0.008712986708855253
MAP for model 22: 0.010856201511759415
MAP for model 23: 0.00921308699965868
MAP for model 24: 0.00932692742282686
MAP for model 25: 0.009190556583230794
MAP for model 26: 0.009052930491427313


In [19]:
for i in range(len(score_dict_list)):
  LM_NDCG_list = generate_NDCG(score_dict_list[i], validation_rel_dic, validation_non_rel_dic)
  LM_NDCG = np.mean(LM_NDCG_list)
  print('NDCG for model ' + str(i + 1) + ': ' + str(LM_NDCG))

NDCG for model 1: 0.03293901744426016
NDCG for model 2: 0.03153329243651167
NDCG for model 3: 0.02908159330345498
NDCG for model 4: 0.029713059623724263
NDCG for model 5: 0.03080196384328192
NDCG for model 6: 0.03115271762781139
NDCG for model 7: 0.03054438482160823
NDCG for model 8: 0.02907099510940176
NDCG for model 9: 0.029946105774373382
NDCG for model 10: 0.030969871974781783
NDCG for model 11: 0.032562280345111105
NDCG for model 12: 0.03379108869328335
NDCG for model 13: 0.030700833505978915
NDCG for model 14: 0.033151321990398296
NDCG for model 15: 0.03008007415084328
NDCG for model 16: 0.03090258341858005
NDCG for model 17: 0.029484425473997093
NDCG for model 18: 0.02932160247456011
NDCG for model 19: 0.031160939630486946
NDCG for model 20: 0.03113836993647027
NDCG for model 21: 0.03090392730827075
NDCG for model 22: 0.03105586275781211
NDCG for model 23: 0.03177943151703813
NDCG for model 24: 0.030156630657410224
NDCG for model 25: 0.02902874866833616
NDCG for model 26: 0.0295

In [None]:
LM_dict= score_dict_list[11]
with open('LM.txt','w') as f:
    for i in range(len(LM_dict.keys())):
        qid = list(LM_dict.keys())[i]
        pids = list(LM_dict[qid].keys())
        #if not equals 100, delete it
        if len(pids) < 100:
          continue
        for j in range(100):
          pid = pids[j]
          # qid A2 pid rank score algoname
          f.writelines([str(qid), '  A2  ', str(pid),'  ', str(j+1),'  ',str(float(LM_dict[qid][pid])), '  LM', '\n'])
f.close()