In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader,Dataset
from torch.autograd import Variable
import string
import matplotlib.pyplot as plt
import numpy as np
import csv
import pandas as pd
import re
import os
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import SnowballStemmer 
import collections
from collections import Counter
import random
import gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from gensim.models.keyedvectors import KeyedVectors
from tqdm import tqdm
from operator import itemgetter
import torch.nn.functional as F
from torch.optim import Adam

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def preprocessing(text, remove):
    #initial setting
    lemmatizer = WordNetLemmatizer()
    stemmer = SnowballStemmer('english')
    stop_words = set(stopwords.words('english'))
    tokenizer = RegexpTokenizer(r'\s+', gaps = True)
    processed_txt = []

    for line in text:
        sentence = []
        #remove the punctuation
        table = str.maketrans(dict.fromkeys(string.punctuation))
        line = line.translate(table)

        line = re.sub(r'[^a-zA-Z\s]', u' ', line, flags = re.UNICODE)

        line_token = tokenizer.tokenize(line)

        #remove stop words
        if (remove == True):
            line_token = [w for w in line_token if not w in stop_words]
        

        for word in line_token:
            word = lemmatizer.lemmatize(word)
            word = stemmer.stem(word)
            sentence.append(word)

        processed_txt.append(sentence)
    
    return processed_txt

def generate_rel_dict(qid_list, pid_list, rel_list):
    '''
    generate two dict according to the given data
    '''

    rel_dict = {}
    non_rel_dict = {}

    for i in range(len(qid_list)):
        qid = qid_list[i]
        pid = pid_list[i]
        rel = rel_list[i]

        if rel > 0:
            add_dict = {pid:i}
            if qid in rel_dict.keys():
                rel_dict[qid].update(add_dict)
            else:
                rel_dict[qid] = add_dict
        else:
            add_dict = {pid:i}
            if qid in non_rel_dict.keys():
                non_rel_dict[qid].update(add_dict)
            else:
                non_rel_dict[qid] = add_dict

    return rel_dict, non_rel_dict

def generate_AP(model, rel_dic, non_rel_dic):
    
    total_ap = []
    qid_list = list(model.keys())

    for qid in qid_list:
        N = 0
        R_rel = 0
        precision = 0
        model_pid_list= list(model[qid].keys())
        rel_pid_list = list(rel_dic[qid].keys())
        

        if len(set(model_pid_list) & set(rel_pid_list)) == 0:
            precision = 0
            total_ap.append(precision)

        else:
            for pid in model_pid_list:
                N += 1
                if pid in rel_pid_list:
                    R_rel += 1
                    precision += R_rel/N
                else:
                    continue
                if R_rel == len(rel_dic[qid]):
                    break

        
            total_ap.append(precision / R_rel)
    
    return total_ap

def generate_NDCG(model, rel_dic, non_rel_dic):

    NDCG = []
    qid_list = list(model.keys())
    
    for qid in qid_list:
        N = 0
        N_opt = 0
        DCG = 0
        DCG_opt = 0
        model_pid_list = list(model[qid].keys())
        rel_pid_list = list(rel_dic[qid].keys())

        if len(set(model_pid_list) & set(rel_pid_list)) == 0:
            DCG += 0
        

        else:
            rel_qid_dic = rel_dic[qid]
            for pid in model_pid_list:
                N += 1
                if pid in rel_pid_list:
                    rel_pid = 1
                else:
                    rel_pid = 0
            
                DCG += (2**rel_pid - 1)/np.log(1 + N)

        #find the opt DCG
        rel_qid_dic = rel_dic[qid]
        best_sort_ranking = dict(sorted(rel_qid_dic.items(), key=itemgetter(1), reverse = True)[: 100])
        opt_pid_list = list(best_sort_ranking.keys())
        
        for pid in opt_pid_list:
            rel_pid = 1
            N_opt += 1
            DCG_opt += (2**rel_pid - 1)/np.log(1 + N_opt)

        NDCG.append(DCG / DCG_opt)
    

    return NDCG

In [3]:
train_data = pd.read_csv('train_data.tsv', sep = '\t', header = 0)
validation_data = pd.read_csv('validation_data.tsv', sep = '\t', header = 0)

train_qid_dict = np.load('train_qid_dict.npy', allow_pickle= True).tolist()
train_pid_dict = np.load('train_pid_dict.npy', allow_pickle= True).tolist()

validation_qid_dict = np.load('validation_qid_dict.npy', allow_pickle= True).tolist()
validation_pid_dict = np.load('validation_pid_dict.npy', allow_pickle= True).tolist()

In [4]:
train_qid_list = list(train_data['qid'])
train_pid_list = list(train_data['pid'])


train_rel_list = list(train_data['relevancy'])
train_rel_dic, train_non_rel_dic = generate_rel_dict(train_qid_list, train_pid_list, train_rel_list)

In [5]:
def get_row(dict):
    key_list = list(dict.keys())
    len_row = 0
    for key in key_list:
        len_row += len(dict[key])

    return len_row

print('the number of revelance rows is', get_row(train_rel_dic))#4797
print('the number of non-revelance rows is', get_row(train_non_rel_dic))#4359542

the number of revelance rows is 4797
the number of non-revelance rows is 4359542


In [6]:
def subsampling(pid_dict, non_rel_dic):
    
    qid_list = list(non_rel_dic.keys())


    save_index_list = []
    for qid in qid_list:

        non_rel_pid_dict = non_rel_dic[qid]

        non_rel_index_list = list(non_rel_pid_dict.values())
        save_len = int(len(non_rel_index_list) * 0.02)
        #shuffle the pid then delete by specific ratio
        random.shuffle(non_rel_index_list)
        new_non_rel_index_list = non_rel_index_list[0:save_len]

        #upadte the pid revelant
        save_index_list.extend(new_non_rel_index_list)

        #new_rel_dic[qid] = add_non_rel

    #new_pid_dict = {key:val for key, val in pid_dict.items() if key in save_pid_list}

    return save_index_list

new_non_index = subsampling(train_pid_dict, train_non_rel_dic)



rel_index = []
for qid in list(train_rel_dic.keys()):
    rel_index_list = list(train_rel_dic[qid].values())
    rel_index.extend(rel_index_list)

new_train_index = new_non_index + rel_index
print('current length of new dataset is', len(new_train_index))

current length of new dataset is 87647


In [7]:
new_train_data = []
for index in new_train_index:
    new_train_data.append(train_data[index:index + 1])

new_train_data = pd.concat(new_train_data, axis = 0, ignore_index=True)

print(new_train_data)

          qid      pid                                            queries  \
0      188714  8523351         foods and supplements to lower blood sugar   
1      188714  6947934         foods and supplements to lower blood sugar   
2      188714  7352565         foods and supplements to lower blood sugar   
3      188714  3387416         foods and supplements to lower blood sugar   
4      188714  1130808         foods and supplements to lower blood sugar   
...       ...      ...                                                ...   
87642  401287   860900  is a written prescription required for hydroco...   
87643  541272   876066                        was wilson a good president   
87644  845529   882642              what is the salary range of a dentist   
87645  850361   926854              what is the temperature in washington   
87646  969974   956426               where did the the trail of tears end   

                                                 passage  relevancy  
0    

In [8]:
embedding_dict = {}
with open("glove.6B.100d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], 'float32')
        embedding_dict[word] = vector

len(embedding_dict)

400002

In [9]:
max_length = 200

In [10]:
train_passages = list(train_pid_dict.values())
train_queries = list(train_qid_dict.values())
test_passages = list(validation_pid_dict.values())
test_queries = list(validation_qid_dict.values())

In [11]:
def word_table(datasets,model):
    token_index_dict = {} # tokens to indexes
    index_vector_dict = {} # indexes to word vectors
    i = 0
    
    for dataset in tqdm(datasets):
        for sentence in dataset: # for each query/passage
            for token in sentence: # for each token of the sentence
                # if this word is not token_to_ind
                if(token_index_dict.get(token) == None):
                    if token in model.keys():
                    # if this word exists is the word model
                        i += 1
                        token_index_dict[token] = i
                        index_vector_dict[i] = model[token]

    return token_index_dict, index_vector_dict

In [12]:
token_index, index_vector = word_table([train_passages, train_queries, test_passages, test_queries], embedding_dict)
len(token_index)

100%|██████████| 4/4 [00:23<00:00,  5.90s/it]


124824

In [13]:
def generate_idx(text, token_index_dict, max_length):

  return_list = []
  for sentence in text:
    sen_list = []
    for token in sentence:
      if (token_index_dict.get(token) != None):
        sen_list.append(token_index_dict[token])
    
    #padding
    if len(sen_list) < max_length:
      sen_list.extend(0 for _ in range(abs(len(sen_list) - max_length)))

  return_list.append(sen_list)

  return np.array(return_list)


In [14]:
idx = generate_idx(train_passages[100], token_index, max_length)
idx.shape

(1, 200)

In [15]:

class datas(Dataset):
  def __init__(self, df, qid_dict, pid_dict):

    self.qid_list = df['qid'].values
    self.pid_list = df['pid'].values

    self.label_list = df['relevancy'].values

    self.qid_dict = qid_dict
    self.pid_dict = pid_dict

    self.length = len(self.qid_list)

  def __len__(self):
    return self.length
  
  def __getitem__(self, index):
    qid = self.qid_list[index]
    pid = self.pid_list[index]

    label = self.label_list[index]

    idx_qid = generate_idx([self.qid_dict[qid]], token_index, 200)
    idx_pid = generate_idx([self.pid_dict[pid]], token_index, 200)

    idx_input = np.concatenate((idx_pid, idx_qid), axis = 1)
    
    return idx_input, label

In [16]:
class TextCNN(nn.Module):
  def __init__(self,vocab_size = len(token_index), embedding_dim = 200, dropout = 0.5):
      super(TextCNN, self).__init__()

      self.embed = nn.Embedding(vocab_size, embedding_dim)
      
      self.convs = nn.ModuleList(
          [nn.Conv2d(1, 256, (k, 200)) for k in (2, 3, 4)]
      )
      self.dropout = nn.Dropout(dropout)
      self.fc = nn.Linear(256*3, 1)


  def conv_and_pool_layers(self, x , conv):
    x = conv(x)
    x = F.relu(x)
    x = x.squeeze(3)
    x = F.max_pool1d(x, x.size(2))
    x = x.squeeze(2)

    return x

  def forward(self, input_idx):

    
    out = self.embed(input_idx)
    #print(out.shape)
    #out = out.unsqueeze(1)

    out = torch.cat([self.conv_and_pool_layers(out, conv) for conv in self.convs], 1)

    out = self.dropout(out)

    #print(out.shape)

    result = self.fc(out)
    result = nn.Sigmoid()(result)

    return result

In [17]:
def train(model, train, lr = 1e-6, epochs = 5):

  train_data = datas(train, train_qid_dict, train_pid_dict)
  train_data_dataloader = DataLoader(train_data, batch_size = 15, shuffle = True)

  criterion = nn.BCELoss()
  optimizer = Adam(model.parameters(), lr = lr)

  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")

  if use_cuda:
    model = model.cuda()
    criterion = criterion.cuda()

  for epoch in range(epochs):

    train_acc = 0
    train_loss = 0
    
    for idx_input, train_label in tqdm(train_data_dataloader):

      train_label = train_label.to(device)
      idx_input = idx_input.to(device)
      #model.to(device)

      
      optimizer.zero_grad()

      output = model(idx_input)
      batch_loss = criterion(output.squeeze(), train_label.to(torch.float32))

      #optimizer.zero_grad()
      train_loss += batch_loss.item()


      #acc = int(torch.count_nonzero(output.squeeze() != train_label.to(torch.float32)))
      
      #train_acc += acc

      batch_loss.backward()
      optimizer.step()


    print(f'''Epochs: {epoch + 1} | train loss: {train_loss / len(train): .3f}
          ''')

In [18]:
model = TextCNN()
train(model, new_train_data)

100%|██████████| 5844/5844 [02:41<00:00, 36.23it/s]


Epochs: 1 | train loss:  0.020
          


100%|██████████| 5844/5844 [02:36<00:00, 37.34it/s]


Epochs: 2 | train loss:  0.015
          


100%|██████████| 5844/5844 [02:36<00:00, 37.35it/s]


Epochs: 3 | train loss:  0.015
          


100%|██████████| 5844/5844 [02:36<00:00, 37.31it/s]


Epochs: 4 | train loss:  0.015
          


100%|██████████| 5844/5844 [02:37<00:00, 37.13it/s]

Epochs: 5 | train loss:  0.015
          





In [19]:
#torch.save(model, '/content/drive/MyDrive/model_cnn.pth')

In [None]:
#model = torch.load('model_cnn.pth')
#model.cuda()

In [26]:
val_set = datas(validation_data, validation_qid_dict, validation_pid_dict)
val_loader = DataLoader(val_set, batch_size = 3, shuffle = False)

In [None]:
y_pred = []
with torch.no_grad():
      model.eval()
      for idx_input, train_label in tqdm(val_loader):
        
        
        use_cuda = torch.cuda.is_available()
        device = torch.device("cuda" if use_cuda else "cpu")
        #device = torch.device("cpu")

        model.to(device)
        train_label = train_label.to(device)
        idx_input = idx_input.to(device)

        output = model(idx_input)
        
        output = output.squeeze()
        output = output.cpu()
        output = output.numpy()
        output = output.tolist()
        if (type(output) == float):
          y_pred.append(output)
        else:
          y_pred.extend(output)

In [None]:
validation_qid_list = list(validation_data['qid'])
validation_pid_list = list(validation_data['pid'])
validation_rel_list = list(validation_data['relevancy'])
CNN_dict = {}
for qid in tqdm(list(validation_qid_dict.keys())):
  pid_index = [i for i,x in enumerate(validation_qid_list) if x == qid ]
  qid_dict = {}
  for i in pid_index:
    y_pid = y_pred[i]
    pid = validation_pid_list[i]
    add_dict = {pid:y_pid}
    qid_dict.update(add_dict)

  sorted_top_100 = dict(sorted(qid_dict.items(), key=itemgetter(1), reverse = True)[: 100])
  CNN_dict.update({qid: sorted_top_100})

In [None]:
def generate_rel_dict(qid_list, pid_list, rel_list):
    '''
    generate two dict according to the given data
    '''

    rel_dict = {}
    non_rel_dict = {}

    for i in range(len(qid_list)):
        qid = qid_list[i]
        pid = pid_list[i]
        rel = rel_list[i]

        if rel > 0:
            add_dict = {pid:i}
            if qid in rel_dict.keys():
                rel_dict[qid].update(add_dict)
            else:
                rel_dict[qid] = add_dict
        else:
            add_dict = {pid:i}
            if qid in non_rel_dict.keys():
                non_rel_dict[qid].update(add_dict)
            else:
                non_rel_dict[qid] = add_dict

    return rel_dict, non_rel_dict


validation_rel_dic, validation_non_rel_dic = generate_rel_dict(validation_qid_list, validation_pid_list, validation_rel_list)

In [None]:
NN_AP_list = generate_AP(CNN_dict, validation_rel_dic, validation_non_rel_dic)
NN_AP = np.mean(NN_AP_list)
NN_AP

In [None]:
NN_NDCG_list = generate_NDCG(CNN_dict, validation_rel_dic, validation_non_rel_dic)
NN_NDCG = np.mean(NN_NDCG_list)
NN_NDCG

In [None]:
with open('NN.txt','w') as f:
    for i in range(len(CNN_dict.keys())):
        qid = list(CNN_dict.keys())[i]
        pids = list(CNN_dict[qid].keys())
        #if not equals 100, delete it
        if len(pids) < 100:
          continue
        for j in range(100):
          pid = pids[j]
          # qid A2 pid rank score algoname
          f.writelines([str(qid), '  A2  ', str(pid),'  ', str(j+1),'  ',str(float(CNN_dict[qid][pid])), '  NN', '\n'])
f.close()