In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize

import psutil
import humanize
import os
import GPUtil as GPU

GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
    process = psutil.Process(os.getpid())
    print("Gen RAM Free: " + humanize.naturalsize(psutil.virtual_memory().available), " |     Proc size: " + humanize.naturalsize(process.memory_info().rss))
    print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total     {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

Collecting gputil
  Downloading https://files.pythonhosted.org/packages/ed/0e/5c61eedde9f6c87713e89d794f01e378cfd9565847d4576fa627d758c554/GPUtil-1.4.0.tar.gz
Building wheels for collected packages: gputil
  Building wheel for gputil (setup.py) ... [?25l[?25hdone
  Created wheel for gputil: filename=GPUtil-1.4.0-cp36-none-any.whl size=7411 sha256=7228e7063f348458edae0d20a56e33d251fa884aed939fc8e5b8e40fec891b9c
  Stored in directory: /root/.cache/pip/wheels/3d/77/07/80562de4bb0786e5ea186911a2c831fdd0018bda69beab71fd
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0
Gen RAM Free: 26.4 GB  |     Proc size: 111.5 MB
GPU RAM Free: 16280MB | Used: 0MB | Util   0% | Total     16280MB


In [None]:
!pip3 install transformers
!pip3 install unidecode
!pip3 install pyserini

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/3a/83/e74092e7f24a08d751aa59b37a9fc572b2e4af3918cb66f7766c3affb1b4/transformers-3.5.1-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 6.9MB/s 
Collecting tokenizers==0.9.3
[?25l  Downloading https://files.pythonhosted.org/packages/4c/34/b39eb9994bc3c999270b69c9eea40ecc6f0e97991dba28282b9fd32d44ee/tokenizers-0.9.3-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 44.7MB/s 
Collecting sentencepiece==0.1.91
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 51.1MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |██

In [None]:
import numpy as np
import pandas as pd
import os
from os import listdir
from os.path import join
import unidecode
import re
import logging
from tqdm.notebook import tnrange
import glob
import json

#For ploting results
import matplotlib.pyplot as plt

# DL Libraries
from transformers import BertModel, AdamW, BertTokenizer, BertConfig, RobertaTokenizer, RobertaModel, RobertaForSequenceClassification
from keras.preprocessing.sequence import pad_sequences
import torch
import torch.nn as nn
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from statistics import mean 

#For anserini
from pyserini.search import SimpleSearcher
from pyserini import analysis, index

In [None]:
indexes= '/content/drive/My Drive/project_nlp/submission/lucene_indexing'

searcher = SimpleSearcher(join(indexes, 'pyserini/indexes/lucene-index-wiki'))
searcher.set_bm25(0.4, 0.1)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print("device: {} n_gpu: {}".format(device, n_gpu))

device: cuda n_gpu: 1


##Dataloader

In [None]:
def create_dataloader(tokenizer, df):
    input_ids= list()
    attention_masks= list()

    special_sentences_1 = [sentence for i, sentence in enumerate(df.question)]
    special_sentences_2 = [" [SEP] " + str(sentence) for i, sentence in enumerate(df.answer)]
    special_sentences = [i + j for i, j in zip(special_sentences_1, special_sentences_2)]

    for sentence in special_sentences:
      encoded_text = tokenizer.encode_plus(sentence, max_length=512, add_special_tokens=True, return_token_type_ids=False, 
                                       padding='max_length', return_attention_mask=True, truncation=True)
      input_ids.append(encoded_text['input_ids'])
      attention_masks.append(encoded_text['attention_mask'])

    inputs = torch.tensor(input_ids).to(device)
    masks = torch.tensor(attention_masks).to(device)
    # gold_labels = torch.tensor(df.label.tolist()).to(device)
  
    data = TensorDataset(inputs, masks)
    sampler = SequentialSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=1)

    return dataloader

##evaluation

In [None]:
def precision_at_k(r, k):
  assert k >= 1
  r = np.asarray(r)[:k] != 0
  if r.size != k:
    raise ValueError('Relevance score length < k')
  return np.mean(r)

In [None]:
def recall_at_k(actual, predicted, k):
  count=0
  predicted= predicted[0:k]
  for a in actual:
    if a in predicted:
      count+=1
  return count/len(actual)

In [None]:
def average_precision(r):
  r = np.asarray(r) != 0
  out = [precision_at_k(r, k + 1) for k in range(r.size) if r[k]]
  if not out:
    return 0
  
  return np.mean(out)

def mean_average_precision(rs):
  return np.mean([average_precision(r) for r in rs])

In [None]:
def mean_reciprocal_rank(rs):
  rs = (np.asarray(r).nonzero()[0] for r in rs)
  return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])

##Model and Algo

In [None]:
def get_candidate_passages(query):

  candidate_passages=[]
  hits = searcher.search(query, k=20)
  # return the first top 10 hits:
  for hit in hits:
    doc = searcher.doc(str(hit.docid))
    candidate_passages.append(doc.raw().replace('"', ''))
  
  return candidate_passages

In [None]:
#classification model

model_path= '/content/drive/MyDrive/man_mihir_project/cls_experiment/model_sqcls'

model= RobertaForSequenceClassification.from_pretrained(model_path)
model.to(device)

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




In [None]:
load_data = '/content/drive/My Drive/man_mihir_project/data'

with open(join(load_data,'document_passages.json'),'r') as f:
  doc_passages= json.load(f)

data_df= pd.read_csv(join(load_data, 'test.csv'))

In [None]:
precision_at_5=[]
precision_at_10=[]
recall_at_5=[]
recall_at_10=[]
recall_at_20=[]
rs=[]

for index, row in data_df.iterrows():
  answers= get_candidate_passages(row.Question)
  temp_df= pd.DataFrame(answers, columns=['answer'])
  temp_df['question']=row.Question
  dataloader= create_dataloader(tokenizer, temp_df)
  pos_prob= []
  model.eval()
  with torch.no_grad():
    for step, batch in enumerate(dataloader):
      batch = tuple(t.to(device) for t in batch)
      ip_ids, masks= batch
      logits = model(ip_ids, attention_mask=masks)
      logits= logits[0].squeeze(0)
      pred_logits = logits.cpu().detach().numpy()
      pos_prob.append(pred_logits[1])
  
  temp_df['score']=pos_prob
  temp_df= temp_df.sort_values(by='score', ascending=False)
  top_k_ans= temp_df[0:20].answer.tolist()

  doc= doc_passages[str(row.DocumentID)]
  passages_no= row.RelevantPassages.split(',')
  org_ans=[]
  for no in passages_no:
    org_ans.append(doc[str(no)])

  r=[]
  for ans in top_k_ans:
    if ans in org_ans:
      r.append(1)
    else:
      r.append(0)
  
  rs.append(r)
  precision_at_5.append(precision_at_k(r,5))
  precision_at_10.append(precision_at_k(r,10))
  recall_at_5.append(recall_at_k(org_ans, top_k_ans, 5))
  recall_at_10.append(recall_at_k(org_ans, top_k_ans, 10))
  recall_at_20.append(recall_at_k(org_ans, top_k_ans, 20))

print(mean(precision_at_5))
print(mean(precision_at_10))
print(mean(recall_at_5))
print(mean(recall_at_10))
print(mean(recall_at_20))
print(mean_average_precision(rs))
print(mean_reciprocal_rank(rs))

0.09134615384615385
0.05480769230769231
0.22129693223443223
0.2473385989010989
0.2725789835164835
0.3025580595168406
0.31782559250390136


##SIA Model

In [None]:
#Class for Regression
class Regressor(nn.Module):

  def __init__(self, model_path):
    super(Regressor, self).__init__()
    self.bert = RobertaModel.from_pretrained(model_path)
    self.out = nn.Linear(self.bert.config.hidden_size, 1)

  def forward(self, input_ids, attention_mask):
    output, pooler_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
    score= self.out(pooler_out)
    return score

In [None]:
#Model Intialization

model_path= '/content/drive/MyDrive/man_mihir_project/sia_experiment/model'

#Load Model
model= Regressor(model_path)
lr_weights= torch.load(join(model_path, 'model_state.bin'))
model.out.load_state_dict(lr_weights)
model.to(device)

In [None]:
precision_at_5=[]
precision_at_10=[]
recall_at_5=[]
recall_at_10=[]
recall_at_20=[]
rs=[]

for index, row in data_df.iterrows():
  answers= get_candidate_passages(row.Question)
  temp_df= pd.DataFrame(answers, columns=['answer'])
  temp_df['question']=row.Question
  dataloader= create_dataloader(tokenizer, temp_df)
  pos_prob= []
  model.eval()
  with torch.no_grad():
    for step, batch in enumerate(dataloader):
      batch = tuple(t.to(device) for t in batch)
      ip_ids, masks= batch
      score = model(ip_ids, attention_mask=masks)
      pos_prob.append(score.cpu().detach().numpy())
  
  temp_df['score']=pos_prob
  temp_df= temp_df.sort_values(by='score', ascending=False)
  top_k_ans= temp_df[0:20].answer.tolist()

  doc= doc_passages[str(row.DocumentID)]
  passages_no= row.RelevantPassages.split(',')
  org_ans=[]
  for no in passages_no:
    org_ans.append(doc[str(no)])

  r=[]
  for ans in top_k_ans:
    if ans in org_ans:
      r.append(1)
    else:
      r.append(0)
  
  rs.append(r)
  precision_at_5.append(precision_at_k(r,5))
  precision_at_10.append(precision_at_k(r,10))
  recall_at_5.append(recall_at_k(org_ans, top_k_ans, 5))
  recall_at_10.append(recall_at_k(org_ans, top_k_ans, 10))
  recall_at_20.append(recall_at_k(org_ans, top_k_ans, 20))

print(mean(precision_at_5))
print(mean(precision_at_10))
print(mean(recall_at_5))
print(mean(recall_at_10))
print(mean(recall_at_20))
print(mean_average_precision(rs))
print(mean_reciprocal_rank(rs))

0.09230769230769231
0.054086538461538464
0.22410141941391942
0.24773923992673993
0.264566163003663
0.2992625758702875
0.3089393619862184


##Anserini Only

In [None]:
precision_at_5=[]
precision_at_10=[]
recall_at_5=[]
recall_at_10=[]
recall_at_20=[]
rs=[]
count=0

for index, row in data_df.iterrows():
  answers= get_candidate_passages(row.Question)
  doc= doc_passages[str(row.DocumentID)]
  passages_no= row.RelevantPassages.split(',')
  org_ans=[]
  for no in passages_no:
    org_ans.append(doc[str(no)])

  r=[]
  for ans in answers:
    if ans in org_ans:
      r.append(1)
    else:
      r.append(0)

  if max(r)==0:
    count+=1
  
  rs.append(r)
  precision_at_5.append(precision_at_k(r,5))
  precision_at_10.append(precision_at_k(r,10))
  recall_at_5.append(recall_at_k(org_ans, answers, 5))
  recall_at_10.append(recall_at_k(org_ans, answers, 10))
  recall_at_20.append(recall_at_k(org_ans, answers, 20))

print(mean(precision_at_5))
print(mean(precision_at_10))
print(mean(recall_at_5))
print(mean(recall_at_10))
print(mean(recall_at_20))
print(mean_average_precision(rs))
print(mean_reciprocal_rank(rs))
print(count)

0.06490384615384616
0.04014423076923077
0.19811698717948717
0.24573603479853479
0.2825950091575092
0.21090602390195673
0.21888767924682356
247


##BM25

In [None]:
!pip3 install rank_bm25

Collecting rank_bm25
  Downloading https://files.pythonhosted.org/packages/16/5a/23ed3132063a0684ea66fb410260c71c4ffda3b99f8f1c021d1e245401b5/rank_bm25-0.2.1-py3-none-any.whl
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.1


In [None]:
passages= list()
for key in doc_passages:
  passages.extend(list(doc_passages[str(key)].values()))

In [None]:
from rank_bm25 import BM25Okapi

In [None]:
tokenized_corpus = [doc.split(" ") for doc in passages]

In [None]:
bm25 = BM25Okapi(tokenized_corpus, k1=0.4, b=0.1, epsilon=1)

In [None]:
precision_at_5=[]
rs=[]
count=0

for index, row in data_df.iterrows():

  query = row.Question
  tokenized_query = query.split(" ")

  answers= bm25.get_top_n(tokenized_query, passages, n=5)

  doc= doc_passages[str(row.DocumentID)]
  passages_no= row.RelevantPassages.split(',')
  org_ans=[]
  for no in passages_no:
    org_ans.append(doc[str(no)])

  r=[]
  for ans in answers:
    if ans in org_ans:
      r.append(1)
    else:
      r.append(0)

  if max(r)==0:
    count+=1
  
  rs.append(r)
  precision_at_5.append(precision_at_k(r,5))

print(mean(precision_at_5))
print(mean_average_precision(rs))
print(mean_reciprocal_rank(rs))
print(count)

0.07067307692307692
0.2255742521367521
0.22932692307692307
286
