# Necessary Installations section

In [61]:
!pip3 install transformers
!pip3 install unidecode
!pip install pandas
!pip install keras 
!pip install tensorflow
!pip install torch
!pip install git+https://github.com/AndriyMulyar/semantic-text-similarity
!pip install urllib3==1.25.10
!pip install tqdm




Collecting git+https://github.com/AndriyMulyar/semantic-text-similarity
  Cloning https://github.com/AndriyMulyar/semantic-text-similarity to /tmp/pip-req-build-zrf5husd
  Running command git clone -q https://github.com/AndriyMulyar/semantic-text-similarity /tmp/pip-req-build-zrf5husd
Building wheels for collected packages: semantic-text-similarity
  Building wheel for semantic-text-similarity (setup.py) ... [?25ldone
[?25h  Created wheel for semantic-text-similarity: filename=semantic_text_similarity-1.0.3-py3-none-any.whl size=416023 sha256=6fe74a2f759688899760a304a009dadab95550815e480250b27d79e9e71de901
  Stored in directory: /tmp/pip-ephem-wheel-cache-j5v041x9/wheels/53/38/40/8492fa5fef9e81bbdf64927c1c1b8ef9b5ac39cfec09f526dc
Successfully built semantic-text-similarity


In [62]:
import numpy as np
import pandas as pd
from os import listdir
from os.path import join
import unidecode
import re
import logging
from tqdm.notebook import tnrange
import glob
import json
import os

#For ploting results
import matplotlib.pyplot as plt

# DL Libraries
from transformers import BertModel, AdamW, BertTokenizer, BertConfig, get_linear_schedule_with_warmup
from keras.preprocessing.sequence import pad_sequences
import torch
import torch.nn as nn
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,TensorDataset)
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr

# #NLTK Libraries
# import nltk
# nltk.download('punkt')
# from nltk.tokenize import word_tokenize
# from nltk.corpus import stopwords
# nltk.download('stopwords')
# stop_words = set(stopwords.words('english'))
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

In [63]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
# print("device: {} n_gpu: {}".format(device, n_gpu))
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)
# print(logger)

Sample Model Testing

In [64]:
from semantic_text_similarity.models import WebBertSimilarity

web_model = WebBertSimilarity(device='cuda', batch_size=10) #defaults to GPU prediction

#clinical_model = ClinicalBertSimilarity(device='cuda', batch_size=10) #defaults to GPU prediction

web_model.predict([("She won an olympic gold medal","The women is an olympic champion")])

10/24/2020 23:10:42 - INFO - pytorch_transformers.tokenization_utils -   Model name '/home/tharun/.cache/torch/semantic_text_similarity/448b457fca135ab38bb3d6af49c5f220a8167e6b6ce9012f7df8172aa29865f9' not found in model shortcut name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc). Assuming '/home/tharun/.cache/torch/semantic_text_similarity/448b457fca135ab38bb3d6af49c5f220a8167e6b6ce9012f7df8172aa29865f9' is a path or url to a directory containing tokenizer files.
10/24/2020 23:10:42 - INFO - pytorch_transformers.tokenization_utils -   Didn't find file /home/tharun/.cache/torch/semantic_text_similarity/448b457fca135ab38bb3d6af49c5f220a81

array([3.0079887], dtype=float32)

# Data Preprocessing section

Function Definitions

In [65]:
def generate_df(df, start,end):
    new_df = pd.DataFrame(columns=['question-id','passage-id','query','passage_title','sentence', 'passage_len'])
    pd.set_option('display.max_seq_items', None)
    num_psges =  0
    print("max length of the data : ",len(df))

    for i in tnrange(start,end):
        cntxt_len = len(df['context'][i])
        num_psges  = num_psges + cntxt_len

        # query = []
        # query.append(df['question'][i])
        # query.append(df['answer'][i])

        query = str(df['question'][i] + df['answer'][i])
        if len(query) < 2:
            continue

        for j  in range(0,cntxt_len):
            senList = df['context'][i][j][1]
            passage_title =  df['context'][i][j][0]
            k = 1
            for each_sen in senList:
                # filtering empty sentences
                if len(each_sen) < 2 or len(passage_title)< 2:
                    continue

                ques_id = str(i+1)
                psge_id  =  str(j+1)
                new_row = {'question-id':ques_id,'passage-id':psge_id, 'query':query,'passage_title': passage_title,'sentence': each_sen,'passage_len': len(senList)}
                # print(new_row)
                new_df =  new_df.append(new_row,ignore_index= True)
    print(end,"length=", len(new_df))
    return  new_df

In [66]:
def generate_files(inpfilename,  foldername, start,end, forcegenerate = False):
    df = pd.read_json(inpfilename);
    str1 = foldername 
    str3 = ".csv"
    fullfilename = "".join((str1,str(start), "_", str(end),str3))
    
    os.makedirs(os.path.dirname(fullfilename), exist_ok=True)

    if os.path.isfile(fullfilename) and forcegenerate == False:
        print(fullfilename + " already  created -  so skipping  generation")
    else:
        new_df  =  generate_df(df,start,end)
        new_df.to_csv(fullfilename,index = False)
        print(fullfilename + " generated")


Generation

In [67]:

start = 10
end = 12
FOLDER = '/home/tharun/Downloads/Gouthami/NLP_Project/Training_Data/Submission'

JSON_FILE =  FOLDER +"/hotpot_train_v1.1.json"
FILE_PREFIX  =  FOLDER +  '/parsed_df/parsed_data_'

In [68]:
generate_files(JSON_FILE, FILE_PREFIX, start,end, True)

max length of the data :  90447


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


12 length= 73
/home/tharun/Downloads/Gouthami/NLP_Project/Training_Data/Submission/parsed_df/parsed_data_10_12.csv generated


In [69]:
print(JSON_FILE)

/home/tharun/Downloads/Gouthami/NLP_Project/Training_Data/Submission/hotpot_train_v1.1.json


In [70]:
from semantic_text_similarity.models import WebBertSimilarity

In [71]:
import pandas as pd

cur_file_name = "".join((FILE_PREFIX, str(start),"_",str(end),".csv"))
new_df = pd.read_csv(cur_file_name)

final_df = pd.DataFrame(columns=['question-id', 'passage-id','passage_title','passage_len','sentence_1','sentence_2','score'])
final_df['question-id'] = new_df['question-id']
final_df['passage-id'] =  new_df['passage-id']
final_df['passage_title'] = new_df['passage_title']
final_df['passage_len'] = new_df['passage_len']
final_df['sentence_1'] = new_df['query']
final_df['sentence_2'] =  new_df['sentence']
print(len(final_df))


73


# Scores Generation section

In [72]:
web_model = WebBertSimilarity(device='cuda', batch_size=16) #defaults to GPU prediction

for i in tnrange(len(final_df)):
    sts_score  = web_model.predict([(final_df['sentence_1'][i],final_df['sentence_2'][i])])
    final_df['score'][i]= np.round(sts_score,2)[0]


10/24/2020 23:10:51 - INFO - pytorch_transformers.tokenization_utils -   Model name '/home/tharun/.cache/torch/semantic_text_similarity/448b457fca135ab38bb3d6af49c5f220a8167e6b6ce9012f7df8172aa29865f9' not found in model shortcut name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc). Assuming '/home/tharun/.cache/torch/semantic_text_similarity/448b457fca135ab38bb3d6af49c5f220a8167e6b6ce9012f7df8172aa29865f9' is a path or url to a directory containing tokenizer files.
10/24/2020 23:10:51 - INFO - pytorch_transformers.tokenization_utils -   Didn't find file /home/tharun/.cache/torch/semantic_text_similarity/448b457fca135ab38bb3d6af49c5f220a81

HBox(children=(FloatProgress(value=0.0, max=73.0), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['score'][i]= np.round(sts_score,2)[0]





In [73]:
final_df.head()

Unnamed: 0,question-id,passage-id,passage_title,passage_len,sentence_1,sentence_2,score
0,11,1,Lights Out Paris,3,"Fast Cars, Danger, Fire and Knives includes gu...",Lights Out Paris is the first studio album by ...,1.16
1,11,1,Lights Out Paris,3,"Fast Cars, Danger, Fire and Knives includes gu...","It was released July 28, 2005 on Doomtree Rec...",1.35
2,11,1,Lights Out Paris,3,"Fast Cars, Danger, Fire and Knives includes gu...",The album was re-released with four remixes a...,0.98
3,11,2,El-P,2,"Fast Cars, Danger, Fire and Knives includes gu...","Jaime Meline (born March 2, 1975), better know...",2.42
4,11,2,El-P,2,"Fast Cars, Danger, Fire and Knives includes gu...","Originally a member of Company Flow, El-P has...",1.45


In [74]:
      
SCORED_FOLDER = "".join((FOLDER, "/scored_df/"))

filename = SCORED_FOLDER +"with_scores_"+ str(start) +"_"+str(end) +".csv"

os.makedirs(os.path.dirname(filename), exist_ok=True)
final_df.to_csv(filename,index = False)

Data Frame 

In [75]:
print(filename)
result_df= pd.read_csv(filename)

/home/tharun/Downloads/Gouthami/NLP_Project/Training_Data/Submission/scored_df/with_scores_10_12.csv


In [76]:
result_df

Unnamed: 0,question-id,passage-id,passage_title,passage_len,sentence_1,sentence_2,score
0,11,1,Lights Out Paris,3,"Fast Cars, Danger, Fire and Knives includes gu...",Lights Out Paris is the first studio album by ...,1.16
1,11,1,Lights Out Paris,3,"Fast Cars, Danger, Fire and Knives includes gu...","It was released July 28, 2005 on Doomtree Rec...",1.35
2,11,1,Lights Out Paris,3,"Fast Cars, Danger, Fire and Knives includes gu...",The album was re-released with four remixes a...,0.98
3,11,2,El-P,2,"Fast Cars, Danger, Fire and Knives includes gu...","Jaime Meline (born March 2, 1975), better know...",2.42
4,11,2,El-P,2,"Fast Cars, Danger, Fire and Knives includes gu...","Originally a member of Company Flow, El-P has...",1.45
...,...,...,...,...,...,...,...
68,12,10,Bruce Harwood,5,"Gunmen from Laredo starred which narrator of ""...","Bruce Harwood (born April 29, 1963) is a Canad...",0.90
69,12,10,Bruce Harwood,5,"Gunmen from Laredo starred which narrator of ""...","In addition to ""The X-Files"", Harwood portray...",1.28
70,12,10,Bruce Harwood,5,"Gunmen from Laredo starred which narrator of ""...",He has also played other roles with a strong ...,0.71
71,12,10,Bruce Harwood,5,"Gunmen from Laredo starred which narrator of ""...",He was a founding member of the Vancouver sum...,0.38


# Data Processing Section

In [77]:
output_df = pd.DataFrame(columns=['question', 'passage_title','sia_score'])
i =  0
while i < len(result_df):
  p_len = result_df['passage_len'][i]
  score_sum = 0
  k  = 0
  while k < p_len:
    score_sum = score_sum +  result_df['score'][i]
    k = k+1
    i  = i+1
  avg_score =  (score_sum/ p_len)
  #print(avg_score)
  new_row = {'question': result_df['sentence_1'][i-1], 'passage_title' : result_df['passage_title'][i-1], 'sia_score' : avg_score }
  output_df =  output_df.append(new_row,ignore_index= True)

In [78]:
output_df

Unnamed: 0,question,passage_title,sia_score
0,"Fast Cars, Danger, Fire and Knives includes gu...",Lights Out Paris,1.163333
1,"Fast Cars, Danger, Fire and Knives includes gu...",El-P,1.935
2,"Fast Cars, Danger, Fire and Knives includes gu...",Born and Raised (EP),1.2675
3,"Fast Cars, Danger, Fire and Knives includes gu...",Lord Steppington,1.45
4,"Fast Cars, Danger, Fire and Knives includes gu...",Control Freek,1.235
5,"Fast Cars, Danger, Fire and Knives includes gu...",Hip hop,1.156
6,"Fast Cars, Danger, Fire and Knives includes gu...","Fast Cars, Danger, Fire and Knives",1.565
7,"Fast Cars, Danger, Fire and Knives includes gu...",Longterm Mentality,1.2725
8,"Fast Cars, Danger, Fire and Knives includes gu...",Experimental hip hop,1.403333
9,"Fast Cars, Danger, Fire and Knives includes gu...",Changes (Alyson Avenue album),1.123333


In [79]:
output_df = output_df.round({'sia_score': 2})


OUT_FOLDER = "".join((FOLDER, "/output_df/"))
filename = OUT_FOLDER +"output_"+ str(start) +"_"+str(end) +".csv"
os.makedirs(os.path.dirname(filename), exist_ok=True)
output_df.to_csv(filename,index = False)


# The End #

# Experiments on MODELs



In [80]:
# #Class for Regression
# class Regressor(nn.Module):

#   def __init__(self,  model_path):
#     super(Regressor, self).__init__()
#     # self.bert = BertModel.from_pretrained(model_path)
#     self.bert = BertModel.from_pretrained('/content/drive/My Drive/Courses/NLP/Project/model')
#     self.out = nn.Linear(self.bert.config.hidden_size, 1)

#   def forward(self, input_ids, attention_mask):
#     output, pooler_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)Preprocessing
#     score= self.out(pooler_out)
#     return score

In [81]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# n_gpu = torch.cuda.device_count()
# print("device: {} n_gpu: {}".format(device, n_gpu)) 

In [82]:
# logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
#                     datefmt = '%m/%d/%Y %H:%M:%S',
#                     level = logging.INFO)
# logger = logging.getLogger(__name__)
# print(logger)

In [83]:
# # memory footprint support libraries/code
# !ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
# !pip install gputil
# !pip install psutil
# !pip install humanize

# import psutil
# import humanize
# import os
# import GPUtil as GPU

# GPUs = GPU.getGPUs()
# # XXX: only one GPU on Colab and isn’t guaranteed
# gpu = GPUs[0]
# def printm():
#     process = psutil.Process(os.getpid())
#     print("Gen RAM Free: " + humanize.naturalsize(psutil.virtual_memory().available), " |     Proc size: " + humanize.naturalsize(process.memory_info().rss))
#     print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total     {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
# printm()

# TRAINING AND EVALUATION

In [84]:
# output_dir= '/content/drive/My Drive/Courses/NLP/Project/model'
# output_result= '/content/drive/My Drive/Courses/NLP/Project/results'

# if not os.path.exists(output_dir):
#   os.makedirs(output_dir)

# if not os.path.exists(output_result):
#   os.makedirs(output_result)

In [85]:
# for iteration in tnrange(epochs, desc='Epochs'):
#   model.train()
#   logger.info("Running for iteration: {}".format(iteration+1))

#   training_loss, training_steps=0,0
#   true_labels, predicted_labels= list(), list()
  
#   for step, batch in enumerate(train_dataloader):
#     batch = tuple(t.to(device) for t in batch)
#     ip_ids, masks, gold_labels= batch
#     score = model(ip_ids, attention_mask=masks)
#     score = score.squeeze(1)
#     loss= mse_loss(score, gold_labels.float())
#     loss.backward()
#     optimizer.step()
#     optimizer.zero_grad()
#     training_loss+=loss.item()
#     training_steps+=1
#     if (step+1)%1000 == 0:
#       print(step+1)

#     true_labels.extend(gold_labels.cpu().numpy())
#     predicted_labels.extend(score.detach().cpu().numpy())
  
#   training_loss_for_epoch= training_loss/training_steps
#   pcc= pearsonr(true_labels, predicted_labels)
#   result = {'loss': training_loss_for_epoch, 'PCC': pcc[0]}
#   print(result)

#   model_to_save = model.bert.module if hasattr(model.bert, 'module') else model.bert
#   model_to_save.save_pretrained(output_dir)

#   torch.save(model.out.state_dict(), join(output_dir, 'model_state.bin'))

#   print("Running validation for epoch: {}".format(iteration+1))

#   validation_loss, validation_steps=0,0
#   true_labels, predicted_labels= list(), list()

#   for step, batch in enumerate(dev_dataloader):
#     batch = tuple(t.to(device) for t in batch)
#     ip_ids, masks, gold_labels= batch
#     score = model(ip_ids, attention_mask=masks)
#     score = score.squeeze(1)
#     loss= mse_loss(score, gold_labels)
#     validation_loss+=loss.item()
#     validation_steps+=1

#     true_labels.extend(gold_labels.cpu().numpy())
#     predicted_labels.extend(score.detach().cpu().numpy())
  
#   val_loss_for_epoch= validation_loss/validation_steps
#   pcc= pearsonr(true_labels, predicted_labels)
#   result = {'loss':val_loss_for_epoch, 'PCC': pcc[0]}
#   print(result)
  
#   #Testing

#   print("Running evaluation for epoch: {}".format(iteration+1))

#   true_labels, predicted_labels= list(), list()Preprocessing
#   model.eval()
#   with torch.no_grad():
#     for step, batch in enumerate(test_dataloader):
#       batch = tuple(t.to(device) for t in batch)
#       ip_ids, masks, gold_labels= batch
#       score = model(ip_ids, attention_mask=masks)
#       score = score.squeeze(1)

#       true_labels.extend(gold_labels.cpu().numpy())
#       predicted_labels.extend(score.detach().cpu().numpy())
  
#   pcc= pearsonr(true_labels, predicted_labels)
#   test_report= {'PCC': pcc[0]}
#   print(test_report)

#   with open(join(output_result, 'result_'+str(iteration+1)+'.json'), 'w') as fp:
#     json.dump(test_report, fp)

Extras

In [86]:
# def sts_score_generator(df):
#   tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#   #tokenizer = BertTokenizer.from_pretrained('/content/drive/My Drive/Courses/NLP/Project/model')
#   sts_dataloader = create_dataloader(tokenizer,df)
#   sts_score  = []
#   for step, batch in enumerate(sts_dataloader):
#     batch = tuple(t.to(device) for t in batch)
#     ip_ids,masks = batch
#     score = model(ip_ids, attention_mask = masks)
#     score  = score.squeeze(1)
#     sts_score.extend(score.detach().cpu().numpy())
#   return sts_score

In [87]:
# def sts_score_generator(df):
#   # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#   sts_dataloader = create_dataloader(tokenizer,df)
#   sts_score  = []
#   for step, batch in enumerate(sts_dataloader):
#     batch = tuple(t.to(device) for t in batch)
#     ip_ids,masks = batch
#     score = model(ip_ids, attention_mask = masks)
#     score  = score.squeeze(1)
#     sts_score.extend(score.detach().cpu().numpy())
#   return sts_score

In [88]:
# load_data= '/content/drive/My Drive/Courses/NLP/Project/data'
# train_df= pd.read_csv(join(load_data,'train.csv'))
# train_df.columns =['caption', 'MSR', 'test','id', 'label','sentence_1','sentence_2','url','url_2']
# dev_df= pd.read_csv(join(load_data,'dev.csv'))
# dev_df.columns =['caption', 'MSR', 'test','id', 'label','sentence_1','sentence_2','url','url_2']
# test_df= pd.read_csv(join(load_data,'test.csv'))
# #test_df = new_df
# test_df.columns =['caption', 'MSR', 'test','id', 'label','sentence_1','sentence_2','url','url_2']

In [89]:
# #Model Intialization

# #epochs=10

# #Load Model
# model_path  = '/content/drive/My Drive/Courses/NLP/Project/model'
# model= Regressor(model_path)
# weights_score = torch.load(join(model_path,'model_state.bin'))
# model.out.load_state_dict(weights_score)
# model.to(device)

# #To tokenize  the data
# #tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# # # Prepare optimizer
# # optimizer = AdamW(model.parameters(),lr=2e-5)

# # #Loss Function
# # mse_loss= nn.MSELoss().to(device)

In [90]:
# tokenizer = BertTokenizer.from_pretrained('/content/drive/My Drive/Courses/NLP/Project/model')
# #tokenizer = BertTokenizer.from_pretrained('/content/drive/My Drive/Courses/NLP/Project/model')
# # test_dataloader = create_dataloader(tokenizer, train_df)
# # train_dataloader = create_dataloader(tokenizer, train_df)
# # dev_dataloader = create_dataloader(tokenizer, dev_df)
