# Necessary Installations section

In [None]:
!pip3 install transformers
!pip3 install unidecode
!pip install pandas
!pip install keras 
!pip install tensorflow
!pip install torch
!pip install git+https://github.com/AndriyMulyar/semantic-text-similarity
!pip install urllib3==1.25.10
!pip install awscli awsebcli botocore==1.18.18 --upgrade
!pip install tqdm


In [2]:
import numpy as np
import pandas as pd
from os import listdir
from os.path import join
import unidecode
import re
import logging
from tqdm.notebook import tnrange
import glob
import json
import os

#For ploting results
import matplotlib.pyplot as plt

# DL Libraries
from transformers import BertModel, AdamW, BertTokenizer, BertConfig, get_linear_schedule_with_warmup
from keras.preprocessing.sequence import pad_sequences
import torch
import torch.nn as nn
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,TensorDataset)
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr
from semantic_text_similarity.models import WebBertSimilarity

# #NLTK Libraries
# import nltk
# nltk.download('punkt')
# from nltk.tokenize import word_tokenize
# from nltk.corpus import stopwords
# nltk.download('stopwords')
# stop_words = set(stopwords.words('english'))
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
# print("device: {} n_gpu: {}".format(device, n_gpu))
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)
# print(logger)

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Sample Model Testing

In [None]:
from semantic_text_similarity.models import WebBertSimilarity

web_model = WebBertSimilarity(device='cuda', batch_size=10) #defaults to GPU prediction

#clinical_model = ClinicalBertSimilarity(device='cuda', batch_size=10) #defaults to GPU prediction

web_model.predict([("She won an olympic gold medal","The women is an olympic champion")])

# Data Preprocessing section

Function Definitions

In [4]:

def generate_df(df, start,end):
    new_df = pd.DataFrame(columns=['question-id','passage-id','query','passage','sentence', 'passage_len'])
    pd.set_option('display.max_seq_items', None)
    num_psges =  0
    print("max length of the data : ",len(df))

    for i in tnrange(start,end):
        cntxt_len = len(df['context'][i])
        num_psges  = num_psges + cntxt_len

        query = str(df['question'][i] + df['answer'][i])
        if len(query) < 2:
            continue

        for j  in range(0,cntxt_len):
            passage  =  "".join(df['context'][i][j][1])
            senList  = df['context'][i][j][1]
            # passage_title =  df['context'][i][j][0]
            for each_sen in senList:
                # filtering empty sentences
                if len(each_sen) < 2 :
                    continue

                ques_id = str(i+1)
                psge_id  =  str(j+1)
                new_row = {'question-id':ques_id,'passage-id':psge_id, 'query':query,'passage': passage,'sentence': each_sen,'passage_len': len(senList)}
                
                new_df =  new_df.append(new_row,ignore_index= True)
    return  new_df

In [5]:
def generate_files(inpfilename,  start,  end):
    df = pd.read_json(inpfilename)
    new_df  =  generate_df(df,start,end)
    final_df = pd.DataFrame(columns=['question-id', 'passage-id','passage','passage_len','sentence_1','sentence_2','score'])
    final_df['question-id'] = new_df['question-id']
    final_df['passage-id'] =  new_df['passage-id']
    final_df['passage'] = new_df['passage']
    final_df['passage_len'] = new_df['passage_len']
    final_df['sentence_1'] = new_df['query']
    final_df['sentence_2'] =  new_df['sentence']
    return final_df
    #new_df  =  generate_df(df,start,end)
    # str1 = foldername 
    # str3 = ".csv"
    # fullfilename = "".join((str1,str(start), "_", str(end),str3))
    
    # os.makedirs(os.path.dirname(fullfilename), exist_ok=True)

    # if os.path.isfile(fullfilename) and forcegenerate == False:
    #     print(fullfilename + " already  created -  so skipping  generation")
    # else:
    #     new_df.to_csv(fullfilename,index = False)
    #     print(fullfilename + " generated")


Generation

In [8]:

start = 0
end = 5
FOLDER = '/content/drive/My Drive/Courses/NLP/Project/data'
JSON_FILE =  FOLDER +"/hotpot_train_v1.1.json"
# FILE_PREFIX  =  FOLDER +  '/parsed_df/parsed_data_'

In [9]:
JSON_FILE

'/content/drive/My Drive/Courses/NLP/Project/data/hotpot_train_v1.1.json'

In [10]:
# generate_files(JSON_FILE, FILE_PREFIX, start,end, True)
final_df =  generate_files(JSON_FILE,start, end)

10/30/2020 06:12:24 - INFO - numexpr.utils -   NumExpr defaulting to 2 threads.


max length of the data :  90447


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




# Scores Generation section

In [None]:
web_model = WebBertSimilarity(device='cuda', batch_size=16) #defaults to GPU prediction

for i in tnrange(len(final_df)):
    sts_score  = web_model.predict([(final_df['sentence_1'][i],final_df['sentence_2'][i])])
    final_df['score'][i]= np.round(sts_score,2)[0]


# Data Processing Section

In [14]:
output_df = pd.DataFrame(columns=['question', 'sentence','sia_score'])
i =  0
while i < len(final_df):
  p_len = final_df['passage_len'][i]
  score_sum = 0
  k  = 0
  while k < p_len:
    score_sum = score_sum +  final_df['score'][i]
    k = k+1
    i  = i+1
  avg_score =  (score_sum/ p_len)
  new_row = {'question': final_df['sentence_1'][i-1], 'sentence' : final_df['passage'][i-1], 'sia_score' : avg_score }
  output_df =  output_df.append(new_row,ignore_index= True)

In [17]:
output_df = output_df.round({'sia_score': 2})
OUT_FOLDER = "".join((FOLDER, "/output_df/"))
filename = OUT_FOLDER +"output_with_passage_"+ str(start) +"_"+str(end) +".csv"
os.makedirs(os.path.dirname(filename), exist_ok=True)
output_df.to_csv(filename,index = False)


In [None]:
output_df

# The End #

# Experiments on MODELs



In [None]:
# #Class for Regression
# class Regressor(nn.Module):

#   def __init__(self,  model_path):
#     super(Regressor, self).__init__()
#     # self.bert = BertModel.from_pretrained(model_path)
#     self.bert = BertModel.from_pretrained('/content/drive/My Drive/Courses/NLP/Project/model')
#     self.out = nn.Linear(self.bert.config.hidden_size, 1)

#   def forward(self, input_ids, attention_mask):
#     output, pooler_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)Preprocessing
#     score= self.out(pooler_out)
#     return score

In [None]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# n_gpu = torch.cuda.device_count()
# print("device: {} n_gpu: {}".format(device, n_gpu)) 

In [None]:
# logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
#                     datefmt = '%m/%d/%Y %H:%M:%S',
#                     level = logging.INFO)
# logger = logging.getLogger(__name__)
# print(logger)

In [None]:
# # memory footprint support libraries/code
# !ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
# !pip install gputil
# !pip install psutil
# !pip install humanize

# import psutil
# import humanize
# import os
# import GPUtil as GPU

# GPUs = GPU.getGPUs()
# # XXX: only one GPU on Colab and isn’t guaranteed
# gpu = GPUs[0]
# def printm():
#     process = psutil.Process(os.getpid())
#     print("Gen RAM Free: " + humanize.naturalsize(psutil.virtual_memory().available), " |     Proc size: " + humanize.naturalsize(process.memory_info().rss))
#     print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total     {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
# printm()

# TRAINING AND EVALUATION

In [None]:
# output_dir= '/content/drive/My Drive/Courses/NLP/Project/model'
# output_result= '/content/drive/My Drive/Courses/NLP/Project/results'

# if not os.path.exists(output_dir):
#   os.makedirs(output_dir)

# if not os.path.exists(output_result):
#   os.makedirs(output_result)

In [None]:
# for iteration in tnrange(epochs, desc='Epochs'):
#   model.train()
#   logger.info("Running for iteration: {}".format(iteration+1))

#   training_loss, training_steps=0,0
#   true_labels, predicted_labels= list(), list()
  
#   for step, batch in enumerate(train_dataloader):
#     batch = tuple(t.to(device) for t in batch)
#     ip_ids, masks, gold_labels= batch
#     score = model(ip_ids, attention_mask=masks)
#     score = score.squeeze(1)
#     loss= mse_loss(score, gold_labels.float())
#     loss.backward()
#     optimizer.step()
#     optimizer.zero_grad()
#     training_loss+=loss.item()
#     training_steps+=1
#     if (step+1)%1000 == 0:
#       print(step+1)

#     true_labels.extend(gold_labels.cpu().numpy())
#     predicted_labels.extend(score.detach().cpu().numpy())
  
#   training_loss_for_epoch= training_loss/training_steps
#   pcc= pearsonr(true_labels, predicted_labels)
#   result = {'loss': training_loss_for_epoch, 'PCC': pcc[0]}
#   print(result)

#   model_to_save = model.bert.module if hasattr(model.bert, 'module') else model.bert
#   model_to_save.save_pretrained(output_dir)

#   torch.save(model.out.state_dict(), join(output_dir, 'model_state.bin'))

#   print("Running validation for epoch: {}".format(iteration+1))

#   validation_loss, validation_steps=0,0
#   true_labels, predicted_labels= list(), list()

#   for step, batch in enumerate(dev_dataloader):
#     batch = tuple(t.to(device) for t in batch)
#     ip_ids, masks, gold_labels= batch
#     score = model(ip_ids, attention_mask=masks)
#     score = score.squeeze(1)
#     loss= mse_loss(score, gold_labels)
#     validation_loss+=loss.item()
#     validation_steps+=1

#     true_labels.extend(gold_labels.cpu().numpy())
#     predicted_labels.extend(score.detach().cpu().numpy())
  
#   val_loss_for_epoch= validation_loss/validation_steps
#   pcc= pearsonr(true_labels, predicted_labels)
#   result = {'loss':val_loss_for_epoch, 'PCC': pcc[0]}
#   print(result)
  
#   #Testing

#   print("Running evaluation for epoch: {}".format(iteration+1))

#   true_labels, predicted_labels= list(), list()Preprocessing
#   model.eval()
#   with torch.no_grad():
#     for step, batch in enumerate(test_dataloader):
#       batch = tuple(t.to(device) for t in batch)
#       ip_ids, masks, gold_labels= batch
#       score = model(ip_ids, attention_mask=masks)
#       score = score.squeeze(1)

#       true_labels.extend(gold_labels.cpu().numpy())
#       predicted_labels.extend(score.detach().cpu().numpy())
  
#   pcc= pearsonr(true_labels, predicted_labels)
#   test_report= {'PCC': pcc[0]}
#   print(test_report)

#   with open(join(output_result, 'result_'+str(iteration+1)+'.json'), 'w') as fp:
#     json.dump(test_report, fp)

Extras

In [None]:
# def sts_score_generator(df):
#   tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#   #tokenizer = BertTokenizer.from_pretrained('/content/drive/My Drive/Courses/NLP/Project/model')
#   sts_dataloader = create_dataloader(tokenizer,df)
#   sts_score  = []
#   for step, batch in enumerate(sts_dataloader):
#     batch = tuple(t.to(device) for t in batch)
#     ip_ids,masks = batch
#     score = model(ip_ids, attention_mask = masks)
#     score  = score.squeeze(1)
#     sts_score.extend(score.detach().cpu().numpy())
#   return sts_score

In [None]:
# def sts_score_generator(df):
#   # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#   sts_dataloader = create_dataloader(tokenizer,df)
#   sts_score  = []
#   for step, batch in enumerate(sts_dataloader):
#     batch = tuple(t.to(device) for t in batch)
#     ip_ids,masks = batch
#     score = model(ip_ids, attention_mask = masks)
#     score  = score.squeeze(1)
#     sts_score.extend(score.detach().cpu().numpy())
#   return sts_score

In [None]:
# load_data= '/content/drive/My Drive/Courses/NLP/Project/data'
# train_df= pd.read_csv(join(load_data,'train.csv'))
# train_df.columns =['caption', 'MSR', 'test','id', 'label','sentence_1','sentence_2','url','url_2']
# dev_df= pd.read_csv(join(load_data,'dev.csv'))
# dev_df.columns =['caption', 'MSR', 'test','id', 'label','sentence_1','sentence_2','url','url_2']
# test_df= pd.read_csv(join(load_data,'test.csv'))
# #test_df = new_df
# test_df.columns =['caption', 'MSR', 'test','id', 'label','sentence_1','sentence_2','url','url_2']

In [None]:
# #Model Intialization

# #epochs=10

# #Load Model
# model_path  = '/content/drive/My Drive/Courses/NLP/Project/model'
# model= Regressor(model_path)
# weights_score = torch.load(join(model_path,'model_state.bin'))
# model.out.load_state_dict(weights_score)
# model.to(device)

# #To tokenize  the data
# #tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# # # Prepare optimizer
# # optimizer = AdamW(model.parameters(),lr=2e-5)

# # #Loss Function
# # mse_loss= nn.MSELoss().to(device)

In [None]:
# tokenizer = BertTokenizer.from_pretrained('/content/drive/My Drive/Courses/NLP/Project/model')
# #tokenizer = BertTokenizer.from_pretrained('/content/drive/My Drive/Courses/NLP/Project/model')
# # test_dataloader = create_dataloader(tokenizer, train_df)
# # train_dataloader = create_dataloader(tokenizer, train_df)
# # dev_dataloader = create_dataloader(tokenizer, dev_df)
