In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip3 install transformers
!pip3 install unidecode

In [None]:
import numpy as np
import pandas as pd
import os
from os import listdir
from os.path import join
import unidecode
import re
import logging
from tqdm.notebook import tnrange
import glob
import json

#For ploting results
import matplotlib.pyplot as plt

# DL Libraries
from transformers import BertModel,RobertaModel,RobertaTokenizer, AdamW, BertTokenizer, BertConfig, get_linear_schedule_with_warmup
from keras.preprocessing.sequence import pad_sequences
import torch
import torch.nn as nn
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,TensorDataset)
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr

#NLTK Libraries
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print("device: {} n_gpu: {}".format(device, n_gpu))

In [None]:
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)
print(logger)

# Data Processing

In [None]:
#taking the jsonl file and extracting it to generate the entire data in dataframe
!pip install jsonlines
import jsonlines
id=[]  
question=[]   
label_list=[] 
A_choice=[] 
B_choice=[] 
C_choice=[] 
D_choice=[] 
Actual_Answer=[] 
fact_1=[] 

with jsonlines.open('/content/drive/My Drive/NLP/train_complete.jsonl') as f:
    for line in f.iter():
      question.append(line['question']['stem'])
      label_list.append(line['question']['choices'])
      id.append(line["id"])
      Actual_Answer.append(line['answerKey'])
      fact_1.append(line['fact1'])

for i in range(0,len(label_list)):
  for j in range(0,4):
    if(label_list[i][j]['label']=="A"):
      A_choice.append(label_list[i][j]['text'])
      
    if(label_list[i][j]['label']=="B"):
      B_choice.append(label_list[i][j]['text'])
      
    if(label_list[i][j]['label']=="C"):
      C_choice.append(label_list[i][j]['text'])
      
    if(label_list[i][j]['label']=="D"):
      D_choice.append(label_list[i][j]['text'])    
  
merged_list = tuple(zip(id,question,Actual_Answer,A_choice,B_choice,C_choice,D_choice,fact_1))
data=pd.DataFrame(merged_list,columns=['ID','Question','Actual Answer','A','B','C','D','Fact 1'])
data

In [None]:
answer_candidates=[]
flag = False
for i in range(0,len(A_choice)):
  R=[] 
  for j in range(0,4):
    R.append(label_list[i][j]['text'])
  answer_candidates.append(R)
  
#Getting the exactanswer candidates for each question in the exactanswer_candidates list
exactanswer_candidates=[]
flag = False
for i in range(0,len(A_choice)):
  R=[] 
  for j in range(0,4):
    l = label_list[i][j]['label']
    if(l == Actual_Answer[i]):
      R.append(question[i])
      R.append(label_list[i][j]['text'])
      break
  exactanswer_candidates.append(R)

In [None]:
exactanswer_candidates

In [None]:
#Generating with the BM25 top choices for each choice
!pip install rank_bm25
from rank_bm25 import BM25Okapi
corpus = []
with open('/content/drive/My Drive/NLP/names.txt', 'r') as f:
    corpus = f.readlines()
tokenized_corpus = [doc.split(" ") for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)



In [None]:
question_list = []
finallist = []
for i in range(0,len(answer_candidates)):
  question_list=[]
  for j in range(0,4):
    query = answer_candidates[i][j]
    tokenized_query = query.split(" ")
    doc_scores = bm25.get_scores(tokenized_query)
    l = bm25.get_top_n(tokenized_query, corpus, n=2)
    question_list.append(l)
  finallist.append(question_list)

In [None]:
another_list = []
for i in range(0,len(finallist)):
  choice_list= []
  for j in range(0,4):
    for k in range(0,2):
      alist=[]
      alist.append(question[i] + ' ' + exactanswer_candidates[i][1])
      alist.append(finallist[i][j][k])
      choice_list.append(alist)
  f_list=[]
  f_list.append(question[i] + ' ' + exactanswer_candidates[i][1])
  f_list.append(fact_1[i])
  choice_list.append(f_list)
  another_list.append(choice_list)

In [None]:
q=[]
a=[]
for i in range(0, len(another_list)):
  for j in range(0,9):
    q.append(another_list[i][j][0])
    a.append(another_list[i][j][1])

In [None]:
merged_list = tuple(zip(q,a))
final_data = pd.DataFrame(merged_list,columns=['Sentence_1','Sentence_2'])

In [None]:
final_data

Unnamed: 0,Sentence_1,Sentence_2
0,"The sun is responsible for plants sprouting, b...",hush puppies are food\n
1,"The sun is responsible for plants sprouting, b...",performing tricks is a learned behavior\n
2,"The sun is responsible for plants sprouting, b...",children inherit characteristics of parents\n
3,"The sun is responsible for plants sprouting, b...",reaping something is getting it\n
4,"The sun is responsible for plants sprouting, b...",a vase is an object\n
...,...,...
44608,Harriet wants to know the area of a rectangula...,a mouse is a prey to a cat\n
44609,Harriet wants to know the area of a rectangula...,a slash is a wound\n
44610,Harriet wants to know the area of a rectangula...,a mouse is a prey to a cat\n
44611,Harriet wants to know the area of a rectangula...,a slash is a wound\n


# Web STS BERT Model

In [None]:
!pip install git+https://github.com/AndriyMulyar/semantic-text-similarity
!pip install urllib3==1.25.10

In [None]:
from semantic_text_similarity.models import WebBertSimilarity
web_model = WebBertSimilarity(device='cuda', batch_size=32) #defaults to GPU prediction

web_model.predict([("She won an olympic gold medal","The women is an olympic champion")])

In [None]:
sts_score = []
def predict_score(df):
  web_model = WebBertSimilarity(device='cuda', batch_size=32) #defaults to GPU prediction
  for i in  range(len(df)):
    sts_score.extend(web_model.predict([(df['sentence_1'][i],df['sentence_2'][i])]))
    print(i,  df['sentence_1'][i], df['sentence_2'][i], sts_score[i])
  return  sts_score

In [None]:
final_data

In [None]:
final_df = pd.DataFrame()
new_df2 = final_data
final_df['sentence_1'] = final_data['Sentence_1']
final_df['sentence_2'] =  final_data['Sentence_2']
#sts_score  = []
sts_score = predict_score(final_df)
final_df['score'] = sts_score
final_df = final_df.round({'score': 2})

In [None]:
final_df[50:70]

In [None]:
choice_list = []
for i in range(0,len(finallist)):
  for j in range(0,9):
    choice_list.append(question[i])

In [None]:
choice_list

In [None]:
merged = tuple(zip(choice_list,final_df['sentence_2'], final_df['score']))
pr = pd.DataFrame(merged,columns=['Question','Sentence','Sia_Score'])
pr['Sia_Score'] = pr['Sia_Score'] * 0.8

In [None]:
pr

In [None]:
pr.to_csv('/content/drive/My Drive/NLP/NLPDataset.csv')

# Roberta Model

In [None]:
def create_dataloader(tokenizer, df):
    input_ids= list()
    attention_masks= list()
    print("Shape: {}".format(df.shape))
    special_sentences_1 = [sentence for i, sentence in enumerate(df.sentence_1)]
    special_sentences_2 = [" [SEP] " + str(sentence) for i, sentence in enumerate(df.sentence_2)]
    special_sentences = [str(i) + str(j) for i, j in zip(special_sentences_1, special_sentences_2)]

    for sentence in special_sentences:
      encoded_text = tokenizer.encode_plus(sentence, max_length=128, add_special_tokens=True, return_token_type_ids=False, 
                                       padding='max_length', return_attention_mask=True, truncation=True)
      input_ids.append(encoded_text['input_ids'])
      attention_masks.append(encoded_text['attention_mask'])

    inputs = torch.tensor(input_ids).to(device)
    masks = torch.tensor(attention_masks).to(device)
    data = TensorDataset(inputs, masks)
    sampler = RandomSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=4)
    return dataloader

In [None]:
class Regressor(nn.Module):
  def __init__(self,  model_path):
    super(Regressor, self).__init__()
    # self.bert = BertModel.from_pretrained(model_path)
    self.bert = RobertaModel.from_pretrained('/content/drive/My Drive/NLP/roberta-model')
    self.out = nn.Linear(self.bert.config.hidden_size, 1)

  def forward(self, input_ids, attention_mask):
    output, pooler_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
    score= self.out(pooler_out)
    return score

In [None]:
model_path  = '/content/drive/My Drive/NLP/roberta-model'
model= Regressor(model_path)
weights_score = torch.load(join(model_path,'model_state.bin'))
model.out.load_state_dict(weights_score)
model.to(device)

In [None]:
def sts_score_generator(df):
  tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
  sts_dataloader = create_dataloader(tokenizer,df)
  sts_score  = []
  for step, batch in enumerate(sts_dataloader):
    batch = tuple(t.to(device) for t in batch)
    ip_ids,masks = batch
    score = model(ip_ids, attention_mask = masks)
    score  = score.squeeze(1)
    sts_score.extend(score.detach().cpu().numpy())
  return sts_score

In [None]:
final_df_r = pd.DataFrame()
final_df_r['sentence_1'] =  final_data['Sentence_1']
final_df_r['sentence_2'] = final_data['Sentence_2']

sts_score = sts_score_generator(final_df_r)
final_df_r['score'] = sts_score
final_df_r.round({'score': 2})


Shape: (44613, 2)


Unnamed: 0,sentence_1,sentence_2,score
0,"The sun is responsible for plants sprouting, b...",hush puppies are food\n,0.43
1,"The sun is responsible for plants sprouting, b...",performing tricks is a learned behavior\n,1.08
2,"The sun is responsible for plants sprouting, b...",children inherit characteristics of parents\n,2.32
3,"The sun is responsible for plants sprouting, b...",reaping something is getting it\n,1.49
4,"The sun is responsible for plants sprouting, b...",a vase is an object\n,2.17
...,...,...,...
44608,Harriet wants to know the area of a rectangula...,a mouse is a prey to a cat\n,0.24
44609,Harriet wants to know the area of a rectangula...,a slash is a wound\n,3.52
44610,Harriet wants to know the area of a rectangula...,a mouse is a prey to a cat\n,2.37
44611,Harriet wants to know the area of a rectangula...,a slash is a wound\n,0.63


In [None]:
final_df_r.to_csv('/content/drive/My Drive/NLP/roberta-model-res.csv')

# Bert Model


In [None]:
def create_dataloader(tokenizer, df):
    input_ids= list()
    attention_masks= list()
    print("Shape: {}".format(df.shape))
    special_sentences_1 = [sentence for i, sentence in enumerate(df.sentence_1)]
    special_sentences_2 = [" [SEP] " + str(sentence) for i, sentence in enumerate(df.sentence_2)]
    special_sentences = [str(i) + str(j) for i, j in zip(special_sentences_1, special_sentences_2)]

    for sentence in special_sentences:
      encoded_text = tokenizer.encode_plus(sentence, max_length=128, add_special_tokens=True, return_token_type_ids=False, 
                                       padding='max_length', return_attention_mask=True, truncation=True)
      input_ids.append(encoded_text['input_ids'])
      attention_masks.append(encoded_text['attention_mask'])

    inputs = torch.tensor(input_ids).to(device)
    masks = torch.tensor(attention_masks).to(device)
    data = TensorDataset(inputs, masks)
    sampler = RandomSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=4)
    return dataloader

In [None]:
class Regressor(nn.Module):
  def __init__(self,  model_path):
    super(Regressor, self).__init__()
    self.bert = BertModel.from_pretrained('/content/drive/My Drive/NLP/model')
    #self.bert = RobertaModel.from_pretrained('/content/drive/My Drive/NLP/roberta-model')
    self.out = nn.Linear(self.bert.config.hidden_size, 1)

  def forward(self, input_ids, attention_mask):
    output, pooler_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
    score= self.out(pooler_out)
    return score

In [None]:
model_path  = '/content/drive/My Drive/NLP/model'
model= Regressor(model_path)
weights_score = torch.load(join(model_path,'model_state.bin'))
model.out.load_state_dict(weights_score)
model.to(device)

Regressor(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  

In [None]:
def sts_score_generator(df):
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  sts_dataloader = create_dataloader(tokenizer,df)
  sts_score  = []
  for step, batch in enumerate(sts_dataloader):
    batch = tuple(t.to(device) for t in batch)
    ip_ids,masks = batch
    score = model(ip_ids, attention_mask = masks)
    score  = score.squeeze(1)
    sts_score.extend(score.detach().cpu().numpy())
  return sts_score

In [None]:
final_df_b = pd.DataFrame()
final_df_b['sentence_1'] =  final_data['Sentence_1']
final_df_b['sentence_2'] = final_data['Sentence_2']
sts_score = []
sts_score = sts_score_generator(final_df_b)
final_df_b['score'] = sts_score
final_df_b.round({'score': 2})

# With Facts


In [None]:
final_df_b.to_csv('/content/drive/My Drive/NLP/bert-model-res.csv')

In [None]:
another_fact = []
for i in range(0,len(finallist)):
  choice_list= []
  for j in range(0,4):
    for k in range(0,2):
      alist=[]
      alist.append(question[i] + ' ' + fact_1[i])
      alist.append(finallist[i][j][k])
      choice_list.append(alist)
  f_list=[]
  f_list.append(question[i] + ' ' + fact_1[i])
  f_list.append(A_choice[i])
  choice_list.append(f_list)
  f_list=[]
  f_list.append(question[i] + ' ' + fact_1[i])
  f_list.append(B_choice[i])
  choice_list.append(f_list)
  f_list=[]
  f_list.append(question[i] + ' ' + fact_1[i])
  f_list.append(C_choice[i])
  choice_list.append(f_list)
  f_list=[]
  f_list.append(question[i] + ' ' + fact_1[i])
  f_list.append(D_choice[i])
  choice_list.append(f_list)
  another_fact.append(choice_list)

In [None]:
another_fact[0:20]

In [None]:
q_f=[]
a_f=[]
for i in range(0, len(another_list)):
  for j in range(0,12):
    q_f.append(another_fact[i][j][0])
    a_f.append(another_fact[i][j][1])

In [None]:
merged_fact = tuple(zip(q_f,a_f))
final_fact = pd.DataFrame(merged_fact,columns=['Sentence_1','Sentence_2'])

In [None]:
final_df_f = pd.DataFrame()
new_df2_f = final_fact
final_df_f['sentence_1'] = final_fact['Sentence_1']
final_df_f['sentence_2'] =  final_fact['Sentence_2']
sts_score  = []
sts_score = predict_score(final_df_f)
final_df_f['score'] = sts_score
final_df_f = np.round({'score': 2})

NameError: ignored

In [None]:
final_fact