In [None]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


GPU and available memory check

In [None]:
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize

import psutil
import humanize
import os
import GPUtil as GPU

GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
    process = psutil.Process(os.getpid())
    print("Gen RAM Free: " + humanize.naturalsize(psutil.virtual_memory().available), " |     Proc size: " + humanize.naturalsize(process.memory_info().rss))
    print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total     {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

Gen RAM Free: 26.3 GB  |     Proc size: 111.7 MB
GPU RAM Free: 16130MB | Used: 0MB | Util   0% | Total     16130MB


Transformers for general purpose NLP models

Unidecode for ASCII translation of Unicode text.

In [None]:
!pip3 install transformers
!pip3 install unidecode



# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import os
from os import listdir
from os.path import join
import unidecode
import re
import logging
from tqdm.notebook import tnrange
import glob
import json

#For ploting results
import matplotlib.pyplot as plt

# DL Libraries
from transformers import BertModel, AdamW, BertTokenizer, BertConfig, RobertaTokenizer, RobertaModel
from keras.preprocessing.sequence import pad_sequences
import torch
import torch.nn as nn
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print("device: {} n_gpu: {}".format(device, n_gpu)) 

device: cuda n_gpu: 1


In [None]:
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)
print(logger)

<Logger __main__ (INFO)>


#Data Processing

Function to tokenize input dataframe (Query, Sentence, Label) and return tensorDatset with corresponding input_id, attention_masks and labels as a PyTorch dataloader. 

In [None]:
def create_dataloader(tokenizer, df):
    input_ids= list()
    attention_masks= list()

    print("Shape: {}".format(df.shape))

    special_sentences_1 = [sentence for i, sentence in enumerate(df.question)]
    special_sentences_2 = [" [SEP] " + str(sentence) for i, sentence in enumerate(df.answer)]
    special_sentences = [i + j for i, j in zip(special_sentences_1, special_sentences_2)]

    for sentence in special_sentences:
      encoded_text = tokenizer.encode_plus(sentence, max_length=512, add_special_tokens=True, return_token_type_ids=False, 
                                       padding='max_length', return_attention_mask=True, truncation=True)
      input_ids.append(encoded_text['input_ids'])
      attention_masks.append(encoded_text['attention_mask'])

    inputs = torch.tensor(input_ids).to(device)
    masks = torch.tensor(attention_masks).to(device)
    gold_labels = torch.tensor(df.sia_score.tolist()).to(device)
  
    data = TensorDataset(inputs, masks, gold_labels)
    sampler = RandomSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=8)

    return dataloader

In [None]:
load_data= '/content/drive/My Drive/man_mihir_project/data/sia_data'

train_df= pd.read_csv(join(load_data,'train_sia_data.csv'))
dev_df= pd.read_csv(join(load_data,'dev_sia_data.csv'))
test_df= pd.read_csv(join(load_data,'test_sia_data.csv'))

Loading pretrained 'roberta-base' tokenizer and creating dataloader for train & test dataframes.

In [None]:
#Dataloaders
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_dataloader= create_dataloader(tokenizer, train_df)
dev_dataloader= create_dataloader(tokenizer, dev_df)
test_dataloader= create_dataloader(tokenizer, test_df)

Shape: (51012, 3)
Shape: (6111, 3)
Shape: (6312, 3)


# Model

Model: 'roberta-base' with a Linear layer on top to generate SIA scores.

In [None]:
#Class for Regression
class Regressor(nn.Module):

  def __init__(self):
    super(Regressor, self).__init__()
    self.bert = RobertaModel.from_pretrained('roberta-base')
    self.out = nn.Linear(self.bert.config.hidden_size, 1)

  def forward(self, input_ids, attention_mask):
    output, pooler_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
    score= self.out(pooler_out)
    return score
    

Model Initialization with:

* 10 epochs

* 'AdamW' optimizer

* Mean Squared Error (MSE) Loss function

In [None]:
#Model Intialization

epochs=10

#Load Model
model= Regressor()
model.to(device)

# Prepare optimizer
optimizer = AdamW(model.parameters(),lr=2e-5)

#Loss Function
mse_loss= nn.MSELoss().to(device)

#Training and Evaluation

Creating output directory:

model path: '/content/drive/My Drive/man_mihir_project/sia_experiment/model1'

result path: '/content/drive/My Drive/man_mihir_project/sia_experiment/results1'

In [None]:
output_dir= '/content/drive/My Drive/man_mihir_project/sia_experiment/model'
output_result= '/content/drive/My Drive/man_mihir_project/sia_experiment/results'

if not os.path.exists(output_dir):
  os.makedirs(output_dir)

if not os.path.exists(output_result):
  os.makedirs(output_result)

Model training followed by model evaluation

In [None]:
for iteration in tnrange(epochs, desc='Epochs'):
  model.train()
  logger.info("Running for iteration: {}".format(iteration+1))

  training_loss, training_steps=0,0
  true_labels, predicted_labels= list(), list()
  
  for step, batch in enumerate(train_dataloader):
    batch = tuple(t.to(device) for t in batch)
    ip_ids, masks, gold_labels= batch
    score = model(ip_ids, attention_mask=masks)
    score = score.squeeze(1)
    loss= mse_loss(score, gold_labels.float())
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    training_loss+=loss.item()
    training_steps+=1

    true_labels.extend(gold_labels.cpu().numpy())
    predicted_labels.extend(score.detach().cpu().numpy())
  
  training_loss_for_epoch= training_loss/training_steps
  pcc= pearsonr(true_labels, predicted_labels)
  rmse= mean_squared_error(true_labels, predicted_labels, squared=False)
  result = {'loss': training_loss_for_epoch, 'PCC': pcc[0], 'RMSE':rmse}
  print(result)

  model_to_save = model.bert.module if hasattr(model.bert, 'module') else model.bert
  model_to_save.save_pretrained(output_dir)

  torch.save(model.out.state_dict(), join(output_dir, 'model_state.bin'))

  #Validation
  print("Running validation for epoch: {}".format(iteration+1))

  true_labels, predicted_labels= list(), list()
  val_loss, val_steps=0,0

  model.eval()
  with torch.no_grad():
    for step, batch in enumerate(dev_dataloader):
      batch = tuple(t.to(device) for t in batch)
      ip_ids, masks, gold_labels= batch
      score = model(ip_ids, attention_mask=masks)
      score = score.squeeze(1)
      loss= mse_loss(score, gold_labels.float())
      val_loss+=loss.item()
      val_steps+=1

      true_labels.extend(gold_labels.cpu().numpy())
      predicted_labels.extend(score.detach().cpu().numpy())
  
  val_loss_for_epoch= val_loss/val_steps
  pcc= pearsonr(true_labels, predicted_labels)
  rmse= mean_squared_error(true_labels, predicted_labels, squared=False)
  test_report= {'loss': val_loss_for_epoch, 'PCC': pcc[0], 'RMSE':str(rmse)}
  print(test_report)

  #Testing
  print("Running evaluation for epoch: {}".format(iteration+1))

  true_labels, predicted_labels= list(), list()
  model.eval()
  with torch.no_grad():
    for step, batch in enumerate(test_dataloader):
      batch = tuple(t.to(device) for t in batch)
      ip_ids, masks, gold_labels= batch
      score = model(ip_ids, attention_mask=masks)
      score = score.squeeze(1)

      true_labels.extend(gold_labels.cpu().numpy())
      predicted_labels.extend(score.detach().cpu().numpy())
  
  pcc= pearsonr(true_labels, predicted_labels)
  rmse= mean_squared_error(true_labels, predicted_labels, squared=False)
  test_report= {'PCC': pcc[0], 'RMSE':str(rmse)}
  print(test_report)

  with open(join(output_result, 'result_'+str(iteration+1)+'.json'), 'w') as fp:
    json.dump(test_report, fp)

HBox(children=(FloatProgress(value=0.0, description='Epochs', max=10.0, style=ProgressStyle(description_width=…

11/20/2020 00:52:08 - INFO - __main__ -   Running for iteration: 1


{'loss': 0.5266859620644233, 'PCC': 0.6040618884444051, 'RMSE': 0.7257527}
Running validation for epoch: 1
{'loss': 0.44674821646094165, 'PCC': 0.5872597255492085, 'RMSE': '0.6684408'}
Running evaluation for epoch: 1


11/20/2020 01:21:50 - INFO - __main__ -   Running for iteration: 2


{'PCC': 0.592949521688731, 'RMSE': '0.68295664'}
{'loss': 0.3708119323374626, 'PCC': 0.7429421213960177, 'RMSE': 0.6089458}
Running validation for epoch: 2
{'loss': 0.42568886601878086, 'PCC': 0.5913124039912669, 'RMSE': '0.65247697'}
Running evaluation for epoch: 2


11/20/2020 01:51:31 - INFO - __main__ -   Running for iteration: 3


{'PCC': 0.6126418709092778, 'RMSE': '0.6523529'}
{'loss': 0.30547364057118087, 'PCC': 0.7942870213236641, 'RMSE': 0.55265903}
Running validation for epoch: 3
{'loss': 0.4825591450625377, 'PCC': 0.5832100631178629, 'RMSE': '0.69468254'}
Running evaluation for epoch: 3


11/20/2020 02:21:13 - INFO - __main__ -   Running for iteration: 4


{'PCC': 0.5912090434949465, 'RMSE': '0.7045721'}
{'loss': 0.2626739875671245, 'PCC': 0.8261565396289542, 'RMSE': 0.51253074}
Running validation for epoch: 4
{'loss': 0.4720981094488845, 'PCC': 0.5765449539810705, 'RMSE': '0.68713367'}
Running evaluation for epoch: 4


11/20/2020 02:50:52 - INFO - __main__ -   Running for iteration: 5


{'PCC': 0.5735503591775546, 'RMSE': '0.71010953'}
{'loss': 0.22121057712825087, 'PCC': 0.8559527235805487, 'RMSE': 0.47032884}
Running validation for epoch: 5
{'loss': 0.44745645154735647, 'PCC': 0.5886052882870334, 'RMSE': '0.66895795'}
Running evaluation for epoch: 5


11/20/2020 03:20:33 - INFO - __main__ -   Running for iteration: 6


{'PCC': 0.5913106390632916, 'RMSE': '0.6895179'}
{'loss': 0.19220350798614053, 'PCC': 0.8761858711654654, 'RMSE': 0.4384154}
Running validation for epoch: 6
{'loss': 0.507854274976316, 'PCC': 0.5794808926938192, 'RMSE': '0.7126828'}
Running evaluation for epoch: 6


11/20/2020 03:50:13 - INFO - __main__ -   Running for iteration: 7


{'PCC': 0.5699463893014216, 'RMSE': '0.7450327'}
{'loss': 0.164368638072929, 'PCC': 0.895173403431865, 'RMSE': 0.4054337}
Running validation for epoch: 7
{'loss': 0.5093861876874772, 'PCC': 0.5819130517244827, 'RMSE': '0.7137616'}
Running evaluation for epoch: 7


11/20/2020 04:19:55 - INFO - __main__ -   Running for iteration: 8


{'PCC': 0.5803771199860133, 'RMSE': '0.7331443'}
{'loss': 0.14269512340756357, 'PCC': 0.9096889489739736, 'RMSE': 0.3777525}
Running validation for epoch: 8
{'loss': 0.5032455213367939, 'PCC': 0.5778445675264673, 'RMSE': '0.70934534'}
Running evaluation for epoch: 8


11/20/2020 04:49:36 - INFO - __main__ -   Running for iteration: 9


{'PCC': 0.5772541905671027, 'RMSE': '0.73568857'}
{'loss': 0.13165219620544763, 'PCC': 0.916998482720494, 'RMSE': 0.36283407}
Running validation for epoch: 9
{'loss': 0.5117116411660276, 'PCC': 0.5725776146692011, 'RMSE': '0.7153465'}
Running evaluation for epoch: 9


11/20/2020 05:19:20 - INFO - __main__ -   Running for iteration: 10


{'PCC': 0.5729123339037223, 'RMSE': '0.7356315'}
{'loss': 0.11269743514643164, 'PCC': 0.9294081509450887, 'RMSE': 0.33569396}
Running validation for epoch: 10
{'loss': 0.5233996129224164, 'PCC': 0.5790363937733175, 'RMSE': '0.7234673'}
Running evaluation for epoch: 10
{'PCC': 0.582014837780855, 'RMSE': '0.7417391'}

