# **Install and Load all the Necessary Libraries**

In [None]:
!pip3 install transformers
!pip3 install unidecode

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/2c/4e/4f1ede0fd7a36278844a277f8d53c21f88f37f3754abf76a5d6224f76d4a/transformers-3.4.0-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 10.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 57.5MB/s 
[?25hCollecting tokenizers==0.9.2
[?25l  Downloading https://files.pythonhosted.org/packages/7c/a5/78be1a55b2ac8d6a956f0a211d372726e2b1dd2666bb537fea9b03abd62c/tokenizers-0.9.2-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 59.9MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893257 sha256=b651edecd8e1af1

In [None]:
import numpy as np
import pandas as pd
import os
from os import listdir
from os.path import join
import unidecode
import re
import logging
from tqdm.notebook import tnrange
import glob
import json

#For ploting results
import matplotlib.pyplot as plt

# DL Libraries
from transformers import BertModel, AdamW, BertTokenizer,RobertaTokenizer, BertConfig, get_linear_schedule_with_warmup,RobertaModel
from keras.preprocessing.sequence import pad_sequences
import torch
import torch.nn as nn
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,TensorDataset)
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr

#NLTK Libraries
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# **Path to the Directory of Model and Dataset**

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


# **Tokenizing the input Data**

In [None]:
def create_dataloader(tokenizer, df):
    input_ids= list()
    attention_masks= list()

    print("Shape: {}".format(df.shape))

    special_sentences_1 = ["[CLS] "+ sentence for i, sentence in enumerate(df.Query)]
    special_sentences_2 = [" [SEP] " + str(sentence) for i, sentence in enumerate(df.Sentence)]
    special_sentences = [str(i) + str(j) +" [SEP]" for i, j in zip(special_sentences_1, special_sentences_2)]

    for sentence in special_sentences:
      encoded_text = tokenizer.encode_plus(sentence, max_length=128, add_special_tokens=True, return_token_type_ids=False, 
                                       padding='max_length', return_attention_mask=True, truncation=True)
      input_ids.append(encoded_text['input_ids'])
      attention_masks.append(encoded_text['attention_mask'])

    inputs = torch.tensor(input_ids).to(device)
    masks = torch.tensor(attention_masks).to(device)
    #gold_labels = torch.tensor(df.label).to(device)
  
    data = TensorDataset(inputs, masks)
    #data = TensorDataset(inputs, masks, gold_labels)

    sampler = RandomSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=4)
    print(input_ids)
    print(attention_masks)

    return dataloader

## **MODEL SETUP**

In [None]:
#Class for Regression
class Regressor(nn.Module):

  def __init__(self,  model_path):
    super(Regressor, self).__init__()
    self.bert = BertModel.from_pretrained(model_path)
    #self.bert = BertModel.from_pretrained('bert-base-uncased')
    self.out = nn.Linear(self.bert.config.hidden_size, 1)

  def forward(self, input_ids, attention_mask):
    output, pooler_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
    score= self.out(pooler_out)
    return score

GPU Initialization Section

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print("device: {} n_gpu: {}".format(device, n_gpu)) 

device: cuda n_gpu: 1


In [None]:
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)
print(logger)

<Logger __main__ (INFO)>


## **Load the Model and the Tokenizer**

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model_path  = '/content/drive/My Drive/pretraining/model'
model= Regressor(model_path)
weights_score = torch.load(join(model_path,'model_state.bin'))
model.out.load_state_dict(weights_score)
model.to(device)


10/24/2020 03:52:46 - INFO - filelock -   Lock 139853576403544 acquired on /root/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…

10/24/2020 03:52:46 - INFO - filelock -   Lock 139853576403544 released on /root/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084.lock





Regressor(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  

# **SIA Score Generator**

In [None]:
def sia_score_generator(df):
  sia_Dataloader=create_dataloader(tokenizer,df)
  sia_score=[]
  for step, batch in enumerate(sia_Dataloader):
    batch=tuple(t.to(device) for t in batch)
    ip_ids,masks=batch
    score=model(ip_ids, attention_mask=masks)
    score=score.squeeze(1)
    sia_score.extend(score.detach().cpu().numpy())
  return sia_score

# **Loading the Data**



In [None]:
#Loadibg the QASC preprocessd Data
qasc_train=pd.read_csv('/content/drive/My Drive/QASC-DATASET/data/QASC_Dataset/train_factc_ans.csv')
qasc_dev=pd.read_csv('/content/drive/My Drive/QASC-DATASET/data/QASC_Dataset/dev_df.csv')

**Generate the SIA Scores**

In [None]:
sia_score = sia_score_generator(qasc_train)
qasc_train['score'] = sia_score

Shape: (89474, 2)


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
qasc_train.head(22)

Unnamed: 0,Query,Sentence,score
0,Beads of water can be formed by clouds.,beads of water are formed by water vapor conde...,2.230989
1,Beads of water can be formed by clouds.,Clouds are made of water vapor.,3.353326
2,Beads of water can be formed by clouds.,Beads of water can be formed by clouds.,1.251877
3,Beads of water can be formed by clouds.,About Pearls Types of Pearls Akoya pearls are ...,2.013541
4,Beads of water can be formed by clouds.,Params stream is the stream reference of the s...,1.205536
5,Beads of water can be formed by clouds.,"Ch 4, shell over shell, shell over shell.",1.945948
6,Beads of water can be formed by clouds.,Diamonds are guaranteed to be diamonds.,1.494088
7,Beads of water can be formed by clouds.,Rain is rain.,1.997201
8,Beads of water can be formed by clouds.,"Trade beads, make beads, wear beads, share bea...",3.413424
9,Beads of water can be formed by clouds.,Ducks are Cool Ducks are Cool Ducks are cool.,1.861501


# **Export the Generated Scores to csv**

In [None]:
qasc_train.to_csv("/content/drive/My Drive/QASC-DATASET/data/QASC_Dataset/Results/bert_train_factc_answer_result.csv",index=False)