# Question Answering

##Preparing data

### Importing libraries and data collection



In [2]:
import tensorflow as tf
import numpy as np
import json
import pandas as pd
import nltk
import os
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics.pairwise import cosine_similarity
import re
import math
import requests
import torch
import torch.nn as nn
import pickle
from collections import Counter
import random
import time
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.metrics import classification_report, confusion_matrix , accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [4]:
# How much of the data to use for training and testing
DATA_MAX = 5000
TEST_RATIO = .2

# Use pickled previously preprocessed corpus instead of preprocessing again to save time
use_pickle = True
pickle_file = 'train_squad.pickle'
train_pickle = 'https://drive.google.com/uc?export=download&id=19Bmeb6zVyKId5UJSw1fgGhiCH88GYwGv'


In [5]:
def download_data(urls):
  for url in urls:
    r = requests.get(url)
    file_name = url.split('/')[-1]
    print(f"Downloading {file_name}...")
    with open(file_name, 'wb') as fd:
      for chunk in r.iter_content():
        fd.write(chunk)

In [6]:
urls = ['https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json', 'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json']
download_data(urls)

Downloading train-v2.0.json...
Downloading dev-v2.0.json...


In [7]:
train_set = json.load(open('train-v2.0.json'))['data']
valid_set = json.load(open('dev-v2.0.json'))['data']

###Data Exploration


In [8]:
print("Number of titles (titles hold several paragraphs, each with their own questions and answers): ", len(train_set))
print("Keys for the title and paragraphs:", train_set[0].keys())
print("Number of paragraphs: ", len(train_set[0]['paragraphs']))
print("Keys inside every paragraph: ", train_set[0]['paragraphs'][0].keys())
print("Number of questions in first paragraph: ", len(train_set[0]['paragraphs'][0]['qas']))
print("Keys for 'qas': ", train_set[0]['paragraphs'][0]['qas'][0].keys())
print("Keys in every answer:", train_set[0]['paragraphs'][0]['qas'][0]['answers'][0].keys())
print("Sample answer: ", train_set[0]['paragraphs'][0]['qas'][0]['answers'][0])
print("Sliced answer to be checked with the previous sample to see if answer_start describes the answer: ", train_set[0]['paragraphs'][0]['context'][269:301])

Number of titles (titles hold several paragraphs, each with their own questions and answers):  442
Keys for the title and paragraphs: dict_keys(['title', 'paragraphs'])
Number of paragraphs:  66
Keys inside every paragraph:  dict_keys(['qas', 'context'])
Number of questions in first paragraph:  15
Keys for 'qas':  dict_keys(['question', 'id', 'answers', 'is_impossible'])
Keys in every answer: dict_keys(['text', 'answer_start'])
Sample answer:  {'text': 'in the late 1990s', 'answer_start': 269}
Sliced answer to be checked with the previous sample to see if answer_start describes the answer:  in the late 1990s as lead singer


In [9]:
cols = ['qa_id', 'title', 'context', 'question', 'answers','answer_start', 'is_impossible'] # all column names

In [10]:
# Convert dataset from json format to df format
def json_list_to_df(dataset):
  qa_id_list = [] #question-answer id list
  title_list = []
  context_list = []
  question_list = []
  answers_list = []
  answer_start=[]
  is_impossible_list = [] #answer not provided
  for item in dataset:
    sum_qas = 0 #all question and answer pairs in dataset
    title = item['title']
    paragraphs = item['paragraphs']
    for paragraph in paragraphs:
      context = paragraph['context']
      qas = paragraph['qas']
      sum_qas += len(qas)
      for qa in qas:
        question = qa['question']
        qa_id = qa['id']
        answers = qa['answers'] # list
        answers_temp = []
        answers_temp_pos = []
        for answer in answers:
          answers_temp.append(answer['text']) # remove the start index
          answers_temp_pos.append(answer['answer_start']) # remove the start index
        is_impossible = qa['is_impossible']
        question_list.append(question)
        qa_id_list.append(qa_id)
        answers_list.append(answers_temp)
        answer_start.append(answers_temp_pos)
        is_impossible_list.append(is_impossible)
      for _ in range(len(qas)):
        context_list.append(context)
    for _ in range(sum_qas):
      title_list.append(title)
  df_dict = {'title': title_list,'context': context_list,'question':question_list,'qa_id':qa_id_list,'is_impossible':is_impossible_list,'answers':answers_list, 'answer_start':answer_start} #compile all columns and corresponding lists into a dict
  # Create df
  df = pd.DataFrame(df_dict)
  return df

In [11]:
df_train = json_list_to_df(train_set)
print(df_train.shape) #(questions, features)

(130319, 7)


###Data Cleaning

In [12]:
df_train.head(2)

Unnamed: 0,title,context,question,qa_id,is_impossible,answers,answer_start
0,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,56be85543aeaaa14008c9063,False,[in the late 1990s],[269]
1,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,56be85543aeaaa14008c9065,False,[singing and dancing],[207]


In [13]:
index_unanswered_train = df_train[df_train['is_impossible'] == True].index
df_train.drop(index_unanswered_train,inplace = True)

In [14]:
print(df_train.isnull().sum())

title            0
context          0
question         0
qa_id            0
is_impossible    0
answers          0
answer_start     0
dtype: int64


In [15]:
print(df_train['question'].duplicated().sum())
index_dup_train = df_train[df_train['question'].duplicated()].index
df_train.drop(index_dup_train,inplace = True)

52


###Data Pre-Processing

In [16]:
# this function takes in a string, and outputs the same string but processed according to the following steps:
# 1. Tokenized
# 2. Stop words removed
# 3. POS tagged
# 4. Lemmatized all words
# 5. Returned to string format
# some flags can be used to control the output of the function according to the user's liking
def process_corpus(text,unique = False,token_output = True):
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)
    if unique:
       tokens = list(set(tokens))
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]
    tokens = nltk.pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token[0]) for token in tokens]
    if token_output:
      return tokens
    else:
      return ' '.join(tokens)

In [17]:
df_sample_train = df_train[:DATA_MAX]
df_sample_test = df_train[DATA_MAX:DATA_MAX+int(DATA_MAX*TEST_RATIO)]

if(use_pickle):
  # run this cell instead of pre-processing the whole corpus as that can take a while
  print(f"Downloading pickle file...")
  r = requests.get(train_pickle)
  with open(pickle_file, 'wb') as fd:
    for chunk in r.iter_content():
      fd.write(chunk)
  # Download pickle files and place each type in its list:
  data_pickle = open(pickle_file,'rb')
  train_pre_processed = pickle.load(data_pickle)
  paragraph_corpus = train_pre_processed['context'][:DATA_MAX]
  question_corpus = train_pre_processed['question'][:DATA_MAX]
  answer_corpus =  train_pre_processed['answers'][:DATA_MAX]
else:
  #optional: run these commands to pre-process the corpus manually.
  #NOTE: it will take WAY longer than to just download it pre-processed already and stored in a pickle file like shown above.
  paragraph_corpus = [process_corpus(paragraph) for paragraph in df_sample_train['context']]
  question_corpus  = [process_corpus(question) for question in df_sample_train['question']]
  answer_corpus = [process_corpus(answer[0]) for answer in df_sample_train['answers']]

Downloading pickle file...


In [18]:
corpus = paragraph_corpus + question_corpus + answer_corpus

In [19]:
len(corpus)

15000

In [20]:
df_train['answers'] = df_train['answers'].apply(lambda x: ','.join(map(str, x)))
df_train['answer_start'] = df_train['answer_start'].apply(lambda x: ','.join(map(str, x)))
df_train

Unnamed: 0,title,context,question,qa_id,is_impossible,answers,answer_start
0,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,56be85543aeaaa14008c9063,False,in the late 1990s,269
1,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,56be85543aeaaa14008c9065,False,singing and dancing,207
2,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,56be85543aeaaa14008c9066,False,2003,526
3,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,56bf6b0f3aeaaa14008c9601,False,"Houston, Texas",166
4,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,56bf6b0f3aeaaa14008c9602,False,late 1990s,276
...,...,...,...,...,...,...,...
130046,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",In what US state did Kathmandu first establish...,5735d259012e2f140011a09d,False,Oregon,229
130047,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",What was Yangon previously known as?,5735d259012e2f140011a09e,False,Rangoon,414
130048,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",With what Belorussian city does Kathmandu have...,5735d259012e2f140011a09f,False,Minsk,476
130049,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",In what year did Kathmandu create its initial ...,5735d259012e2f140011a0a0,False,1975,199


In [21]:
val_df = df_train.iloc[-10000:]
val_df

Unnamed: 0,title,context,question,qa_id,is_impossible,answers,answer_start
115243,Antibiotics,"With advances in medicinal chemistry, most mod...",What type of antibiotics include penicilin?,5733b6a2d058e614000b6123,False,beta-lactam antibiotics,159
115244,Antibiotics,"With advances in medicinal chemistry, most mod...",What are the type of antibiotics which are tak...,5733b6a2d058e614000b6124,False,aminoglycosides,365
115245,Antibiotics,"With advances in medicinal chemistry, most mod...","How are the slufonamides,quinolones, and oxazo...",5733b6a2d058e614000b6125,False,synthesis,513
115251,Antibiotics,The emergence of resistance of bacteria to ant...,What does emergence of resistance reflect?,5731c593e17f3d14004223c5,False,evolutionary processes,118
115252,Antibiotics,The emergence of resistance of bacteria to ant...,What is the purpose of antibiotic treatment?,5731c593e17f3d14004223c6,False,survive high doses of antibiotics,299
...,...,...,...,...,...,...,...
130046,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",In what US state did Kathmandu first establish...,5735d259012e2f140011a09d,False,Oregon,229
130047,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",What was Yangon previously known as?,5735d259012e2f140011a09e,False,Rangoon,414
130048,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",With what Belorussian city does Kathmandu have...,5735d259012e2f140011a09f,False,Minsk,476
130049,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",In what year did Kathmandu create its initial ...,5735d259012e2f140011a0a0,False,1975,199


In [22]:
train_df = df_train.iloc[:50000]

####Dataset Sampling

In [23]:
def print_squad_sample(df: pd.core.frame.DataFrame, line_length: int=20, separator_length: int=120) -> None:
  sample = df.iloc[random.randint(0,df.shape[0])]
  title = sample.title.replace('_', ' ')
  print('TITLE: ')
  print(title)
  print('='*separator_length)
  context = sample.context.split()
  print('CONTEXT: ')
  lines = [' '.join(context[idx:idx+line_length]) for idx in range(0, len(context), line_length)]
  for l in lines:
      print(l)
  print('='*separator_length)
  questions = df[df.context.values==sample.context]
  max_len = len(max(questions.question, key=len)) + 5
  print("{: <{max_len}} {: <{max_len}}".format('QUESTION:','ANSWER:', max_len=max_len))
  for idx, row in questions.iterrows():
    question = row.question
    answer = row['answers']
    print("{: <{max_len}} {: <{max_len}}".format(question,answer, max_len=max_len))

print_squad_sample(df_train)

TITLE: 
Northwestern University
CONTEXT: 
Like other American research universities, Northwestern was transformed by World War II. Franklyn B. Snyder led the university from 1939
to 1949, when nearly 50,000 military officers and personnel were trained on the Evanston and Chicago campuses. After the war,
surging enrollments under the G.I. Bill drove drastic expansion of both campuses. In 1948 prominent anthropologist Melville J. Herskovits founded
the Program of African Studies at Northwestern, the first center of its kind at an American academic institution. J. Roscoe
Miller's tenure as president from 1949 to 1970 was responsible for the expansion of the Evanston campus, with the construction
of the lakefill on Lake Michigan, growth of the faculty and new academic programs, as well as polarizing Vietnam-era student
protests. In 1978, the first and second Unabomber attacks occurred at Northwestern University. Relations between Evanston and Northwestern were strained
throughout much of t

##Built-in AlBert- Model 1

In [24]:
!pip install transformers



In [25]:
train_df=df_train

In [26]:
df_train

Unnamed: 0,title,context,question,qa_id,is_impossible,answers,answer_start
0,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,56be85543aeaaa14008c9063,False,in the late 1990s,269
1,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,56be85543aeaaa14008c9065,False,singing and dancing,207
2,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,56be85543aeaaa14008c9066,False,2003,526
3,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,56bf6b0f3aeaaa14008c9601,False,"Houston, Texas",166
4,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,56bf6b0f3aeaaa14008c9602,False,late 1990s,276
...,...,...,...,...,...,...,...
130046,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",In what US state did Kathmandu first establish...,5735d259012e2f140011a09d,False,Oregon,229
130047,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",What was Yangon previously known as?,5735d259012e2f140011a09e,False,Rangoon,414
130048,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",With what Belorussian city does Kathmandu have...,5735d259012e2f140011a09f,False,Minsk,476
130049,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",In what year did Kathmandu create its initial ...,5735d259012e2f140011a0a0,False,1975,199


##Tokenize data

In [27]:
def add_end_idx(answer, start_idx , context):
#     print(answer,start_idx,context)
    end_idx = int(start_idx) + len(answer)
    if context[int(start_idx):end_idx] == answer:
        return start_idx, end_idx
    else:
        for offset in [1,2]:
            if context[int(start__idx-offset):end_idx-offset] == answer:
                return start_idx-offset, end_idx-offset
train_df[['answer_start','answer_end']]= train_df[['answers','answer_start','context']]\
.apply(lambda x: add_end_idx(x[0],x[1],x[2]),axis=1,result_type="expand")
val_df[['answer_start','answer_end']]= train_df[['answers','answer_start','context']]\
.apply(lambda x: add_end_idx(x[0],x[1],x[2]),axis=1,result_type="expand")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df[['answer_start','answer_end']]= train_df[['answers','answer_start','context']]\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df[['answer_start','answer_end']]= train_df[['answers','answer_start','context']]\


In [28]:
train_contexts, train_questions = train_df['context'].tolist(),train_df['question'].tolist()
train_answers = [{'text': ans, 'answer_start':ans_start, 'answer_end':ans_end}
                 for ans,ans_start,ans_end in
                 zip(train_df['answers'].tolist(),train_df['answer_start'].tolist(),train_df['answer_end'].tolist())]

In [29]:
val_contexts, val_questions = val_df['context'].tolist(),val_df['question'].tolist()
val_answers = [{'text': ans, 'answer_start':ans_start, 'answer_end':ans_end}
                 for ans,ans_start,ans_end in
                 zip(val_df['answers'].tolist(),val_df['answer_start'].tolist(),val_df['answer_end'].tolist())]

In [30]:
val_answers[:5]

[{'text': 'beta-lactam antibiotics', 'answer_start': '159', 'answer_end': 182},
 {'text': 'aminoglycosides', 'answer_start': '365', 'answer_end': 380},
 {'text': 'synthesis', 'answer_start': '513', 'answer_end': 522},
 {'text': 'evolutionary processes', 'answer_start': '118', 'answer_end': 140},
 {'text': 'survive high doses of antibiotics',
  'answer_start': '299',
  'answer_end': 332}]

In [31]:

from transformers import AlbertTokenizerFast

tokenizer = AlbertTokenizerFast.from_pretrained('albert-base-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

In [32]:
train_encodings = tokenizer(train_contexts, train_questions, truncation = True, padding = True)

In [33]:
val_encodings = tokenizer(val_contexts, val_questions, truncation = True, padding = True)

In [34]:
train_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [35]:
# torch.backends.cudnn.enabled
torch.cuda.is_available()
# torch.zeros(1).cuda()

True

In [36]:
def add_token_positions(encodings,answers):
    start_pos, end_pos = [], []
    for i in range(len(answers)):
        start_pos.append(encodings.char_to_token(i,int(answers[i]['answer_start'])))
        end_pos.append(encodings.char_to_token(i,int(answers[i]['answer_end'])))
        if start_pos[-1] is None:
            start_pos[-1] = tokenizer.model_max_length
        go_back = 1
        while end_pos[-1] is None:
            end_pos[-1] = encodings.char_to_token(i,answers[i]['answer_end']-go_back)
            go_back+=1

    encodings.update({
        'start_positions': start_pos,
        'end_positions': end_pos
    })

add_token_positions(train_encodings,train_answers)

In [37]:
add_token_positions(val_encodings,val_answers)

##Training data

In [38]:
# create a pytorch data object to load the data using pytorch dataloader later
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self,idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)


In [39]:
from transformers import AlbertForQuestionAnswering
model = AlbertForQuestionAnswering.from_pretrained('albert-base-v2')
from torch.utils.data import DataLoader
from transformers import AdamW
import torch


model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
train_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [41]:
model.eval()

AlbertForQuestionAnswering(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias

In [42]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
optim = AdamW(model.parameters(),lr=5e-5)

train_loader = DataLoader(train_dataset, batch_size = 16, shuffle = True)

for epoch in range(1):
    model.train()
    # to use tdm
    loop = tqdm(train_loader)
    for batch in loop:
        # initialize gradients to zero for each batch
        optim.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        outputs = model(input_ids,attention_mask = attention_mask,
                       start_positions= start_positions,
                       end_positions = end_positions)

        loss = outputs[0]
        loss.backward()
        optim.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|██████████| 5424/5424 [2:27:59<00:00,  1.64s/it, loss=0.468]


In [43]:
!nvidia-smi


Tue Feb  6 08:50:38 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   76C    P0              68W /  70W |  11235MiB / 15360MiB |     68%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

##Evaluating data-Accuracy, F1 and EM scores

In [44]:
model.eval()

AlbertForQuestionAnswering(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias

In [45]:
val_loader = DataLoader(val_dataset,batch_size = 8)
acc = []
predicted_answers = []
# to use tdm
loop = tqdm(val_loader)
for batch in loop:
    # stop pytorch from calculating any gradients
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)

        outputs = model(input_ids,attention_mask = attention_mask)
        start_preds = torch.argmax(outputs['start_logits'],dim=1)
        end_preds = torch.argmax(outputs['end_logits'],dim=1)
        acc.append(((start_preds == start_true).sum()/len(start_preds)).item())
        acc.append(((end_preds == end_true).sum()/len(end_preds)).item())
#         predicted_raw_answers = [ (start_idx,end_idx) for start_idx,end_idx in zip(start_preds,end_preds)]
#         predicted_answers += []
#         break

100%|██████████| 1250/1250 [06:14<00:00,  3.34it/s]


In [46]:
sum(acc)/len(acc)

0.76475