# **Exploring Question Answering (QA) on Tweets using BERT and its variants**
---
### **CS533 - Final Project Submission**
### *By Shivali Singh, Judhajit Roy, Krishnan C.S.*
### Department of Computer Science, University of Illinois at Chicago







# IMPORT NECCESSARY LIBRARIES

In [1]:
!pip install simpletransformers
!pip install setuptools==59.5.0



In [2]:
import torch
import os
import random
import json 

from google.colab import drive
from setuptools import distutils
from simpletransformers.question_answering import QuestionAnsweringModel
import nltk
import copy
nltk.download('wordnet')
from nltk.corpus import wordnet 

#stop words list
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 
			'ours', 'ourselves', 'you', 'your', 'yours', 
			'yourself', 'yourselves', 'he', 'him', 'his', 
			'himself', 'she', 'her', 'hers', 'herself', 
			'it', 'its', 'itself', 'they', 'them', 'their', 
			'theirs', 'themselves', 'what', 'which', 'who', 
			'whom', 'this', 'that', 'these', 'those', 'am', 
			'is', 'are', 'was', 'were', 'be', 'been', 'being', 
			'have', 'has', 'had', 'having', 'do', 'does', 'did',
			'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
			'because', 'as', 'until', 'while', 'of', 'at', 
			'by', 'for', 'with', 'about', 'against', 'between',
			'into', 'through', 'during', 'before', 'after', 
			'above', 'below', 'to', 'from', 'up', 'down', 'in',
			'out', 'on', 'off', 'over', 'under', 'again', 
			'further', 'then', 'once', 'here', 'there', 'when', 
			'where', 'why', 'how', 'all', 'any', 'both', 'each', 
			'few', 'more', 'most', 'other', 'some', 'such', 'no', 
			'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 
			'very', 's', 't', 'can', 'will', 'just', 'don', 
			'should', 'now', '']

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
!nvidia-smi

Sat May  7 00:04:59 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   60C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
# !pip install wandb
#import wandb
#wandb.init(project="test-project", entity="cs533_spring2022_jks")

# HELPER FUNCS


In [5]:
# these functions are heavily influenced by the metrics.py script from the SQuAD dataset website
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

In [6]:
#To compute f1 score using redefiend precision and recall
def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

In [7]:
def compute_final_scores(result, text ):
  f1_score = 0
  
  no_correct = result.get("correct")
  no_incorrect = result.get("incorrect")
  no_similar = result.get("similar")
  count_examples = no_correct + no_incorrect + no_similar

  correct_text = text.get("correct_text")
  similar_text = text.get("similar_text")
  incorrect_text = text.get('incorrect_text')

  

  #iterating over incorrect preds to computer f1
  for key, val in incorrect_text.items():
    truth = val.get("truth")
    predicted = val.get("predicted")
    f = compute_f1(predicted, truth)
    f1_score += f

  #iterating over similar preds to computer f1
  for key, val in similar_text.items():
    truth = val.get("truth")
    predicted = val.get("predicted")
    f = compute_f1(predicted, truth)
    f1_score += f

  #adding correct preds since for it f1=1
  for i in range(no_correct):
    f1_score += 1

  #Averaging and printing F1 and Exact scores 
  print("F1: " + str(f1_score/count_examples))
  print("Exact match: " + str(no_correct/count_examples))

# LOADING DATA

In [8]:
!wget https://sites.cs.ucsb.edu/~xwhan/datasets/tweetqa.zip

--2022-05-07 00:06:19--  https://sites.cs.ucsb.edu/~xwhan/datasets/tweetqa.zip
Resolving sites.cs.ucsb.edu (sites.cs.ucsb.edu)... 128.111.27.164
Connecting to sites.cs.ucsb.edu (sites.cs.ucsb.edu)|128.111.27.164|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1573980 (1.5M) [application/zip]
Saving to: ‘tweetqa.zip.3’


2022-05-07 00:06:21 (1.91 MB/s) - ‘tweetqa.zip.3’ saved [1573980/1573980]



In [9]:
!unzip /content/tweetqa.zip

Archive:  /content/tweetqa.zip
replace TweetQA_data/tweetqa_eval.py? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: TweetQA_data/tweetqa_eval.py  
  inflating: TweetQA_data/test.json  
  inflating: TweetQA_data/train.json  
  inflating: TweetQA_data/dev.json   


In [10]:
with open('/content/TweetQA_data/dev.json', 'r') as f:
    dev_file = json.load(f)

In [11]:
with open('/content/TweetQA_data/train.json', 'r') as f:
    data = json.load(f)

# PREPROCESSING DATA

In [12]:
# method to search answer start index in tweet
def answer_index_in_document(answer_list, document):

    for answer_string_in_doc in answer_list:
        index = document.lower().find(answer_string_in_doc)
        if index != -1:
            return answer_string_in_doc, index
    return answer_list[0], -1

In [13]:
#convert  dev data to squad format
dev_data = []
for content in dev_file:
  answer,ind = answer_index_in_document(content['Answer'], content['Tweet'])
  dev_data.append({'context':content['Tweet'].lower(),'qas':[{'id':content['qid'],'question':content['Question'],'answers':[{'text':answer,'answer_start':ind }]}]})


In [14]:
# convert train data to squad format
train_data = []
for content in data:
  answer,ind = answer_index_in_document(content['Answer'], content['Tweet'])
  train_data.append({'context':content['Tweet'].lower(),'qas':[{'id':content['qid'],'question':content['Question'],'answers':[{'text':answer,'answer_start':ind }]}]})


In [15]:
dev_data[0]

{'context': 'this forecast is deflated as much as new england patriots footballs! i apologize. w nj has the most to lose. dave curren (@davecurren) january 27, 2015',
 'qas': [{'id': 'f867d1c3361549952be5639ca433895f',
   'question': 'who has the most to lose',
   'answers': [{'text': 'w nj', 'answer_start': 82}]}]}

In [16]:
train_data[0]

{'context': 'our prayers are with the students, educators & families at independence high school & all the first responders on the scene. #patriotpride— doug ducey (@dougducey) february 12, 2016',
 'qas': [{'id': '0c871b7e5320d0816d5b2979d67c2649',
   'question': 'at which school were first responders on the scene for?',
   'answers': [{'text': 'independence high school', 'answer_start': 59}]}]}

In [17]:
tweetqa_as_squad_dev = {'data': train_data, 'version': '2.0'}

In [18]:
tweetqa_as_squad_train = {'data': train_data, 'version': '2.0'}

In [19]:
with open('tweetqa_as_squad_dev.json', 'w', encoding='utf-8') as outfile:
        json.dump(tweetqa_as_squad_dev, outfile, indent=2, sort_keys=True, ensure_ascii=False)

In [20]:
with open('tweetqa_as_squad_train.json', 'w', encoding='utf-8') as outfile:
        json.dump(tweetqa_as_squad_train, outfile, indent=2, sort_keys=True, ensure_ascii=False)

In [21]:
with open('/content/tweetqa_as_squad_dev.json', 'r') as f:
    dev = json.load(f)

dev = [topic for topic in dev['data']]

In [22]:
with open('/content/tweetqa_as_squad_train.json', 'r') as f:
    train = json.load(f)

train = [topic for topic in train['data']]

We had trained the models below using the same code and these models can be run by changing the model name in the below line. 
The models can be found at [Huggingface](https://huggingface.co/models)
```
model = QuestionAnsweringModel('model-type', 'model-name', args=train_args)
```

1. tinyBERT 
2. miniBERT 
3. ALBERT 
4. DistilBERT
5. RoBERTa
6. BERT

For the purpose of this submission, we are only showing the performance of the best model - BERT, BERT-SQUAD and BERT-SQUAD-SR


# VANILLA BERT MODEL

 Trying out pretrained vanilla bert base uncased model from huggingface

In [49]:
#initializing model
train_args = {
    'learning_rate': 3e-5,
    'num_train_epochs': 3,
    'max_seq_length': 128,
    'doc_stride': 64,
    'overwrite_output_dir': True,
    'reprocess_input_data': False,
    'train_batch_size': 2,
    'fp16': False,

}

vanilla_bert_model = QuestionAnsweringModel('bert', 'bert-base-uncased', args=train_args)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

In [50]:
#training model on train data
vanilla_bert_model.train_model(train)

convert squad examples to features:   0%|          | 0/10692 [00:00<?, ?it/s]Could not find answer: '' vs. 'instagram'
Could not find answer: '' vs. '5:09 pm- jul 22, 2016.'
Could not find answer: '' vs. 'eric'
Could not find answer: '' vs. 'tax it'
Could not find answer: '' vs. 'greg schiano.'
Could not find answer: '' vs. 'steal from us'
Could not find answer: '' vs. '15 years old'
Could not find answer: '' vs. 'sony pictures'
Could not find answer: '' vs. 'suarez and the world cup'
Could not find answer: '' vs. 'bill cosby'
Could not find answer: '' vs. 'sad and ashamed'
Could not find answer: '' vs. 'fake pics'
Could not find answer: '' vs. 'they could wanna be quarantined.'
Could not find answer: '' vs. 'donald trump's response to question'
Could not find answer: '' vs. 'victoria secret fashion show'
Could not find answer: '' vs. 'ramsey nouah'
Could not find answer: '' vs. 'tweet does not exist'
Could not find answer: '' vs. 'think big and aim high.'
Could not find answer: '' vs.

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/3076 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/3076 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/3076 [00:00<?, ?it/s]

(9228, 0.8660785325463545)

In [51]:
result, text = vanilla_bert_model.eval_model(dev)
compute_final_scores(result, text)

convert squad examples to features: 100%|██████████| 10692/10692 [00:10<00:00, 1020.67it/s]
add example index and unique id: 100%|██████████| 10692/10692 [00:00<00:00, 828493.01it/s]


Running Evaluation:   0%|          | 0/1337 [00:00<?, ?it/s]

F1: 0.6956293145235672
Exact match: 0.5446513395401862


# BERT SQUAD MODEL

In [32]:
train_args = {
    'learning_rate': 3e-5,
    'num_train_epochs': 3,
    'max_seq_length': 128,
    'doc_stride': 64,
    'overwrite_output_dir': True,
    'reprocess_input_data': False,
    'train_batch_size': 2,
    'fp16': False
}

bert_squad_model = QuestionAnsweringModel('bert', 'deepset/bert-base-uncased-squad2', args=train_args)

Downloading:   0%|          | 0.00/465 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

In [33]:
# training using train data
bert_squad_model.train_model(train)

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/3076 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/3076 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/3076 [00:00<?, ?it/s]

(9228, 0.6162040902598768)

In [34]:
#eval model on dev data
result, text = bert_squad_model.eval_model(dev)
compute_final_scores(result, text)

Running Evaluation:   0%|          | 0/1337 [00:00<?, ?it/s]

F1: 0.7107744412307279
Exact match: 0.5481664449933498


# HYPER PARAMETER TUNING

In [None]:
!pip install wandb
import wandb
wandb.init(project="test-project", entity="cs533_spring2022_jks")

In [43]:
sweep_config = {
    'method': 'grid', #grid, random
    'metric': {
      'name': 'eval_loss',
      'goal': 'minimize'   
    },
    'parameters': {
   
        'learning_rate': {
            'values': [ 5e-5, 3e-5, 2e-5]
        },
        'epochs':{
            'values':[2,3]
        }

    }
}
sweep_defaults = {
            'learning_rate': 5e-5,
       
        'epochs':2
}

sweep_id = wandb.sweep(sweep_config,project="bert hyperparameter")

Create sweep with ID: qd3jkzpq
Sweep URL: https://wandb.ai/cs533_spring2022_jks/bert%20hyperparameter/sweeps/qd3jkzpq


In [None]:
def trainQA():
  from simpletransformers.question_answering import QuestionAnsweringModel
  wandb.init()
  train_args = {
      'learning_rate': wandb.config.learning_rate,
      'num_train_epochs': 2,
      'max_seq_length': 128,
      'doc_stride': 64,
      'overwrite_output_dir': True,
      'reprocess_input_data': False,
      'train_batch_size': 2,
      'fp16': False,
      'wandb_project': "latest"
  }

  modelbert = QuestionAnsweringModel('bert', 'twmkn9/bert-base-uncased-squad2', args=train_args)
  modelbert.train_model(train_data)

wandb.agent(sweep_id,function=trainQA)

[34m[1mwandb[0m: Agent Starting Run: len4hulb with config:
[34m[1mwandb[0m: 	epochs: 2
[34m[1mwandb[0m: 	learning_rate: 5e-05


Downloading:   0%|          | 0.00/465 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.0 [00:00<?, ?B/s]


convert squad examples to features:   0%|          | 0/10692 [00:00<?, ?it/s]Could not find answer: '' vs. 'instagram'
Could not find answer: '' vs. '5:09 pm- jul 22, 2016.'
Could not find answer: '' vs. 'eric'
Could not find answer: '' vs. 'tax it'
Could not find answer: '' vs. 'greg schiano.'
Could not find answer: '' vs. 'steal from us'
Could not find answer: '' vs. '15 years old'
Could not find answer: '' vs. 'sony pictures'
Could not find answer: '' vs. 'suarez and the world cup'
Could not find answer: '' vs. 'victoria secret fashion show'
Could not find answer: '' vs. 'tweet does not exist'
Could not find answer: '' vs. 'donald trump'
Could not find answer: '' vs. 'on a comet'
Could not find answer: '' vs. 'the diversity of our country.'
Could not find answer: '' vs. 'spirt day for glaad'
Could not find answer: '' vs. 'reports of this person being aggressive.'
Could not find answer: '' vs. 'investigators.'
Could not find answer: '' vs. 'death and destruction'
Could not find ans

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

Running Epoch 0 of 2:   0%|          | 0/3076 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/3076 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Training loss,▃▄▂▅▁▁▁▁▁▆▁▆▁▂▂▂▁▃▁▁▁▁▁▄▁▂▁▁█▁▁▁▁▁▁▁▄▁▂▁
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
lr,▂▅████▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁

0,1
Training loss,0.00045
global_step,6150.0
lr,0.0


[34m[1mwandb[0m: Agent Starting Run: j1xd2bed with config:
[34m[1mwandb[0m: 	epochs: 2
[34m[1mwandb[0m: 	learning_rate: 3e-05
[34m[1mwandb[0m: Currently logged in as: [33mjroy13n96[0m ([33mcs533_spring2022_jks[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

Running Epoch 0 of 2:   0%|          | 0/3076 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/3076 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Training loss,▇▃▄▂▁▁▄▇▃▁▁▃▁▄▁▄▃▁▃▁▁▁▁▁▂▁▁▁▁▂▁▂▁▂▁█▁▁▁▂
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
lr,▂▅████▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁

0,1
Training loss,0.57297
global_step,6150.0
lr,0.0


[34m[1mwandb[0m: Agent Starting Run: wbt3pr8s with config:
[34m[1mwandb[0m: 	epochs: 2
[34m[1mwandb[0m: 	learning_rate: 2e-05


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

Running Epoch 0 of 2:   0%|          | 0/3076 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/3076 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Training loss,▇▁▁█▄▂▂▃▂▁▂▄▂▂▁▅▁▂▂▆▁▁▄▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁█▁
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
lr,▂▅████▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁

0,1
Training loss,0.00367
global_step,6150.0
lr,0.0


[34m[1mwandb[0m: Agent Starting Run: uvoqkt8i with config:
[34m[1mwandb[0m: 	epochs: 3
[34m[1mwandb[0m: 	learning_rate: 5e-05


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

Running Epoch 0 of 2:   0%|          | 0/3076 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/3076 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Training loss,▁▄▁▂▃▄▂▂▃▃▁█▁▂▃▄▄▂▁▁▂▁▃▁▁▁▁▁▁▃▁▁▁▂▁▁▁▂▁▁
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
lr,▂▅████▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁

0,1
Training loss,1e-05
global_step,6150.0
lr,0.0


[34m[1mwandb[0m: Agent Starting Run: anfa97ki with config:
[34m[1mwandb[0m: 	epochs: 3
[34m[1mwandb[0m: 	learning_rate: 3e-05


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

Running Epoch 0 of 2:   0%|          | 0/3076 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/3076 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Training loss,█▃▃▁▂▁▅▁▂▂▂▅▂▁▄▅▇▁▅▂▁▁▃▁▁▃▁▁▁▆▁▂▁▁▁▁▁▁▁▃
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
lr,▂▅████▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁

0,1
Training loss,0.65328
global_step,6150.0
lr,0.0


[34m[1mwandb[0m: Agent Starting Run: o3sv16se with config:
[34m[1mwandb[0m: 	epochs: 3
[34m[1mwandb[0m: 	learning_rate: 2e-05


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

Running Epoch 0 of 2:   0%|          | 0/3076 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/3076 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Training loss,▄█▃▂▁▁▂▁▃▂▁▃▁▄▁▃▁▁▂▁▁▁▁▁▁▁▁▁▅▁▁▁▂▅▁▃▁▃▁▁
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
lr,▂▅████▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁

0,1
Training loss,0.00275
global_step,6150.0
lr,0.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


# Augmenting Data with Easy Data Augmentation (Synonym Replacement)

In [26]:
# get synonyms for word from nltk wordnet
def get_synonyms(word):
	synonyms = set()
	for syn in wordnet.synsets(word): 
		for l in syn.lemmas(): 
			synonym = l.name().replace("_", " ").replace("-", " ").lower()
			synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
			synonyms.add(synonym) 
	if word in synonyms:
		synonyms.remove(word)
	return list(synonyms)

# replaces random word in question with synonym
def synonym_replacement(words):
	new_words = words.copy()
	random_word_list = list(set([word for word in words if word not in stop_words and word not in answer.split()]))
	random.shuffle(random_word_list)
	num_replaced = 0
	for random_word in random_word_list:
		synonyms = get_synonyms(random_word)
		if len(synonyms) >= 1:
			synonym = random.choice(list(synonyms))
			new_words = [synonym if word == random_word else word for word in new_words]
			# print("replaced", random_word, "with", synonym)
			num_replaced += 1
		if num_replaced >= 1: #only replace up to n words
			break
	#this is stupid but we need it, trust me
	sentence = ' '.join(new_words)
  # new_words = sentence.split(' ')

	return sentence

# augment all the 5% of questions with synonym replacement
def tweetq_augment(problem,total,aug_per=0.05):
  aug = 0
  newproblem = []
  random.shuffle(problem)
  for prob in problem:
    # print(type(prob))
    newprob = copy.deepcopy(prob)
    # print(newprob['qas'][0]['question'].split())
    newprob['qas'][0]['question'] = synonym_replacement(prob['qas'][0]['question'].split())
    newprob['qas'][0]['id'] += 'aug'
    aug+=1
    newproblem.append(newprob)
    # print(prob['qas'][0]['question'])
    if aug > aug_per*total:
      break
  print(aug)  
  return newproblem     

In [27]:
tweetqa_as_squad_train_augmented = {'data': train_data + tweetq_augment(train_data,len(train_data)), 'version': '3.0'}
with open('tweetqa_as_squad_train_augmented.json', 'w', encoding='utf-8') as outfile:
        json.dump(tweetqa_as_squad_train_augmented, outfile, indent=2, sort_keys=True, ensure_ascii=False)

535


In [28]:
with open('/content/tweetqa_as_squad_train_augmented.json', 'r') as f:
    train_aug = json.load(f)

train_aug = [topic for topic in train_aug['data']]

# BERT SQUAD TRAINED ON AUGMENTED DATA

In [29]:
train_args = {
    'learning_rate': 3e-5,
    'num_train_epochs': 3,
    'max_seq_length': 128,
    'doc_stride': 64,
    'overwrite_output_dir': True,
    'reprocess_input_data': False,
    'train_batch_size': 2,
    'fp16': False
}

bert_squad_aug = QuestionAnsweringModel('bert', 'deepset/bert-base-uncased-squad2', args=train_args)

In [30]:
# training using train data
bert_squad_aug.train_model(train_aug)


convert squad examples to features:   0%|          | 0/11227 [00:00<?, ?it/s]Could not find answer: '' vs. 'of thai nationality.'
Could not find answer: '' vs. 'hilary clinton'
Could not find answer: '' vs. 'white.'
Could not find answer: '' vs. 'it won't happen.'
Could not find answer: '' vs. 'gov. rick snyder'
Could not find answer: '' vs. 'they are ministers?'
Could not find answer: '' vs. 'goodell must go'
Could not find answer: '' vs. 'associations support'
Could not find answer: '' vs. 'adega de tia matilde'
Could not find answer: '' vs. 'he was mugged in gateshead'
Could not find answer: '' vs. 'funny things.'
Could not find answer: '' vs. 'abby's southern friends.'
Could not find answer: '' vs. 'because she is part of lafc.'
Could not find answer: '' vs. 'free shrimp.'
Could not find answer: '' vs. 'rams organization and city of st. louis'
Could not find answer: '' vs. 'whoever holds this hammer'
Could not find answer: '' vs. 'senator scott brown'
Could not find answer: '' vs

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/3230 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/3230 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/3230 [00:00<?, ?it/s]

(9690, 0.585175969818734)

In [31]:
#eval model on dev data
result, text = bert_squad_aug.eval_model(dev)
compute_final_scores(result, text)

Running Evaluation:   0%|          | 0/1337 [00:00<?, ?it/s]

F1: 0.7127639336817904
Exact match: 0.5450313509405282
