# **Assignment 3 for Computational Semantics**

**Topic**: SemEval 2020 Task 4 Commonsense validation, explanation and generation

**Member**: Sijie Ju

**Introduction**: The task is to test whether a model can differentiate natural language statements that make sense from those that do not make sense. This task contains three subtasks. The following codes is the solution to subtask A which I finetuned the whole BERT model.

### **Subtask A**: Commonsense verification

**The subtask A** is to choose from two natural language statements with similar wordings which one makes sense and which one does not make sense.

**Examples**

>Which statement of the two is against common sense?
>
>Statement 1: He put a turkey into the fridge. (correct)
>
>Statement 2: He put an elephant into the fridge.

### **1. General preparation**

In [None]:
# INSTALL MISSING PACKAGES
from importlib.util import find_spec
import pip

required_packages = ['torch', 'pandas','datasets']

for package in required_packages:
  if find_spec(package) is None:
    print(f'Installing package: {package}...')
    pip.main(['install', package])

!pip install tqdm



In [None]:
# IMPORT PACKAGES
import pandas as pd
from pandas import DataFrame
import numpy as np
from scipy.spatial.distance import cosine
import matplotlib.pyplot as plt
import torch
from torch.utils.data import DataLoader, _utils, RandomSampler, SequentialSampler
import datasets
from datasets import Dataset, DatasetDict
from tqdm import tqdm
import time

import transformers
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, AdamW, BertConfig
from transformers import pipeline
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification

from transformers import TextClassificationPipeline
from transformers import TrainerCallback

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True,
                                  )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

### **2. Data processing**

In [None]:
# download the data
!git clone https://github.com/wangcunxiang/SemEval2020-Task4-Commonsense-Validation-and-Explanation.git

fatal: destination path 'SemEval2020-Task4-Commonsense-Validation-and-Explanation' already exists and is not an empty directory.


In [None]:
# load the data

def read_data(text_path,answer_path):
  text = pd.read_csv(text_path, header = 0, names = ['ID','Sentence 0','Sentence 1'])
  answer = pd.read_csv(answer_path, header = None, names = ['ID','Answer'])
  return text, answer

train_text, train_answer = read_data ('/content/SemEval2020-Task4-Commonsense-Validation-and-Explanation/ALL data/Training  Data/subtaskA_data_all.csv','/content/SemEval2020-Task4-Commonsense-Validation-and-Explanation/ALL data/Training  Data/subtaskA_answers_all.csv')
val_text,val_answer = read_data('/content/SemEval2020-Task4-Commonsense-Validation-and-Explanation/ALL data/Dev Data/subtaskA_dev_data.csv','/content/SemEval2020-Task4-Commonsense-Validation-and-Explanation/ALL data/Dev Data/subtaskA_gold_answers.csv')
test_text, test_answer = read_data ('/content/SemEval2020-Task4-Commonsense-Validation-and-Explanation/ALL data/Test Data/subtaskA_test_data.csv','/content/SemEval2020-Task4-Commonsense-Validation-and-Explanation/ALL data/Test Data/subtaskA_gold_answers.csv')

def data_process(text,answer):

  data = text.merge(answer, on = 'ID',how = 'left')
  data = data.drop(labels = 'ID',axis =1)
  return data

train_data = data_process(train_text,train_answer)
val_data = data_process(val_text, val_answer)
test_data = data_process(test_text,test_answer)



In [None]:
# convert to dataset
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_data)

dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

In [None]:
# encode the data
def encode_batch(batch):
  return tokenizer(batch['Sentence 0'],batch['Sentence 1'], max_length = 64, truncation=True, padding="max_length")

dataset = dataset.map(encode_batch, batched=True)
dataset = dataset.rename_column("Answer", "labels")

dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/997 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

### **3. Load the pretrained model and optimizer**

In [None]:
# load the model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False
)

model.cuda()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
# load the optimizer
optimizer = AdamW(model.parameters(), lr = 1e-5, eps = 1e-8)



### **4. Train the model**

In [None]:
# prediction function
def predict(outputs):
    probabilities = torch.softmax(outputs["logits"], dim=1)
    predictions = torch.argmax(probabilities, dim=1)
    return predictions

In [None]:
# set parameter
batch_size = 16
epoch = 10

# get training data
train_dataloader = torch.utils.data.DataLoader(
            dataset['train'],
            sampler = RandomSampler(dataset['train']),
            batch_size = batch_size
        )
# get validation data
val_dataloader = torch.utils.data.DataLoader(
            dataset['validation'],
            sampler = RandomSampler(dataset['validation']),
            batch_size = batch_size
        )

In [None]:
# start training
model.train()

training_loss = []

for epoch_i in range(epoch):
    print('Epoch %s/%s' % (epoch_i + 1, epoch))

    epoch_loss = 0

    pbar = tqdm(train_dataloader)

    for batch in pbar:

        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs['loss']
        loss.backward()
        optimizer.step()

        # show progress along with metrics
        pbar.set_postfix({'Loss': '{:.3f}'.format(loss.item())})

        epoch_loss += loss.item()


    pbar.close()

    epoch_loss /= len(train_dataloader)
    training_loss.append(epoch_loss)

    # evaluate the model accuracy
    model.eval()
    correct = 0
    count = 0

    for batch in val_dataloader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)

      with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

      # calculate the accuracy
      predictions = predict(outputs)
      correct += predictions.eq(labels).sum().item()
      count += len(labels)
      accuracy = correct / count



    print("Accuracy: {0:.3f}".format(accuracy))

In [None]:
# visualize the training loss
plt.plot(training_loss, label="Training Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
#plt.ylim(0.0,1.0)
plt.title("Iteration vs Training Loss")
plt.legend()
plt.show()

### **5. Test the model**

In [None]:
# load test data
test_loader = torch.utils.data.DataLoader(
            dataset['test'],
            batch_size = 1
        )

# Start testing
model.eval()

with torch.no_grad():

    correct_num = 0
    count = 0
    record = []

    pbar = tqdm(test_loader)
    for batch in pbar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs['loss']

        predictions = predict(outputs)
        correct_num += predictions.eq(labels).sum().item()
        count += len(labels)
        accuracy = correct_num / count

        # show progress along with metrics
        pbar.set_postfix({
            'loss': '{:.3f}'.format(loss.item()),
            'accuracy': '{:.3f}'.format(accuracy)
        })
        record.append(int(predictions))

    pbar.close()

print("\nThe accuracy on the test data: {:.2%}".format(accuracy))

In [None]:
# error analysis

test_data['Prediction answer'] = record

i = 0
first_sentence = []
second_sentence = []
wrong_prediction = []

for answer, prediction in zip (test_data['Answer'],test_data['Prediction answer']):

  if answer != prediction:
    first_sentence.append(test_data['Sentence 0'].iloc[i])
    second_sentence.append(test_data['Sentence 1'].iloc[i])
    wrong_prediction.append(prediction)
  i += 1

wrong_predictions = pd.DataFrame({
    'Sentence 0':first_sentence,
    'Sentence 1':second_sentence,
    'Wrong prediction':wrong_prediction
    })

pd.set_option('display.max_rows', None)
wrong_predictions.to_csv('Wrong prediction.csv',index=False)

In [None]:
test_data.to_csv('Test data sol2.csv',index = False)

In [None]:
# Display the confusion Matrix
import seaborn as sns
crosstab = pd.crosstab(test_data['Answer'],test_data['Prediction answer'])
sns.heatmap(crosstab, cmap='Oranges', annot=True, fmt='g', linewidths=5)
#plt.title("Confusion Matrix (Accuracy: %s%%)" % round(accuracy*100,2))
plt.title('Confusion Matrix (Accuracy:{:.2%})'.format(accuracy))
plt.show()