In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 10.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 35.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.5 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 14.4 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 44.8 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existin

In [3]:
import torch.nn as nn
import torch
import transformers
import pandas as pd
import numpy as np
from sklearn import model_selection
from transformers import AdamW, get_linear_schedule_with_warmup
from scipy import stats

import warnings 
warnings.filterwarnings('ignore')

In [4]:
class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.bert = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.out = nn.Linear(768, 3)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, ids, mask, token_type_ids):
        _, output = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        output = self.out(output)
        return self.softmax(output)

class BERTDatasetTraining:
    def __init__(self, question, context, targets, tokenizer, max_len):
        self.question = question
        self.context = context
        self.targets = targets
        
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.question)
    
    def __getitem__(self, item):
        question= str(self.question[item])
        context = str(self.context[item])

        inputs = self.tokenizer.encode_plus(
            question,
            context,
            add_special_tokens=True
        )

        ids = inputs['input_ids']
        token_type_ids = inputs['token_type_ids']
        mask = inputs['attention_mask']
        
        padding_len = self.max_len - len(ids)
        
        ids = ids[:self.max_len] + ([0] * padding_len) 
        token_type_ids = token_type_ids[:self.max_len] + ([0] * padding_len)
        mask = mask[:self.max_len] + ([0] * padding_len)
        

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[item], dtype=torch.long)
        }


def loss_fn(outputs, targets):
    return nn.CrossEntropyLoss()(outputs, targets)


def train_loop_fn(data_loader, model, optimizer, device, scheduler=None):
    model.train()
    for bi, d in enumerate(data_loader):
        ids = d['ids']
        mask = d['mask']
        token_type_ids = d['token_type_ids']
        targets = d['targets']
        
        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.long)
        
        optimizer.zero_grad()
        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        if scheduler is not None:
            scheduler.step()
        if bi % 50 == 0:
            print(f'bi={bi}, loss={loss}')

            
def eval_loop_fn(data_loader, model, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    for bi, d in enumerate(data_loader):
        with torch.no_grad():
            ids = d['ids'].to(device, dtype=torch.long)
            mask = d['mask'].to(device, dtype=torch.long)
            token_type_ids = d['token_type_ids'].to(device, dtype=torch.long)
            targets = d['targets'].to(device, dtype=torch.long)
          
            outputs = model(ids, mask, token_type_ids)
            #loss = loss_fn(outputs, targets)
          
            fin_targets.append(targets.cpu().detach().numpy())
            fin_outputs.append(outputs.cpu().detach().numpy())

    return np.vstack(fin_outputs), np.hstack(fin_targets)


def run():
    MAX_LEN = 512
    TRAIN_BATCH_SIZE = 4
    EPOCHS = 4
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    
    df_train = pd.read_json('./drive/MyDrive/qa_finetuning/data/pqal_fold0/train_set.json', orient='index')
    df_valid = pd.read_json('./drive/MyDrive/qa_finetuning/data/pqal_fold0/dev_set.json', orient='index')
    

    target_cols = 'final_decision'
    train_targets_str = df_train[target_cols].values
    valid_targets_str = df_valid[target_cols].values
    
    # Transformer the target col to numeric values
    target_dict = {'yes': 0, 'maybe': 1, 'no': 2}
    train_targets = [target_dict[key] for key in train_targets_str]
    valid_targets = [target_dict[key] for key in valid_targets_str]
    

    tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
    model = BERTBaseUncased().to(device)

    train_dataset = BERTDatasetTraining(
        question=df_train.QUESTION.values,
        context=df_train.CONTEXTS.values,
        targets=train_targets,
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=TRAIN_BATCH_SIZE,
        shuffle=True
    )
    
    valid_dataset = BERTDatasetTraining(
        question=df_valid.QUESTION.values,
        context=df_valid.CONTEXTS.values,
        targets=valid_targets,
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=4,
        shuffle=True,
        drop_last=True
    )
    
    optimizer = AdamW(model.parameters(), lr=2e-5)
    num_training_steps = int(len(train_dataset) / TRAIN_BATCH_SIZE * EPOCHS)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=0,
        num_training_steps=num_training_steps
    )
    
    for epoch in range(EPOCHS):
        train_loop_fn(train_data_loader, model, optimizer, device, scheduler)
        output, target = eval_loop_fn(valid_data_loader, model, device)
        acc = (output.argmax(1) == target).sum() / len(target)
        print(f'epoch: {epoch}, acc: {acc}')
    
    # Inference
    TEST_BATCH_SIZE = 4
    df_test = pd.read_json('./drive/MyDrive/qa_finetuning/data/test_set.json', orient='index')
    test_targets_str = df_test[target_cols].values
    test_targets = [target_dict[key] for key in test_targets_str]


    test_dataset = BERTDatasetTraining(
        question=df_test.QUESTION.values,
        context=df_test.CONTEXTS.values,
        targets=test_targets, # This is not used
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )
    test_data_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=TEST_BATCH_SIZE,
        shuffle=False
    )

    predictions = []
    model.eval()

    for bi, d in enumerate(test_data_loader):
        ids = d["ids"]
        mask = d["mask"]
        token_type_ids = d["token_type_ids"]

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        
        with torch.no_grad():
            outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids).cpu().detach().numpy()
            predictions.append(outputs.argmax(1))
    pmids = df_test.index

    return pmids, np.hstack(predictions)

In [5]:
pmids, predictions = run()

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Token indices sequence length is longer than the specified maximum sequence length for this model (629 > 512). Running this sequence through the model will

bi=0, loss=1.161617636680603
bi=50, loss=1.2221460342407227
bi=100, loss=1.0360175371170044
epoch: 0, acc: 0.5625
bi=0, loss=0.8508382439613342
bi=50, loss=0.8237094879150391
bi=100, loss=0.9995380640029907
epoch: 1, acc: 0.5625
bi=0, loss=1.0087517499923706
bi=50, loss=0.8305184841156006
bi=100, loss=1.0278972387313843
epoch: 2, acc: 0.5416666666666666
bi=0, loss=1.0612764358520508
bi=50, loss=0.8188626170158386
bi=100, loss=0.8771536946296692
epoch: 3, acc: 0.5208333333333334


In [11]:
import json
from sklearn.metrics import accuracy_score, f1_score
ground_truth = pd.read_json('./drive/MyDrive/qa_finetuning/data/test_ground_truth.json', orient='index')

In [14]:
truth_list = ground_truth[0].tolist()
target_dict = {'yes': 0, 'maybe': 1, 'no': 2}
truth_list = [target_dict[key] for key in truth_list]

In [16]:
(truth_list == predictions).sum() / len(predictions)

0.536