In [1]:
import pandas as pd
import numpy as np
import json
import os 

import torch.nn as nn
import torch
import transformers
from transformers import get_linear_schedule_with_warmup

### Change the path here

In [2]:
training_filepath = '../../Data/BioASQ_data/BioASQ-training7b/trainining7b.json'
test_directory = '../../Data/BioASQ_data/Task7BGoldenEnriched/'

In [3]:
training_text = []
test_text = []

# Filter out the training data
with open (training_filepath, "r") as f:
    data = json.loads(f.read())
    texts = data['questions']
    for text in texts:
        if 'exact_answer' in text.keys():
            if text['exact_answer'] == 'yes' or text['exact_answer'] == 'no':
                training_text.append(text)

# Filter out the text data
directory = test_directory
for filename in os.listdir(directory):
    with open (directory+filename, "r") as f:
        data = json.loads(f.read())
        texts = data['questions']
        for text in texts:
            if 'exact_answer' in text.keys():
                if text['exact_answer'] == 'yes' or text['exact_answer'] == 'no':
                    test_text.append(text)

In [4]:
def process_data(texts):
    question_list = []
    context_list = []
    target_list = []
    for text in texts:
        question_list.append(text['body'])
        context_list.append(' '. join([x['text'] for x in text['snippets']]))
        target_list.append(1 if text['exact_answer'] == 'yes' else 0)
    df = pd.DataFrame(zip(question_list, context_list, target_list), columns=['question', 'context', 'target'])
    return df

In [6]:
class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.bert = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.out = nn.Linear(768, 2)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, ids, mask, token_type_ids):
        _, output = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        output = self.out(output)
        return self.softmax(output)

class BERTDatasetTraining:
    def __init__(self, question, context, target, tokenizer, max_len):
        self.question = question
        self.context = context
        self.target = target
        
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.question)
    
    def __getitem__(self, item):
        question= str(self.question[item])
        context = str(self.context[item])

        inputs = self.tokenizer.encode_plus(
            question,
            context,
            add_special_tokens=True
        )

        ids = inputs['input_ids']
        token_type_ids = inputs['token_type_ids']
        mask = inputs['attention_mask']
        
        padding_len = self.max_len - len(ids)
        
        ids = ids[:self.max_len] + ([0] * padding_len) 
        token_type_ids = token_type_ids[:self.max_len] + ([0] * padding_len)
        mask = mask[:self.max_len] + ([0] * padding_len)
        

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'target': torch.tensor(self.target[item], dtype=torch.long)
        }


def loss_fn(outputs, target):
    return nn.CrossEntropyLoss()(outputs, target)


def train_loop_fn(data_loader, model, optimizer, device, scheduler=None):
    model.train()
    for bi, d in enumerate(data_loader):
        ids = d['ids']
        mask = d['mask']
        token_type_ids = d['token_type_ids']
        target = d['target']
        
        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        target = target.to(device, dtype=torch.long)
        
        optimizer.zero_grad()
        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, target)
        loss.backward()
        optimizer.step()
        if scheduler is not None:
            scheduler.step()
        if bi % 20 == 0:
            print(f'bi={bi}, loss={loss}')

            
def eval_loop_fn(data_loader, model, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    for bi, d in enumerate(data_loader):
        with torch.no_grad():
            ids = d['ids'].to(device, dtype=torch.long)
            mask = d['mask'].to(device, dtype=torch.long)
            token_type_ids = d['token_type_ids'].to(device, dtype=torch.long)
            target = d['target'].to(device, dtype=torch.long)
          
            outputs = model(ids, mask, token_type_ids)
          
            fin_targets.append(target.cpu().detach().numpy())
            fin_outputs.append(outputs.cpu().detach().numpy())

    return np.vstack(fin_outputs), np.hstack(fin_targets)

In [8]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 16
TEST_BATCH_SIZE = 4
EPOCHS = 20
LEARNING_RATE = 3e-5
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
model = BERTBaseUncased().to(device)

train_df = process_data(training_text)
test_df = process_data(test_text)

train_dataset = BERTDatasetTraining(
    question=train_df.question.values,
    context=train_df.context.values,
    target=train_df.target.values,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)
train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True
)

test_dataset = BERTDatasetTraining(
    question=test_df.question.values,
    context=test_df.context.values,
    target=test_df.target.values,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)
test_data_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=TEST_BATCH_SIZE,
    shuffle=True,
    drop_last=True
)

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
num_training_steps = int(len(train_dataset) / TRAIN_BATCH_SIZE * EPOCHS)
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

for epoch in range(EPOCHS):
    train_loop_fn(train_data_loader, model, optimizer, device, scheduler)
    output, target = eval_loop_fn(test_data_loader, model, device)
    acc = (output.argmax(1) == target).sum() / len(target)
    print(f'epoch: {epoch}, acc: {acc}')
    # print(output.tolist())
    # print(target.tolist())
    print('---------------------')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Token indices sequence length is longer than the specified maximum sequence length for this model (1156 > 512). Running this sequence through the model wil

bi=0, loss=0.6000276803970337
bi=20, loss=0.6239089369773865
bi=40, loss=0.5026217699050903
epoch: 0, acc: 0.6714285714285714
---------------------
bi=0, loss=0.5018689036369324
bi=20, loss=0.4391683340072632
bi=40, loss=0.4387642443180084
epoch: 1, acc: 0.6714285714285714
---------------------
bi=0, loss=0.625644326210022
bi=20, loss=0.5630885362625122
bi=40, loss=0.5008623003959656
epoch: 2, acc: 0.6714285714285714
---------------------
bi=0, loss=0.5632413625717163
bi=20, loss=0.438466340303421
bi=40, loss=0.5633181929588318
epoch: 3, acc: 0.6714285714285714
---------------------
bi=0, loss=0.37593668699264526
bi=20, loss=0.4383500814437866
bi=40, loss=0.6254837512969971
epoch: 4, acc: 0.6714285714285714
---------------------
bi=0, loss=0.37587085366249084
bi=20, loss=0.5630156397819519
bi=40, loss=0.5008226633071899
epoch: 5, acc: 0.6714285714285714
---------------------
bi=0, loss=0.562738299369812
bi=20, loss=0.3137112855911255
bi=40, loss=0.501624584197998
epoch: 6, acc: 0.67142

In [7]:
torch.cuda.empty_cache()