In [74]:
# !pip install torch
# !pip install transformers
# !pip install matplotlib
# !pip install tqdm



In [75]:
import json
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, BertConfig
import torch
from sklearn.model_selection import train_test_split
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch.optim as optim
import matplotlib.pyplot as plt
import tqdm


In [76]:
def get_data_lists(data):
    questions = []
    choices = []
    labels = []
    for example in data:
        # print(example.keys())
        questions.append(example['question'])
        choices.append(example['choice_list'])
        labels.append(example['label'])
    return questions, choices, labels

class Brain_Teaser(Dataset):
  def __init__(self, tokenizer, questions, choices, labels, max_len=512):
    self.questions = questions
    self.choices = choices
    self.labels = labels

    self.max_len = max_len
    self.tokenizer = tokenizer
    self.inputs = []
    self.targets = []
    self.question_options_encoded = []

    self.build_questions()
  
  def build_questions(self):
    maxi = 0
    for id in range(len(self.questions)):
      question_options = "Question : " + self.questions[id] + ' ' + "Options:" + ' ' + ' 0 '+ self.choices[id][0] + '1 '+ self.choices[id][1] + ' 2 '+ self.choices[id][2] + ' 3 '+ self.choices[id][3]
      maxi = max(maxi, len(question_options.split(' ')))
      self.question_options_encoded.append(self.tokenizer(question_options, return_tensors="pt", max_length = 205, padding="max_length", truncation=True, add_special_tokens = True))
    
  def __len__(self):
    return len(self.question_options_encoded)
  
  def __getitem__(self, id):
    return self.question_options_encoded[id],  self.labels[id]
    
  


     

In [70]:
train_data = np.load("data/SP-train.npy", allow_pickle=True)
test_data = np.load("data/SP_eval_data_for_practice.npy", allow_pickle = True)

train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)
train_questions, train_choices, train_labels = get_data_lists(train_data)
val_questions, val_choices, val_labels = get_data_lists(val_data)

In [71]:
class MCQBert(nn.Module):
    def __init__(self):
        super(MCQBert, self).__init__()
        self.pre_trained = BertModel.from_pretrained("bert-base-uncased")
        self.classifier =  nn.Linear(self.pre_trained.config.hidden_size, 4)

    def forward(self, input_ids, attention_mask):
        x = self.pre_trained(input_ids = input_ids, attention_mask = attention_mask)
        x = self.classifier(x['last_hidden_state'][:,0,:]) #Using CLS token for prediction
        return x
    
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model_bert = MCQBert()
model_bert.eval()

MCQBert(
  (pre_trained): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [77]:
train_dataset = Brain_Teaser(tokenizer, train_questions, train_choices, train_labels)
val_dataset = Brain_Teaser(tokenizer, val_questions, val_choices, val_labels)



In [81]:


def train(train_dataset, val_dataset, model=model_bert, epochs=5, learning_rate=1e-3):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=True)
    
    train_losses = []
    val_losses = []

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        t_correct = 0
        t_total = 0
        for inputs, targets in tqdm.tqdm(train_dataloader):
            inputs_ids, attention_mask = inputs['input_ids'].to(device), inputs['attention_mask'].to(device)
            targets = targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs_ids[0], attention_mask)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            _, predicted = torch.max(outputs, 1)
            t_total += targets.size(0)
            t_correct += (predicted == t_total).sum().item()
        
        # Validation
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, targets in val_dataloader:
                inputs_ids, attention_mask = inputs['input_ids'].to(device), inputs['attention_mask'].to(device)
                targets = targets.to(device)
                outputs = model(inputs_ids[0], attention_mask)
                loss = criterion(outputs, targets)
                val_loss += loss.item()

                _, predicted = torch.max(outputs, 1)
                total += targets.size(0)
                correct += (predicted == targets).sum().item()

        train_loss = running_loss / len(train_dataloader)
        val_loss = val_loss / len(val_dataloader)
        train_losses.append(train_loss)
        val_losses.append(val_loss)

        print(f"Epoch [{epoch+1}/{epochs}], "
              f"Train Loss: {train_loss:.4f}, "
                f"Train Accuracy: {(100 * t_correct / t_total):.2f}% , "
              f"Val Loss: {val_loss:.4f}, "
              f"Val Accuracy: {(100 * correct / total):.2f}%")

    # Plotting
    plt.figure(figsize=(10, 5))
    plt.plot(range(1, epochs + 1), train_losses, label='Training Loss')
    plt.plot(range(1, epochs + 1), val_losses, label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.show()

# Example usage:
train(train_dataset, val_dataset, model=model_bert, epochs=5)


 11%|█         | 6/57 [00:17<02:19,  2.73s/it]

In [63]:
train(train_dataset, val_dataset)

torch.Size([8, 768])
torch.Size([8, 4])


In [15]:
tokenised = tokenizer("Hi Hello I am Nalish", return_tensors="pt", max_length = 10, padding="max_length", truncation=True, add_special_tokens = True)


In [16]:
tokenised

{'input_ids': tensor([[  101,  7632,  7592,  1045,  2572,  6583, 13602,   102,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])}

In [18]:
tokenizer.decode(tokenised['input_ids'][0])

'[CLS] hi hello i am nalish [SEP] [PAD] [PAD]'