In [1]:
from transformers import AutoTokenizer, DistilBertForSequenceClassification
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from accelerate import Accelerator
from torch.utils.data import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np



  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classif

In [2]:


class BertClassifier(nn.Module):
    def __init__(self, bert_name, num_classes):
        super(BertClassifier, self).__init__()
        
        # Load a pretrained BERT model
        self.bert = DistilBertForSequenceClassification.from_pretrained(bert_name)
        
        # Add a classification layer on top of BERT
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
        
    def forward(self, inputs):
        # Feed the input to BERT
        outputs = self.bert(**inputs, output_hidden_states=True)
        last_hidden_states = outputs.hidden_states[-1]
        logits = self.classifier(last_hidden_states[:,0,:])
        
        return logits


In [3]:

df = pd.read_csv('C:\data_projects\subjects-questions.csv').sample(20000)
train, test = train_test_split(df, test_size=0.3)
test, val = train_test_split(test, test_size=0.5)
class CustomImageDataset(Dataset):
    def __init__(self, data,tokenizer,labels):
        self.data = data
        self.tokenizer = tokenizer
        self.labels = labels


    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        label = self.data.iloc[idx, 1]
        data = self.data.iloc[idx,0]
        label =  torch.zeros(len(self.labels), dtype=torch.float).scatter_(0, torch.tensor(self.labels.index(label)), value=1)

        return data, label
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
train_dataset = CustomImageDataset(data = train ,tokenizer=tokenizer,labels =['Biology', 'Chemistry', 'Maths', 'Physics'])
val_dataset = CustomImageDataset(data = val ,tokenizer=tokenizer,labels =['Biology', 'Chemistry', 'Maths', 'Physics'])
test_dataset = CustomImageDataset(data = test ,tokenizer=tokenizer,labels =['Biology', 'Chemistry', 'Maths', 'Physics'])
numcls = train.Subject.value_counts().loc[['Biology', 'Chemistry', 'Maths', 'Physics']].tolist()


In [4]:


def class_weight(num_per_class,beta):
    effective_num = 1.0 - np.power(beta, num_per_class)
    weights = (1.0 - beta) / np.array(effective_num)
    weights = weights / np.sum(weights)
    return  torch.tensor(weights)
class_weight([5,100,20,15,1000,7000,35000],0.9)


tensor([0.2762, 0.1131, 0.1288, 0.1425, 0.1131, 0.1131, 0.1131],
       dtype=torch.float64)

In [9]:



def train(model,tokenizer, train_dataloader, val_dataloader, epochs, lr, warmup_steps,beta,num_per_class):
    accelerator = Accelerator(mixed_precision="fp16")
    device = accelerator.device

    
    optimizer = AdamW(model.parameters(), lr=lr)
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)
    weight =class_weight(num_per_class,beta)
    loss_fn = nn.CrossEntropyLoss(weight=weight)
    model, optimizer, train_dataloader, scheduler = accelerator.prepare(
    model, optimizer, train_dataloader, scheduler)
    model.to(device)
    loss_fn.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch, labels in train_dataloader:
            batch= tokenizer(batch, truncation=True, padding=True, return_tensors="pt")
            batch = batch.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            logits = model(batch)
            loss = loss_fn(logits, labels)
            accelerator.backward(loss)
            total_loss += loss.item()
            optimizer.step()
            scheduler.step()
            
        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch+1} / {epochs}:")
        print(f"  Training loss: {avg_train_loss:.4f}")
        
        model.eval()
        total_val_loss = 0
        total_val_accuracy = 0
        with torch.no_grad():
            for batch, labels in val_dataloader:
                batch= tokenizer(batch, truncation=True, padding=True, return_tensors="pt")
                batch = batch.to(device)
                labels = labels.to(device)
                logits = model(batch)
                loss = loss_fn(logits, labels)
                total_val_loss += loss.item()
                preds = torch.argmax(logits, dim=1)
                total_val_accuracy += torch.sum(preds == labels).item()
        avg_val_loss = total_val_loss / len(val_dataloader)
        avg_val_accuracy = total_val_accuracy / len(val_dataloader.dataset)
        print(f"  Validation loss: {avg_val_loss:.4f}")
        print(f"  Validation accuracy: {avg_val_accuracy:.4f}")
train_dataloader = DataLoader(train_dataset, batch_size=5)
val_dataloader = DataLoader(val_dataset, batch_size=16)
model= BertClassifier("distilbert-base-uncased",4)
train(model, tokenizer,train_dataloader, val_dataloader,1, 0.00001, 3, 0.9,numcls)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'pre_classifier.

KeyboardInterrupt: 