In [1]:
# Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
import time
from sklearn.metrics import classification_report, accuracy_score
import os

In [2]:
# Constants
tEPOCHS = 3
EPOCHS = 10  # Set distillation epochs to 1 for demonstration purposes


# Load the dataset
data = pd.read_excel('/kaggle/input/ubmec-ii/UBMEC.xlsx')
#data = data.sample(frac=1, random_state=42).reset_index(drop=True)
#data = data.tail(1000)

# Split the data
X = data['text'].astype(str).tolist()
y = data['classes']
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Device configuration
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Using device: {device}')

# Utility functions
def encode_texts(texts, tokenizer, max_len=512):
    return tokenizer(texts, padding=True, truncation=True, max_length=max_len, return_tensors='pt')

def calculate_accuracy(preds, labels):
    preds_argmax = torch.argmax(preds, dim=1)
    correct = (preds_argmax == labels).float()
    accuracy = correct.sum() / len(correct)
    return accuracy.item()

# Train Teacher Model (BERT)
#teacher_name = 'bert-base-multilingual-cased'
#tokenizer = BertTokenizer.from_pretrained(teacher_name)
#teacher_model = BertForSequenceClassification.from_pretrained(teacher_name, num_labels=6)

# Load model directly
#from transformers import AutoTokenizer, AutoModelForPreTraining

#model = AutoModelForPreTraining.from_pretrained("csebuetnlp/banglabert")

Using device: cuda


In [3]:
# Load Teacher Model (BERT)
teacher_name = 'csebuetnlp/banglabert'
tokenizer = AutoTokenizer.from_pretrained(teacher_name)
teacher_model = AutoModelForSequenceClassification.from_pretrained(teacher_name, num_labels=6)
teacher_model.to(device)

# Checkpoint path for the teacher model
checkpoint_path = 'best_teacher_model.pth'

""""
# Load best teacher model checkpoint if it exists
if os.path.exists(checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    teacher_model.load_state_dict(checkpoint['model_state_dict'])
    best_val_acc = checkpoint['val_acc']
    print(f"Loaded best teacher model checkpoint with val_acc: {best_val_acc*100:.4f}%")
"""
train_encodings = encode_texts(X_train, tokenizer)
test_encodings = encode_texts(X_test, tokenizer)

y_train = torch.tensor(y_train)
y_test = torch.tensor(y_test)

train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], y_train)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], y_test)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(teacher_model.parameters(), lr=1e-5)

# Training function for Teacher Model
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    
    for input_ids, attention_mask, labels in iterator:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        loss = criterion(logits, labels)
        acc = calculate_accuracy(logits, labels)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

# Evaluation function for Teacher Model
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for input_ids, attention_mask, labels in iterator:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            
            loss = criterion(logits, labels)
            acc = calculate_accuracy(logits, labels)

            all_preds.extend(logits.argmax(dim=1).cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            
            epoch_loss += loss.item()
            epoch_acc += acc
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator), all_preds, all_labels

# Initialize the best validation accuracy variables
best_val_acc = 0.0
best_val_acc_student = 0.0

# Early stopping parameters
patience = 3
patience_counter = 0

tokenizer_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/586 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/528k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# Training Teacher Model with Checkpointing and Early Stopping
for epoch in range(tEPOCHS):
    start_time = time.time()
    
    train_loss, train_acc = train(teacher_model, train_loader, optimizer, criterion)
    val_loss, val_acc, all_preds, all_labels = evaluate(teacher_model, test_loader, criterion)
    
    end_time = time.time()
    epoch_mins, epoch_secs = divmod(end_time - start_time, 60)
    
    print(f'Epoch: {epoch+1:02}/{tEPOCHS} | Epoch Time: {int(epoch_mins)}m {int(epoch_secs)}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.4f}%')
    print(f'\tVal Loss: {val_loss:.3f} | Val Acc: {val_acc*100:.4f}%')
    
    # Checkpointing for the best teacher model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': teacher_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_acc': val_acc,
            'val_loss': val_loss,
        }, checkpoint_path)
        print(f'Saved new best teacher model with val_acc: {val_acc*100:.4f}%')
        patience_counter = 0  # Reset patience counter
    else:
        patience_counter += 1

    # Early stopping
    if patience_counter >= patience:
        print("Early stopping triggered")
        break


Epoch: 01/3 | Epoch Time: 10m 20s
	Train Loss: 1.312 | Train Acc: 50.0279%
	Val Loss: 1.100 | Val Acc: 58.2217%
Saved new best teacher model with val_acc: 58.2217%
Epoch: 02/3 | Epoch Time: 10m 20s
	Train Loss: 0.953 | Train Acc: 64.8065%
	Val Loss: 1.052 | Val Acc: 60.7515%
Saved new best teacher model with val_acc: 60.7515%
Epoch: 03/3 | Epoch Time: 10m 21s
	Train Loss: 0.739 | Train Acc: 73.5863%
	Val Loss: 1.104 | Val Acc: 60.0446%


In [5]:
# Generate classification report for Teacher Model
class_names = ['joy', 'disgust', 'anger', 'sadness', 'surprise', 'fear']
teacher_model.load_state_dict(torch.load(checkpoint_path)['model_state_dict'])
_, _, all_preds_teacher, all_labels_teacher = evaluate(teacher_model, test_loader, criterion)
report_teacher = classification_report(all_labels_teacher, all_preds_teacher, target_names=class_names, digits=4)
print("Teacher Model Classification Report:\n", report_teacher)
# Generate classification report for Teacher Model

#report = classification_report(all_labels, all_preds, target_names=class_names, digits=4)
#print("Teacher Model Classification Report:\n", report)


Teacher Model Classification Report:
               precision    recall  f1-score   support

         joy     0.6193    0.5764    0.5970       491
     disgust     0.3920    0.4231    0.4069       416
       anger     0.8063    0.4691    0.5931       275
     sadness     0.8099    0.8027    0.8063       674
    surprise     0.4942    0.7061    0.5814       541
        fear     0.6740    0.4192    0.5169       291

    accuracy                         0.6075      2688
   macro avg     0.6326    0.5661    0.5836      2688
weighted avg     0.6318    0.6075    0.6079      2688



In [6]:
# Load the best teacher model for distillation
#teacher_model.load_state_dict(torch.load(checkpoint_path)['model_state_dict'])

# Define a custom distillation model
class DistillationModel(nn.Module):
    def __init__(self, student_model, teacher_model, temperature=2, alpha=0.5):
        super(DistillationModel, self).__init__()
        self.student_model = student_model
        self.teacher_model = teacher_model
        self.temperature = temperature
        self.alpha = alpha

    def forward(self, input_ids, attention_mask):
        # Compute student logits
        student_outputs = self.student_model(input_ids, attention_mask=attention_mask)
        student_logits = student_outputs.logits
        
        # Compute teacher logits (no gradients needed)
        with torch.no_grad():
            teacher_outputs = self.teacher_model(input_ids, attention_mask=attention_mask)
            teacher_logits = teacher_outputs.logits
        
        return student_logits, teacher_logits

# Training and evaluation functions for Student Model
def train_student(model, iterator, optimizer, criterion, temperature=2, alpha=0.5):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    
    for input_ids, attention_mask, labels in iterator:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        
        student_logits, teacher_logits = model(input_ids, attention_mask=attention_mask)
        
        student_loss = criterion(student_logits, labels)
        distillation_loss = nn.KLDivLoss()(nn.functional.log_softmax(student_logits / temperature, dim=1),
                                           nn.functional.softmax(teacher_logits / temperature, dim=1)) * (temperature * temperature)
        loss = alpha * student_loss + (1 - alpha) * distillation_loss
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += calculate_accuracy(student_logits, labels)

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate_student(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    all_preds_student = []
    all_labels_student = []
    
    with torch.no_grad():
        for input_ids, attention_mask, labels in iterator:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            
            student_logits, _ = model(input_ids, attention_mask=attention_mask)
            
            loss = criterion(student_logits, labels)
            acc = calculate_accuracy(student_logits, labels)

            all_preds_student.extend(student_logits.argmax(dim=1).cpu().numpy())
            all_labels_student.extend(labels.cpu().numpy())
            
            epoch_loss += loss.item()
            epoch_acc += acc
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator), all_preds_student, all_labels_student

# Load Student Model (DistilBERT with Sequence Classification Head)
student_name = 'h4g3n/distilbert-mini-multilingual-cased'
student_tokenizer = AutoTokenizer.from_pretrained(student_name)
student_model = AutoModelForSequenceClassification.from_pretrained(student_name, num_labels=6)

student_model.to(device)

distillation_model = DistillationModel(student_model, teacher_model)
distillation_model.to(device)

optimizer_student = optim.AdamW(distillation_model.student_model.parameters(), lr=5e-5)

# Checkpoint path for the student model
checkpoint_path_student = 'best_student_model.pth'


tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/208M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at h4g3n/distilbert-mini-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Training Student Model with Checkpointing and Early Stopping
for epoch in range(EPOCHS):
    start_time = time.time()
    
    train_loss, train_acc = train_student(distillation_model, train_loader, optimizer_student, criterion, temperature=2, alpha=0.5)
    val_loss, val_acc, all_preds_student, all_labels_student = evaluate_student(distillation_model, test_loader, criterion)
    
    end_time = time.time()
    epoch_mins, epoch_secs = divmod(end_time - start_time, 60)
    
    print(f'Epoch: {epoch+1:02}/{EPOCHS} | Epoch Time: {int(epoch_mins)}m {int(epoch_secs)}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.4f}%')
    print(f'\tVal Loss: {val_loss:.3f} | Val Acc: {val_acc*100:.4f}%')
    
    # Checkpointing for the best student model
    if val_acc > best_val_acc_student:
        best_val_acc_student = val_acc
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': distillation_model.student_model.state_dict(),
            'optimizer_state_dict': optimizer_student.state_dict(),
            'val_acc': val_acc,
            'val_loss': val_loss,
        }, checkpoint_path_student)
        print(f'Saved new best student model with val_acc: {val_acc*100:.4f}%')
        patience_counter = 0  # Reset patience counter
    else:
        patience_counter += 1

    # Early stopping
    if patience_counter >= patience:
        print("Early stopping triggered")
        break




Epoch: 01/10 | Epoch Time: 5m 5s
	Train Loss: 0.876 | Train Acc: 33.4542%
	Val Loss: 1.500 | Val Acc: 38.8021%
Saved new best student model with val_acc: 38.8021%
Epoch: 02/10 | Epoch Time: 5m 5s
	Train Loss: 0.685 | Train Acc: 50.1209%
	Val Loss: 1.332 | Val Acc: 49.1443%
Saved new best student model with val_acc: 49.1443%
Epoch: 03/10 | Epoch Time: 5m 5s
	Train Loss: 0.513 | Train Acc: 65.1600%
	Val Loss: 1.396 | Val Acc: 47.7679%
Epoch: 04/10 | Epoch Time: 5m 5s
	Train Loss: 0.389 | Train Acc: 75.9766%
	Val Loss: 1.490 | Val Acc: 48.3631%
Epoch: 05/10 | Epoch Time: 5m 5s
	Train Loss: 0.300 | Train Acc: 83.9844%
	Val Loss: 1.620 | Val Acc: 47.2470%
Early stopping triggered


In [8]:
# Load the best teacher and student model checkpoints for evaluation
teacher_model.load_state_dict(torch.load(checkpoint_path)['model_state_dict'])
distillation_model.student_model.load_state_dict(torch.load(checkpoint_path_student)['model_state_dict'])

# Generate evaluation reports for the best models
_, _, all_preds_teacher, all_labels_teacher = evaluate(teacher_model, test_loader, criterion)
_, _, all_preds_student, all_labels_student = evaluate_student(distillation_model, test_loader, criterion)

# Generate classification reports for both models
report_teacher = classification_report(all_labels_teacher, all_preds_teacher, target_names=class_names, digits=4)
report_student = classification_report(all_labels_student, all_preds_student, target_names=class_names, digits=4)

print("Teacher Model Classification Report:\n", report_teacher)
print("Distilled Student Model Classification Report:\n", report_student)

Teacher Model Classification Report:
               precision    recall  f1-score   support

         joy     0.6193    0.5764    0.5970       491
     disgust     0.3920    0.4231    0.4069       416
       anger     0.8063    0.4691    0.5931       275
     sadness     0.8099    0.8027    0.8063       674
    surprise     0.4942    0.7061    0.5814       541
        fear     0.6740    0.4192    0.5169       291

    accuracy                         0.6075      2688
   macro avg     0.6326    0.5661    0.5836      2688
weighted avg     0.6318    0.6075    0.6079      2688

Distilled Student Model Classification Report:
               precision    recall  f1-score   support

         joy     0.4451    0.6110    0.5150       491
     disgust     0.2786    0.1346    0.1815       416
       anger     0.5227    0.4182    0.4646       275
     sadness     0.6539    0.7596    0.7028       674
    surprise     0.3897    0.5194    0.4453       541
        fear     0.6404    0.1959    0.3000   

# Calculating size

In [9]:
import os

def get_model_size_and_params(model):
    param_count = sum(p.numel() for p in model.parameters())
    temp_model_path = "temp_model.pth"
    torch.save(model.state_dict(), temp_model_path)
    model_size = os.path.getsize(temp_model_path)
    os.remove(temp_model_path)
    return param_count, model_size

# Calculate size and param count for the distilled student model
#param_count_student, model_size_student = get_model_size_and_params(distillation_model.student_model)
#print(f"Distilled Student Model - Params: {param_count_student}, Size: {model_size_student / 1e6:.2f}MB")

In [10]:
# Save the distilled student model
#student_model_save_path = "distilled_student_model.pth"
#torch.save(distillation_model.student_model.state_dict(), student_model_save_path)

# Verify the saved model size
#model_size_student = os.path.getsize(student_model_save_path)
#print(f"Distilled Student Model Size: {model_size_student / 1e6:.2f} MB")


In [11]:
# Calculate size and param count before distillation for the student model
param_count_before_distillation, model_size_before_distillation = get_model_size_and_params(teacher_model)
print(f"Original Teacher Model - Params: {param_count_before_distillation}, Size: {model_size_before_distillation / 1e6:.2f}MB")

# Calculate size and param count after distillation for the student model
param_count_after_distillation, model_size_after_distillation = get_model_size_and_params(distillation_model.student_model)
print(f"Distilled Student Model - Params: {param_count_after_distillation}, Size: {model_size_after_distillation / 1e6:.2f}MB")


Original Teacher Model - Params: 110621958, Size: 442.57MB
Distilled Student Model - Params: 52164870, Size: 208.68MB
