In [23]:
# Install transformers (if not already installed)
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, classification_report

In [None]:
'''
The classifier is built on top of the famous BERT model, which is great at understanding text. 
We will then add a dropout layer to keep things in check and a linear layer to help us classify text.
Our BERTClassifier takes in some input IDs and attention masks, and runs them through BERT and the extra layers we added. 
The classifier returns our output as class scores.
'''
class CaptionClassifier(nn.Module):
    def __init__(self, bert_model_name, num_labels=2):
        super(CaptionClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_labels)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits


In [None]:
"""
    Dataset for the caption classifier.
    Each sample is a tuple: (original_caption, generated_caption, occlusion_level, label)
    the input text is formed as mentioned in assignment 
"""
class CaptionClassifierDataset(Dataset):
    
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        orig_cap, gen_cap, occlusion_level, label = self.data[idx]
        input_text = f"{orig_cap} {tokenizer.sep_token} {gen_cap} {tokenizer.sep_token} {occlusion_level}"
        encoding = self.tokenizer.encode_plus(
            input_text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze()  # shape: [max_length]
        attention_mask = encoding['attention_mask'].squeeze()
        
        return input_ids, attention_mask, torch.tensor(label, dtype=torch.long)


In [None]:
'''
This code combines two files into one csv file 
and rearrange them so as to make train, validation and test sets 
'''

df_smol = pd.read_csv("/kaggle/input/partc-bert/occlusion_details_SmolVLM.csv")
df_custom = pd.read_csv("/kaggle/input/partc-bert/occlusion_details_custom.csv")

# Add label: 0 for SmolVLM, 1 for Custom model
df_smol["label"] = 0
df_custom["label"] = 1
df_smol["image_id"] = df_smol.index
df_custom["image_id"] = df_custom.index
df_combined = pd.concat([df_smol, df_custom], ignore_index=True)
unique_ids = df_smol["image_id"].unique()
np.random.shuffle(unique_ids)

n = len(unique_ids)
train_ids = unique_ids[:int(0.7 * n)]
val_ids = unique_ids[int(0.7 * n):int(0.8 * n)]
test_ids = unique_ids[int(0.8 * n):]

def get_split(df, ids):
    # Get all rows whose image_id is in ids.
    return df[df["image_id"].isin(ids)]

df_train = get_split(df_combined, train_ids)
df_val = get_split(df_combined, val_ids)
df_test = get_split(df_combined, test_ids)

print(f"Train: {len(df_train)} rows, Val: {len(df_val)} rows, Test: {len(df_test)} rows")
def create_data_list(df):
    data_list = []
    for _, row in df.iterrows():
        # Ensure occlusion_level is represented as int or str if needed.
        data_list.append((row["original_caption"], row["generated_caption"], row["occlusion_level"], row["label"]))
    return data_list

train_data = create_data_list(df_train)
val_data = create_data_list(df_val)
test_data = create_data_list(df_test)



Train: 5196 rows, Val: 742 rows, Test: 1486 rows


In [None]:
'''
The train() function takes the model, data loader, optimizer, scheduler, and device as its trainees. 
The function puts the model into training mode and then runs through each batch of data from the data loader. 
For each batch, it clears the optimizer’s gradients, gets the input IDs, attention masks, and labels, and feeds them to the model
'''
def train_classifier(model, dataloader, optimizer, criterion, device, epochs):
    model.to(device)
    best_loss = float('inf')

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        print(f"\n--- Epoch {epoch+1}/{epochs} ---")
        
        for batch_idx, (input_ids, attention_mask, labels) in enumerate(dataloader):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            if (batch_idx + 1) % 10 == 0 or (batch_idx + 1) == len(dataloader):
                print(f"  Batch {batch_idx+1}/{len(dataloader)} - Loss: {loss.item():.4f}")

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch [{epoch+1}/{epochs}] Average Loss: {avg_loss:.4f}")



def evaluate_classifier(model, dataloader, device):
    model.to(device)
    model.eval()

    predictions = []
    actual_labels = []

    with torch.no_grad():
        for batch_idx, (input_ids, attention_mask, labels) in enumerate(dataloader):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            logits = model(input_ids, attention_mask)
            _, preds = torch.max(logits, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())

    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)


In [28]:
bert_model_name = 'bert-base-uncased'
num_classes = 2
num_epochs = 3
learning_rate = 2e-5

In [29]:
# Set device and initialize the tokenizer for BERT
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Create dataset objects for train, validation, and test splits
train_dataset = CaptionClassifierDataset(train_data, tokenizer, max_length=128)
val_dataset = CaptionClassifierDataset(val_data, tokenizer, max_length=128)
test_dataset = CaptionClassifierDataset(test_data, tokenizer, max_length=128)


# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Initialize the classifier, optimizer, and loss function
model = CaptionClassifier(bert_model_name,2).to(device)
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
criterion = nn.CrossEntropyLoss()

weights_path = "/kaggle/working/caption_classifier.pt"
if os.path.exists(weights_path):
    model.load_state_dict(torch.load(weights_path, map_location=device))
    print(f"Loaded model weights from {weights_path}")
else:
    print("No saved weights found. Training from scratch.")


# Train the classifier for a few epochs (adjust epochs as needed)
epochs = 3
train_classifier(model, train_loader, optimizer, criterion, device, epochs)
model_save_path = "caption_classifier.pt"
torch.save(model.state_dict(), model_save_path)
print(f"Classifier saved to {model_save_path}")




No saved weights found. Training from scratch.

--- Epoch 1/3 ---
  Batch 10/163 - Loss: 0.5576
  Batch 20/163 - Loss: 0.3615
  Batch 30/163 - Loss: 0.2858
  Batch 40/163 - Loss: 0.2082
  Batch 50/163 - Loss: 0.1521
  Batch 60/163 - Loss: 0.1077
  Batch 70/163 - Loss: 0.1974
  Batch 80/163 - Loss: 0.0171
  Batch 90/163 - Loss: 0.0315
  Batch 100/163 - Loss: 0.0486
  Batch 110/163 - Loss: 0.0218
  Batch 120/163 - Loss: 0.0044
  Batch 130/163 - Loss: 0.0168
  Batch 140/163 - Loss: 0.0061
  Batch 150/163 - Loss: 0.0863
  Batch 160/163 - Loss: 0.0439
  Batch 163/163 - Loss: 0.0664
Epoch [1/3] Average Loss: 0.1294

--- Epoch 2/3 ---
  Batch 10/163 - Loss: 0.0667
  Batch 20/163 - Loss: 0.0061
  Batch 30/163 - Loss: 0.0453
  Batch 40/163 - Loss: 0.0404
  Batch 50/163 - Loss: 0.0439
  Batch 60/163 - Loss: 0.0021
  Batch 70/163 - Loss: 0.1007
  Batch 80/163 - Loss: 0.0018
  Batch 90/163 - Loss: 0.0465
  Batch 100/163 - Loss: 0.0037
  Batch 110/163 - Loss: 0.0014
  Batch 120/163 - Loss: 0.0025
 

In [30]:
# Evaluate on validation and test sets
val_accuracy, val_report = evaluate_classifier(model, val_loader, device)
print(f"Validation Accuracy: {val_accuracy:.4f}")

test_accuracy, test_report = evaluate_classifier(model, test_loader, device)
print(f"Test Accuracy: {test_accuracy:.4f}")
print(test_report)

Validation Accuracy: 0.9798
Test Accuracy: 0.9838
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       743
           1       0.99      0.98      0.98       743

    accuracy                           0.98      1486
   macro avg       0.98      0.98      0.98      1486
weighted avg       0.98      0.98      0.98      1486

