In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import wandb
from tqdm import tqdm
import matplotlib.pyplot as plt
from pathlib import Path
import kagglehub

import warnings
warnings.filterwarnings("ignore")

import string # for milestone 1
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_recall_curve, average_precision_score

from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader, Dataset as TorchDataset
from transformers import (
        AutoTokenizer, 
        AutoModelForSequenceClassification,
        AutoConfig
    )

from torch.optim import AdamW # Import AdamW from torch
from torch.optim.lr_scheduler import StepLR # Import StepLR

from kaggle_secrets import UserSecretsClient # For secure API key access

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

os.environ["TOKENIZERS_PARALLELISM"] = "false" # getting rid of hugging face tokenizer parallelism warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Set multiprocessing start method 
# This can prevent deadlocks in notebook environments
try:
    torch.multiprocessing.set_start_method("spawn", force=True)
    print("\nSet torch multiprocessing start method to 'spawn'.")
except RuntimeError as e:
    print(f"Note: Could not set start method: {e}")

/kaggle/input/2025-sep-dl-gen-ai-project/sample_submission.csv
/kaggle/input/2025-sep-dl-gen-ai-project/train.csv
/kaggle/input/2025-sep-dl-gen-ai-project/test.csv
/kaggle/input/bert-emotion-classifier/pytorch/v5/5/spm.model
/kaggle/input/bert-emotion-classifier/pytorch/v5/5/config.json
/kaggle/input/bert-emotion-classifier/pytorch/v5/5/tokenizer.json
/kaggle/input/bert-emotion-classifier/pytorch/v5/5/tokenizer_config.json
/kaggle/input/bert-emotion-classifier/pytorch/v5/5/model.safetensors
/kaggle/input/bert-emotion-classifier/pytorch/v5/5/special_tokens_map.json
/kaggle/input/bert-emotion-classifier/pytorch/v5/5/added_tokens.json

Set torch multiprocessing start method to 'spawn'.


# 1. Configuration & Globals

In [2]:
# Set to True for the first run- training and model upload
# Set to False while submitting the notebook
DO_TRAIN_AND_UPLOAD = False

# Set this to True to train on 100% of data (no validation split).
USE_FULL_DATA = False 

# Thresholds found in previous optimal tuning runs
# Used when USE_FULL_DATA=True or during inference
# MANUAL_THRESHOLDS = [0.4755, 0.1980, 0.6224, 0.2633, 0.1163] 
# MANUAL_THRESHOLDS = [0.8, 0.25, 0.77, 0.59, 0.86]
MANUAL_THRESHOLDS = [0.79, 0.48000000000000004, 0.62, 0.75, 0.92]

# Set to true for ensembling 2 models together during inference
ENSEMBLE_MODELS = False

In [3]:
config = {
    "MODEL_NAME": "yangheng/deberta-v3-base-absa-v1.1", 
    "TRAIN_FILE": "/kaggle/input/2025-sep-dl-gen-ai-project/train.csv",
    "TEST_FILE": "/kaggle/input/2025-sep-dl-gen-ai-project/test.csv",
    "VALIDATION_SPLIT_SIZE": 0.1, 
    "EPOCHS": 25, 
    "STARTING_LR": 5e-5,
    "TRAIN_BATCH_SIZE": 16, 
    "EVAL_BATCH_SIZE": 32, 
    "CLASSIFIER_DROPOUT": 0.1, 
    "RANDOM_SEED": 42, 
    "MAX_TOKEN_LENGTH": 128,
    "LR_BASE_FACTOR": 0.95, # for discriminative learning rate
}

In [4]:
# defining labels for the task
emotion_labels = ['anger', 'fear', 'joy', 'sadness', 'surprise']

label2id = {label: i for i, label in enumerate(emotion_labels)}
id2label = {i: label for i, label in enumerate(emotion_labels)}

num_labels = len(emotion_labels)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using device: {device}")

Using device: cuda


# 2. Data Loading & Preparation

In [5]:
# loading data
all_train_df = pd.read_csv(config["TRAIN_FILE"])
test_df = pd.read_csv(config["TEST_FILE"])

if USE_FULL_DATA:
    df_train = all_train_df
    df_val = pd.DataFrame() # empty dataframe

else:
    # create a train-test-split as in the config
    df_train, df_val = train_test_split(
        all_train_df,
        test_size=config["VALIDATION_SPLIT_SIZE"],
        random_state=config["RANDOM_SEED"]
    )
    
print(f"Training split shape: {df_train.shape}")
print(f"Validation split shape: {df_val.shape}")

Training split shape: (6144, 8)
Validation split shape: (683, 8)


In [6]:
# calc pos_vector 
# We calculate weights based on the data we are training on
pos_weights_list = []

total_train_samples = len(df_train)

for label in emotion_labels:
    pos_count = df_train[label].sum()
    neg_count = total_train_samples - pos_count
    weight = neg_count / pos_count if pos_count > 0 else 1.0
    pos_weights_list.append(weight)
    
pos_weight_tensor = torch.tensor(pos_weights_list, dtype=torch.float).to(device)

print(f"pos_weight vector: {pos_weights_list}")

pos_weight vector: [7.43956043956044, 0.7716262975778547, 3.1042084168336674, 2.133095359510454, 2.417130144605117]


In [7]:
# Loading tokenizer
tokenizer = AutoTokenizer.from_pretrained(config["MODEL_NAME"])

tokenizer_config.json:   0%|          | 0.00/372 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

In [8]:
def preprocess_function(batch_texts, batch_labels):
    """Tokenizes text and returns a dict for the model."""
    tokenized_inputs = tokenizer(
        batch_texts,
        truncation=True,
        padding="max_length",
        max_length=config["MAX_TOKEN_LENGTH"],
        return_tensors="pt"
    )
    labels_tensor = torch.tensor(batch_labels, dtype=torch.float)
    return {
        "input_ids": tokenized_inputs["input_ids"],
        "attention_mask": tokenized_inputs["attention_mask"],
        "labels": labels_tensor
    }

In [9]:
class EmotionDataset(TorchDataset):
    """Custom PyTorch dataset."""
    def __init__(self, df, is_test=False):
        self.texts = df['text'].tolist()
        self.is_test = is_test
        if not self.is_test:
            self.labels = df[emotion_labels].values
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        if self.is_test:
            return self.texts[idx], [] # Return empty labels for test
        return self.texts[idx], self.labels[idx]

In [10]:
def collate_fn(batch):
    """Custom collate function to batch-tokenize."""
    texts, labels = zip(*batch)
    return preprocess_function(list(texts), list(labels))

def collate_fn_test(batch):
    """Collate function for the test set (no labels)."""
    texts, _ = zip(*batch)
    tokenized_inputs = tokenizer(
        list(texts),
        truncation=True,
        padding="max_length",
        max_length=config["MAX_TOKEN_LENGTH"],
        return_tensors="pt"
    )
    return {
        "input_ids": tokenized_inputs["input_ids"],
        "attention_mask": tokenized_inputs["attention_mask"]
    }

In [11]:
# Create DataLoaders
train_dataset = EmotionDataset(df_train)
train_loader = DataLoader(
    train_dataset,
    batch_size=config["TRAIN_BATCH_SIZE"],
    collate_fn=collate_fn,
    shuffle=True,
    num_workers=0 # Prevent notebook deadlocks
)

# Only create val_loader if we are NOT using full data
if not USE_FULL_DATA:
    val_dataset = EmotionDataset(df_val)
    val_loader = DataLoader(
        val_dataset,
        batch_size=config["EVAL_BATCH_SIZE"],
        collate_fn=collate_fn,
        shuffle=False,
        num_workers=0
    )
else:
    val_loader = None

# Always create test loader
test_dataset_obj = EmotionDataset(test_df, is_test=True)
test_loader = DataLoader(
    test_dataset_obj,
    batch_size=config["EVAL_BATCH_SIZE"],
    collate_fn=collate_fn_test,
    shuffle=False,
    num_workers=0 # Prevent notebook deadlocks
)

steps_per_epoch = len(train_loader)
print(f"Steps per epoch: {steps_per_epoch}")

Steps per epoch: 384


# 3. Training, WandB Logging & Upload

In [12]:
if DO_TRAIN_AND_UPLOAD:
    # 1. WandB Init 
    os.environ["WANDB_SILENT"] = "true"
    try:
        user_secrets = UserSecretsClient()
        api_key = user_secrets.get_secret("wandb_api")
        wandb.login(key=api_key)
        print("W&B login successful.")
        run = wandb.init(
            project="multi-label-emotion-bert",
            job_type="train-upload",
            config=config,
            name=f"{config['MODEL_NAME']}-final-{wandb.util.generate_id()}"
        )
        wandb.config.update({"pos_weights": pos_weights_list})
    except Exception as e:
        print(f"W&B init failed: {e}")

    print("\n--- Starting Model Training ---")

    # 2. Model & Optimizer Setup
    model = AutoModelForSequenceClassification.from_pretrained(
        config["MODEL_NAME"],
        num_labels=num_labels,  # This forces it to shape (batch_Size, 5)
        problem_type="multi_label_classification",
        ignore_mismatched_sizes=True 
    ).to(device)

    # Discriminative LR (DLR) Implementation
    base_lr = config["STARTING_LR"] * config["LR_BASE_FACTOR"]
    head_lr = config["STARTING_LR"]                             
    
    optimizer_grouped_parameters = [
        {
            "params": model.deberta.parameters(), # deBERTa encoder parameters
            "lr": base_lr,
            "weight_decay": 0.01
        },
        {
            "params": model.classifier.parameters(), # Classification head parameters
            "lr": head_lr,
            "weight_decay": 0.0
        }
    ]
    
    optimizer = AdamW(optimizer_grouped_parameters, lr=head_lr)

    print("Using StepLR scheduler")
    scheduler = StepLR(
        optimizer,
        step_size=2, # Decay every 2 epochs
        gamma=0.5    # Halve the learning rate
    )

    loss_fct = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)

    best_val_f1 = -1.0
    best_model_state = None
    global_step = 0

    # --- 3. Training Loop ---
    for epoch in range(config["EPOCHS"]):
        print(f"\n--- Starting Epoch {epoch+1}/{config['EPOCHS']} ---")
        model.train() 
        
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}", disable=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = loss_fct(logits, labels)
            
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss.item()
            
            # WandB Log Step
            if run is not None:
                wandb.log({
                    "train/step_loss": loss.item(),
                    "train/lr_head": optimizer.param_groups[1]['lr'],
                    "train/lr_base": optimizer.param_groups[0]['lr'],
                    "global_step": global_step
                })
            global_step += 1
            
        avg_train_loss = total_loss / len(train_loader)
        print(f"  Average Training Loss: {avg_train_loss:.4f}")
            
        scheduler.step()
        print(f"  End of Epoch {epoch+1}. LR Head: {optimizer.param_groups[1]['lr']:.1e}, LR Base: {optimizer.param_groups[0]['lr']:.1e}")
            
        # Run evaluation (ONLY if not using full data)
        if not USE_FULL_DATA:
            print(f"  Running evaluation for Epoch {epoch+1}...")
            model.eval()
            all_preds = []
            all_labels = []
            with torch.no_grad():
                for batch in val_loader:
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['labels'].to(device)
                    outputs = model(input_ids, attention_mask=attention_mask)
                    sigmoid = torch.sigmoid(outputs.logits)
                    predictions = (sigmoid > 0.5).int()
                    all_preds.append(predictions.cpu())
                    all_labels.append(labels.cpu())
                    
            all_preds = torch.cat(all_preds, dim=0).numpy()
            all_labels = torch.cat(all_labels, dim=0).numpy()
            macro_f1 = f1_score(all_labels, all_preds, average='macro', zero_division=0)
            print(f"  Epoch {epoch+1} - Validation Macro F1 (0.5 thresh): {macro_f1:.4f}")
        
        # Save model state based on strategy
        best_model_state = model.state_dict().copy()
        
    print("\n--- Training Finished ---")


    if not USE_FULL_DATA :
        print("\n--- Calculating Optimal Thresholds on Validation Set ---")
        model.eval()
        val_preds_list = []
        val_labels_list = []
        
        # Get raw probabilities for validation set
        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validating"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(input_ids, attention_mask=attention_mask)
                sigmoid = torch.sigmoid(outputs.logits)
                
                val_preds_list.append(sigmoid.cpu().numpy())
                val_labels_list.append(labels.cpu().numpy())
                
        val_preds_arr = np.vstack(val_preds_list)
        val_labels_arr = np.vstack(val_labels_list)
    
        # Find best threshold for EACH of the 5 labels
        optimal_thresholds = []
        print("\nOptimization Results:")
        for i, label in enumerate(emotion_labels):
            best_f1 = 0
            best_thresh = 0.5
            
            # Sweep thresholds from 0.01 to 0.99
            for thresh in np.arange(0.01, 1.0, 0.01):
                pred_binary = (val_preds_arr[:, i] > thresh).astype(int)
                score = f1_score(val_labels_arr[:, i], pred_binary)
                
                if score > best_f1:
                    best_f1 = score
                    best_thresh = thresh
                    
            optimal_thresholds.append(best_thresh)
            print(f"  {label.ljust(10)}: Best Threshold={best_thresh:.3f}, F1-Score={best_f1:.4f}")
    
        print(f"\n>>> FINAL OPTIMAL THRESHOLDS: {optimal_thresholds}")
    
    # 4. Save & Upload
    # Load the best model state
    if best_model_state is not None:
        model.load_state_dict(best_model_state)

    save_path = "./final_model"
    print(f"Saving model locally to {save_path}...")
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    

    # 5. WandB Finish
    if run is not None:
        wandb.finish()
        print("WandB run finished.")

else:
    print("SKIPPING TRAINING CELL")

SKIPPING TRAINING CELL


In [13]:
if DO_TRAIN_AND_UPLOAD :
    KAGGLE_USERNAME = 'gaurangnigam'
    MODEL_SLUG = 'bert-emotion-classifier'
    handle = f'{KAGGLE_USERNAME}/{MODEL_SLUG}/pytorch/v5'
    print(f"Uploading to {handle}...")
    kagglehub.model_upload(handle, save_path, version_notes='Using yangheng/deberta-v3-base-absa-v1.1')
    print("Model saved. Ready for upload.")

# 4. Inference & Submission

In [14]:
if ENSEMBLE_MODELS :
    print("\n--- Starting Ensemble Inference ---")

    # The best score came from using a high weight for DeBERTa (0.9) vs BERT (0.1) 
    # MODEL_PATH_0 = "/kaggle/input/bert-emotion-classifier/pytorch/v4/3"
    # MODEL_PATH_1 = "/kaggle/input/bert-emotion-classifier/pytorch/v4/5"
    # MODEL_PATH_2 = "/kaggle/input/bert-emotion-classifier/pytorch/v3/1"

    ENSEMBLE_CONFIGS = [
        {"name": "DeBERTa", "path": MODEL_PATH_0, "weight": 0.8, "model_name_id": "microsoft/deberta-v3-base"},
        {"name": "DeBERTa", "path": MODEL_PATH_1, "weight": 0.1, "model_name_id": "microsoft/deberta-v3-base"},
        {"name": "BERT", "path": MODEL_PATH_2, "weight": 0.1, "model_name_id": "bert-base-uncased"},
    ]

    # Total predictions initialization
    ensemble_probs = np.zeros((len(test_df), num_labels), dtype=np.float32)

    # --- 2. Sequential Prediction Loop ---
    for model_info in ENSEMBLE_CONFIGS:
        model_name = model_info['name']
        model_path = model_info['path']
        weight = model_info['weight']
        
        print(f"\n-> Loading and predicting with {model_name} (Weight: {weight:.2f}) from: {model_path}")
        
        # Load the appropriate model class based on the path/config
        model_instance = AutoModelForSequenceClassification.from_pretrained(model_path)
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model_instance.to(device)
        model_instance.eval()

        # Predict
        model_probs = []
        with torch.no_grad():
            for batch in tqdm(test_loader, desc=f"Predicting ({model_name})"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                outputs = model_instance(input_ids, attention_mask=attention_mask)
                sigmoid = torch.sigmoid(outputs.logits)
                model_probs.append(sigmoid.cpu())
        
        current_probs = torch.cat(model_probs, dim=0).numpy()
        
        # Accumulate Ensemble Probabilities (Weighted Addition)
        ensemble_probs += current_probs * weight

        # Clean up
        del model_instance, tokenizer
        torch.cuda.empty_cache()
        print(f"Cleanup complete for {model_name}.")

    # 3. Apply Manual Thresholds
    final_preds = np.zeros(ensemble_probs.shape, dtype=int)

    print("\n-> Applying MANUAL thresholds to Ensemble Predictions...")
    if MANUAL_THRESHOLDS and len(MANUAL_THRESHOLDS) == num_labels:
        for i, thresh in enumerate(MANUAL_THRESHOLDS):
            final_preds[:, i] = (ensemble_probs[:, i] > thresh).astype(int)
            print(f"  {emotion_labels[i]}: {thresh}")

    # 4. Save Submission
    submission_df = pd.DataFrame(final_preds, columns=emotion_labels)
    submission_df.insert(0, 'id', test_df['id'])
    submission_df.to_csv("submission.csv", index=False)
    print("\nSubmission file created: submission.csv")

In [15]:
if not ENSEMBLE_MODELS :
    print("\n--- Starting Inference ---")
    
    # 1. Load Model
    # bert-emotion-classifier
    MODEL_PATH_FOR_INFERENCE = "/kaggle/input/bert-emotion-classifier/pytorch/v5/5"
    # MODEL_PATH_FOR_INFERENCE = "./final_model" 
    
    print(f"Loading model from {MODEL_PATH_FOR_INFERENCE}...")
    
    try:
        model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH_FOR_INFERENCE)
        tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH_FOR_INFERENCE)
        model.to(device)
        model.eval()
        print("Model loaded successfully.")
    except Exception as e:
        print(f"Error loading model: {e}.")
    
    # 2. Get Thresholds
    try:
        final_thresholds = optimal_thresholds 
        print(f"Using calculated optimal thresholds: {final_thresholds}")
    except NameError:
        print("Warning: 'optimal_thresholds' not found. Using MANUAL_THRESHOLDS.")
        final_thresholds = MANUAL_THRESHOLDS
    
    
    # 3. Get Test Predictions
    print("Predicting on test set")
    test_probs = []
    with torch.no_grad():
        for batch in tqdm(test_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            sigmoid = torch.sigmoid(outputs.logits)
            test_probs.append(sigmoid.cpu())
    
    test_probs = torch.cat(test_probs, dim=0).numpy()
    final_preds = np.zeros(test_probs.shape, dtype=int)
    
    # 4. Apply Thresholds
    for i in range(num_labels):
        # Use the specific threshold for this emotion
        thresh = final_thresholds[i]
        final_preds[:, i] = (test_probs[:, i] > thresh).astype(int)
    
    # 5. Save Submission
    submission_df = pd.DataFrame(final_preds, columns=emotion_labels)
    submission_df.insert(0, 'id', test_df['id'])
    
    submission_df.to_csv("submission.csv", index=False)
    
    print("Submission file created: submission.csv")
    
    submission_df.head()


--- Starting Inference ---
Loading model from /kaggle/input/bert-emotion-classifier/pytorch/v5/5...


2025-11-21 17:12:16.432213: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763745136.619374      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763745136.674002      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Model loaded successfully.
Predicting on test set


100%|██████████| 54/54 [00:12<00:00,  4.19it/s]

Submission file created: submission.csv



