In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import wandb

import string # for milestone 1
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score

from datasets import Dataset, DatasetDict
from transformers import (
        AutoTokenizer, 
        AutoModelForSequenceClassification,
        TrainingArguments,
        Trainer,
        EarlyStoppingCallback
    )

from kaggle_secrets import UserSecretsClient # For secure API key access

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

os.environ["TOKENIZERS_PARALLELISM"] = "false" # getting rid of hugging face tokenizer parallelism warnings
        
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

2025-10-28 19:23:23.892869: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761679404.112104      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761679404.173755      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


/kaggle/input/2025-sep-dl-gen-ai-project/sample_submission.csv
/kaggle/input/2025-sep-dl-gen-ai-project/train.csv
/kaggle/input/2025-sep-dl-gen-ai-project/test.csv


# 1. WandB Configuraiton and Setup 

In [2]:

config = {
    "MODEL_NAME": "bert-base-uncased",  
    "TRAIN_FILE": "/kaggle/input/2025-sep-dl-gen-ai-project/train.csv",
    "TEST_FILE": "/kaggle/input/2025-sep-dl-gen-ai-project/test.csv",
    "EPOCHS": 4,                        
    "LEARNING_RATE": 2e-5,              # learning rate for AdamW optimizer
    "TRAIN_BATCH_SIZE": 16,             # Batch size for training
    "EVAL_BATCH_SIZE": 32,              # Batch size for evaluation
    "CLASSIFIER_DROPOUT": 0.1,          # Dropout for the final layer (as requested)
    "VALIDATION_SPLIT_SIZE": 0.1,       # 10% of training data for validation
    "RANDOM_SEED": 42,                  # because deep thought said so 
    "MAX_TOKEN_LENGTH": 128,            # Max sequence length for tokenizer
    "METRIC_FOR_BEST_MODEL": "macro_f1" # This must match the key in compute_metrics
}


# defining labels for the task
emotion_labels = ['anger', 'fear', 'joy', 'sadness', 'surprise']
label2id = {label: i for i, label in enumerate(emotion_labels)}
id2label = {i: label for i, label in enumerate(emotion_labels)}


# Securely log in to W&B using Kaggle Secrets
try:
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("wandb_api")
    wandb.login(key=api_key)
    print("W&B login successful using Kaggle Secrets.")
except Exception as e:
    print(f"W&B login failed: {e}")


# creating a run
run = wandb.init(
    project="multi-label-emotion-bert",  # Name of your project
    job_type="fine-tuning",
    config=config  # Log all hyperparameters
)

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33m22f3001059[0m ([33m22f3001059-iit-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


W&B login successful using Kaggle Secrets.


[34m[1mwandb[0m: Tracking run with wandb version 0.20.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20251028_192337-fl8mamrh[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mserene-wind-2[0m
[34m[1mwandb[0m: ‚≠êÔ∏è View project at [34m[4mhttps://wandb.ai/22f3001059-iit-madras/multi-label-emotion-bert[0m
[34m[1mwandb[0m: üöÄ View run at [34m[4mhttps://wandb.ai/22f3001059-iit-madras/multi-label-emotion-bert/runs/fl8mamrh[0m


# 2. Load and Prepare Data

In [3]:
# loading data
train_df = pd.read_csv(config['TRAIN_FILE'])
test_df = pd.read_csv(config['TEST_FILE'])

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
#create train validation split
df_train, df_val = train_test_split(train_df, test_size=config["VALIDATION_SPLIT_SIZE"], random_state=config["RANDOM_SEED"])

print(f"Full training data shape: {train_df.shape}")
print(f"Training split shape: {df_train.shape}")
print(f"Validation split shape: {df_val.shape}")
print(f"Test data shape: {test_df.shape}")

Full training data shape: (6827, 8)
Training split shape: (6144, 8)
Validation split shape: (683, 8)
Test data shape: (1707, 2)


In [6]:
#convert pandas to HF dataset objects

train_dataset = Dataset.from_pandas(df_train)
val_dataset = Dataset.from_pandas(df_val)
test_dataset = Dataset.from_pandas(test_df)

# 3. Tokenization & Dataset Preprocessin

In [7]:
# loading tokenizer
tokenizer = AutoTokenizer.from_pretrained(config["MODEL_NAME"])

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
tokenizer = AutoTokenizer.from_pretrained(config["MODEL_NAME"])

# tokenizing text, and formatting labels for multi-label classification

def preprocess_data(examples):
    # Tokenize the text
    tokenized_inputs = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",    # this will create a column called 'attention_mask', tells BERT 1: important token, 0: unimportant
        max_length=config["MAX_TOKEN_LENGTH"]
    )
    
    # Creates the 'labels' column
    labels = np.zeros((len(examples["text"]), len(emotion_labels)), dtype=float)
    for i, _ in enumerate(examples["text"]):
        for idx, label in enumerate(emotion_labels):
            labels[i, idx] = float(examples[label][i]) # taking float because pytorch loss func accepts floats

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

def preprocess_test_data(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=config["MAX_TOKEN_LENGTH"]
    )

In [9]:
# Apply tokenization
train_dataset = train_dataset.map(preprocess_data, batched=True)
val_dataset = val_dataset.map(preprocess_data, batched=True)

#same for test data
test_dataset = test_dataset.map(preprocess_test_data, batched=True)

Map:   0%|          | 0/6144 [00:00<?, ? examples/s]

Map:   0%|          | 0/683 [00:00<?, ? examples/s]

Map:   0%|          | 0/1707 [00:00<?, ? examples/s]

In [10]:
# Set format to PyTorch tensors
columns_to_keep = ['input_ids', 'attention_mask', 'labels']
train_dataset.set_format("torch", columns=columns_to_keep)
val_dataset.set_format("torch", columns=columns_to_keep)

test_columns = ['input_ids', 'attention_mask']
test_dataset.set_format("torch", columns=test_columns)

# 4. Model, Metrics, and Trainer Setup

In [11]:
# loading the model
model = AutoModelForSequenceClassification.from_pretrained(
    config["MODEL_NAME"],
    num_labels=len(emotion_labels),
    
    # It tells the model to use Sigmoid + BCEWithLogitsLoss for multilabel classifiction 
    # instead of Softmax + CrossEntropyLoss.
    problem_type="multi_label_classification",
    
    # This sets the dropout rate for the final classification layer
    classifier_dropout=config["CLASSIFIER_DROPOUT"],

    # gives BERT a dictionary to understand the labels,
    id2label=id2label,  # tells model how to map output numbers back to human readable labels
    label2id=label2id  # does the exact opposite, mapping numbers to names
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Calculates Macro F1-Score for multi-label classification.
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    
    # Apply sigmoid to logits to get probabilities
    sigmoid = 1 / (1 + np.exp(-logits))
    
    # Get 0/1 predictions by thresholding at 0.5 (you can also tune this)
    predictions = (sigmoid > 0.5).astype(int)
    
    # Calculate Macro F1-Score
    macro_f1 = f1_score(
        y_true=labels,
        y_pred=predictions,
        average='macro',
        zero_division=0
    )
    
    return {
        'macro_f1': macro_f1
    }

In [13]:
# defining TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    report_to="wandb",  # enables W&B logging
    
    # defining hyperparameters 
    num_train_epochs=config["EPOCHS"],
    learning_rate=config["LEARNING_RATE"],
    per_device_train_batch_size=config["TRAIN_BATCH_SIZE"],
    per_device_eval_batch_size=config["EVAL_BATCH_SIZE"],
    weight_decay=0.01,
    warmup_steps=500,
    
    # evaluating and saving
    eval_strategy="epoch",  # evaluates at the end of each epoch
    save_strategy="epoch",        # saves at the end of each epoch
    load_best_model_at_end=True,  # load the best model based on the metric
    metric_for_best_model=config["METRIC_FOR_BEST_MODEL"], #macro_f1 in this case
    greater_is_better=True,       # explains that higher score for metric is a good thing
    
    # Other 
    logging_steps=50,             # Log metrics every 50 steps
    fp16=torch.cuda.is_available(), # Use mixed precision if a GPU is available
    push_to_hub=False,            # does not automatically publish to hugging face
    seed=config["RANDOM_SEED"],   # answer to the universe 
)


In [14]:
# initializing the trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)] # Stop if val metric doesn't improve for 2 epochs
)

  trainer = Trainer(


# 5. Model Training

In [15]:
print("Starting model training.")

trainer.train() 

print("Training finished")

# Log a summary of the best results to W&B
wandb.summary["best_val_macro_f1"] = trainer.state.best_metric
wandb.summary["best_model_checkpoint"] = trainer.state.best_model_checkpoint




Starting model training.




Epoch,Training Loss,Validation Loss,Macro F1
1,0.5881,0.517715,0.168687
2,0.4158,0.350625,0.698653
3,0.2828,0.282074,0.783032
4,0.2059,0.261031,0.815976




Training finished


# 6. Generate Submission File

In [16]:
# get predictions on the test set using the best model
test_predictions = trainer.predict(test_dataset)

# processing the output logits
test_logits = test_predictions.predictions
test_sigmoid = 1 / (1 + np.exp(-test_logits)) #sigmoid
final_preds = (test_sigmoid > 0.5).astype(int) # theta = 0.5


In [17]:
# creating the submission DataFrame
submission_df = pd.DataFrame(final_preds, columns=emotion_labels)
submission_df.insert(0, 'id', test_df['id'])

# Save the submission file
submission_df.to_csv("submission.csv", index=False)
print("Submission file created: submission.csv")

Submission file created: submission.csv


# Section 7: Finishing W&B Run

In [18]:
wandb.finish()
print("Run complete. All data synced to W&B.")

[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:               eval/loss ‚ñà‚ñÉ‚ñÇ‚ñÅ
[34m[1mwandb[0m:           eval/macro_f1 ‚ñÅ‚ñá‚ñà‚ñà
[34m[1mwandb[0m:            eval/runtime ‚ñÅ‚ñÜ‚ñà‚ñà
[34m[1mwandb[0m: eval/samples_per_second ‚ñà‚ñÇ‚ñÅ‚ñÅ
[34m[1mwandb[0m:   eval/steps_per_second ‚ñà‚ñÇ‚ñÅ‚ñÅ
[34m[1mwandb[0m:            test/runtime ‚ñÅ
[34m[1mwandb[0m: test/samples_per_second ‚ñÅ
[34m[1mwandb[0m:   test/steps_per_second ‚ñÅ
[34m[1mwandb[0m:             train/epoch ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñà‚ñà‚ñà
[34m[1mwandb[0m:       train/global_step ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà
[34m[1mwandb[0m:         train/grad_norm ‚ñÑ‚ñÑ‚ñÑ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñà‚ñÖ‚ñÖ‚ñÖ‚ñÇ‚ñÖ‚ñá‚ñá
[34m[1mwandb[0m:     train/learning_rate ‚ñÅ‚ñÇ‚ñÉ‚ñÉ‚ñÑ‚ñÖ‚ñÜ‚ñÜ‚ñá‚ñà‚ñá‚ñÖ‚ñÑ‚ñÇ‚ñÅ
[34m[1mwandb[0m:     

Run complete. All data synced to W&B.
