# A) Problem Statement

Problem statement: This project addresses multi-label emotion detection from short English text: given a sentence, predict which of five emotions ‚Äî anger, fear, joy, sadness, and surprise ‚Äî are present. Unlike multi-class classification, a sentence can express multiple emotions simultaneously, so the model must perform multi-label prediction and be robust to class imbalance and subtle linguistic cues (sarcasm, negation, emoticons). The goal is to develop a reproducible pipeline that achieves high Macro F1 on the provided dataset by iteratively building from classical baselines (TF-IDF + linear models) to deep learning models (CNN/LSTM) and fine-tuned transformer models, while rigorously logging experiments and hyperparameters with Weights & Biases and tuning per-label decision thresholds for optimal performance.

# B) Importing

In [1]:
import os, random
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import torch, torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer

In [2]:
import wandb 
from kaggle_secrets import UserSecretsClient

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wand")
wandb.login(key=secret_value_0)

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhariswarsamasi[0m ([33mhariswarsamasi-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
# Fixing the random seed to 42 for reproducibility to ensure it will be everytime

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

#make device to CUDE else GPU
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# HuggingFace model name to load both model and tokenizer.
# 'distilbert-base-uncased' is a small & fast version of BERT.
MODEL_TOKENIZER = "distilbert-base-uncased" 

MAX_LEN = 200
BATCH = 32
EPOCHS = 2
LR = 1e-3

#label s i got from the Train 
LABELS = ["anger","fear","joy","sadness","surprise"]

#Per-label classification thresholds for converting predicted probabilities
po = 0.60
best_thresholds = [0.45, 0.55, 0.4, 0.5, 0.48]

# D) Importing Files and Splitting

In [4]:
df = pd.read_csv("/kaggle/input/2025-sep-dl-gen-ai-project/train.csv")

# create a new column 'sum_labels' which counts how many emotions
df['sum_labels'] = df[LABELS].sum(axis=1)

# split into train (90%) and validation (10%).
tr, df_val = train_test_split(df, test_size=0.1, random_state=SEED, stratify=df['sum_labels'])
#df is now train
df = tr

#Replace missing text with empty string and change to string 
df['text'] = df['text'].fillna('').astype(str)
df_val['text'] = df_val['text'].fillna('').astype(str)

#for every label (anger, fear, joy, sadness, surprise) fill empty with 0 whi
#is default
for c in LABELS:
    df[c] = df[c].fillna(0).astype(int)
    df_val[c] = df_val[c].fillna(0).astype(int)

# E) Tokenizer 

In [5]:
#Load the tokenizer for DistilBERT ("distilbert-base-uncased")
tok = AutoTokenizer.from_pretrained(MODEL_TOKENIZER)

#converts a list of strings into numerical token IDs of size
def texts_to_ids(texts):
    out = tok(texts, 
              truncation=True, # Cut off text longer than MAX_LEN in global
              padding='max_length', 
              max_length=MAX_LEN, 
              return_tensors='pt'# Return PyTorch tensor ( Important )
             )
    return out['input_ids']


train_ids = texts_to_ids(df['text'].tolist())
val_ids   = texts_to_ids(df_val['text'].tolist())


# Convert the multi-label columns (0/1 for each emotion)
# into PyTorch float tensors for training.
train_labels = torch.tensor(df[LABELS].values, dtype=torch.float)
val_labels   = torch.tensor(df_val[LABELS].values, dtype=torch.float)

#Wrap input_ids and labels together

train_ds = TensorDataset(train_ids, train_labels)
val_ds   = TensorDataset(val_ids, val_labels)

# Create iterable loaders that
#batch the data (size BATCH) &
# shuffle training data for randomness
train_loader = DataLoader(train_ds, batch_size=BATCH, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=BATCH, shuffle=False)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

# E) Simple Models

## 0. Deffrentiation into X and Y 

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

#max_features=20000 ‚Üí Keep the top 20,000 most important tokens and use
#1-grams = single words   &    2-grams = two-word phrases
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2))

#Learn vocabulary + IDF weights from training data
X_train = tfidf.fit_transform(df['text'].tolist())

# Convert validation text using **same** vocabulary.
X_val   = tfidf.transform(df_val['text'].tolist())

y_train = df[LABELS].values
y_val   = df_val[LABELS].values

print("Prepared TF-IDF and labels. Train shape:", X_train.shape)
print("Prepared TF-IDF and labels. Validation shape:", X_val.shape)

Prepared TF-IDF and labels. Train shape: (6144, 20000)
Prepared TF-IDF and labels. Validation shape: (683, 20000)


## 1. Logistic Regression

In [7]:
from sklearn.linear_model import LogisticRegression

preds = np.zeros_like(y_val)
log_dic = {}

for i,label in enumerate(LABELS):
    
    m = LogisticRegression(
        max_iter=400, 
        solver='saga',
        n_jobs=-1, 
        random_state=SEED
    )
    
    m.fit(X_train, y_train[:, i])
    
    log_dic[label] = m
    preds[:, i] = m.predict(X_val)
    
score = f1_score(y_val, preds, average='macro')
print("LogisticRegression Macro F1:", round(score,4))

LogisticRegression Macro F1: 0.3777


## 2. Multinomial Naive Bayes

In [8]:
from sklearn.naive_bayes import MultinomialNB

mnb_dict = {}

preds = np.zeros_like(y_val)

for i,label in enumerate(LABELS):
    
    m = MultinomialNB()
    
    m.fit(X_train, y_train[:, i])
    
    mnb_dict[label] = m
    preds[:, i] = m.predict(X_val)

score = f1_score(y_val, preds, average='macro')
print("MultinomialNB Macro F1:", round(score,4))


MultinomialNB Macro F1: 0.2573


## 3. XGboost

In [9]:
from xgboost import XGBClassifier

xgb_dict = {}
preds = np.zeros_like(y_val)
for i,label in enumerate(LABELS):
    m = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_jobs=4,
                      n_estimators=150, random_state=SEED, verbosity=0)
    m.fit(X_train, y_train[:, i])
    preds[:, i] = m.predict(X_val)
    xgb_dict[label]=m

score = f1_score(y_val, preds, average='macro')
print("XGBoost Macro F1:", round(score,4))

XGBoost Macro F1: 0.5794


## 4. LightGBM

In [10]:
import lightgbm as lgb

lgb_dict = {}
preds = np.zeros_like(y_val)
for i,label in enumerate(LABELS):
    m = lgb.LGBMClassifier(n_jobs=4, n_estimators=200, random_state=SEED)
    m.fit(X_train, y_train[:, i])
    preds[:, i] = m.predict(X_val)
    lgb_dict[label]=m


score = f1_score(y_val, preds, average='macro')
print("LightGBM Macro F1:", round(score,4))

[LightGBM] [Info] Number of positive: 734, number of negative: 5410
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017846 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21187
[LightGBM] [Info] Number of data points in the train set: 6144, number of used features: 831
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119466 -> initscore=-1.997495
[LightGBM] [Info] Start training from score -1.997495
[LightGBM] [Info] Number of positive: 3469, number of negative: 2675
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017230 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 21187
[LightGBM] [Info] Number of data points in the train set: 6144, number of used features: 831
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.564616 -> initscore=0.259917
[Light

# F) Testing the Model and Submitting

## 0) Global Submit Function

In [11]:
import os, numpy as np, pandas as pd

def predict_now_linear(models_dict, vectorizer=tfidf,out_csv="submission.csv", thresholds=None, label_order=None):
    
    df_test = pd.read_csv("/kaggle/input/2025-sep-dl-gen-ai-project/test.csv")
    texts = df_test['text'].fillna('').astype(str).tolist()

    X_test = vectorizer.transform(texts)

    if label_order is None:
        labels = list(models_dict.keys())
    else:
        labels = list(label_order)

    probs_list = []
    for lab in labels:
        m = models_dict[lab]
        if hasattr(m, "predict_proba"):
            p = m.predict_proba(X_test)[:, 1]   
        else:
            p = m.predict(X_test).astype(float)
        probs_list.append(p)
    probs = np.vstack(probs_list).T  

    if thresholds is None:
        thr = np.array([0.5] * probs.shape[1])
    else:
        thr = np.array(thresholds)
        if thr.shape[0] != probs.shape[1]:
            raise ValueError("thresholds length must match number of labels/models")

    preds = (probs >= thr).astype(int)

    sub = pd.DataFrame(preds, columns=labels)
    if 'id' in df_test.columns:
        sub.insert(0, 'id', df_test['id'])
    else:
        sub.insert(0, 'id', range(1, len(sub)+1))

    sub.to_csv(out_csv, index=False)
    print(f"Saved submission filed for this model {out_csv}")

## 1) Saving the Simple Models

In [12]:
best_thresholds = [0.45, 0.55, 0.4, 0.5, 0.48]

predict_now_linear(log_dic,tfidf,"logistic.csv",best_thresholds)

predict_now_linear(mnb_dict,tfidf,"naive.csv",best_thresholds)

Saved submission filed for this model logistic.csv
Saved submission filed for this model naive.csv


## 2. Saving the Boosting Models

In [13]:
predict_now_linear(xgb_dict,tfidf,"xgb.csv",best_thresholds)
predict_now_linear(lgb_dict,tfidf,"lgb.csv",best_thresholds)


Saved submission filed for this model xgb.csv
Saved submission filed for this model lgb.csv


# F) Deep Learning Model Training

## 0) Global Evaluation 

In [14]:
def eval_val(trained_model):
    trained_model.eval()
    all_probs, all_labels = [], []

    with torch.no_grad():
        for batch in val_loader:

            # BERT batch: (ids, mask, labels)
            if len(batch) == 3:
                ids, mask, labels = batch
                ids = ids.to(DEVICE)
                mask = mask.to(DEVICE)
                labels = labels.to(DEVICE)
                logits = trained_model(ids, mask)

            # Scratch models: (ids, labels)
            else:
                ids, labels = batch
                ids = ids.to(DEVICE)
                labels = labels.to(DEVICE)
                logits = trained_model(ids)

            # Convert logits ‚Üí probabilities
            probs = torch.sigmoid(logits).cpu().numpy()
            all_probs.append(probs)
            all_labels.append(labels.cpu().numpy())

    # Stack results
    probs = np.vstack(all_probs)
    labels = np.vstack(all_labels)

    # Multi-label thresholding
    preds = (probs >= 0.5).astype(int)

    return f1_score(labels, preds, average='macro')


## 1) TextCNN Model

In [15]:
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, kernel_sizes=(3,4,5), num_filters=64, num_labels=5, pad_idx=0):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.convs = nn.ModuleList([nn.Conv2d(1, num_filters, (k, embed_dim)) for k in kernel_sizes])
        self.fc = nn.Linear(len(kernel_sizes)*num_filters, num_labels)
        self.dropout = nn.Dropout(0.2)

    
    def forward(self, input_ids):
        x = self.embed(input_ids)          # (B, L, E)
        x = x.unsqueeze(1)                 # (B, 1, L, E)
        convs = [torch.relu(conv(x)).squeeze(3) for conv in self.convs]  # each: (B, F, L-k+1)
        pools = [torch.max(c, dim=2)[0] for c in convs]                  # each: (B, F)
        cat = torch.cat(pools, dim=1)       # (B, F*len(kernels))
        cat = self.dropout(cat)
        logits = self.fc(cat)
        return logits

vocab_size = tok.vocab_size
model = TextCNN(vocab_size=vocab_size, embed_dim=128, kernel_sizes=(3,4,5), num_filters=64, num_labels=len(LABELS), pad_idx=tok.pad_token_id or 0)
model.to(DEVICE)

TextCNN(
  (embed): Embedding(30522, 128, padding_idx=0)
  (convs): ModuleList(
    (0): Conv2d(1, 64, kernel_size=(3, 128), stride=(1, 1))
    (1): Conv2d(1, 64, kernel_size=(4, 128), stride=(1, 1))
    (2): Conv2d(1, 64, kernel_size=(5, 128), stride=(1, 1))
  )
  (fc): Linear(in_features=192, out_features=5, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [16]:
opt = torch.optim.AdamW(model.parameters(), lr=LR)
crit = nn.BCEWithLogitsLoss()

wandb.init(project="22f3001994-t32025", name="text_cnn_model")

for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    for ids, labels in train_loader:
        ids = ids.to(DEVICE)
        labels = labels.to(DEVICE)
        opt.zero_grad()
        logits = model(ids)
        loss = crit(logits, labels)
        loss.backward()
        opt.step()
        running_loss += loss.item()
    avg_loss = running_loss / len(train_loader)
    val_f1 = eval_val(model)
    wandb.log({
            "epoch": epoch,
            "train_loss": avg_loss,
            "Val _ Macro f1": val_f1
        })
    if(epoch%5==0):
        print(f"Epoch {epoch+1}/{EPOCHS} ‚Äî train_loss: {avg_loss:.4f} ‚Äî val_macro_f1: {val_f1:.4f}")

wandb.finish()   


[34m[1mwandb[0m: Tracking run with wandb version 0.20.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20251129_114511-zbkf3b2q[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mtext_cnn_model[0m
[34m[1mwandb[0m: ‚≠êÔ∏è View project at [34m[4mhttps://wandb.ai/hariswarsamasi-indian-institute-of-technology-madras/22f3001994-t32025[0m
[34m[1mwandb[0m: üöÄ View run at [34m[4mhttps://wandb.ai/hariswarsamasi-indian-institute-of-technology-madras/22f3001994-t32025/runs/zbkf3b2q[0m


Epoch 1/2 ‚Äî train_loss: 0.5531 ‚Äî val_macro_f1: 0.2564


[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m: Val _ Macro f1 ‚ñÅ‚ñà
[34m[1mwandb[0m:          epoch ‚ñÅ‚ñà
[34m[1mwandb[0m:     train_loss ‚ñà‚ñÅ
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m: Val _ Macro f1 0.29258
[34m[1mwandb[0m:          epoch 1
[34m[1mwandb[0m:     train_loss 0.46418
[34m[1mwandb[0m: 
[34m[1mwandb[0m: üöÄ View run [33mtext_cnn_model[0m at: [34m[4mhttps://wandb.ai/hariswarsamasi-indian-institute-of-technology-madras/22f3001994-t32025/runs/zbkf3b2q[0m
[34m[1mwandb[0m: ‚≠êÔ∏è View project at: [34m[4mhttps://wandb.ai/hariswarsamasi-indian-institute-of-technology-madras/22f3001994-t32025[0m
[34m[1mwandb[0m: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
[34m[1mwandb[0m: Find logs at: [35m[1m./wandb/run-20251129_114511-zbkf3b2q/logs[0m


## 2) BLISTN Model

In [17]:
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_size=128, num_layers=1, num_labels=5, pad_idx=0, dropout=0.2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embed_dim, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Sequential(
            nn.Linear(hidden_size * 2, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, num_labels)
        )
    def forward(self, input_ids):
        emb = self.embed(input_ids)             # (B, L, E)
        out, _ = self.lstm(emb)                 # out: (B, L, 2*H)
        pooled = out.mean(dim=1)                # (B, 2*H)
        dropped = self.dropout(pooled)
        logits = self.fc(dropped)               # (B, num_labels)
        return logits

vocab_size = tok.vocab_size
model_b = BiLSTM(vocab_size=vocab_size, embed_dim=128, hidden_size=128, num_layers=1, num_labels=len(LABELS), pad_idx=tok.pad_token_id or 0, dropout=0.2)
model_b.to(DEVICE)

opt = torch.optim.AdamW(model_b.parameters(), lr=LR)
crit = nn.BCEWithLogitsLoss()

In [18]:
opt = torch.optim.AdamW(model_b.parameters(), lr=LR)
crit = nn.BCEWithLogitsLoss()

wandb.init(project="22f3001994-t32025", name="blistin_model")
for epoch in range(EPOCHS):
    model_b.train()
    running_loss = 0.0
    for ids, labels in train_loader:
        ids = ids.to(DEVICE)
        labels = labels.to(DEVICE)
        opt.zero_grad()
        logits = model_b(ids)
        loss = crit(logits, labels)
        loss.backward()
        opt.step()
        running_loss += loss.item()
    avg_loss = running_loss / len(train_loader)
    val_f1 = eval_val(model_b)
    wandb.log({
            "epoch": epoch,
            "train_loss": avg_loss,
            "Val _ Macro f1": val_f1
        })
    if(epoch%10==0):
        print(f"Epoch {epoch+1}/{EPOCHS} ‚Äî train_loss: {avg_loss:.4f} ‚Äî val_macro_f1: {val_f1:.4f}")

wandb.finish()   


[34m[1mwandb[0m: Tracking run with wandb version 0.20.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20251129_114535-ty7snv8n[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mblistin_model[0m
[34m[1mwandb[0m: ‚≠êÔ∏è View project at [34m[4mhttps://wandb.ai/hariswarsamasi-indian-institute-of-technology-madras/22f3001994-t32025[0m
[34m[1mwandb[0m: üöÄ View run at [34m[4mhttps://wandb.ai/hariswarsamasi-indian-institute-of-technology-madras/22f3001994-t32025/runs/ty7snv8n[0m


Epoch 1/2 ‚Äî train_loss: 0.5757 ‚Äî val_macro_f1: 0.1456


[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m: Val _ Macro f1 ‚ñÅ‚ñà
[34m[1mwandb[0m:          epoch ‚ñÅ‚ñà
[34m[1mwandb[0m:     train_loss ‚ñà‚ñÅ
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m: Val _ Macro f1 0.24828
[34m[1mwandb[0m:          epoch 1
[34m[1mwandb[0m:     train_loss 0.55748
[34m[1mwandb[0m: 
[34m[1mwandb[0m: üöÄ View run [33mblistin_model[0m at: [34m[4mhttps://wandb.ai/hariswarsamasi-indian-institute-of-technology-madras/22f3001994-t32025/runs/ty7snv8n[0m
[34m[1mwandb[0m: ‚≠êÔ∏è View project at: [34m[4mhttps://wandb.ai/hariswarsamasi-indian-institute-of-technology-madras/22f3001994-t32025[0m
[34m[1mwandb[0m: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
[34m[1mwandb[0m: Find logs at: [35m[1m./wandb/run-20251129_114535-ty7snv8n/logs[0m


## 3. GRU Model

In [19]:
class GRUNet(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_size=128, num_layers=1, num_labels=5, pad_idx=0):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.gru = nn.GRU(embed_dim, hidden_size, num_layers=num_layers,
                          batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, num_labels)
        self.dropout = nn.Dropout(0.2)

    def forward(self, input_ids):
        x = self.embed(input_ids)        # (B, L, E)
        out, _ = self.gru(x)             # (B, L, 2H)
        pooled = out.mean(dim=1)         # (B, 2H)
        pooled = self.dropout(pooled)
        logits = self.fc(pooled)         # (B, num_labels)
        return logits

model_gru = GRUNet(
    vocab_size=tok.vocab_size,
    embed_dim=128,
    hidden_size=128,
    num_labels=len(LABELS),
    pad_idx=tok.pad_token_id
).to(DEVICE)

In [20]:
crit = nn.BCEWithLogitsLoss()
opt = torch.optim.AdamW(model_gru.parameters(), lr=LR)

for epoch in range(EPOCHS):
    model_gru.train()
    running_loss = 0.0

    for ids, labels in train_loader:
        ids = ids.to(DEVICE)
        labels = labels.to(DEVICE).float()    # IMPORTANT: float, not long

        opt.zero_grad()
        logits = model_gru(ids)               # (B, 5)
        loss = crit(logits, labels)           # BCE loss
        loss.backward()
        opt.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(train_loader)
    val_f1 = eval_val(model_gru)
    if(epoch%10==0):
        print(f"Epoch {epoch+1}/{EPOCHS} ‚Äî train_loss: {avg_loss:.4f} ‚Äî val_f1: {val_f1:.4f}")

Epoch 1/2 ‚Äî train_loss: 0.5685 ‚Äî val_f1: 0.1475


## 4. DistilBERT

In [21]:
bert_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(bert_name)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [22]:
from transformers import AutoModel

class BERTMultiLabel(nn.Module):
    def __init__(self, model_name="microsoft/deberta-v3-base", num_labels=5):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        hidden = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(hidden, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        cls = outputs.last_hidden_state[:, 0]

        cls = self.dropout(cls)
        logits = self.classifier(cls)
        return logits
        

model_bert = BERTMultiLabel(
    model_name=bert_name,
    num_labels=len(LABELS)
).to(DEVICE)

2025-11-29 11:45:50.902390: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764416751.067929      21 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764416751.117543      21 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

In [23]:
class BERTDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.texts = df["text"].tolist()
        self.labels = df[LABELS].values   
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = torch.tensor(self.labels[idx], dtype=torch.float)
        enc = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return (
            enc["input_ids"].squeeze(0),
            enc["attention_mask"].squeeze(0),
            labels
        )
train_ds = BERTDataset(df, tokenizer)
val_ds   = BERTDataset(df_val, tokenizer)
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader   = torch.utils.data.DataLoader(val_ds, batch_size=32)
from transformers import AutoTokenizer, AutoModel

In [24]:
crit = nn.BCEWithLogitsLoss()

opt = torch.optim.AdamW(model_bert.parameters(), lr=1e-5)
wandb.init(project="22f3001994-t32025", name="text_doberta_model")

for epoch in range(EPOCHS):
    model_bert.train()
    total_loss = 0

    for ids, mask, labels in train_loader:
        ids = ids.to(DEVICE)
        mask = mask.to(DEVICE)
        labels = labels.to(DEVICE)

        opt.zero_grad()
        logits = model_bert(ids, mask)
        loss = crit(logits, labels)
        loss.backward()
        opt.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    val_f1 = eval_val(model_bert) 
    wandb.log({
            "epoch": epoch,
            "train_loss": avg_loss,
            "Val _ Macro f1": val_f1
        })
    if(epoch%5==0):
        print(f"Epoch {epoch+1}: loss={avg_loss:.4f}, val_f1={val_f1:.4f}")

wandb.finish()   

[34m[1mwandb[0m: Tracking run with wandb version 0.20.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20251129_114604-mperhxok[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mtext_doberta_model[0m
[34m[1mwandb[0m: ‚≠êÔ∏è View project at [34m[4mhttps://wandb.ai/hariswarsamasi-indian-institute-of-technology-madras/22f3001994-t32025[0m
[34m[1mwandb[0m: üöÄ View run at [34m[4mhttps://wandb.ai/hariswarsamasi-indian-institute-of-technology-madras/22f3001994-t32025/runs/mperhxok[0m


Epoch 1: loss=0.5192, val_f1=0.6520


[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m: Val _ Macro f1 ‚ñÅ‚ñà
[34m[1mwandb[0m:          epoch ‚ñÅ‚ñà
[34m[1mwandb[0m:     train_loss ‚ñà‚ñÅ
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m: Val _ Macro f1 0.77599
[34m[1mwandb[0m:          epoch 1
[34m[1mwandb[0m:     train_loss 0.34104
[34m[1mwandb[0m: 
[34m[1mwandb[0m: üöÄ View run [33mtext_doberta_model[0m at: [34m[4mhttps://wandb.ai/hariswarsamasi-indian-institute-of-technology-madras/22f3001994-t32025/runs/mperhxok[0m
[34m[1mwandb[0m: ‚≠êÔ∏è View project at: [34m[4mhttps://wandb.ai/hariswarsamasi-indian-institute-of-technology-madras/22f3001994-t32025[0m
[34m[1mwandb[0m: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
[34m[1mwandb[0m: Find logs at: [35m[1m./wandb/run-20251129_114604-mperhxok/logs[0m


# H) Final Submission

## 1. Global Submit Function

In [25]:
def predict_and_save_submission(trained_model, out_csv="submission.csv", thresholds=None):
    # thresholds: list/array of length num_labels or None -> default 0.5
    # Don't forget !!!!!!!!!!!!!!!!!!!!!!!!!

    test_df = pd.read_csv("/kaggle/input/2025-sep-dl-gen-ai-project/test.csv")
    test_texts = test_df['text'].fillna('').astype(str).tolist()

    enc = tok(test_texts, truncation=True, padding='max_length', max_length=MAX_LEN, return_tensors='pt')
    input_ids = enc['input_ids']

    test_ds = TensorDataset(input_ids)
    test_loader = DataLoader(test_ds, batch_size=BATCH, shuffle=False)

    trained_model.eval()
    all_probs = []
    with torch.no_grad():
        for (ids_batch,) in test_loader:
            ids_batch = ids_batch.to(DEVICE)
            logits = trained_model(ids_batch)
            probs = torch.sigmoid(logits).cpu().numpy()
            all_probs.append(probs)
    all_probs = np.vstack(all_probs)

    if thresholds is None:
        thr = np.array([0.5]*len(LABELS))
    else:
        thr = np.array(thresholds)

    preds_bin = (all_probs >= thr).astype(int)

    sub = pd.DataFrame(preds_bin, columns=LABELS)
    if 'id' in test_df.columns:
        sub.insert(0, 'id', test_df['id'])
    else:
        sub.insert(0, 'id', range(1, len(sub)+1))

    sub.to_csv(out_csv, index=False)
    print(f"Saved submission to {out_csv}  ‚Äî shape: {sub.shape}")
    return out_csv


## 2. Submit the CNN Model

In [26]:
torch.save(model, "textcnn_full.pt")
predict_and_save_submission(model, out_csv="submission_text.csv", thresholds=best_thresholds)

Saved submission to submission_text.csv  ‚Äî shape: (1707, 6)


'submission_text.csv'

In [27]:
torch.save(model_b, "bi-directional_full.pt")
predict_and_save_submission(model_b, out_csv="submission_ki.csv", thresholds=best_thresholds)

Saved submission to submission_ki.csv  ‚Äî shape: (1707, 6)


'submission_ki.csv'

In [28]:
torch.save(model, "gru_full.pt")
predict_and_save_submission(model_gru, out_csv="submission_gru.csv", thresholds=best_thresholds)

Saved submission to submission_gru.csv  ‚Äî shape: (1707, 6)


'submission_gru.csv'

In [29]:
def predict_and_save_submission(trained_model, out_csv="submission.csv", thresholds=None):
    test_df = pd.read_csv("/kaggle/input/2025-sep-dl-gen-ai-project/test.csv")
    test_texts = test_df['text'].fillna('').astype(str).tolist()

    enc = tokenizer(
        test_texts,
        truncation=True,
        padding='max_length',
        max_length=MAX_LEN,
        return_tensors='pt'
    )

    input_ids = enc['input_ids']
    attention_mask = enc['attention_mask']

    test_ds = TensorDataset(input_ids, attention_mask)
    test_loader = DataLoader(test_ds, batch_size=BATCH, shuffle=False)

    trained_model.eval()
    all_probs = []

    with torch.no_grad():
        for ids_batch, mask_batch in test_loader:
            ids_batch = ids_batch.to(DEVICE)
            mask_batch = mask_batch.to(DEVICE)

            # BERT forward pass
            logits = trained_model(ids_batch, mask_batch)
            probs = torch.sigmoid(logits).cpu().numpy()
            all_probs.append(probs)

    all_probs = np.vstack(all_probs)

    # Thresholds
    if thresholds is None:
        thr = np.array([0.5] * len(LABELS))
    else:
        thr = np.array(thresholds)

    preds_bin = (all_probs >= thr).astype(int)

    # build submission
    sub = pd.DataFrame(preds_bin, columns=LABELS)
    if 'id' in test_df.columns:
        sub.insert(0, 'id', test_df['id'])
    else:
        sub.insert(0, 'id', range(1, len(sub)+1))

    sub.to_csv(out_csv, index=False)
    print(f"Saved submission to {out_csv} ‚Äî shape: {sub.shape}")
    return out_csv

predict_and_save_submission(model_bert, out_csv="submission.csv", thresholds=best_thresholds)
tokenizer.save_pretrained("saved_bert_model")

Saved submission to submission.csv ‚Äî shape: (1707, 6)


('saved_bert_model/tokenizer_config.json',
 'saved_bert_model/special_tokens_map.json',
 'saved_bert_model/spm.model',
 'saved_bert_model/added_tokens.json',
 'saved_bert_model/tokenizer.json')

In [30]:
torch.save(model_bert.state_dict(), "saved_bert_model/model_bert_state_dict.pt")
tokenizer.save_pretrained("saved_bert_model")

print("Model + tokenizer saved successfully!")


Model + tokenizer saved successfully!


In [31]:
import shutil
shutil.copy(
    "saved_bert_model/model_bert_state_dict.pt",
    "saved_bert_model/pytorch_model.bin"
)


'saved_bert_model/pytorch_model.bin'

# I) Take Majority Voting and Submit

In [32]:
p1 = pd.read_csv("/kaggle/working/submission.csv")
p2 = pd.read_csv("/kaggle/working/submission_text.csv")
p4 = pd.read_csv("/kaggle/working/submission_ki.csv")
p3 = pd.read_csv("/kaggle/working/submission_gru.csv")  

final = pd.DataFrame()
final["id"] = p1["id"]

for label in LABELS:
    votes = np.vstack([p1[label], p2[label], p3[label]]).T
    
    final[label] = (votes.sum(axis=1) >= 2).astype(int)

final.to_csv("submission_bert_textcnn_gru.csv", index=False)
print("Saved submission Final Ensemble (3-model)")

Saved submission Final Ensemble (3-model)
