In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import wandb
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from torch.utils.data import DataLoader, Dataset as TorchDataset
from transformers import AutoTokenizer
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR
from kaggle_secrets import UserSecretsClient
import os
import warnings

warnings.filterwarnings("ignore")



# 1. Configuration & Globals


In [None]:

# Set to True for the first run - training and model upload
DO_TRAIN_AND_UPLOAD = True

# Set this to True to train on 100% of data (no validation split).
USE_FULL_DATA = False

# Thresholds found in previous optimal tuning runs
MANUAL_THRESHOLDS = [0.99, 0.54, 0.88, 0.71, 0.52]

config = {
    "MODEL_NAME": "bert-base-uncased", # Used ONLY for Tokenizer
    "TRAIN_FILE": "/kaggle/input/2025-sep-dl-gen-ai-project/train.csv",
    "TEST_FILE": "/kaggle/input/2025-sep-dl-gen-ai-project/test.csv",
    "VALIDATION_SPLIT_SIZE": 0.1,
    "EPOCHS": 21, # Increased epochs for scratch training
    "STARTING_LR": 1e-3, # Increased LR for scratch training
    "TRAIN_BATCH_SIZE": 32, # Increased batch size
    "EVAL_BATCH_SIZE": 32,
    "MAX_TOKEN_LENGTH": 128,
    "EMBEDDING_DIM": 300, # Dimension of custom embeddings
    "HIDDEN_DIM": 256,    # LSTM Hidden dimension
    "N_LAYERS": 2,        # Number of LSTM layers
    "DROPOUT": 0.3,
    "RANDOM_SEED": 42
}

# Defining labels for the task
emotion_labels = ['anger', 'fear', 'joy', 'sadness', 'surprise']
num_labels = len(emotion_labels)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 2. Data Loading & Preparation

In [None]:
# Loading data
if os.path.exists(config["TRAIN_FILE"]):
    all_train_df = pd.read_csv(config["TRAIN_FILE"])
    test_df = pd.read_csv(config["TEST_FILE"])


if USE_FULL_DATA:
    df_train = all_train_df
    df_val = pd.DataFrame()
else:
    df_train, df_val = train_test_split(
        all_train_df,
        test_size=config["VALIDATION_SPLIT_SIZE"],
        random_state=config["RANDOM_SEED"]
    )

print(f"Training split shape: {df_train.shape}")
print(f"Validation split shape: {df_val.shape}")

In [None]:
# Calculate positive weights
pos_weights_list = []
total_train_samples = len(df_train)

for label in emotion_labels:
    pos_count = df_train[label].sum()
    neg_count = total_train_samples - pos_count
    weight = neg_count / (pos_count + 1e-6)
    pos_weights_list.append(weight)

pos_weight_tensor = torch.tensor(pos_weights_list, dtype=torch.float).to(device)
print(f"pos_weight vector: {pos_weights_list}")


In [None]:
# Loading Tokenizer (Used only for mapping words to IDs)
tokenizer = AutoTokenizer.from_pretrained(config["MODEL_NAME"])

In [None]:
def preprocess_function(batch_texts, batch_labels):
    tokenized_inputs = tokenizer(
        batch_texts,
        truncation=True,
        padding="max_length",
        max_length=config["MAX_TOKEN_LENGTH"],
        return_tensors="pt"
    )
    labels_tensor = torch.tensor(batch_labels, dtype=torch.float)
    return {
        "input_ids": tokenized_inputs["input_ids"],
        "attention_mask": tokenized_inputs["attention_mask"],
        "labels": labels_tensor
    }

In [None]:
class EmotionDataset(TorchDataset):
    def __init__(self, df, is_test=False):
        self.texts = df['text'].tolist()
        self.is_test = is_test
        if not self.is_test:
            self.labels = df[emotion_labels].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        if self.is_test:
            return self.texts[idx], []
        return self.texts[idx], self.labels[idx]

In [None]:
def collate_fn(batch):
    texts, labels = zip(*batch)
    return preprocess_function(list(texts), list(labels))
def collate_fn_test(batch):
    texts, _ = zip(*batch)
    tokenized_inputs = tokenizer(
        list(texts),
        truncation=True,
        padding="max_length",
        max_length=config["MAX_TOKEN_LENGTH"],
        return_tensors="pt"
    )
    return {
        "input_ids": tokenized_inputs["input_ids"],
        "attention_mask": tokenized_inputs["attention_mask"]
    }

In [None]:
# Create DataLoaders
train_loader = DataLoader(
    EmotionDataset(df_train),
    batch_size=config["TRAIN_BATCH_SIZE"],
    collate_fn=collate_fn,
    shuffle=True
)

if not USE_FULL_DATA:
    val_loader = DataLoader(
        EmotionDataset(df_val),
        batch_size=config["EVAL_BATCH_SIZE"],
        collate_fn=collate_fn,
        shuffle=False
    )
else:
    val_loader = None

test_loader = DataLoader(
    EmotionDataset(test_df, is_test=True),
    batch_size=config["EVAL_BATCH_SIZE"],
    collate_fn=collate_fn_test,
    shuffle=False
)

# 3. Custom Model Definition (Built from Scratch)

In [None]:
class CustomEmotionRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout, pad_idx):
        super().__init__()

        # 1. Embedding Layer: Converts integer inputs to dense vectors
        # We train this from scratch
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)

        # 2. LSTM Layer: Processes the sequence
        # Bidirectional=True allows the model to look at past and future context
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=n_layers,
            bidirectional=True,
            batch_first=True,
            dropout=dropout if n_layers > 1 else 0
        )

        # 3. Fully Connected Layers
        # Input dim is hidden_dim * 2 because of bidirectional (forward + backward states)
        self.fc = nn.Linear(hidden_dim * 2, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_ids, attention_mask=None):
        # input_ids = [batch size, sent len]

        # embed = [batch size, sent len, emb dim]
        embedded = self.embedding(input_ids)

        # output: [batch size, sent len, hid dim * num directions]
        # hidden: [num layers * num directions, batch size, hid dim]
        # cell: [num layers * num directions, batch size, hid dim]
        output, (hidden, cell) = self.lstm(embedded)

        # We concatenate the final forward and backward hidden states
        # hidden[-2,:,:] is the last of the forward LSTM
        # hidden[-1,:,:] is the last of the backward LSTM
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))

        # Dense layers
        x = self.fc(hidden)
        x = self.relu(x)
        x = self.dropout(x)
        logits = self.out(x)

        return logits

# 4. Training Loop


### 1. WandB Init

In [None]:

if DO_TRAIN_AND_UPLOAD:
    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        api_key = user_secrets.get_secret("wandb_api")
        wandb.login(key=api_key)
        run = wandb.init(project="multi-label-emotion-bert", config=config, name="Custom-LSTM")
    except Exception as e:
        print(f"WandB not initialized: {e}")
        run = None

    print("\n--- Starting Model Training (From Scratch) ---")


### 2. Model Initialization

In [None]:
if DO_TRAIN_AND_UPLOAD:
    INPUT_DIM = tokenizer.vocab_size
    PAD_IDX = tokenizer.pad_token_id

    model = CustomEmotionRNN(
        vocab_size=INPUT_DIM,
        embedding_dim=config["EMBEDDING_DIM"],
        hidden_dim=config["HIDDEN_DIM"],
        output_dim=num_labels,
        n_layers=config["N_LAYERS"],
        dropout=config["DROPOUT"],
        pad_idx=PAD_IDX
    ).to(device)

### 3. Initialize optimizer, scheduler, and loss function

In [None]:
if DO_TRAIN_AND_UPLOAD:
    optimizer = Adam(model.parameters(), lr=config["STARTING_LR"])

    scheduler = StepLR(optimizer, step_size=5, gamma=0.5)

    loss_fct = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)

    best_val_f1 = -1.0
    best_model_state = None
    global_step = 0

    if MANUAL_THRESHOLDS:
        thresholds_tensor = torch.tensor(MANUAL_THRESHOLDS).to(device)

## Training Loop

In [None]:
if DO_TRAIN_AND_UPLOAD:
    for epoch in range(config["EPOCHS"]):
        print(f"\n--- Starting Epoch {epoch+1}/{config['EPOCHS']} ---")
        model.train()

        train_preds_list = []
        train_labels_list = []
        total_loss = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            logits = model(input_ids, attention_mask=attention_mask)

            loss = loss_fct(logits, labels)

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss.item()

            probs = torch.sigmoid(logits)

            if MANUAL_THRESHOLDS:
                preds = (probs > thresholds_tensor).int()
            else:
                preds = (probs > 0.5).int()

            train_preds_list.append(preds.detach().cpu().numpy())
            train_labels_list.append(labels.detach().cpu().numpy())

            # WandB Log Step
            if run is not None:
                current_lr = optimizer.param_groups[0]['lr']
                wandb.log({
                    "train/step_loss": loss.item(),
                    "train/learning_rate": current_lr,
                    "global_step": global_step
                })
            global_step += 1

        avg_train_loss = total_loss / len(train_loader)
        all_train_preds = np.concatenate(train_preds_list, axis=0)
        all_train_labels = np.concatenate(train_labels_list, axis=0)
        train_f1 = f1_score(all_train_labels, all_train_preds, average='macro', zero_division=0)
        train_acc = accuracy_score(all_train_labels, all_train_preds)

        print(f"  Average Training Loss: {avg_train_loss:.4f}")
        print(f"  Training Macro F1: {train_f1:.4f}")

        # WandB Log Epoch
        if run is not None:
            wandb.log({
                "train/epoch_loss": avg_train_loss,
                "train/macro_f1": train_f1,
                "train/accuracy": train_acc,
                "epoch": epoch + 1
            })

        scheduler.step()

        # Evaluation
        if not USE_FULL_DATA:
            model.eval()
            val_preds_list = []
            val_labels_list = []
            with torch.no_grad():
                for batch in val_loader:
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['labels'].to(device)

                    logits = model(input_ids, attention_mask=attention_mask)
                    probs = torch.sigmoid(logits)
                    preds = (probs > 0.5).int()

                    val_preds_list.append(preds.cpu().numpy())
                    val_labels_list.append(labels.cpu().numpy())

            val_preds = np.concatenate(val_preds_list, axis=0)
            val_labels = np.concatenate(val_labels_list, axis=0)
            val_f1 = f1_score(val_labels, val_preds, average='macro', zero_division=0)
            print(f"  Validation Macro F1: {val_f1:.4f}")

            if val_f1 > best_val_f1:
                best_val_f1 = val_f1
                best_model_state = model.state_dict().copy()
        else:
             best_model_state = model.state_dict().copy()

    # Save Best Model
    if best_model_state is not None:
        model.load_state_dict(best_model_state)

    save_path = "./final_model.pt"
    torch.save(model.state_dict(), save_path)
    print(f"Model weights saved to {save_path}")

    if run is not None:
        wandb.finish()

# 5. Inference & Submission

In [None]:
print("\n--- Starting Inference ---")

INPUT_DIM = tokenizer.vocab_size

PAD_IDX = tokenizer.pad_token_id

model = CustomEmotionRNN(
    vocab_size=INPUT_DIM,
    embedding_dim=config["EMBEDDING_DIM"],
    hidden_dim=config["HIDDEN_DIM"],
    output_dim=num_labels,
    n_layers=config["N_LAYERS"],
    dropout=config["DROPOUT"],
    pad_idx=PAD_IDX
).to(device)

try:
    model.load_state_dict(torch.load("./final_model.pt", map_location=device))
    model.eval()
    print("Model loaded successfully.")
except Exception as e:
    print(f"Model load failed (likely no training run): {e}")

test_probs = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # NOTE: Custom model returns logits directly as a tensor, not a dictionary object.
        logits = model(input_ids, attention_mask=attention_mask)
        sigmoid = torch.sigmoid(logits)
        test_probs.append(sigmoid.cpu())

test_probs = torch.cat(test_probs, dim=0).numpy()
final_preds = np.zeros(test_probs.shape, dtype=int)

# Apply Thresholds
final_thresholds = MANUAL_THRESHOLDS
for i in range(num_labels):
    thresh = final_thresholds[i]
    final_preds[:, i] = (test_probs[:, i] > thresh).astype(int)

# Save Submission
submission_df = pd.DataFrame(final_preds, columns=emotion_labels)
submission_df.insert(0, 'id', test_df['id'])
submission_df.to_csv("submission.csv", index=False)
print("Submission file created: submission.csv")
submission_df.head()


--- Starting Inference ---
Model loaded successfully.


Predicting: 100%|██████████| 54/54 [00:00<00:00, 58.60it/s]


Submission file created: submission.csv
   id  anger  fear  joy  sadness  surprise
0   0      0     0    0        0         0
1   1      0     0    0        0         0
2   2      1     1    0        0         0
3   3      0     1    0        0         0
4   4      0     1    0        0         1
