In [6]:
# ============================================
# STEP 0: SETUP & GPU CHECK
# ============================================

import os
import re
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"✅ Using device: {device}")

# Fix seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)


✅ Using device: cuda


In [7]:
# ============================================
# STEP 0–2: LOAD & CLEAN DATASET (CUSTOMIZED FOR EMPATHETIC DIALOGUES)
# ============================================

import os
import pandas as pd
import re
import glob

print("🔍 DEBUGGING: Searching for dataset files...\n")

# --------------------------------------------
# Locate all CSVs under /kaggle/input
# --------------------------------------------
input_path = "/kaggle/input"
csv_files = glob.glob(f"{input_path}/**/*.csv", recursive=True)

if not csv_files:
    raise FileNotFoundError("❌ No CSV files found in /kaggle/input! Please add dataset.")
else:
    print(f"✅ Found {len(csv_files)} CSV file(s):")
    for csv_file in csv_files:
        size_mb = os.path.getsize(csv_file) / (1024*1024)
        print(f"  📊 {csv_file} ({size_mb:.2f} MB)")
    print()

# --------------------------------------------
# Load dataset(s)
# --------------------------------------------
dfs = []
for csv_file in csv_files:
    try:
        temp_df = pd.read_csv(csv_file)
        print(f"✅ Loaded: {os.path.basename(csv_file)} → shape {temp_df.shape}")
        dfs.append(temp_df)
    except Exception as e:
        print(f"⚠️ Skipping {csv_file} due to error: {e}")

if not dfs:
    raise ValueError("❌ No valid CSV files could be loaded.")

# Merge datasets if multiple files exist
df = pd.concat(dfs, ignore_index=True)
print(f"\n📦 Final combined dataset shape: {df.shape}")

print("\n🔎 Columns detected:")
print(list(df.columns))

# --------------------------------------------
# Manual column mapping (your dataset structure)
# --------------------------------------------
context_col = "Situation"
emotion_col = "emotion"
customer_col = "empathetic_dialogues"
agent_col = "labels"

# Safety check
for col in [context_col, emotion_col, customer_col, agent_col]:
    if col not in df.columns:
        raise ValueError(f"❌ Column '{col}' not found in dataset! Check column names.")

print("\n🧭 Column mapping:")
print(f"  - Situation/Context: {context_col}")
print(f"  - Customer: {customer_col}")
print(f"  - Agent: {agent_col}")
print(f"  - Emotion: {emotion_col}")

# --------------------------------------------
# Clean text safely (preserve important chars)
# --------------------------------------------
def normalize_text(text):
    text = str(text).lower().strip()
    text = re.sub(r"\s+", " ", text)               # normalize spaces
    text = re.sub(r"([?.!,])", r" \1 ", text)      # space punctuation
    text = re.sub(r"[^a-z0-9?.!,’' ]+", " ", text) # keep numbers & apostrophes
    return text.strip()

for col in [context_col, customer_col, agent_col, emotion_col]:
    df[col] = df[col].astype(str).apply(normalize_text)

print("\n✅ Text normalization completed.")
print(f"🧾 Sample rows:\n{df[[emotion_col, context_col, customer_col, agent_col]].sample(2, random_state=42)}")

# --------------------------------------------
# Basic sanity checks
# --------------------------------------------
missing = df[[context_col, customer_col, agent_col, emotion_col]].isnull().sum().sum()
if missing > 0:
    print(f"⚠️ Warning: Found {missing} missing text values. Filling with empty strings.")
    df.fillna("", inplace=True)

print("\n✅ Dataset is ready for tokenization.")
print(f"Total records: {len(df)}")

# Save variable names for later steps
COLUMNS = {
    "emotion": emotion_col,
    "context": context_col,
    "customer": customer_col,
    "agent": agent_col
}


🔍 DEBUGGING: Searching for dataset files...

✅ Found 1 CSV file(s):
  📊 /kaggle/input/empathetic-dialogues-facebook-ai/emotion-emotion_69k.csv (15.95 MB)

✅ Loaded: emotion-emotion_69k.csv → shape (64636, 7)

📦 Final combined dataset shape: (64636, 7)

🔎 Columns detected:
['Unnamed: 0', 'Situation', 'emotion', 'empathetic_dialogues', 'labels', 'Unnamed: 5', 'Unnamed: 6']

🧭 Column mapping:
  - Situation/Context: Situation
  - Customer: empathetic_dialogues
  - Agent: labels
  - Emotion: emotion

✅ Text normalization completed.
🧾 Sample rows:
            emotion                                          Situation  \
37054       content  my husband and i have been married 22 years . ...   
15618  anticipating                 i can't wait to get to the ocean !   

                                    empathetic_dialogues  \
37054  customer  no ,  just married kinda young ,  bu...   
15618  customer  no ,  i've never done that but my fa...   

                                                

# STEP 3: TOKENIZATION + VOCABULARY

In [8]:
# ============================================
# STEP 3: TOKENIZATION & VOCABULARY CREATION
# ============================================

from collections import Counter
from tqdm.notebook import tqdm

print("🔡 Building vocabulary from dataset...\n")

# Use columns from previous step
emotion_col = COLUMNS["emotion"]
context_col = COLUMNS["context"]
customer_col = COLUMNS["customer"]
agent_col = COLUMNS["agent"]

SPECIAL_TOKENS = ["<pad>", "<bos>", "<eos>", "<unk>"]

def tokenize(sentence):
    """Simple space-based tokenizer."""
    return sentence.split()

word_counts = Counter()

# Combine all dataset text and count words
for _, row in tqdm(df.iterrows(), total=len(df)):
    emotion = row.get(emotion_col, "")
    context = row.get(context_col, "")
    customer = row.get(customer_col, "")
    agent = row.get(agent_col, "")
    
    # Merge all relevant parts for tokenization
    text = f"emotion: {emotion} | situation: {context} | customer: {customer} agent: {agent}"
    word_counts.update(tokenize(text))

# Minimum frequency filter (to avoid rare typos/noise)
min_freq = 2
vocab = SPECIAL_TOKENS + [w for w, c in word_counts.items() if c >= min_freq]

word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}

print(f"✅ Vocabulary built successfully!")
print(f"📊 Total unique words (min_freq ≥ {min_freq}): {len(vocab)}")

# Show sample tokens
print("\n🔤 Sample tokens:", vocab[:25])

# Basic stats
rare_words = sum(1 for w, c in word_counts.items() if c < min_freq)
print(f"⚙️ Rare words filtered out (<{min_freq}): {rare_words}")
print(f"📚 Final vocabulary size: {len(vocab)}")

# Save for later use
VOCAB = {
    "vocab": vocab,
    "word2idx": word2idx,
    "idx2word": idx2word
}


🔡 Building vocabulary from dataset...



  0%|          | 0/64636 [00:00<?, ?it/s]

✅ Vocabulary built successfully!
📊 Total unique words (min_freq ≥ 2): 17800

🔤 Sample tokens: ['<pad>', '<bos>', '<eos>', '<unk>', 'emotion:', 'sentimental', '|', 'situation:', 'i', 'remember', 'going', 'to', 'the', 'fireworks', 'with', 'my', 'best', 'friend', '.', 'there', 'was', 'a', 'lot', 'of', 'people']
⚙️ Rare words filtered out (<2): 2655
📚 Final vocabulary size: 17800


In [10]:
# ============================================================
# SAVE VOCABULARY TO FILE (for Streamlit app)
# ============================================================
import pickle

VOCAB = {
    "word2idx": word2idx,
    "idx2word": idx2word
}

with open("/kaggle/working/vocab.pkl", "wb") as f:
    pickle.dump(VOCAB, f)

print("✅ Vocabulary saved successfully at /kaggle/working/vocab.pkl")
print(f"🔤 Total vocab size: {len(word2idx)}")


✅ Vocabulary saved successfully at /kaggle/working/vocab.pkl
🔤 Total vocab size: 17800


# STEP 4: SPLITTING + ENCODING + PADDING

In [None]:
# ============================================
# STEP 4: DATA SPLIT + ENCODING + PADDING
# ============================================

import torch
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

print("🔀 Splitting dataset and encoding sequences...\n")

# ----------------------------------------------------
# Prepare Input (X) and Target (Y)
# ----------------------------------------------------
emotion_col = COLUMNS["emotion"]
context_col = COLUMNS["context"]
customer_col = COLUMNS["customer"]
agent_col = COLUMNS["agent"]

inputs, targets = [], []

for _, row in tqdm(df.iterrows(), total=len(df)):
    emotion = row[emotion_col]
    context = row[context_col]
    customer = row[customer_col]
    agent = row[agent_col]
    
    x = f"emotion: {emotion} | situation: {context} | customer: {customer} agent:"
    y = agent
    
    inputs.append(x)
    targets.append(y)

# ----------------------------------------------------
# Train/Val/Test Split (80/10/10)
# ----------------------------------------------------
X_train, X_temp, y_train, y_temp = train_test_split(inputs, targets, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"✅ Split complete:")
print(f"  • Train: {len(X_train)}")
print(f"  • Val:   {len(X_val)}")
print(f"  • Test:  {len(X_test)}")

# ----------------------------------------------------
# Convert text → token IDs
# ----------------------------------------------------
def encode_sentence(sentence, word2idx, max_len=64):
    tokens = sentence.split()
    ids = [word2idx.get(tok, word2idx["<unk>"]) for tok in tokens]
    ids = [word2idx["<bos>"]] + ids + [word2idx["<eos>"]]  # add special tokens
    if len(ids) < max_len:
        ids += [word2idx["<pad>"]] * (max_len - len(ids))
    else:
        ids = ids[:max_len]
    return ids

max_len_input = 64
max_len_output = 64

print("\n🔢 Encoding sequences...")

def encode_dataset(X, Y):
    X_encoded = [encode_sentence(x, VOCAB["word2idx"], max_len_input) for x in X]
    Y_encoded = [encode_sentence(y, VOCAB["word2idx"], max_len_output) for y in Y]
    return torch.tensor(X_encoded), torch.tensor(Y_encoded)

X_train_enc, y_train_enc = encode_dataset(X_train, y_train)
X_val_enc, y_val_enc = encode_dataset(X_val, y_val)
X_test_enc, y_test_enc = encode_dataset(X_test, y_test)

print("\n✅ Encoding complete!")
print(f"Train set tensor shapes: X={X_train_enc.shape}, Y={y_train_enc.shape}")

# ----------------------------------------------------
# Sample decoded check
# ----------------------------------------------------
def decode(ids):
    words = [VOCAB["idx2word"][i] for i in ids if i != VOCAB["word2idx"]["<pad>"]]
    return " ".join(words)

print("\n🧾 Example check:")
print("Input :", decode(X_train_enc[0].tolist()))
print("Target:", decode(y_train_enc[0].tolist()))


# STEP 5: TRANSFORMER MODEL (Encoder–Decoder) From Scratch

In [None]:
# ============================================
# STEP 5: TRANSFORMER ENCODER–DECODER FROM SCRATCH
# ============================================

import torch
import torch.nn as nn
import math

# --------------------------------------------------------
# ✅ GPU SETUP
# --------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"⚙️ Using device: {device}")

# --------------------------------------------------------
# 🔹 Positional Encoding
# --------------------------------------------------------
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # (1, max_len, d_model)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return x

# --------------------------------------------------------
# 🔹 Multi-Head Attention
# --------------------------------------------------------
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.linear_q = nn.Linear(d_model, d_model)
        self.linear_k = nn.Linear(d_model, d_model)
        self.linear_v = nn.Linear(d_model, d_model)
        self.linear_out = nn.Linear(d_model, d_model)

        self.scale = math.sqrt(self.d_k)

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)

        Q = self.linear_q(query)
        K = self.linear_k(key)
        V = self.linear_v(value)

        # (batch, heads, seq_len, d_k)
        Q = Q.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        K = K.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        V = V.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)

        scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale

        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        attn = torch.softmax(scores, dim=-1)
        context = torch.matmul(attn, V)

        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        output = self.linear_out(context)
        return output, attn

# --------------------------------------------------------
# 🔹 Feed-Forward Network
# --------------------------------------------------------
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=512, dropout=0.1):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.fc2(self.dropout(torch.relu(self.fc1(x))))

# --------------------------------------------------------
# 🔹 Encoder Layer
# --------------------------------------------------------
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff=512, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_out, _ = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_out))
        ff_out = self.ff(x)
        x = self.norm2(x + self.dropout(ff_out))
        return x

# --------------------------------------------------------
# 🔹 Decoder Layer
# --------------------------------------------------------
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff=512, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask=None, tgt_mask=None):
        _x, _ = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(_x))
        _x, attn_weights = self.cross_attn(x, enc_out, enc_out, src_mask)
        x = self.norm2(x + self.dropout(_x))
        ff_out = self.ff(x)
        x = self.norm3(x + self.dropout(ff_out))
        return x, attn_weights

# --------------------------------------------------------
# 🔹 Full Transformer
# --------------------------------------------------------
class TransformerChatbot(nn.Module):
    def __init__(self, vocab_size, d_model=256, num_heads=2, num_layers=2, dropout=0.1, pad_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=pad_idx)
        self.pos_encoder = PositionalEncoding(d_model)
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_model*2, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_model*2, dropout) for _ in range(num_layers)])
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.d_model = d_model
        self.pad_idx = pad_idx

    def make_subsequent_mask(self, size):
        mask = torch.tril(torch.ones(size, size)).bool().to(device)
        return mask

    def forward(self, src, tgt):
        src_mask = (src != self.pad_idx).unsqueeze(1).unsqueeze(2)
        tgt_mask = self.make_subsequent_mask(tgt.size(1))

        src_embed = self.dropout(self.pos_encoder(self.embedding(src)))
        tgt_embed = self.dropout(self.pos_encoder(self.embedding(tgt)))

        enc_out = src_embed
        for layer in self.encoder_layers:
            enc_out = layer(enc_out, src_mask)

        dec_out = tgt_embed
        for layer in self.decoder_layers:
            dec_out, _ = layer(dec_out, enc_out, src_mask, tgt_mask)

        logits = self.fc_out(dec_out)
        return logits

# --------------------------------------------------------
# 🔹 Model Initialization
# --------------------------------------------------------
# --------------------------------------------------------
# ✅ Ensure vocabulary variables are defined
# --------------------------------------------------------

# If you already have vocab built in memory, skip this block.
# Otherwise, reconstruct from your tokenizer step.

try:
    vocab
    word2idx
    idx2word
    print("✅ Vocabulary variables already exist in memory.")
except NameError:
    print("⚠️ vocab not found — reconstructing minimal example...")

    # Example: reload from saved files (if you saved them earlier)
    # vocab = torch.load("/kaggle/working/vocab.pt")
    # word2idx = torch.load("/kaggle/working/word2idx.pt")
    # idx2word = torch.load("/kaggle/working/idx2word.pt")

    # OR temporary fallback (if not saved)
    vocab = ["<pad>", "<bos>", "<eos>", "<unk>"]
    word2idx = {w: i for i, w in enumerate(vocab)}
    idx2word = {i: w for w, i in word2idx.items()}
VOCAB = {
    "vocab": vocab,
    "word2idx": word2idx,
    "idx2word": idx2word
}
vocab_size = len(VOCAB["word2idx"])
pad_idx = VOCAB["word2idx"]["<pad>"]

model = TransformerChatbot(
    vocab_size=vocab_size,
    d_model=256,
    num_heads=2,
    num_layers=2,
    dropout=0.1,
    pad_idx=pad_idx
).to(device)

print("\n✅ Model initialized successfully!")
print(model)


In [None]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("Device count:", torch.cuda.device_count())


# testing training code for small dataset

In [None]:
# # ============================================
# # STEP 6 (FIXED): MINI TRAINING TEST (10% DATA)
# # ============================================

# import torch
# from torch.utils.data import Dataset, DataLoader
# import torch.nn as nn
# import torch.optim as optim
# import pandas as pd
# from tqdm.notebook import tqdm
# import gc

# # --------------------------------------------------------
# # ⚙️ CONFIGURATION
# # --------------------------------------------------------
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"🚀 Using device: {device}")

# BATCH_SIZE = 16
# EPOCHS = 1
# LR = 3e-4
# MAX_LEN = 64

# # --------------------------------------------------------
# # 🧾 LOAD MAIN DATAFRAME (already used earlier)
# # --------------------------------------------------------
# # if df not loaded in memory, reload from your dataset file:
# try:
#     df
# except NameError:
#     df = pd.read_csv("/kaggle/input/empathetic-dialogues-facebook-ai/emotion-emotion_69k.csv")

# print(f"📊 Loaded dataset shape: {df.shape}")

# # --------------------------------------------------------
# # 🔧 Define column mapping (based on your earlier debug)
# # --------------------------------------------------------
# COLUMNS = {
#     "emotion": "emotion",
#     "context": "Situation",
#     "customer": "empathetic_dialogues",
#     "agent": "labels",
# }

# # --------------------------------------------------------
# # 🪓 Split dataset: 80% train, 10% val, 10% test
# # --------------------------------------------------------
# from sklearn.model_selection import train_test_split

# train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
# val_df, test_df   = train_test_split(temp_df, test_size=0.5, random_state=42)

# print(f"✅ Splits created — Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")

# # --------------------------------------------------------
# # ✅ Take only 10% of the data for testing
# # --------------------------------------------------------
# small_train_df = train_df.sample(frac=0.1, random_state=42)
# small_val_df   = val_df.sample(frac=0.1, random_state=42)

# print(f"📦 Using subset — Train: {len(small_train_df)} | Val: {len(small_val_df)}")

# # --------------------------------------------------------
# # 🧩 Dataset Definition
# # --------------------------------------------------------
# class EmpatheticDataset(Dataset):
#     def __init__(self, df, word2idx, columns):
#         self.df = df.reset_index(drop=True)
#         self.word2idx = word2idx
#         self.columns = columns

#     def encode_text(self, text):
#         tokens = text.split()
#         ids = [self.word2idx.get(w, self.word2idx["<unk>"]) for w in tokens]
#         ids = [self.word2idx["<bos>"]] + ids[:MAX_LEN-2] + [self.word2idx["<eos>"]]
#         return torch.tensor(ids, dtype=torch.long)

#     def __getitem__(self, idx):
#         row = self.df.iloc[idx]
#         src_text = f"emotion: {row[self.columns['emotion']]} | situation: {row[self.columns['context']]} | customer: {row[self.columns['customer']]}"
#         tgt_text = f"agent: {row[self.columns['agent']]}"
#         return self.encode_text(src_text), self.encode_text(tgt_text)

#     def __len__(self):
#         return len(self.df)

# # --------------------------------------------------------
# # 🧰 Collate Function (for padding)
# # --------------------------------------------------------
# def collate_fn(batch):
#     src_batch, tgt_batch = zip(*batch)
#     src_batch = nn.utils.rnn.pad_sequence(src_batch, batch_first=True, padding_value=pad_idx)
#     tgt_batch = nn.utils.rnn.pad_sequence(tgt_batch, batch_first=True, padding_value=pad_idx)
#     return src_batch, tgt_batch

# # --------------------------------------------------------
# # 📦 DataLoaders
# # --------------------------------------------------------
# train_loader = DataLoader(EmpatheticDataset(small_train_df, word2idx, COLUMNS),
#                           batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

# val_loader = DataLoader(EmpatheticDataset(small_val_df, word2idx, COLUMNS),
#                         batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# print("✅ DataLoaders ready!")

# # --------------------------------------------------------
# # 🎯 Loss & Optimizer
# # --------------------------------------------------------
# criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
# optimizer = optim.Adam(model.parameters(), lr=LR)

# # --------------------------------------------------------
# # 🔥 Training Loop
# # --------------------------------------------------------
# def run_epoch(model, loader, optimizer=None):
#     total_loss = 0
#     train_mode = optimizer is not None
#     model.train() if train_mode else model.eval()

#     loop = tqdm(loader, desc="🧠 Training" if train_mode else "🧪 Validating", leave=False)
#     for src, tgt in loop:
#         src, tgt = src.to(device), tgt.to(device)
#         if train_mode:
#             optimizer.zero_grad()

#         output = model(src, tgt[:, :-1])
#         loss = criterion(output.reshape(-1, output.size(-1)), tgt[:, 1:].reshape(-1))
#         if train_mode:
#             loss.backward()
#             torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
#             optimizer.step()

#         total_loss += loss.item()
#         loop.set_postfix(loss=loss.item())

#     return total_loss / len(loader)

# # --------------------------------------------------------
# # 🚀 Run Mini Training
# # --------------------------------------------------------
# torch.cuda.empty_cache()
# gc.collect()

# print("\n===== 🚀 MINI TRAINING START =====")
# train_loss = run_epoch(model, train_loader, optimizer)
# val_loss = run_epoch(model, val_loader)
# print(f"\n✅ Done! Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")


# STEP 7 – Full GPU Training + Checkpoint Save

In [None]:
# ===============================================================
# STEP 7: Full GPU Training + Checkpoint Save (Deep Training + ETA)
# ===============================================================

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from tqdm.auto import tqdm
import time
from sklearn.model_selection import train_test_split
import os

# ---------------------------------------------------------------
# ⚙️ Device Setup
# ---------------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Using device: {device}")

# ---------------------------------------------------------------
# ✅ Check essential variables
# ---------------------------------------------------------------
assert "df" in globals(), "❌ Dataset 'df' not found in memory!"
assert "word2idx" in globals(), "❌ Vocabulary not found — please rebuild vocabulary first!"
assert "model" in globals(), "❌ Model not found — please run Step 5 first!"

# ---------------------------------------------------------------
# 🔹 1. Dataset Split (80/10/10)
# ---------------------------------------------------------------
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
print(f"✅ Splits — Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")

# ---------------------------------------------------------------
# 🔹 2. Dataset Class
# ---------------------------------------------------------------
class EmpatheticDataset(Dataset):
    def __init__(self, dataframe, word2idx, max_len=64):
        self.data = dataframe
        self.word2idx = word2idx
        self.max_len = max_len

    def encode_text(self, text):
        tokens = text.split()
        ids = [self.word2idx.get(tok, self.word2idx["<unk>"]) for tok in tokens]
        ids = ids[:self.max_len - 2]
        return torch.tensor([self.word2idx["<bos>"]] + ids + [self.word2idx["<eos>"]])

    def pad_sequence(self, seq):
        pad_len = self.max_len - len(seq)
        if pad_len > 0:
            seq = torch.cat([seq, torch.full((pad_len,), self.word2idx["<pad>"])])
        return seq

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        src = f"emotion: {row['emotion']} | situation: {row['Situation']} | customer: {row['empathetic_dialogues']}"
        tgt = f"{row['labels']}"
        src_ids = self.pad_sequence(self.encode_text(src))
        tgt_ids = self.pad_sequence(self.encode_text(tgt))
        return src_ids, tgt_ids

    def __len__(self):
        return len(self.data)

# ---------------------------------------------------------------
# 🔹 3. DataLoaders
# ---------------------------------------------------------------
train_dataset = EmpatheticDataset(train_df, word2idx)
val_dataset = EmpatheticDataset(val_df, word2idx)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
print("✅ DataLoaders ready!")

# ---------------------------------------------------------------
# 🔹 4. Training Setup
# ---------------------------------------------------------------
criterion = nn.CrossEntropyLoss(ignore_index=word2idx["<pad>"])
optimizer = optim.Adam(model.parameters(), lr=3e-4)
num_epochs = 10  # 🔥 deep training
save_path = "/kaggle/working/best_model.pt"
best_val_loss = float("inf")

# ---------------------------------------------------------------
# 🔹 5. Training Loop with ETA + Progress
# ---------------------------------------------------------------
print("\n🚀 Starting Deep Training...\n")
total_start_time = time.time()

for epoch in range(1, num_epochs + 1):
    model.train()
    train_loss = 0
    epoch_start = time.time()

    with tqdm(total=len(train_loader), desc=f"🧠 Epoch {epoch}/{num_epochs}", unit="batch") as pbar:
        for i, (src, tgt) in enumerate(train_loader):
            src, tgt = src.to(device), tgt.to(device)
            optimizer.zero_grad()
            output = model(src, tgt[:, :-1])
            loss = criterion(output.reshape(-1, output.size(-1)), tgt[:, 1:].reshape(-1))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            pbar.set_postfix({"Train Loss": f"{loss.item():.4f}"})
            pbar.update(1)

    train_loss /= len(train_loader)

    # 🔹 Validation Phase
    model.eval()
    val_loss = 0
    with torch.no_grad(), tqdm(total=len(val_loader), desc=f"🔍 Validation {epoch}", unit="batch") as vbar:
        for src, tgt in val_loader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            loss = criterion(output.reshape(-1, output.size(-1)), tgt[:, 1:].reshape(-1))
            val_loss += loss.item()
            vbar.set_postfix({"Val Loss": f"{loss.item():.4f}"})
            vbar.update(1)

    val_loss /= len(val_loader)
    epoch_time = time.time() - epoch_start
    remaining_time = epoch_time * (num_epochs - epoch)
    eta_min = remaining_time / 60

    print(f"📉 Epoch {epoch}/{num_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | ⏱️ ETA: {eta_min:.1f} min")

    # 🔹 Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), save_path)
        print(f"💾 Best model saved → {save_path}")

total_time = (time.time() - total_start_time) / 60
print(f"\n✅ Deep training completed! Total time: {total_time:.2f} min")
print(f"🏆 Best Validation Loss: {best_val_loss:.4f}")

# ---------------------------------------------------------------
# 🔹 6. Load Best Model for Inference
# ---------------------------------------------------------------
model.load_state_dict(torch.load(save_path))
model.eval()
print("✅ Best model loaded and ready for inference!")


# STEP 8: Inference / Response Generation

In [None]:
# ===============================================================
# STEP 8: Inference + Evaluation + Interactive Chat (Fixed)
# ===============================================================

import torch
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import math

# ---------------------------------------------------------------
# ⚙️ Setup
# ---------------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval()
print(f"🚀 Using device: {device}")

# ---------------------------------------------------------------
# 🔹 Helper Functions
# ---------------------------------------------------------------
def encode_text(text, word2idx, max_len=64):
    tokens = text.split()
    ids = [word2idx.get(tok, word2idx["<unk>"]) for tok in tokens]
    ids = ids[:max_len - 2]
    ids = [word2idx["<bos>"]] + ids + [word2idx["<eos>"]]
    pad_len = max_len - len(ids)
    ids += [word2idx["<pad>"]] * pad_len
    return torch.tensor(ids).unsqueeze(0).to(device)

def decode_ids(ids, idx2word):
    tokens = [idx2word[i] for i in ids if i not in (
        word2idx["<pad>"], word2idx["<bos>"], word2idx["<eos>"]
    )]
    return " ".join(tokens)

# ---------------------------------------------------------------
# 🔹 Improved Reply Generation (with Temperature + Top-k Sampling)
# ---------------------------------------------------------------
def generate_reply(emotion, situation, customer_text, max_len=50, temperature=0.8, top_k=20):
    model.eval()
    idx2word = {v: k for k, v in word2idx.items()}

    with torch.no_grad():
        src = f"emotion: {emotion} | situation: {situation} | customer: {customer_text}"
        src_tensor = encode_text(src, word2idx)
        tgt_input = torch.tensor([[word2idx["<bos>"]]], device=device)

        for _ in range(max_len):
            output = model(src_tensor, tgt_input)        # [1, seq_len, vocab]
            logits = output[:, -1, :] / temperature      # take last token
            probs = F.softmax(logits, dim=-1)

            # top-k sampling
            topk_probs, topk_indices = torch.topk(probs, k=top_k)
            next_token = topk_indices[0, torch.multinomial(topk_probs[0], 1)].unsqueeze(0)

            tgt_input = torch.cat([tgt_input, next_token.unsqueeze(0)], dim=1)
            if next_token.item() == word2idx["<eos>"]:
                break

        reply = decode_ids(tgt_input[0].tolist(), idx2word)
        return reply

# ---------------------------------------------------------------
# 🔹 Evaluation Metrics (BLEU + Perplexity)
# ---------------------------------------------------------------
def evaluate_model(model, data_loader, word2idx, idx2word):
    model.eval()
    total_loss = 0
    total_bleu = []
    criterion = torch.nn.CrossEntropyLoss(ignore_index=word2idx["<pad>"])

    with torch.no_grad():
        for src, tgt in tqdm(data_loader, desc="📊 Evaluating"):
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            loss = criterion(output.reshape(-1, output.size(-1)), tgt[:, 1:].reshape(-1))
            total_loss += loss.item()

            # BLEU score
            pred_tokens = output.argmax(dim=-1).detach().cpu().numpy()
            for i in range(pred_tokens.shape[0]):
                ref = decode_ids(tgt[i].cpu().numpy(), idx2word).split()
                hyp = decode_ids(pred_tokens[i], idx2word).split()
                smoothie = SmoothingFunction().method4
                bleu = sentence_bleu([ref], hyp, smoothing_function=smoothie)
                total_bleu.append(bleu)

    avg_loss = total_loss / len(data_loader)
    perplexity = math.exp(avg_loss)
    avg_bleu = np.mean(total_bleu)
    return avg_loss, perplexity, avg_bleu

# ---------------------------------------------------------------
# 🔹 Run Evaluation
# ---------------------------------------------------------------
print("\n📊 Running evaluation on validation set...")
idx2word = {v: k for k, v in word2idx.items()}
val_loss, val_ppl, val_bleu = evaluate_model(model, val_loader, word2idx, idx2word)
print(f"✅ Evaluation Complete!")
print(f"📉 Val Loss: {val_loss:.4f} | 🤯 Perplexity: {val_ppl:.2f} | 🏆 BLEU: {val_bleu:.4f}")

# # ---------------------------------------------------------------
# # 🔹 Interactive Chat Loop
# # ---------------------------------------------------------------
# print("\n======================")
# print("💬 Empathetic Chatbot Ready!")
# print("======================")
# print("Type 'exit' to stop chatting.\n")

# while True:
#     emo = input("Emotion: ").strip()
#     if emo.lower() == "exit":
#         break
#     sit = input("Situation: ").strip()
#     if sit.lower() == "exit":
#         break
#     cust = input("You: ").strip()
#     if cust.lower() == "exit":
#         break

#     reply = generate_reply(emo, sit, cust)
#     print(f"🤖 Chatbot: {reply}\n")


In [None]:
# ===============================================================
# STEP 8: Chatbot Inference / Response Generation
# ===============================================================

import torch
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval()
print(f"💬 Using device: {device}")

# ---------------------------------------------------------------
# 🔹 Helper: Convert text → tensor
# ---------------------------------------------------------------
def encode_text(text, word2idx, max_len=64):
    tokens = text.split()
    ids = [word2idx.get(tok, word2idx["<unk>"]) for tok in tokens]
    ids = ids[:max_len - 2]
    ids = [word2idx["<bos>"]] + ids + [word2idx["<eos>"]]
    pad_len = max_len - len(ids)
    ids += [word2idx["<pad>"]] * pad_len
    return torch.tensor(ids).unsqueeze(0).to(device)

# ---------------------------------------------------------------
# 🔹 Helper: Decode tensor → text
# ---------------------------------------------------------------
def decode_ids(ids, idx2word):
    tokens = [idx2word[i] for i in ids if i not in [
        word2idx["<pad>"], word2idx["<bos>"], word2idx["<eos>"]]]
    return " ".join(tokens)

# ---------------------------------------------------------------
# 🔹 Generate Response
# ---------------------------------------------------------------
def generate_reply(emotion, situation, customer_text, max_len=50):
    model.eval()
    with torch.no_grad():
        src = f"emotion: {emotion} | situation: {situation} | customer: {customer_text}"
        src_tensor = encode_text(src, word2idx)
        tgt_input = torch.tensor([[word2idx["<bos>"]]]).to(device)

        for _ in range(max_len):
            output = model(src_tensor, tgt_input)
            next_token = output[:, -1, :].argmax(dim=-1).unsqueeze(0)
            tgt_input = torch.cat([tgt_input, next_token], dim=1)
            if next_token.item() == word2idx["<eos>"]:
                break

        reply = decode_ids(tgt_input[0].tolist(), {v: k for k, v in word2idx.items()})
        return reply

# ---------------------------------------------------------------
# 🔹 Test the chatbot
# ---------------------------------------------------------------
example_emotion = "sad"
example_situation = "lost my favorite item"
example_customer = "I can't find it anywhere, I feel terrible."

response = generate_reply(example_emotion, example_situation, example_customer)
print("\n🧍‍♂️ User:", example_customer)
print("🤖 Chatbot:", response)


# 10 Examples

In [None]:
# ===============================================================
# STEP 8: Chatbot Inference / Response Generation (with 10 examples)
# ===============================================================

import torch
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval()
print(f"💬 Using device: {device}")

# ---------------------------------------------------------------
# 🔹 Helper: Convert text → tensor
# ---------------------------------------------------------------
def encode_text(text, word2idx, max_len=64):
    tokens = text.split()
    ids = [word2idx.get(tok, word2idx["<unk>"]) for tok in tokens]
    ids = ids[:max_len - 2]
    ids = [word2idx["<bos>"]] + ids + [word2idx["<eos>"]]
    pad_len = max_len - len(ids)
    ids += [word2idx["<pad>"]] * pad_len
    return torch.tensor(ids).unsqueeze(0).to(device)

# ---------------------------------------------------------------
# 🔹 Helper: Decode tensor → text
# ---------------------------------------------------------------
def decode_ids(ids, idx2word):
    tokens = [idx2word[i] for i in ids if i not in [
        word2idx["<pad>"], word2idx["<bos>"], word2idx["<eos>"]]]
    return " ".join(tokens)

# ---------------------------------------------------------------
# 🔹 Generate Response
# ---------------------------------------------------------------
def generate_reply(emotion, situation, customer_text, max_len=50):
    model.eval()
    with torch.no_grad():
        src = f"emotion: {emotion} | situation: {situation} | customer: {customer_text}"
        src_tensor = encode_text(src, word2idx)
        tgt_input = torch.tensor([[word2idx["<bos>"]]]).to(device)

        for _ in range(max_len):
            output = model(src_tensor, tgt_input)
            next_token = output[:, -1, :].argmax(dim=-1).unsqueeze(0)
            tgt_input = torch.cat([tgt_input, next_token], dim=1)
            if next_token.item() == word2idx["<eos>"]:
                break

        reply = decode_ids(tgt_input[0].tolist(), {v: k for k, v in word2idx.items()})
        return reply


# ---------------------------------------------------------------
# 🔹 Test the chatbot with 11 diverse examples
# ---------------------------------------------------------------
test_examples = [
    ("sad", "lost my favorite item", "I can't find it anywhere, I feel terrible."),
    ("angry", "argument with a friend", "He was so rude to me, I can’t believe it."),
    ("happy", "got a promotion at work", "I just got promoted today!"),
    ("afraid", "going for a medical test", "I’m scared of what the doctor will say."),
    ("disappointed", "failed an exam", "I studied so hard but still didn’t pass."),
    ("surprised", "unexpected gift", "My friend sent me a present out of nowhere!"),
    ("lonely", "moved to a new city", "I don’t know anyone here yet, it feels empty."),
    ("grateful", "someone helped me", "She stayed up late just to help me finish my work."),
    ("embarrassed", "made a mistake in public", "Everyone saw me trip over the stairs."),
    ("stressed", "too much workload", "I have so many deadlines, I can’t keep up."),
    ("hopeful", "starting a new job", "I really want to do well in this new role."),
]

print("\n==============================")
print("💬 Testing Chatbot Responses")
print("==============================")

for i, (emo, sit, cust) in enumerate(test_examples, 1):
    response = generate_reply(emo, sit, cust)
    print(f"\n🧍‍♂️ Example {i}")
    print(f"Emotion: {emo}")
    print(f"Situation: {sit}")
    print(f"You: {cust}")
    print(f"🤖 Chatbot: {response}")
