In [1]:
# Task 1
import pandas as pd
import re
import os
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, normalizers
from tokenizers.normalizers import Lowercase, NFD, StripAccents
from sklearn.model_selection import train_test_split

# Load Dataset

data_path = "/kaggle/input/empathetic-dialogues-facebook-ai/emotion-emotion_69k.csv"
df = pd.read_csv(data_path)

print("Columns:", df.columns.tolist())
print("Total rows:", len(df))
df = df.rename(columns=str.strip)  


def normalize_text(text):
    text = str(text).lower().strip()
    text = re.sub(r"\s+", " ", text)  # normalize whitespace
    text = re.sub(r"([?.!,¿])", r" \1 ", text)  # space around punctuation
    text = re.sub(r"[^a-zA-Z?.!,¿']+", " ", text)  # remove non-alphabetic
    return text.strip()

for col in ["Situation", "empathetic_dialogues", "labels"]:
    df[col] = df[col].apply(normalize_text)

texts = []
for _, row in df.iterrows():
    input_text = f"<emotion_{row['emotion']}> <bos> {row['Situation']} <sep> {row['empathetic_dialogues']}"
    target_text = f"<bos> {row['labels']} <eos>"
    texts.append(input_text)
    texts.append(target_text)

tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

special_tokens = ["<pad>", "<bos>", "<eos>", "<unk>", "<sep>"] + [
    f"<emotion_{emo}>" for emo in df["emotion"].unique()
]

trainer = trainers.BpeTrainer(vocab_size=2000, special_tokens=special_tokens)
tokenizer.train_from_iterator(texts, trainer=trainer)

os.makedirs("tokenizer", exist_ok=True)
tokenizer.save("tokenizer/empathetic_tokenizer.json")
print("✅ Tokenizer trained and saved at tokenizer/empathetic_tokenizer.json")
print("Vocab size:", tokenizer.get_vocab_size())

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")

train_df.to_csv("train.csv", index=False)
val_df.to_csv("val.csv", index=False)
test_df.to_csv("test.csv", index=False)

print("✅ Data splits saved: train.csv, val.csv, test.csv")


Columns: ['Unnamed: 0', 'Situation', 'emotion', 'empathetic_dialogues', 'labels', 'Unnamed: 5', 'Unnamed: 6']
Total rows: 64636



✅ Tokenizer trained and saved at tokenizer/empathetic_tokenizer.json
Vocab size: 2000
Train: 51708 | Val: 6464 | Test: 6464
✅ Data splits saved: train.csv, val.csv, test.csv


In [2]:
# Task 2
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from tokenizers import Tokenizer

train_df = pd.read_csv("/kaggle/working/train.csv")
val_df = pd.read_csv("/kaggle/working/val.csv")
test_df = pd.read_csv("/kaggle/working/test.csv")

for df in [train_df, val_df, test_df]:
    df.columns = df.columns.str.strip()
    df.fillna("", inplace=True)

print(f"✅ Data Loaded — Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")


tokenizer = Tokenizer.from_file("/kaggle/working/tokenizer/empathetic_tokenizer.json")
pad_id = tokenizer.token_to_id("<pad>")
bos_id = tokenizer.token_to_id("<bos>")
eos_id = tokenizer.token_to_id("<eos>")
sep_token = "<sep>"

print(f"✅ Tokenizer loaded — Vocab size: {tokenizer.get_vocab_size()}")

def build_input(row):
    """Builds the input (X) sequence in a clear empathetic structure."""
    return (
        f"Emotion: {row['emotion']} | "
        f"Situation: {row['Situation']} | "
        f"Customer: {row['empathetic_dialogues']} Agent:"
    )

def build_target(row):
    """Builds the target (Y) sequence (Agent response)."""
    return str(row["labels"]).strip()

for df in [train_df, val_df, test_df]:
    df["X"] = df.apply(build_input, axis=1)
    df["Y"] = df.apply(build_target, axis=1)

MAX_LEN = 128  

def encode(text, add_special_tokens=True):
    """Tokenizes and pads text sequences to fixed MAX_LEN."""
    text = str(text)
    if add_special_tokens:
        text = f"<bos> {text} <eos>"
    tokens = tokenizer.encode(text)
    ids = tokens.ids[:MAX_LEN]
    if len(ids) < MAX_LEN:
        ids += [pad_id] * (MAX_LEN - len(ids))
    return torch.tensor(ids, dtype=torch.long)

class EmpatheticDataset(Dataset):
    def __init__(self, df):
        self.inputs = df["X"].tolist()
        self.targets = df["Y"].tolist()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        x_ids = encode(self.inputs[idx])
        y_ids = encode(self.targets[idx])
        return {"input_ids": x_ids, "target_ids": y_ids}

train_dataset = EmpatheticDataset(train_df)
val_dataset = EmpatheticDataset(val_df)
test_dataset = EmpatheticDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=32, drop_last=False)
test_loader = DataLoader(test_dataset, batch_size=32, drop_last=False)

print(f"✅ Dataloaders ready — Train: {len(train_loader)} | Val: {len(val_loader)} | Test: {len(test_loader)}")

sample = next(iter(train_loader))
idx = 0  # pick first sample in batch
print("\n✅ Example:")
print("Input:", train_df['X'].iloc[idx])
print("Target:", train_df['Y'].iloc[idx])
print("input_ids shape:", sample["input_ids"].shape)
print("target_ids shape:", sample["target_ids"].shape)
print("✅ Encoding verified and meaningful!")

  df.fillna("", inplace=True)


✅ Data Loaded — Train: 51708 | Val: 6464 | Test: 6464
✅ Tokenizer loaded — Vocab size: 2000
✅ Dataloaders ready — Train: 1615 | Val: 202 | Test: 202

✅ Example:
Input: Emotion: nostalgic | Situation: i had to go buy legos for my nephew the other day . makes me miss the days when my girls were young enough to play with them . | Customer: customer were you embarrassed or what happend ? agent Agent:
Target: no just this feeling overcame me that my kids just have outgrown this time .
input_ids shape: torch.Size([32, 128])
target_ids shape: torch.Size([32, 128])
✅ Encoding verified and meaningful!


In [3]:
# Task 3
import math
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from tokenizers import Tokenizer

def create_padding_mask(seq, pad_id):
    return (seq == pad_id).unsqueeze(1).unsqueeze(2)

def create_look_ahead_mask(size):
    return torch.triu(torch.ones((size, size), dtype=torch.bool), diagonal=1)

def scaled_dot_product_attention(q, k, v, mask=None, dropout=None):
    dk = q.size(-1)
    scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(dk)
    if mask is not None:
        scores = scores.masked_fill(mask, float("-inf"))
    attn = F.softmax(scores, dim=-1)
    if dropout is not None:
        attn = dropout(attn)
    out = torch.matmul(attn, v)
    return out, attn

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        self.d_model = d_model
        self.max_len = max_len

        # Precompute default positions up to max_len
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))  # (1, max_len, d_model)

    def forward(self, x):
        seq_len = x.size(1)
        if seq_len > self.pe.size(1):
            # 🔧 Dynamically expand positional encodings if sequence is longer
            pe = torch.zeros(seq_len, self.d_model, device=x.device)
            position = torch.arange(0, seq_len, device=x.device).unsqueeze(1)
            div_term = torch.exp(torch.arange(0, self.d_model, 2, device=x.device) * (-math.log(10000.0) / self.d_model))
            pe[:, 0::2] = torch.sin(position * div_term)
            pe[:, 1::2] = torch.cos(position * div_term)
            pe = pe.unsqueeze(0)
            return x + pe
        else:
            return x + self.pe[:, :seq_len, :].to(x.device)

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        assert d_model % num_heads == 0
        self.num_heads = num_heads
        self.depth = d_model // num_heads
        self.wq = nn.Linear(d_model, d_model)
        self.wk = nn.Linear(d_model, d_model)
        self.wv = nn.Linear(d_model, d_model)
        self.fc = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def split_heads(self, x):
        B, T, C = x.size()
        return x.view(B, T, self.num_heads, self.depth).permute(0, 2, 1, 3)

    def combine_heads(self, x):
        x = x.permute(0, 2, 1, 3).contiguous()
        B, T, _, _ = x.size()
        return x.view(B, T, self.num_heads * self.depth)

    def forward(self, q, k, v, mask=None):
        q = self.split_heads(self.wq(q))
        k = self.split_heads(self.wk(k))
        v = self.split_heads(self.wv(v))
        if mask is not None and mask.dtype != torch.bool:
            mask = mask.bool()
        out, attn = scaled_dot_product_attention(q, k, v, mask, dropout=self.dropout)
        out = self.combine_heads(out)
        return self.fc(out), attn

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.linear2(self.dropout(F.relu(self.linear1(x))))


class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, num_heads, dropout)
        self.ffn = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_out, _ = self.mha(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_out))
        ffn_out = self.ffn(x)
        x = self.norm2(x + self.dropout(ffn_out))
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_mha = MultiHeadAttention(d_model, num_heads, dropout)
        self.cross_mha = MultiHeadAttention(d_model, num_heads, dropout)
        self.ffn = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, tgt_mask, src_mask):
        attn1, _ = self.self_mha(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn1))
        attn2, _ = self.cross_mha(x, enc_out, enc_out, src_mask)
        x = self.norm2(x + self.dropout(attn2))
        ffn_out = self.ffn(x)
        x = self.norm3(x + self.dropout(ffn_out))
        return x

class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_len):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([
            EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)
        ])
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, mask):
        x = self.emb(src) * math.sqrt(self.emb.embedding_dim)
        x = self.dropout(self.pos_enc(x))
        for layer in self.layers:
            x = layer(x, mask)
        return x

class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_len):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)
        ])
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, tgt, enc_out, tgt_mask, src_mask):
        x = self.emb(tgt) * math.sqrt(self.emb.embedding_dim)
        x = self.dropout(self.pos_enc(x))
        for layer in self.layers:
            x = layer(x, enc_out, tgt_mask, src_mask)
        return self.fc_out(x)


class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model=256, num_heads=2, num_encoder_layers=2,
                 num_decoder_layers=2, d_ff=1024, dropout=0.2, max_len=128, pad_id=0):
        super().__init__()
        self.encoder = Encoder(vocab_size, d_model, num_encoder_layers, num_heads, d_ff, dropout, max_len)
        self.decoder = Decoder(vocab_size, d_model, num_decoder_layers, num_heads, d_ff, dropout, max_len)
        self.pad_id = pad_id

    def make_src_mask(self, src):
        return create_padding_mask(src, self.pad_id)

    def make_tgt_mask(self, tgt):
        B, T = tgt.size()
        pad_mask = create_padding_mask(tgt, self.pad_id)
        look_ahead = create_look_ahead_mask(T).to(tgt.device)
        look_ahead = look_ahead.unsqueeze(0).unsqueeze(1)
        return pad_mask | look_ahead

    def forward(self, src, tgt):
        src_mask = self.make_src_mask(src)
        tgt_mask = self.make_tgt_mask(tgt)
        enc_out = self.encoder(src, src_mask)
        logits = self.decoder(tgt, enc_out, tgt_mask, src_mask)
        return logits

if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    vocab_size = tokenizer.get_vocab_size()

    print(f"✅ Device: {device}")
    print(f"✅ Vocab Size: {vocab_size} | pad_id: {pad_id}")

    model = Transformer(
        vocab_size=vocab_size,
        d_model=256,
        num_heads=2,
        num_encoder_layers=2,
        num_decoder_layers=2,
        d_ff=1024,
        dropout=0.2,
        max_len=128,
        pad_id=pad_id
    ).to(device)

    # Test one batch
    sample = next(iter(train_loader))
    src, tgt = sample["input_ids"].to(device), sample["target_ids"].to(device)
    out = model(src, tgt)
    print("✅ Forward pass successful — output shape:", out.shape)
    # Expected: (batch_size, seq_len, vocab_size)

✅ Device: cuda
✅ Vocab Size: 2000 | pad_id: 0
✅ Forward pass successful — output shape: torch.Size([32, 128, 2000])


In [4]:
pip install sacrebleu rouge-score --quiet

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
Note: you may need to restart the kernel to use updated packages.


In [5]:
# Task 4

import torch
from torch.optim import Adam
from tqdm import tqdm
import math
import sacrebleu
from rouge_score import rouge_scorer
import torch.nn.functional as F

def compute_bleu(preds, refs):
    return sacrebleu.corpus_bleu(preds, [refs]).score / 100

def compute_rougeL(preds, refs):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = [scorer.score(r, p)['rougeL'].fmeasure for p, r in zip(preds, refs)]
    return sum(scores)/len(scores) if scores else 0.0

def compute_chrf(preds, refs):
    return sacrebleu.corpus_chrf(preds, [refs]).score


def generate(model, input_ids, tokenizer, max_len=60, device="cuda"):
    model.eval()
    generated = input_ids.clone()
    for _ in range(max_len):
        with torch.no_grad():
            outputs = model(generated, generated)
            logits = outputs[:, -1, :]  # last token logits
            next_token = logits.argmax(-1, keepdim=True)
        generated = torch.cat((generated, next_token), dim=1)
    return generated


def evaluate_model(model, val_loader, tokenizer, device, pad_id):
    model.eval()
    preds, refs = [], []
    total_loss = 0
    loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_id)

    print("🔍 Starting full evaluation...")
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            target_ids = batch["target_ids"].to(device)

            logits = model(input_ids, target_ids[:, :-1])  # (B, T, V)
            loss = loss_fn(
                logits.reshape(-1, logits.size(-1)),
                target_ids[:, 1:].reshape(-1)
            )
            total_loss += loss.item()

            # Generate predictions
            generated = generate(model, input_ids, tokenizer, max_len=60, device=device)
            preds.extend([tokenizer.decode(g.tolist(), skip_special_tokens=True) for g in generated])
            refs.extend([tokenizer.decode(r.tolist(), skip_special_tokens=True) for r in target_ids])

    bleu = compute_bleu(preds, refs)
    rouge = compute_rougeL(preds, refs)
    chrf = compute_chrf(preds, refs)
    ppl = math.exp(total_loss / len(val_loader))

    print(f"\n📊 Validation — BLEU: {bleu:.4f} | ROUGE-L: {rouge:.4f} | chrF: {chrf:.4f} | PPL: {ppl:.2f}")
    return bleu, rouge, chrf, ppl

def train_loop(model, train_loader, val_loader, tokenizer, device, pad_id, epochs=10, lr=3e-4):
    optimizer = Adam(model.parameters(), lr=lr, betas=(0.9, 0.98))
    loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_id)
    best_bleu = 0

    print(f"✅ GPU Device: {device}\n")
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")

        for batch in pbar:
            input_ids = batch["input_ids"].to(device)
            target_ids = batch["target_ids"].to(device)

            logits = model(input_ids, target_ids[:, :-1])
            loss = loss_fn(
                logits.reshape(-1, logits.size(-1)),
                target_ids[:, 1:].reshape(-1)
            )

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            pbar.set_postfix({"Loss": f"{loss.item():.4f}"})

        avg_loss = total_loss / len(train_loader)
        print(f"\nEpoch {epoch+1} finished — Avg Train Loss: {avg_loss:.4f}")

        # Evaluate after each epoch
        bleu, rouge, chrf, ppl = evaluate_model(model, val_loader, tokenizer, device, pad_id)

        # Save best model
        if bleu > best_bleu:
            best_bleu = bleu
            torch.save(model.state_dict(), "best_model.pt")
            print("💾 Saved new best model!\n")

    print("✅ Training complete! Best BLEU:", best_bleu)

# ===============================
# 🚀 Run Training
# ===============================
device = "cuda" if torch.cuda.is_available() else "cpu"
pad_id = tokenizer.token_to_id("<pad>")

train_loop(model, train_loader, val_loader, tokenizer, device, pad_id, epochs=20, lr=3e-4)


✅ GPU Device: cuda



Epoch 1/20: 100%|██████████| 1615/1615 [01:22<00:00, 19.48it/s, Loss=4.0902]



Epoch 1 finished — Avg Train Loss: 4.6082
🔍 Starting full evaluation...


Evaluating: 100%|██████████| 202/202 [04:50<00:00,  1.44s/it]



📊 Validation — BLEU: 0.0138 | ROUGE-L: 0.0911 | chrF: 18.2817 | PPL: 62.36
💾 Saved new best model!



Epoch 2/20: 100%|██████████| 1615/1615 [01:27<00:00, 18.35it/s, Loss=3.8686]



Epoch 2 finished — Avg Train Loss: 4.1109
🔍 Starting full evaluation...


Evaluating: 100%|██████████| 202/202 [04:48<00:00,  1.43s/it]



📊 Validation — BLEU: 0.0141 | ROUGE-L: 0.0951 | chrF: 18.2842 | PPL: 52.12
💾 Saved new best model!



Epoch 3/20: 100%|██████████| 1615/1615 [01:27<00:00, 18.44it/s, Loss=4.1763]



Epoch 3 finished — Avg Train Loss: 3.9631
🔍 Starting full evaluation...


Evaluating: 100%|██████████| 202/202 [04:48<00:00,  1.43s/it]



📊 Validation — BLEU: 0.0141 | ROUGE-L: 0.0938 | chrF: 18.7262 | PPL: 47.45
💾 Saved new best model!



Epoch 4/20: 100%|██████████| 1615/1615 [01:27<00:00, 18.44it/s, Loss=3.9047]



Epoch 4 finished — Avg Train Loss: 3.8758
🔍 Starting full evaluation...


Evaluating: 100%|██████████| 202/202 [04:48<00:00,  1.43s/it]



📊 Validation — BLEU: 0.0145 | ROUGE-L: 0.0978 | chrF: 18.8150 | PPL: 44.65
💾 Saved new best model!



Epoch 5/20: 100%|██████████| 1615/1615 [01:27<00:00, 18.46it/s, Loss=3.8241]



Epoch 5 finished — Avg Train Loss: 3.8137
🔍 Starting full evaluation...


Evaluating: 100%|██████████| 202/202 [04:48<00:00,  1.43s/it]



📊 Validation — BLEU: 0.0145 | ROUGE-L: 0.0977 | chrF: 18.8217 | PPL: 42.47
💾 Saved new best model!



Epoch 6/20: 100%|██████████| 1615/1615 [01:27<00:00, 18.36it/s, Loss=4.0270]



Epoch 6 finished — Avg Train Loss: 3.7657
🔍 Starting full evaluation...


Evaluating: 100%|██████████| 202/202 [04:47<00:00,  1.42s/it]



📊 Validation — BLEU: 0.0144 | ROUGE-L: 0.0966 | chrF: 18.7836 | PPL: 41.00


Epoch 7/20: 100%|██████████| 1615/1615 [01:27<00:00, 18.36it/s, Loss=3.7344]



Epoch 7 finished — Avg Train Loss: 3.7256
🔍 Starting full evaluation...


Evaluating: 100%|██████████| 202/202 [04:47<00:00,  1.42s/it]



📊 Validation — BLEU: 0.0143 | ROUGE-L: 0.0950 | chrF: 18.7327 | PPL: 39.98


Epoch 8/20: 100%|██████████| 1615/1615 [01:27<00:00, 18.38it/s, Loss=3.6357]



Epoch 8 finished — Avg Train Loss: 3.6937
🔍 Starting full evaluation...


Evaluating: 100%|██████████| 202/202 [04:48<00:00,  1.43s/it]



📊 Validation — BLEU: 0.0143 | ROUGE-L: 0.0948 | chrF: 18.6489 | PPL: 38.90


Epoch 9/20: 100%|██████████| 1615/1615 [01:27<00:00, 18.37it/s, Loss=3.8944]



Epoch 9 finished — Avg Train Loss: 3.6633
🔍 Starting full evaluation...


Evaluating: 100%|██████████| 202/202 [04:48<00:00,  1.43s/it]



📊 Validation — BLEU: 0.0145 | ROUGE-L: 0.0970 | chrF: 18.7969 | PPL: 38.25


Epoch 10/20: 100%|██████████| 1615/1615 [01:27<00:00, 18.43it/s, Loss=3.7251]



Epoch 10 finished — Avg Train Loss: 3.6382
🔍 Starting full evaluation...


Evaluating: 100%|██████████| 202/202 [04:47<00:00,  1.42s/it]



📊 Validation — BLEU: 0.0141 | ROUGE-L: 0.0924 | chrF: 18.5947 | PPL: 37.76


Epoch 11/20: 100%|██████████| 1615/1615 [01:27<00:00, 18.41it/s, Loss=3.7140]



Epoch 11 finished — Avg Train Loss: 3.6138
🔍 Starting full evaluation...


Evaluating: 100%|██████████| 202/202 [04:47<00:00,  1.42s/it]



📊 Validation — BLEU: 0.0139 | ROUGE-L: 0.0893 | chrF: 18.5948 | PPL: 37.49


Epoch 12/20: 100%|██████████| 1615/1615 [01:27<00:00, 18.42it/s, Loss=3.7937]



Epoch 12 finished — Avg Train Loss: 3.5944
🔍 Starting full evaluation...


Evaluating: 100%|██████████| 202/202 [04:47<00:00,  1.42s/it]



📊 Validation — BLEU: 0.0142 | ROUGE-L: 0.0929 | chrF: 18.6881 | PPL: 36.72


Epoch 13/20: 100%|██████████| 1615/1615 [01:27<00:00, 18.45it/s, Loss=3.5537]



Epoch 13 finished — Avg Train Loss: 3.5776
🔍 Starting full evaluation...


Evaluating: 100%|██████████| 202/202 [04:47<00:00,  1.42s/it]



📊 Validation — BLEU: 0.0144 | ROUGE-L: 0.0959 | chrF: 18.6792 | PPL: 36.24


Epoch 14/20: 100%|██████████| 1615/1615 [01:27<00:00, 18.46it/s, Loss=3.6006]



Epoch 14 finished — Avg Train Loss: 3.5590
🔍 Starting full evaluation...


Evaluating:  58%|█████▊    | 117/202 [02:47<02:01,  1.43s/it]


KeyboardInterrupt: 

In [8]:
# Task 5
import random

model.load_state_dict(torch.load("best_model.pt", map_location=device))
model.to(device)
model.eval()

pad_id = tokenizer.token_to_id("<pad>")
bleu, rouge, chrf, ppl = evaluate_model(model, val_loader, tokenizer, device, pad_id)


print("\n📈 Automatic Evaluation Results:")
print(f"BLEU: {bleu:.4f}")
print(f"ROUGE-L: {rouge:.4f}")
print(f"chrF: {chrf:.4f}")
print(f"Perplexity: {ppl:.2f}")

print("\n🔍 Sample Qualitative Outputs (Human Evaluation):")
num_examples = 3
batches = [next(iter(val_loader)) for _ in range(num_examples)]

for i, batch in enumerate(batches, 1):
    input_ids = batch["input_ids"].to(device)
    target_ids = batch["target_ids"].to(device)

    with torch.no_grad():
        generated = generate(model, input_ids, tokenizer, max_len=60, device=device)

    input_text = tokenizer.decode(input_ids[0].tolist(), skip_special_tokens=True)
    pred_text = tokenizer.decode(generated[0].tolist(), skip_special_tokens=True)
    ref_text = tokenizer.decode(target_ids[0].tolist(), skip_special_tokens=True)

    print(f"\nExample {i}")
    print(f"📝 Input:     {input_text[:200]}...")
    print(f"🤖 Predicted: {pred_text[:200]}...")
    print(f"🎯 Reference: {ref_text[:200]}...")
    print("💬 Human Ratings — Fluency: __ | Relevance: __ | Adequacy: __")


🔍 Starting full evaluation...


Evaluating: 100%|██████████| 202/202 [04:50<00:00,  1.44s/it]



📊 Validation — BLEU: 0.0145 | ROUGE-L: 0.0977 | chrF: 18.8217 | PPL: 42.47

📈 Automatic Evaluation Results:
BLEU: 0.0145
ROUGE-L: 0.0977
chrF: 18.8217
Perplexity: 42.47

🔍 Sample Qualitative Outputs (Human Evaluation):

Example 1
📝 Input:     emotion excited situation i got off work at am today ! time to head to the beach . customer customer i got off work at am today ! time to head to the beach . agent agent...
🤖 Predicted: emotion excited situation i got off work at am today ! time to head to the beach . customer customer i got off work at am today ! time to head to the beach . agent agent . . . . . . . . . . . . . . . ...
🎯 Reference: ah h h h h h h h h h ! that is amazing . i am so jealous of you . where are you going ?...
💬 Human Ratings — Fluency: __ | Relevance: __ | Adequacy: __

Example 2
📝 Input:     emotion excited situation i got off work at am today ! time to head to the beach . customer customer i got off work at am today ! time to head to the beach . agent agent...
🤖 Pr