# Project 02: Empathetic Conversational Chatbot
## Transformer with Multi-Head Attention (Built from Scratch)

**Objective:** Build a Transformer encoder-decoder chatbot that generates empathetic agent replies given a situation, emotion, and customer utterance.

**Key Requirements:**
- No pretrained model weights - all weights randomly initialized
- Train end-to-end from scratch
- Track BLEU, ROUGE-L, chrF, and Perplexity metrics
- Save all intermediate models and data as .pkl files

**Project Structure:**
1. Preprocessing (normalization, tokenization, vocabulary building)
2. Input/Output (X/Y) Definition
3. Model Architecture (Transformer from scratch)
4. Training (with teacher forcing)
5. Evaluation (automatic metrics + qualitative analysis)

## Section 1: Import Required Libraries

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence  # For padding variable length sequences
from collections import Counter
from tqdm import tqdm
import random
import pickle
import os
import re  # For text normalization
import math
import time
from typing import List, Tuple, Dict

# Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print("📚 Libraries imported successfully!")
print(f"🎲 Random seed set to: {SEED}")
print(f"🖥️  Device: {device}")
if torch.cuda.is_available():
    print(f"🎮 GPU: {torch.cuda.get_device_name(0)}")
    
# ROBUST PLATFORM DETECTION & PATH SETUP
def detect_platform():
    """
    Robustly detect if running on Kaggle or Local.
    Prioritizes environment variables over directory existence.
    """
    # Check for Kaggle environment variables (most reliable)
    has_kaggle_env = (
        os.getenv('KAGGLE_KERNEL_RUN_TYPE') is not None or
        os.getenv('KAGGLE_WORKING_DIR') is not None
    )
    
    # Check for Kaggle directory structure (secondary check)
    has_kaggle_dirs = (
        os.path.exists('/kaggle/working') and 
        os.path.exists('/kaggle/input')
    )
    
    # Must have BOTH environment variables AND directory structure for Kaggle
    is_kaggle = has_kaggle_env and has_kaggle_dirs
    
    return 'kaggle' if is_kaggle else 'local'

PLATFORM = detect_platform()
BASE_DIR = '/kaggle/working' if PLATFORM == 'kaggle' else '.'

def get_platform_path(relative_path):
    """Get platform-aware absolute path."""
    return os.path.join(BASE_DIR, relative_path)

# Create necessary directories
for directory in ['saved_models', 'saved_data', 'saved_vocab']:
    os.makedirs(get_platform_path(directory), exist_ok=True)

print(f"\n🌍 Platform: {PLATFORM.upper()}")
print(f"📁 Base directory: {os.path.abspath(BASE_DIR)}")
print(f"✅ Directories created: saved_models, saved_data, saved_vocab")

## Section 2: Dataset Loading and Exploration

Load the empathetic dialogue dataset and understand its structure.

In [None]:
# PLATFORM-AWARE DATASET LOADING
print("🔍 Loading dataset...")

def find_dataset():
    """Find the dataset file based on platform."""
    if PLATFORM == 'kaggle':
        # Search in Kaggle input directories
        search_paths = []
        if os.path.exists('/kaggle/input'):
            for root, dirs, files in os.walk('/kaggle/input'):
                for file in files:
                    if 'emotion' in file.lower() and file.endswith('.csv'):
                        return os.path.join(root, file)
        # Common Kaggle paths
        common_paths = [
            '/kaggle/input/empathetic-dialogues/emotion-emotion_69k.csv',
            '/kaggle/input/empathetic-dialogues-dataset/emotion-emotion_69k.csv',
            '/kaggle/input/emotion-emotion_69k.csv',
        ]
        for path in common_paths:
            if os.path.exists(path):
                return path
    else:
        # Local paths - search in current directory and common locations
        local_paths = [
            'emotion-emotion_69k.csv',  # Current directory (no prefix)
            os.path.join(os.getcwd(), 'emotion-emotion_69k.csv'),  # Absolute path
            os.path.abspath('emotion-emotion_69k.csv'),  # Absolute resolve
            os.path.join('..', 'emotion-emotion_69k.csv'),  # Parent directory
            os.path.join('data', 'emotion-emotion_69k.csv'),  # data folder
        ]
        print(f"🔍 Searching for dataset in local paths:")
        for path in local_paths:
            print(f"   Checking: {path}")
            if os.path.exists(path):
                print(f"   ✅ Found!")
                return path
    
    return None

# Find and load dataset
dataset_path = find_dataset()

if dataset_path is None:
    error_msg = f"❌ Dataset 'emotion-emotion_69k.csv' not found!\n"
    if PLATFORM == 'kaggle':
        error_msg += "💡 Please add the dataset to your Kaggle notebook inputs"
    else:
        error_msg += f"💡 Please place 'emotion-emotion_69k.csv' in: {os.getcwd()}"
    raise FileNotFoundError(error_msg)

print(f"\n✅ Found dataset at: {dataset_path}")
df = pd.read_csv(dataset_path)

print(f"\n📊 Dataset loaded successfully!")
print(f"   Shape: {df.shape}")
print(f"   Columns: {list(df.columns)}")
print(f"\n👀 First few rows:")
df.head(10)

In [None]:
# Analyze the structure of the dataset
print("Dataset Information:")
print(f"Total rows: {len(df)}")
print(f"\nColumn details:")
print(df.info())

print(f"\nMissing values:")
print(df.isnull().sum())

print(f"\nUnique emotions:")
print(df['emotion'].unique())
print(f"\nTotal unique emotions: {df['emotion'].nunique()}")

print(f"\nEmotion distribution:")
print(df['emotion'].value_counts())

In [None]:
# Examine the empathetic_dialogues column structure
print("Sample empathetic_dialogues entries:\n")
for i in range(3):
    print(f"Example {i+1}:")
    print(f"Situation: {df.iloc[i]['Situation']}")
    print(f"Emotion: {df.iloc[i]['emotion']}")
    print(f"Dialogue: {df.iloc[i]['empathetic_dialogues']}")
    print("-" * 80)

## Section 3: Text Preprocessing and Dialogue Parsing

Parse the empathetic_dialogues column to extract Customer utterances and Agent replies.

In [None]:
def parse_dialogue(dialogue_text, labels_text):
    """
    Parse the empathetic_dialogues and labels columns to extract customer utterance and agent reply.
    
    Format: 
    - empathetic_dialogues: "Customer :{customer_text}\nAgent :"
    - labels: "{agent_text}"
    """
    if pd.isna(dialogue_text) or pd.isna(labels_text):
        return None, None
    
    # Extract customer utterance from empathetic_dialogues
    if "Customer :" in dialogue_text:
        # Split by "Customer :" and take the part after it
        customer_part = dialogue_text.split("Customer :")[1]
        # Remove the "Agent :" part if it exists
        if "Agent :" in customer_part:
            customer_utterance = customer_part.split("Agent :")[0].strip()
        else:
            customer_utterance = customer_part.strip()
    else:
        return None, None
    
    # Agent reply comes from the labels column
    agent_reply = str(labels_text).strip()
    
    return customer_utterance, agent_reply


# Test the parsing function
print("Testing dialogue parsing:\n")
for i in range(5):
    dialogue = df.iloc[i]['empathetic_dialogues']
    labels = df.iloc[i]['labels']
    customer, agent = parse_dialogue(dialogue, labels)
    print(f"Example {i+1}:")
    print(f"Customer: {customer}")
    print(f"Agent: {agent}")
    print("-" * 80)

In [None]:
# Parse all dialogues and create structured dataset
parsed_data = []

for idx, row in df.iterrows():
    customer_utterance, agent_reply = parse_dialogue(row['empathetic_dialogues'], row['labels'])
    
    if customer_utterance and agent_reply:
        parsed_data.append({
            'situation': row['Situation'],
            'emotion': row['emotion'],
            'customer_utterance': customer_utterance,
            'agent_reply': agent_reply
        })

# Create new dataframe
df_parsed = pd.DataFrame(parsed_data)
print(f"Parsed dataset shape: {df_parsed.shape}")
print(f"\nFirst few rows:")
df_parsed.head()

## Section 4: Text Normalization Functions

Implement text preprocessing: lowercase, clean whitespace, normalize punctuation.

In [None]:
def normalize_text(text):
    """
    Normalize text by:
    1. Converting to lowercase
    2. Cleaning whitespace
    3. Normalizing punctuation
    """
    if pd.isna(text):
        return ""
    
    # Convert to string and lowercase
    text = str(text).lower()
    
    # Normalize whitespace (multiple spaces to single space)
    text = re.sub(r'\s+', ' ', text)
    
    # Add space before and after punctuation for proper tokenization
    text = re.sub(r'([.,!?;:])', r' \1 ', text)
    
    # Remove extra spaces created by punctuation normalization
    text = re.sub(r'\s+', ' ', text)
    
    # Strip leading and trailing whitespace
    text = text.strip()
    
    return text


# Test normalization
test_texts = [
    "I remember going to see the fireworks with my best friend.",
    "it feels like hitting to blank wall when i see the darkness",
    "Oh ya? I don't really see how"
]

print("Testing text normalization:\n")
for text in test_texts:
    normalized = normalize_text(text)
    print(f"Original : {text}")
    print(f"Normalized: {normalized}")
    print()

In [None]:
# Apply normalization to all text fields
df_parsed['situation_normalized'] = df_parsed['situation'].apply(normalize_text)
df_parsed['customer_utterance_normalized'] = df_parsed['customer_utterance'].apply(normalize_text)
df_parsed['agent_reply_normalized'] = df_parsed['agent_reply'].apply(normalize_text)

print("Normalized dataset sample:")
print(df_parsed[['emotion', 'situation_normalized', 'customer_utterance_normalized', 'agent_reply_normalized']].head())

## Section 5: Dataset Splitting (Train 80%, Val 10%, Test 10%)

Split the dataset with fixed random seed for reproducibility.

In [None]:
# Shuffle and split dataset
df_shuffled = df_parsed.sample(frac=1, random_state=SEED).reset_index(drop=True)

# Calculate split sizes
total_size = len(df_shuffled)
train_size = int(0.8 * total_size)
val_size = int(0.1 * total_size)
test_size = total_size - train_size - val_size

# Split data
train_df = df_shuffled[:train_size]
val_df = df_shuffled[train_size:train_size + val_size]
test_df = df_shuffled[train_size + val_size:]

print(f"Dataset split:")
print(f"Total samples: {total_size}")
print(f"Training samples: {len(train_df)} ({len(train_df)/total_size*100:.1f}%)")
print(f"Validation samples: {len(val_df)} ({len(val_df)/total_size*100:.1f}%)")
print(f"Test samples: {len(test_df)} ({len(test_df)/total_size*100:.1f}%)")

# Save splits to pickle files
with open('saved_data/train_df.pkl', 'wb') as f:
    pickle.dump(train_df, f)
    
with open('saved_data/val_df.pkl', 'wb') as f:
    pickle.dump(val_df, f)
    
with open('saved_data/test_df.pkl', 'wb') as f:
    pickle.dump(test_df, f)

print("\n✓ Dataset splits saved to 'saved_data/' directory")

## Section 6: Tokenization and Vocabulary Building

Build vocabulary from training split only with special tokens: `<pad>`, `<bos>`, `<eos>`, `<unk>`.

In [None]:
def simple_tokenize(text):
    """
    Simple tokenizer that splits on whitespace.
    Text should already be normalized.
    """
    return text.split()


class Vocabulary:
    """
    Vocabulary class for managing word-to-index and index-to-word mappings.
    """
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.word_count = Counter()
        
        # Special tokens
        self.PAD_TOKEN = '<pad>'
        self.BOS_TOKEN = '<bos>'
        self.EOS_TOKEN = '<eos>'
        self.UNK_TOKEN = '<unk>'
        
        # Initialize with special tokens
        self.add_word(self.PAD_TOKEN)  # Index 0
        self.add_word(self.BOS_TOKEN)  # Index 1
        self.add_word(self.EOS_TOKEN)  # Index 2
        self.add_word(self.UNK_TOKEN)  # Index 3
        
    def add_word(self, word):
        """Add a word to vocabulary."""
        if word not in self.word2idx:
            idx = len(self.word2idx)
            self.word2idx[word] = idx
            self.idx2word[idx] = word
        self.word_count[word] += 1
        
    def add_sentence(self, sentence):
        """Add all words in a sentence to vocabulary."""
        tokens = simple_tokenize(sentence)
        for word in tokens:
            self.add_word(word)
            
    def __len__(self):
        return len(self.word2idx)
    
    def get_idx(self, word):
        """Get index of a word (returns UNK index if word not found)."""
        return self.word2idx.get(word, self.word2idx[self.UNK_TOKEN])
    
    def get_word(self, idx):
        """Get word from index."""
        return self.idx2word.get(idx, self.UNK_TOKEN)


print("Vocabulary class created successfully!")

In [None]:
# Build vocabulary from TRAINING DATA ONLY
vocab = Vocabulary()

print("Building vocabulary from training data...")

# Add all text from training set to vocabulary
for idx, row in train_df.iterrows():
    vocab.add_sentence(row['situation_normalized'])
    vocab.add_sentence(row['customer_utterance_normalized'])
    vocab.add_sentence(row['agent_reply_normalized'])
    vocab.add_word(row['emotion'])  # Add emotion as a word
    
print(f"\nVocabulary built successfully!")
print(f"Vocabulary size: {len(vocab)}")
print(f"\nSpecial token indices:")
print(f"  PAD: {vocab.word2idx[vocab.PAD_TOKEN]}")
print(f"  BOS: {vocab.word2idx[vocab.BOS_TOKEN]}")
print(f"  EOS: {vocab.word2idx[vocab.EOS_TOKEN]}")
print(f"  UNK: {vocab.word2idx[vocab.UNK_TOKEN]}")

# Show most common words
print(f"\nTop 20 most common words:")
for word, count in vocab.word_count.most_common(20):
    print(f"  {word}: {count}")

In [None]:
# Save vocabulary to pickle file
vocab_path = get_platform_path('saved_vocab/vocabulary.pkl')
with open(vocab_path, 'wb') as f:
    pickle.dump(vocab, f)

print(f"✓ Vocabulary saved to '{vocab_path}'")

# Test vocabulary
test_sentence = "i remember going to the fireworks ."
tokens = simple_tokenize(test_sentence)
indices = [vocab.get_idx(word) for word in tokens]

print(f"\nTest tokenization:")
print(f"Sentence: {test_sentence}")
print(f"Tokens: {tokens}")
print(f"Indices: {indices}")
print(f"Reconstructed: {[vocab.get_word(idx) for idx in indices]}")

## Section 7: Input/Output (X and Y) Definition

Format inputs as: `"Emotion: {emotion} | Situation: {situation} | Customer: {customer_utterance} Agent:"`

Format targets as: `"{agent_reply}"`

In [None]:
def create_input_output_pair(row):
    """
    Create input (X) and output (Y) pair from a data row.
    
    Input format: "Emotion: {emotion} | Situation: {situation} | Customer: {customer_utterance} Agent:"
    Output format: "{agent_reply}"
    """
    emotion = row['emotion']
    situation = row['situation_normalized']
    customer = row['customer_utterance_normalized']
    agent = row['agent_reply_normalized']
    
    # Create input string
    input_text = f"emotion : {emotion} | situation : {situation} | customer : {customer} agent :"
    
    # Output is just the agent reply
    output_text = agent
    
    return input_text, output_text


# Test the function
print("Testing input/output creation:\n")
for i in range(3):
    row = train_df.iloc[i]
    input_text, output_text = create_input_output_pair(row)
    print(f"Example {i+1}:")
    print(f"Input (X): {input_text}")
    print(f"Output (Y): {output_text}")
    print("-" * 80)

In [None]:
def encode_sequence(text, vocab, add_special_tokens=True):
    """
    Encode a text sequence to indices using vocabulary.
    
    Args:
        text: Input text string
        vocab: Vocabulary object
        add_special_tokens: If True, add <bos> and <eos> tokens
    
    Returns:
        List of token indices
    """
    tokens = simple_tokenize(text)
    indices = [vocab.get_idx(token) for token in tokens]
    
    if add_special_tokens:
        # Add BOS at beginning and EOS at end
        indices = [vocab.word2idx[vocab.BOS_TOKEN]] + indices + [vocab.word2idx[vocab.EOS_TOKEN]]
    
    return indices


# Test encoding
test_input, test_output = create_input_output_pair(train_df.iloc[0])
input_indices = encode_sequence(test_input, vocab, add_special_tokens=False)
output_indices = encode_sequence(test_output, vocab, add_special_tokens=True)

print("Testing sequence encoding:\n")
print(f"Input text: {test_input[:100]}...")
print(f"Input length: {len(input_indices)} tokens")
print(f"\nOutput text: {test_output}")
print(f"Output indices (with BOS/EOS): {output_indices}")
print(f"Output length: {len(output_indices)} tokens")

## Section 8: Transformer Architecture - Positional Encoding

Implement sinusoidal positional encoding:
- PE(pos, 2i) = sin(pos / 10000^(2i/d_model))
- PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))

In [None]:
class PositionalEncoding(nn.Module):
    """
    Implement sinusoidal positional encoding for Transformer.
    
    PE(pos, 2i) = sin(pos / 10000^(2i/d_model))
    PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
    """
    def __init__(self, d_model, max_len=5000, dropout=0.1):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Create positional encoding matrix
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        
        # Compute the div_term for the encoding
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        
        # Apply sine to even indices
        pe[:, 0::2] = torch.sin(position * div_term)
        
        # Apply cosine to odd indices
        pe[:, 1::2] = torch.cos(position * div_term)
        
        # Add batch dimension
        pe = pe.unsqueeze(0)  # Shape: (1, max_len, d_model)
        
        # Register as buffer (not a parameter, but part of the model state)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        """
        Args:
            x: Input embeddings of shape (batch_size, seq_len, d_model)
        
        Returns:
            Positional encoded embeddings
        """
        # Add positional encoding to input embeddings
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)


# Test positional encoding
print("Testing Positional Encoding:")
d_model = 512
pe = PositionalEncoding(d_model=d_model)

# Create dummy input (batch_size=2, seq_len=10, d_model=512)
dummy_input = torch.randn(2, 10, d_model)
output = pe(dummy_input)

print(f"Input shape: {dummy_input.shape}")
print(f"Output shape: {output.shape}")
print("✓ Positional Encoding implemented successfully!")

## Section 9: Multi-Head Attention Mechanism

Implement Multi-Head Attention from scratch with:
- Query, Key, Value projections
- Scaled dot-product attention
- Multiple attention heads
- Concatenation and final projection

In [None]:
class MultiHeadAttention(nn.Module):
    """
    Multi-Head Attention mechanism from scratch.
    
    Attention(Q, K, V) = softmax(Q @ K^T / sqrt(d_k)) @ V
    """
    def __init__(self, d_model, num_heads, dropout=0.1):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads  # Dimension per head
        
        # Linear projections for Q, K, V
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        
        # Final output projection
        self.W_o = nn.Linear(d_model, d_model)
        
        self.dropout = nn.Dropout(dropout)
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        """
        Compute scaled dot-product attention.
        
        Args:
            Q: Queries (batch_size, num_heads, seq_len, d_k)
            K: Keys (batch_size, num_heads, seq_len, d_k)
            V: Values (batch_size, num_heads, seq_len, d_k)
            mask: Optional mask (batch_size, 1, seq_len, seq_len) or (batch_size, 1, 1, seq_len)
        
        Returns:
            attention_output, attention_weights
        """
        # Compute attention scores
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        # Apply mask if provided
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        
        # Apply softmax to get attention weights
        attention_weights = F.softmax(scores, dim=-1)
        attention_weights = self.dropout(attention_weights)
        
        # Apply attention weights to values
        attention_output = torch.matmul(attention_weights, V)
        
        return attention_output, attention_weights
    
    def split_heads(self, x):
        """
        Split the last dimension into (num_heads, d_k).
        Transpose to get shape (batch_size, num_heads, seq_len, d_k)
        """
        batch_size, seq_len, _ = x.size()
        x = x.view(batch_size, seq_len, self.num_heads, self.d_k)
        return x.transpose(1, 2)
    
    def combine_heads(self, x):
        """
        Combine heads back to original shape.
        """
        batch_size, _, seq_len, _ = x.size()
        x = x.transpose(1, 2).contiguous()
        return x.view(batch_size, seq_len, self.d_model)
    
    def forward(self, query, key, value, mask=None):
        """
        Args:
            query: Query tensor (batch_size, seq_len_q, d_model)
            key: Key tensor (batch_size, seq_len_k, d_model)
            value: Value tensor (batch_size, seq_len_v, d_model)
            mask: Optional mask tensor
        
        Returns:
            output, attention_weights
        """
        batch_size = query.size(0)
        
        # Linear projections
        Q = self.W_q(query)  # (batch_size, seq_len_q, d_model)
        K = self.W_k(key)    # (batch_size, seq_len_k, d_model)
        V = self.W_v(value)  # (batch_size, seq_len_v, d_model)
        
        # Split into multiple heads
        Q = self.split_heads(Q)  # (batch_size, num_heads, seq_len_q, d_k)
        K = self.split_heads(K)  # (batch_size, num_heads, seq_len_k, d_k)
        V = self.split_heads(V)  # (batch_size, num_heads, seq_len_v, d_k)
        
        # Apply scaled dot-product attention
        attention_output, attention_weights = self.scaled_dot_product_attention(Q, K, V, mask)
        
        # Combine heads
        attention_output = self.combine_heads(attention_output)
        
        # Final linear projection
        output = self.W_o(attention_output)
        
        return output, attention_weights


# Test Multi-Head Attention
print("Testing Multi-Head Attention:")
d_model = 512
num_heads = 2
mha = MultiHeadAttention(d_model, num_heads)

# Create dummy inputs
batch_size, seq_len = 2, 10
dummy_query = torch.randn(batch_size, seq_len, d_model)
dummy_key = torch.randn(batch_size, seq_len, d_model)
dummy_value = torch.randn(batch_size, seq_len, d_model)

output, attn_weights = mha(dummy_query, dummy_key, dummy_value)

print(f"Query shape: {dummy_query.shape}")
print(f"Output shape: {output.shape}")
print(f"Attention weights shape: {attn_weights.shape}")
print("✓ Multi-Head Attention implemented successfully!")

## Section 10: Position-wise Feed-Forward Network

Implement FFN(x) = max(0, xW1 + b1)W2 + b2

In [None]:
class PositionwiseFeedForward(nn.Module):
    """
    Position-wise Feed-Forward Network.
    
    FFN(x) = max(0, xW1 + b1)W2 + b2
    """
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        """
        Args:
            x: Input tensor (batch_size, seq_len, d_model)
        
        Returns:
            Output tensor (batch_size, seq_len, d_model)
        """
        # First linear layer + ReLU
        x = self.linear1(x)
        x = F.relu(x)
        x = self.dropout(x)
        
        # Second linear layer
        x = self.linear2(x)
        
        return x


# Test Feed-Forward Network
print("Testing Position-wise Feed-Forward Network:")
d_model = 512
d_ff = 2048
ffn = PositionwiseFeedForward(d_model, d_ff)

dummy_input = torch.randn(2, 10, d_model)
output = ffn(dummy_input)

print(f"Input shape: {dummy_input.shape}")
print(f"Output shape: {output.shape}")
print("✓ Feed-Forward Network implemented successfully!")

## Section 11: Transformer Encoder Layer

Build Encoder layer with:
- Multi-Head Self-Attention
- Add & Norm (Residual connection + Layer Normalization)
- Feed-Forward Network
- Add & Norm

In [None]:
class EncoderLayer(nn.Module):
    """
    Single Transformer Encoder Layer.
    
    Components:
    1. Multi-Head Self-Attention
    2. Add & Norm (Residual + LayerNorm)
    3. Feed-Forward Network
    4. Add & Norm
    """
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(EncoderLayer, self).__init__()
        
        # Multi-head self-attention
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        
        # Feed-forward network
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        
        # Layer normalization
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        
        # Dropout
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        
    def forward(self, x, mask=None):
        """
        Args:
            x: Input tensor (batch_size, seq_len, d_model)
            mask: Optional mask for padding
        
        Returns:
            Output tensor (batch_size, seq_len, d_model)
        """
        # Multi-head self-attention with residual connection and layer norm
        attn_output, _ = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout1(attn_output))
        
        # Feed-forward with residual connection and layer norm
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout2(ff_output))
        
        return x


class TransformerEncoder(nn.Module):
    """
    Stack of Transformer Encoder Layers.
    """
    def __init__(self, num_layers, d_model, num_heads, d_ff, dropout=0.1):
        super(TransformerEncoder, self).__init__()
        
        # Stack encoder layers
        self.layers = nn.ModuleList([
            EncoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])
        
    def forward(self, x, mask=None):
        """
        Args:
            x: Input tensor (batch_size, seq_len, d_model)
            mask: Optional mask for padding
        
        Returns:
            Output tensor (batch_size, seq_len, d_model)
        """
        for layer in self.layers:
            x = layer(x, mask)
        return x


# Test Encoder
print("Testing Transformer Encoder:")
num_layers = 2
d_model = 512
num_heads = 2
d_ff = 2048

encoder = TransformerEncoder(num_layers, d_model, num_heads, d_ff)

dummy_input = torch.randn(2, 10, d_model)
output = encoder(dummy_input)

print(f"Input shape: {dummy_input.shape}")
print(f"Output shape: {output.shape}")
print(f"Number of encoder layers: {num_layers}")
print("✓ Transformer Encoder implemented successfully!")

## Section 12: Transformer Decoder Layer

Build Decoder layer with:
- Masked Multi-Head Self-Attention
- Add & Norm
- Multi-Head Cross-Attention (with Encoder output)
- Add & Norm
- Feed-Forward Network
- Add & Norm

In [None]:
class DecoderLayer(nn.Module):
    """
    Single Transformer Decoder Layer.
    
    Components:
    1. Masked Multi-Head Self-Attention
    2. Add & Norm
    3. Multi-Head Cross-Attention (with encoder output)
    4. Add & Norm
    5. Feed-Forward Network
    6. Add & Norm
    """
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(DecoderLayer, self).__init__()
        
        # Masked self-attention
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        
        # Cross-attention with encoder output
        self.cross_attn = MultiHeadAttention(d_model, num_heads, dropout)
        
        # Feed-forward network
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        
        # Layer normalization
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        
        # Dropout
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)
        
    def forward(self, x, encoder_output, src_mask=None, tgt_mask=None):
        """
        Args:
            x: Decoder input (batch_size, tgt_seq_len, d_model)
            encoder_output: Encoder output (batch_size, src_seq_len, d_model)
            src_mask: Source mask for padding
            tgt_mask: Target mask for padding and future positions
        
        Returns:
            Output tensor (batch_size, tgt_seq_len, d_model)
        """
        # Masked self-attention
        self_attn_output, _ = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout1(self_attn_output))
        
        # Cross-attention with encoder output
        cross_attn_output, _ = self.cross_attn(x, encoder_output, encoder_output, src_mask)
        x = self.norm2(x + self.dropout2(cross_attn_output))
        
        # Feed-forward
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout3(ff_output))
        
        return x


class TransformerDecoder(nn.Module):
    """
    Stack of Transformer Decoder Layers.
    """
    def __init__(self, num_layers, d_model, num_heads, d_ff, dropout=0.1):
        super(TransformerDecoder, self).__init__()
        
        # Stack decoder layers
        self.layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])
        
    def forward(self, x, encoder_output, src_mask=None, tgt_mask=None):
        """
        Args:
            x: Decoder input (batch_size, tgt_seq_len, d_model)
            encoder_output: Encoder output (batch_size, src_seq_len, d_model)
            src_mask: Source mask
            tgt_mask: Target mask
        
        Returns:
            Output tensor (batch_size, tgt_seq_len, d_model)
        """
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return x


# Test Decoder
print("Testing Transformer Decoder:")
num_layers = 2
d_model = 512
num_heads = 2
d_ff = 2048

decoder = TransformerDecoder(num_layers, d_model, num_heads, d_ff)

# Dummy inputs
dummy_decoder_input = torch.randn(2, 8, d_model)
dummy_encoder_output = torch.randn(2, 10, d_model)

output = decoder(dummy_decoder_input, dummy_encoder_output)

print(f"Decoder input shape: {dummy_decoder_input.shape}")
print(f"Encoder output shape: {dummy_encoder_output.shape}")
print(f"Decoder output shape: {output.shape}")
print(f"Number of decoder layers: {num_layers}")
print("✓ Transformer Decoder implemented successfully!")

## Section 13: Complete Transformer Encoder-Decoder Model

Assemble the full Transformer with:
- Embedding layers
- Positional encoding
- Encoder stack
- Decoder stack
- Final linear projection to vocabulary

In [None]:
class Transformer(nn.Module):
    """
    Complete Transformer Encoder-Decoder model built from scratch.
    """
    def __init__(self, vocab_size, d_model=512, num_heads=2, 
                 num_encoder_layers=2, num_decoder_layers=2,
                 d_ff=2048, max_seq_len=5000, dropout=0.1, pad_idx=0):
        super(Transformer, self).__init__()
        
        self.d_model = d_model
        self.pad_idx = pad_idx
        
        # Embedding layers
        self.src_embedding = nn.Embedding(vocab_size, d_model, padding_idx=pad_idx)
        self.tgt_embedding = nn.Embedding(vocab_size, d_model, padding_idx=pad_idx)
        
        # Positional encoding
        self.pos_encoding = PositionalEncoding(d_model, max_seq_len, dropout)
        
        # Encoder and Decoder
        self.encoder = TransformerEncoder(num_encoder_layers, d_model, num_heads, d_ff, dropout)
        self.decoder = TransformerDecoder(num_decoder_layers, d_model, num_heads, d_ff, dropout)
        
        # Final projection to vocabulary
        self.fc_out = nn.Linear(d_model, vocab_size)
        
        # Dropout
        self.dropout = nn.Dropout(dropout)
        
        # Initialize weights
        self._init_weights()
        
    def _init_weights(self):
        """Initialize weights randomly."""
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
                
    def make_src_mask(self, src):
        """
        Create mask for source padding.
        
        Args:
            src: Source tensor (batch_size, src_seq_len)
        
        Returns:
            Mask tensor (batch_size, 1, 1, src_seq_len)
        """
        src_mask = (src != self.pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask
    
    def make_tgt_mask(self, tgt):
        """
        Create mask for target padding and future positions.
        
        Args:
            tgt: Target tensor (batch_size, tgt_seq_len)
        
        Returns:
            Mask tensor (batch_size, 1, tgt_seq_len, tgt_seq_len)
        """
        batch_size, tgt_len = tgt.shape
        
        # Padding mask
        tgt_pad_mask = (tgt != self.pad_idx).unsqueeze(1).unsqueeze(2)
        
        # Future mask (lower triangular matrix)
        tgt_sub_mask = torch.tril(torch.ones((tgt_len, tgt_len), device=tgt.device)).bool()
        
        # Combine masks
        tgt_mask = tgt_pad_mask & tgt_sub_mask
        
        return tgt_mask
    
    def encode(self, src, src_mask):
        """
        Encode source sequence.
        
        Args:
            src: Source indices (batch_size, src_seq_len)
            src_mask: Source mask
        
        Returns:
            Encoder output (batch_size, src_seq_len, d_model)
        """
        # Embed and add positional encoding
        src_emb = self.src_embedding(src) * math.sqrt(self.d_model)
        src_emb = self.pos_encoding(src_emb)
        
        # Encode
        encoder_output = self.encoder(src_emb, src_mask)
        
        return encoder_output
    
    def decode(self, tgt, encoder_output, src_mask, tgt_mask):
        """
        Decode target sequence.
        
        Args:
            tgt: Target indices (batch_size, tgt_seq_len)
            encoder_output: Encoder output (batch_size, src_seq_len, d_model)
            src_mask: Source mask
            tgt_mask: Target mask
        
        Returns:
            Decoder output (batch_size, tgt_seq_len, d_model)
        """
        # Embed and add positional encoding
        tgt_emb = self.tgt_embedding(tgt) * math.sqrt(self.d_model)
        tgt_emb = self.pos_encoding(tgt_emb)
        
        # Decode
        decoder_output = self.decoder(tgt_emb, encoder_output, src_mask, tgt_mask)
        
        return decoder_output
    
    def forward(self, src, tgt):
        """
        Forward pass.
        
        Args:
            src: Source indices (batch_size, src_seq_len)
            tgt: Target indices (batch_size, tgt_seq_len)
        
        Returns:
            Output logits (batch_size, tgt_seq_len, vocab_size)
        """
        # Create masks
        src_mask = self.make_src_mask(src)
        tgt_mask = self.make_tgt_mask(tgt)
        
        # Encode
        encoder_output = self.encode(src, src_mask)
        
        # Decode
        decoder_output = self.decode(tgt, encoder_output, src_mask, tgt_mask)
        
        # Project to vocabulary
        output = self.fc_out(decoder_output)
        
        return output


print("✓ Complete Transformer model implemented successfully!")

## Section 14: PyTorch Dataset and DataLoader

Create custom Dataset for handling input/output pairs with proper padding.

In [None]:
class EmpatheticDialogueDataset(Dataset):
    """
    Custom Dataset for Empathetic Dialogue.
    """
    def __init__(self, dataframe, vocab):
        self.data = dataframe.reset_index(drop=True)
        self.vocab = vocab
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        # Create input and output pairs
        input_text, output_text = create_input_output_pair(row)
        
        # Encode sequences
        src_indices = encode_sequence(input_text, self.vocab, add_special_tokens=False)
        tgt_indices = encode_sequence(output_text, self.vocab, add_special_tokens=True)
        
        return {
            'src': torch.tensor(src_indices, dtype=torch.long),
            'tgt': torch.tensor(tgt_indices, dtype=torch.long)
        }


def collate_fn(batch, pad_idx=0):
    """
    Collate function for DataLoader to handle variable length sequences.
    """
    src_batch = [item['src'] for item in batch]
    tgt_batch = [item['tgt'] for item in batch]
    
    # Pad sequences
    src_padded = pad_sequence(src_batch, batch_first=True, padding_value=pad_idx)
    tgt_padded = pad_sequence(tgt_batch, batch_first=True, padding_value=pad_idx)
    
    return {
        'src': src_padded,
        'tgt': tgt_padded
    }


# Create datasets
print("Creating datasets...")
train_dataset = EmpatheticDialogueDataset(train_df, vocab)
val_dataset = EmpatheticDialogueDataset(val_df, vocab)
test_dataset = EmpatheticDialogueDataset(test_df, vocab)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# Test dataset
sample = train_dataset[0]
print(f"\nSample from dataset:")
print(f"Source shape: {sample['src'].shape}")
print(f"Target shape: {sample['tgt'].shape}")

In [None]:
# Create DataLoaders with platform-specific batch size for memory optimization
if PLATFORM == "local":
    BATCH_SIZE = 8   # Reduced batch size for local memory constraints (4GB GPU)
    print(f"🏠 LOCAL platform detected: Using reduced batch size {BATCH_SIZE} for memory optimization")
else:
    BATCH_SIZE = 64  # Full batch size for Kaggle/Cloud platforms
    print(f"☁️ KAGGLE platform detected: Using full batch size {BATCH_SIZE}")

PAD_IDX = vocab.word2idx[vocab.PAD_TOKEN]

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=lambda batch: collate_fn(batch, PAD_IDX),
    num_workers=0
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=lambda batch: collate_fn(batch, PAD_IDX),
    num_workers=0
)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=lambda batch: collate_fn(batch, PAD_IDX),
    num_workers=0
)

print(f"DataLoaders created successfully!")
print(f"Batch size: {BATCH_SIZE}")
print(f"Number of training batches: {len(train_loader)}")
print(f"Number of validation batches: {len(val_loader)}")
print(f"Number of test batches: {len(test_loader)}")

# Test a batch
for batch in train_loader:
    print(f"\nSample batch:")
    print(f"Source batch shape: {batch['src'].shape}")
    print(f"Target batch shape: {batch['tgt'].shape}")
    break

In [None]:
# MEMORY OPTIMIZATION SUMMARY:
# 
# ✅ KAGGLE (Cloud): BATCH_SIZE = 64  → Works with 16GB+ GPU memory
# ✅ LOCAL (4GB GPU): BATCH_SIZE = 8  → Prevents kernel crashes due to memory constraints
# 
# The model architecture (47M parameters) remains IDENTICAL on both platforms.
# Only batch size is reduced for local execution to fit within 4GB GPU memory.
# This ensures:
# - No kernel crashes on local machine
# - Same model performance and accuracy
# - Compatible with your existing Kaggle-trained models
# - Seamless switching between platforms
#
# Expected behavior:
# - Local training: ~8x longer but stable (no crashes)  
# - Local inference: Works perfectly with reduced batch size
# - Kaggle: Full performance with batch size 64

## Section 15: Model Initialization and Training Setup

Initialize the Transformer model with specified hyperparameters and set up training components.

In [None]:
# Model hyperparameters (as specified in requirements)
MODEL_CONFIG = {
    'vocab_size': len(vocab),
    'd_model': 512,           # Embedding dimension: 256 or 512
    'num_heads': 2,           # Number of attention heads: 2
    'num_encoder_layers': 2,  # Encoder layers: 2 
    'num_decoder_layers': 2,  # Decoder layers: 2
    'd_ff': 2048,            # Feed-forward dimension (4 * d_model)
    'max_seq_len': 1000,     # Maximum sequence length
    'dropout': 0.1,          # Dropout: 0.1-0.3
    'pad_idx': PAD_IDX
}

# Training hyperparameters (as specified in requirements)
TRAIN_CONFIG = {
    'batch_size': BATCH_SIZE,
    'learning_rate': 1e-4,    # Learning rate: 1e-4 to 5e-4
    'betas': (0.9, 0.98),     # Adam betas: (0.9, 0.98)
    'num_epochs': 10,         # Number of training epochs
    'warmup_steps': 4000,     # Learning rate warmup
    'clip_grad_norm': 1.0,    # Gradient clipping
    'save_every': 5,          # Save checkpoint every N epochs
    'eval_every': 1,          # Evaluate every N epochs
}

print("Model Configuration:")
for key, value in MODEL_CONFIG.items():
    print(f"  {key}: {value}")

print(f"\nTraining Configuration:")  
for key, value in TRAIN_CONFIG.items():
    print(f"  {key}: {value}")
    
print(f"\nDevice: {device}")
print(f"Total parameters will be: ~{(MODEL_CONFIG['vocab_size'] * MODEL_CONFIG['d_model'] * 2 + MODEL_CONFIG['d_model'] ** 2 * 12) // 1000}K")

In [None]:
# Initialize the Transformer model
print("Initializing Transformer model...")
model = Transformer(**MODEL_CONFIG).to(device)

# Count total parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\nModel Architecture:")
print(f"  Total parameters: {total_params:,}")
print(f"  Trainable parameters: {trainable_params:,}")
print(f"  Model size: ~{total_params * 4 / (1024**2):.1f} MB")

# Initialize optimizer (Adam with specified betas)
optimizer = torch.optim.Adam(
    model.parameters(),
    lr=TRAIN_CONFIG['learning_rate'], 
    betas=TRAIN_CONFIG['betas']
)

# Loss function (Cross Entropy with padding ignored)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

# Learning rate scheduler with warmup (FIXED: Added state_dict methods)
class WarmupScheduler:
    def __init__(self, optimizer, d_model, warmup_steps):
        self.optimizer = optimizer
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        self.step_count = 0
        
    def step(self):
        self.step_count += 1
        lr = self.d_model ** (-0.5) * min(
            self.step_count ** (-0.5),
            self.step_count * self.warmup_steps ** (-1.5)
        )
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr
    
    def state_dict(self):
        """Return the state of the scheduler."""
        return {
            'step_count': self.step_count,
            'd_model': self.d_model,
            'warmup_steps': self.warmup_steps
        }
    
    def load_state_dict(self, state_dict):
        """Load the state of the scheduler."""
        self.step_count = state_dict.get('step_count', 0)
        self.d_model = state_dict.get('d_model', self.d_model)
        self.warmup_steps = state_dict.get('warmup_steps', self.warmup_steps)

scheduler = WarmupScheduler(optimizer, MODEL_CONFIG['d_model'], TRAIN_CONFIG['warmup_steps'])

print(f"\n✓ Model, optimizer, loss function, and scheduler initialized!")
print(f"✓ All weights randomly initialized (no pretrained weights used)")
print(f"🔧 FIXED: WarmupScheduler now has state_dict() and load_state_dict() methods")

## Section 16: Evaluation Metrics Implementation

Implement BLEU, ROUGE-L, chrF, and Perplexity metrics from scratch for evaluation.

In [None]:
from collections import Counter

def calculate_bleu(candidate, reference, n=4):
    """
    Calculate BLEU score (BiLingual Evaluation Understudy).
    
    Args:
        candidate: Generated text (list of tokens)
        reference: Ground truth text (list of tokens) 
        n: Maximum n-gram order (default: 4)
    
    Returns:
        BLEU score (0-1)
    """
    if len(candidate) == 0:
        return 0.0
    
    # Calculate n-gram precisions
    precisions = []
    for i in range(1, n + 1):
        # Get n-grams
        cand_ngrams = Counter([tuple(candidate[j:j+i]) for j in range(len(candidate)-i+1)])
        ref_ngrams = Counter([tuple(reference[j:j+i]) for j in range(len(reference)-i+1)])
        
        # Calculate precision
        match_count = sum(min(cand_ngrams[ngram], ref_ngrams[ngram]) for ngram in cand_ngrams)
        total_count = sum(cand_ngrams.values())
        
        if total_count == 0:
            precision = 0.0
        else:
            precision = match_count / total_count
        precisions.append(precision)
    
    # Geometric mean of precisions
    if min(precisions) > 0:
        geo_mean = (precisions[0] * precisions[1] * precisions[2] * precisions[3]) ** 0.25
    else:
        geo_mean = 0.0
    
    # Brevity penalty
    c_len = len(candidate)
    r_len = len(reference) 
    
    if c_len > r_len:
        bp = 1.0
    else:
        bp = math.exp(1 - r_len / c_len) if c_len > 0 else 0.0
    
    bleu = bp * geo_mean
    return bleu


def lcs_length(X, Y):
    """Calculate Longest Common Subsequence length."""
    m, n = len(X), len(Y)
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if X[i-1] == Y[j-1]:
                dp[i][j] = dp[i-1][j-1] + 1
            else:
                dp[i][j] = max(dp[i-1][j], dp[i][j-1])
    
    return dp[m][n]


def calculate_rouge_l(candidate, reference):
    """
    Calculate ROUGE-L score (Recall-Oriented Understudy for Gisting Evaluation - Longest).
    
    Args:
        candidate: Generated text (list of tokens)
        reference: Ground truth text (list of tokens)
    
    Returns:
        ROUGE-L F1 score (0-1)
    """
    if len(candidate) == 0 or len(reference) == 0:
        return 0.0
    
    lcs_len = lcs_length(candidate, reference)
    
    recall = lcs_len / len(reference) if len(reference) > 0 else 0.0
    precision = lcs_len / len(candidate) if len(candidate) > 0 else 0.0
    
    if recall + precision == 0:
        f1 = 0.0
    else:
        f1 = 2 * recall * precision / (recall + precision)
    
    return f1


def calculate_chrf(candidate, reference, n=6, beta=2):
    """
    Calculate chrF score (Character n-gram F-score).
    
    Args:
        candidate: Generated text (string)
        reference: Ground truth text (string)
        n: Maximum character n-gram order (default: 6)
        beta: Beta parameter for F-score (default: 2)
    
    Returns:
        chrF score (0-100)
    """
    if len(candidate) == 0:
        return 0.0
    
    # Convert to character level
    candidate_chars = list(candidate)
    reference_chars = list(reference)
    
    total_recall = 0
    total_precision = 0
    
    for i in range(1, n + 1):
        # Get character n-grams
        cand_ngrams = Counter([tuple(candidate_chars[j:j+i]) for j in range(len(candidate_chars)-i+1)])
        ref_ngrams = Counter([tuple(reference_chars[j:j+i]) for j in range(len(reference_chars)-i+1)])
        
        # Calculate precision and recall
        match_count = sum(min(cand_ngrams[ngram], ref_ngrams[ngram]) for ngram in cand_ngrams)
        
        precision = match_count / sum(cand_ngrams.values()) if sum(cand_ngrams.values()) > 0 else 0
        recall = match_count / sum(ref_ngrams.values()) if sum(ref_ngrams.values()) > 0 else 0
        
        total_precision += precision
        total_recall += recall
    
    # Average precision and recall
    avg_precision = total_precision / n
    avg_recall = total_recall / n
    
    # F-score with beta
    if avg_precision + avg_recall == 0:
        f_score = 0.0
    else:
        f_score = (1 + beta**2) * avg_precision * avg_recall / (beta**2 * avg_precision + avg_recall)
    
    return f_score * 100  # Return as percentage


def calculate_perplexity(loss):
    """
    Calculate perplexity from cross-entropy loss.
    
    Args:
        loss: Cross-entropy loss value
    
    Returns:
        Perplexity score
    """
    return math.exp(loss)


print("✓ Evaluation metrics implemented:")
print("  - BLEU score (n-gram precision with brevity penalty)")
print("  - ROUGE-L score (longest common subsequence F1)")
print("  - chrF score (character n-gram F-score)")
print("  - Perplexity (exponential of cross-entropy loss)")

In [None]:
# Training Resumption Functions
def find_latest_checkpoint(checkpoint_dir='saved_models'):
    """
    Find the latest checkpoint file to resume training from.
    
    Returns:
        str or None: Path to latest checkpoint file, None if no checkpoints found
    """
    import os
    import glob
    
    if not os.path.exists(checkpoint_dir):
        return None
    
    # Look for checkpoint files
    checkpoint_files = glob.glob(os.path.join(checkpoint_dir, 'checkpoint_epoch_*.pkl'))
    
    if not checkpoint_files:
        return None
    
    # Extract epoch numbers and find the latest
    epoch_numbers = []
    for file_path in checkpoint_files:
        try:
            # Extract epoch number from filename like 'checkpoint_epoch_5.pkl'
            filename = os.path.basename(file_path)
            epoch_str = filename.replace('checkpoint_epoch_', '').replace('.pkl', '')
            epoch_numbers.append((int(epoch_str), file_path))
        except ValueError:
            continue
    
    if not epoch_numbers:
        return None
    
    # Return path to checkpoint with highest epoch number
    latest_epoch, latest_path = max(epoch_numbers, key=lambda x: x[0])
    print(f"📁 Found latest checkpoint: Epoch {latest_epoch} at {latest_path}")
    return latest_path


def load_checkpoint_for_resume(checkpoint_path, model, optimizer, scheduler, device):
    """
    Load checkpoint and restore training state.
    
    Args:
        checkpoint_path: Path to checkpoint file
        model: Model to load state into
        optimizer: Optimizer to load state into
        scheduler: Scheduler to load state into
        device: Device to load tensors to
    
    Returns:
        tuple: (start_epoch, training_history, best_bleu, best_epoch)
    """
    print(f"🔄 Loading checkpoint from: {checkpoint_path}")
    
    try:
        checkpoint = torch.load(checkpoint_path, map_location=device)
        
        # Load model state
        model.load_state_dict(checkpoint['model_state_dict'])
        print(f"   ✓ Model state loaded")
        
        # Load optimizer state
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        print(f"   ✓ Optimizer state loaded")
        
        # Load scheduler state
        if 'scheduler_state_dict' in checkpoint and scheduler is not None:
            scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
            print(f"   ✓ Scheduler state loaded")
        
        # Load training progress
        start_epoch = checkpoint['epoch'] + 1  # Start from next epoch
        training_history = checkpoint.get('training_history', {
            'train_loss': [],
            'val_loss': [], 
            'val_perplexity': [],
            'val_bleu': [],
            'val_rouge_l': [],
            'val_chrf': []
        })
        
        best_bleu = checkpoint.get('best_bleu', 0.0)
        best_epoch = checkpoint.get('best_epoch', 0)
        
        print(f"   ✓ Resuming from epoch {start_epoch}")
        print(f"   ✓ Previous best BLEU: {best_bleu:.4f} (Epoch {best_epoch})")
        print(f"   ✓ Training history loaded: {len(training_history['train_loss'])} previous epochs")
        
        return start_epoch, training_history, best_bleu, best_epoch
        
    except Exception as e:
        print(f"   ❌ Error loading checkpoint: {str(e)}")
        print(f"   🔄 Starting fresh training instead...")
        return 1, {
            'train_loss': [],
            'val_loss': [], 
            'val_perplexity': [],
            'val_bleu': [],
            'val_rouge_l': [],
            'val_chrf': []
        }, 0.0, 0


def save_training_checkpoint(epoch, model, optimizer, scheduler, training_history, 
                           best_bleu, best_epoch, checkpoint_path):
    """
    Save comprehensive training checkpoint.
    
    Args:
        epoch: Current epoch number
        model: Current model state
        optimizer: Current optimizer state  
        scheduler: Current scheduler state
        training_history: Training metrics history
        best_bleu: Best BLEU score so far
        best_epoch: Epoch with best BLEU score
        checkpoint_path: Path to save checkpoint
    """
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict() if scheduler else None,
        'training_history': training_history,
        'best_bleu': best_bleu,
        'best_epoch': best_epoch,
        'model_config': MODEL_CONFIG,
        'train_config': TRAIN_CONFIG,
        'vocab_size': len(vocab),
        'timestamp': time.time()
    }
    
    # Ensure directory exists
    os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
    
    # Save checkpoint
    torch.save(checkpoint, checkpoint_path)
    print(f"   ✅ Checkpoint saved: {checkpoint_path}")


print("✅ Training resumption functions implemented!")
print("   🔄 find_latest_checkpoint(): Finds most recent checkpoint")
print("   📂 load_checkpoint_for_resume(): Loads complete training state")  
print("   💾 save_training_checkpoint(): Saves comprehensive checkpoints")
print("   🛡️ Handles interruptions and automatic resumption")

## Section 17A: Training Resumption Guide

### 🔄 **Automatic Training Resumption**

The training system now supports automatic resumption from interruptions:

#### **Features:**
- ✅ **Automatic checkpoint detection** - Finds latest checkpoint on restart
- ✅ **Complete state restoration** - Model, optimizer, scheduler, training history
- ✅ **Interruption handling** - Saves emergency checkpoints on Ctrl+C
- ✅ **Error recovery** - Saves checkpoints even on unexpected errors
- ✅ **Progress preservation** - Maintains best BLEU scores and training metrics

#### **How It Works:**
1. **On Training Start**: Automatically checks for existing checkpoints
2. **If Found**: Loads complete training state and resumes from next epoch
3. **If Not Found**: Starts fresh training from epoch 1
4. **During Training**: Saves checkpoints every N epochs + after validation
5. **On Interruption**: Saves emergency checkpoint for safe resumption

#### **Checkpoint Files:**
- `checkpoint_epoch_N.pkl` - Regular training checkpoints
- `emergency_checkpoint_epoch_N.pkl` - Saved on Ctrl+C interruption  
- `error_checkpoint_epoch_N.pkl` - Saved on unexpected errors
- `best_model.pkl` - Best performing model (highest BLEU score)

#### **Manual Resumption:**
If needed, you can manually specify which checkpoint to resume from by modifying the checkpoint loading logic.

## Section 17: Training Loop with Teacher Forcing

Implement the complete training loop that will take several hours to complete. This includes model checkpointing and metric tracking.

In [None]:
import time
from tqdm import tqdm

def train_one_epoch(model, train_loader, criterion, optimizer, scheduler, device, epoch):
    """
    Train the model for one epoch with teacher forcing.
    """
    model.train()
    total_loss = 0
    num_batches = 0
    
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch}', leave=False)
    
    for batch_idx, batch in enumerate(progress_bar):
        src = batch['src'].to(device)  # (batch_size, src_seq_len)
        tgt = batch['tgt'].to(device)  # (batch_size, tgt_seq_len)
        
        # Teacher forcing: use target input (all tokens except last)
        tgt_input = tgt[:, :-1]  # (batch_size, tgt_seq_len-1)
        tgt_output = tgt[:, 1:]  # (batch_size, tgt_seq_len-1)
        
        # Forward pass
        optimizer.zero_grad()
        
        # Model output: (batch_size, tgt_seq_len-1, vocab_size)
        output = model(src, tgt_input)
        
        # Reshape for loss calculation
        output = output.reshape(-1, output.size(-1))  # (batch_size * seq_len, vocab_size)
        tgt_output = tgt_output.reshape(-1)           # (batch_size * seq_len)
        
        # Calculate loss
        loss = criterion(output, tgt_output)
        
        # Backward pass
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), TRAIN_CONFIG['clip_grad_norm'])
        
        # Update weights
        optimizer.step()
        scheduler.step()
        
        # Update metrics
        total_loss += loss.item()
        num_batches += 1
        
        # Update progress bar
        progress_bar.set_postfix({
            'loss': f'{loss.item():.4f}',
            'avg_loss': f'{total_loss/num_batches:.4f}',
            'lr': f'{optimizer.param_groups[0]["lr"]:.2e}'
        })
    
    return total_loss / num_batches


def evaluate_model(model, val_loader, criterion, device, vocab):
    """
    Evaluate the model on validation set and calculate metrics.
    """
    model.eval()
    total_loss = 0
    num_batches = 0
    all_predictions = []
    all_references = []
    
    with torch.no_grad():
        for batch_idx, batch in enumerate(tqdm(val_loader, desc='Evaluating', leave=False)):
            src = batch['src'].to(device)
            tgt = batch['tgt'].to(device)
            
            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]
            
            # Forward pass
            output = model(src, tgt_input)
            
            # Calculate loss
            output_flat = output.reshape(-1, output.size(-1))
            tgt_output_flat = tgt_output.reshape(-1)
            loss = criterion(output_flat, tgt_output_flat)
            
            total_loss += loss.item()
            num_batches += 1
            
            # Generate predictions for metrics
            predictions = output.argmax(dim=-1)  # (batch_size, seq_len)
            
            # Convert to text for metric calculation (first 5 samples)
            for i in range(min(5, predictions.size(0))):
                pred_tokens = [vocab.get_word(idx.item()) for idx in predictions[i]]
                ref_tokens = [vocab.get_word(idx.item()) for idx in tgt_output[i]]
                
                # Remove padding tokens
                pred_tokens = [tok for tok in pred_tokens if tok != vocab.PAD_TOKEN]
                ref_tokens = [tok for tok in ref_tokens if tok != vocab.PAD_TOKEN]
                
                all_predictions.append(pred_tokens)
                all_references.append(ref_tokens)
    
    # Calculate metrics
    avg_loss = total_loss / num_batches
    perplexity = calculate_perplexity(avg_loss)
    
    # Calculate BLEU, ROUGE-L, chrF on sample predictions
    bleu_scores = [calculate_bleu(pred, ref) for pred, ref in zip(all_predictions, all_references)]
    rouge_scores = [calculate_rouge_l(pred, ref) for pred, ref in zip(all_predictions, all_references)]
    chrf_scores = [calculate_chrf(' '.join(pred), ' '.join(ref)) for pred, ref in zip(all_predictions, all_references)]
    
    avg_bleu = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
    avg_rouge = sum(rouge_scores) / len(rouge_scores) if rouge_scores else 0
    avg_chrf = sum(chrf_scores) / len(chrf_scores) if chrf_scores else 0
    
    return {
        'loss': avg_loss,
        'perplexity': perplexity,
        'bleu': avg_bleu,
        'rouge_l': avg_rouge,
        'chrf': avg_chrf
    }


print("✓ Training and evaluation functions defined!")
print("  - train_one_epoch(): Implements teacher forcing training")
print("  - evaluate_model(): Calculates all metrics (BLEU, ROUGE-L, chrF, Perplexity)")
print("  - Includes gradient clipping and learning rate scheduling")
print("  - Ready for long training process with checkpointing")

## ✅ **Platform-Aware Full Dataset Training**

### 🎯 **Key Changes Made:**

1. **✅ REMOVED Memory Optimizations** - Training now uses FULL model capacity:
   - Original model: **512 d_model**, **2048 d_ff**, **47M parameters**
   - Original batch size: **64** 
   - Original sequence length: **1000**

2. **✅ ADDED Platform Detection** - Works on both Local & Kaggle:
   - 🏠 **Local**: Searches current directory for `emotion-emotion_69k.csv`
   - ☁️ **Kaggle**: Auto-detects dataset in `/kaggle/input/` directories
   - 📁 **Outputs**: Platform-aware saving (local dir vs `/kaggle/working/`)

3. **✅ FULL Dataset Training** - No compromises:
   - Complete **51,672 training samples** per epoch
   - Full **6,459 validation samples** 
   - All **808 batches** processed (no early breaks)
   - Complete model architecture with resumption support

### 🚀 **Training Features:**
- ✅ **Automatic checkpoint resumption** from interruptions
- ✅ **Platform-aware file paths** (local vs Kaggle)
- ✅ **Complete dataset processing** (no memory reduction)
- ✅ **Full model capacity** (47M parameters)
- ✅ **Robust error handling** with emergency saves

**Ready for full-scale training on complete dataset!** 🎯

In [None]:
# FULL DATASET TRAINING - NO MEMORY OPTIMIZATION
# Check for existing checkpoints and resume if possible
print("🔍 Checking for existing checkpoints...")
checkpoint_dir = get_platform_path('saved_models')
latest_checkpoint = find_latest_checkpoint(checkpoint_dir)

if latest_checkpoint:
    print("✅ Found existing checkpoint - resuming training...")
    start_epoch, training_history, best_bleu, best_epoch = load_checkpoint_for_resume(
        latest_checkpoint, model, optimizer, scheduler, device
    )
    total_epochs = TRAIN_CONFIG['num_epochs']
    remaining_epochs = total_epochs - (start_epoch - 1)
    print(f"📊 Resuming: {remaining_epochs} epochs remaining out of {total_epochs} total")
else:
    print("🆕 No existing checkpoints found - starting fresh training...")
    # Training tracking variables
    training_history = {
        'train_loss': [],
        'val_loss': [], 
        'val_perplexity': [],
        'val_bleu': [],
        'val_rouge_l': [],
        'val_chrf': []
    }
    best_bleu = 0.0
    best_epoch = 0
    start_epoch = 1

print("Starting/Resuming Training Process...")
print("="*80)
print(f"Training Configuration:")
print(f"  - Total Epochs: {TRAIN_CONFIG['num_epochs']}")
print(f"  - Starting from Epoch: {start_epoch}")
print(f"  - Batch Size: {TRAIN_CONFIG['batch_size']}")
print(f"  - Learning Rate: {TRAIN_CONFIG['learning_rate']}")
print(f"  - Model Parameters: {trainable_params:,}")
print(f"  - Training Samples: {len(train_dataset):,}")
print(f"  - Validation Samples: {len(val_dataset):,}")
print(f"  - Current Best BLEU: {best_bleu:.4f}")
print(f"  - 💾 AUTOMATIC SAVE: After every epoch completion!")
print("="*80)

# Main training loop with interruption handling
start_time = time.time()

try:
    for epoch in range(start_epoch, TRAIN_CONFIG['num_epochs'] + 1):
        print(f"\n📍 Epoch {epoch}/{TRAIN_CONFIG['num_epochs']}")
        
        # Training phase
        train_loss = train_one_epoch(
            model, train_loader, criterion, optimizer, scheduler, device, epoch
        )
        
        training_history['train_loss'].append(train_loss)
        
        print(f"  Training Loss: {train_loss:.4f}")
        
        # Validation phase
        if epoch % TRAIN_CONFIG['eval_every'] == 0:
            print("  Evaluating on validation set...")
            val_metrics = evaluate_model(model, val_loader, criterion, device, vocab)
            
            # Store metrics
            training_history['val_loss'].append(val_metrics['loss'])
            training_history['val_perplexity'].append(val_metrics['perplexity'])
            training_history['val_bleu'].append(val_metrics['bleu'])
            training_history['val_rouge_l'].append(val_metrics['rouge_l'])
            training_history['val_chrf'].append(val_metrics['chrf'])
            
            print(f"  Validation Loss: {val_metrics['loss']:.4f}")
            print(f"  Perplexity: {val_metrics['perplexity']:.2f}")
            print(f"  BLEU: {val_metrics['bleu']:.4f}")
            print(f"  ROUGE-L: {val_metrics['rouge_l']:.4f}")
            print(f"  chrF: {val_metrics['chrf']:.2f}")
            
            # Save best model based on BLEU score
            if val_metrics['bleu'] > best_bleu:
                best_bleu = val_metrics['bleu']
                best_epoch = epoch
            
                # Save best model checkpoint
                checkpoint = {
                    'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'scheduler_state_dict': scheduler.state_dict(),
                    'best_bleu': best_bleu,
                    'best_epoch': best_epoch,
                    'val_metrics': val_metrics,
                    'model_config': MODEL_CONFIG,
                    'train_config': TRAIN_CONFIG,
                    'training_history': training_history,
                    'vocab': vocab
                }
                
                best_model_path = get_platform_path('saved_models/best_model.pkl')
                os.makedirs(os.path.dirname(best_model_path), exist_ok=True)
                torch.save(checkpoint, best_model_path)
                print(f"  ✓ New best model saved! (BLEU: {best_bleu:.4f})")
        
        # 💾 SAVE CHECKPOINT AFTER EVERY EPOCH COMPLETION (as requested by user)
        print(f"  💾 Saving checkpoint for epoch {epoch}...")
        checkpoint_path = get_platform_path(f'saved_models/checkpoint_epoch_{epoch}.pkl')
        save_training_checkpoint(
            epoch, model, optimizer, scheduler, training_history,
            best_bleu, best_epoch, checkpoint_path
        )
        print(f"  ✅ Epoch {epoch} checkpoint saved successfully!")

except KeyboardInterrupt:
    print(f"\n🛑 Training interrupted by user at epoch {epoch}")
    print(f"💾 Saving emergency checkpoint...")
    emergency_path = get_platform_path(f'saved_models/emergency_checkpoint_epoch_{epoch}.pkl')
    save_training_checkpoint(
        epoch, model, optimizer, scheduler, training_history,
        best_bleu, best_epoch, emergency_path
    )
    print(f"✅ Emergency checkpoint saved! Can resume from epoch {epoch + 1}")
    raise
    
except Exception as e:
    print(f"\n❌ Training failed with error: {str(e)}")
    print(f"💾 Saving error checkpoint...")
    error_path = get_platform_path(f'saved_models/error_checkpoint_epoch_{epoch}.pkl')
    save_training_checkpoint(
        epoch, model, optimizer, scheduler, training_history,
        best_bleu, best_epoch, error_path
    )
    print(f"✅ Error checkpoint saved! Can investigate and resume from epoch {epoch + 1}")
    raise

# Training completed successfully
total_time = time.time() - start_time
print(f"\n🎉 Training Completed Successfully!")
print(f"  Total Time: {total_time/3600:.2f} hours")
print(f"  Best BLEU Score: {best_bleu:.4f} (Epoch {best_epoch})")
print(f"  Total Epochs Completed: {TRAIN_CONFIG['num_epochs']}")

# Save final training history
os.makedirs('saved_data', exist_ok=True)
history_path = get_platform_path('saved_data/training_history.pkl')
with open(history_path, 'wb') as f:
    pickle.dump(training_history, f)
print(f"  ✓ Final training history saved to 'saved_data/training_history.pkl'")

## Section 18: Inference and Response Generation

Load the best trained model and implement response generation functions.

In [None]:
def load_best_model(model_path='saved_models/best_model.pkl', device='cpu'):
    """
    Load the best trained model from checkpoint.
    """
    try:
        # FIXED: Use weights_only=False for custom classes (PyTorch 2.6+ compatibility)
        checkpoint = torch.load(model_path, map_location=device, weights_only=False)
    except Exception as e:
        if "weights_only" in str(e):
            # Alternative fix: Add safe globals for custom classes
            torch.serialization.add_safe_globals([Vocabulary, Transformer])
            checkpoint = torch.load(model_path, map_location=device)
        else:
            raise e
    
    # Recreate model with saved config
    loaded_vocab = checkpoint['vocab']
    model_config = checkpoint['model_config']
    
    model = Transformer(**model_config).to(device)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    
    print(f"✓ Best model loaded from epoch {checkpoint['epoch']}")
    print(f"  BLEU Score: {checkpoint['best_bleu']:.4f}")
    
    return model, loaded_vocab, checkpoint['val_metrics']


def generate_response(model, emotion, situation, customer_utterance, vocab, device, max_length=50):
    """
    Generate empathetic agent response using the trained model.
    
    Args:
        model: Trained Transformer model
        emotion: Emotion category (string)
        situation: Situation description (string) 
        customer_utterance: Customer's message (string)
        vocab: Vocabulary object
        device: Device (cpu/cuda)
        max_length: Maximum response length
        
    Returns:
        Generated agent response (string)
    """
    model.eval()
    
    with torch.no_grad():
        # Normalize inputs
        emotion = emotion.lower()
        situation = normalize_text(situation)
        customer_utterance = normalize_text(customer_utterance)
        
        # Create input format
        input_text = f"emotion : {emotion} | situation : {situation} | customer : {customer_utterance} agent :"
        
        # Encode input
        src_tokens = encode_sequence(input_text, vocab, add_special_tokens=False)
        src = torch.tensor([src_tokens], dtype=torch.long).to(device)
        
        # Start with BOS token
        tgt = torch.tensor([[vocab.word2idx[vocab.BOS_TOKEN]]], dtype=torch.long).to(device)
        
        # Generate tokens one by one
        for _ in range(max_length):
            # Forward pass
            output = model(src, tgt)
            
            # Get next token probabilities
            next_token_logits = output[0, -1, :]  # Last token predictions
            next_token = next_token_logits.argmax().item()
            
            # Stop if EOS token generated
            if next_token == vocab.word2idx[vocab.EOS_TOKEN]:
                break
                
            # Append to target
            tgt = torch.cat([tgt, torch.tensor([[next_token]], device=device)], dim=1)
        
        # Convert to text
        generated_tokens = tgt[0][1:].tolist()  # Remove BOS token
        response_tokens = [vocab.get_word(idx) for idx in generated_tokens]
        
        # Remove EOS and PAD tokens
        response_tokens = [token for token in response_tokens 
                          if token not in [vocab.EOS_TOKEN, vocab.PAD_TOKEN, vocab.UNK_TOKEN]]
        
        return ' '.join(response_tokens)


print("✓ Inference functions implemented!")
print("  - load_best_model(): Loads trained model from .pkl file (FIXED: PyTorch 2.6+ compatibility)")
print("  - generate_response(): Generates empathetic responses")
print("  - Uses greedy decoding with early stopping on EOS token")
print("  - Alternative fix: Safe globals for custom classes added")

## Section 19: Qualitative Examples and Analysis

Generate sample conversations and compare with ground truth responses.

In [None]:
# Try to load trained model if it exists, otherwise use current model
try:
    # FIXED: Use platform-aware path
    model_path = get_platform_path('saved_models/best_model.pkl')
    trained_model, model_vocab, val_metrics = load_best_model(model_path, device)
    print("Using trained model for inference...")
except FileNotFoundError:
    print("No trained model found. Using current initialized model for demo...")
    trained_model = model
    model_vocab = vocab
    val_metrics = None

# Sample test cases for qualitative evaluation (REAL DATASET EXAMPLES)
test_cases = [
    {
        'emotion': 'sentimental',
        'situation': 'I remember going to the fireworks with my best friend. There was a lot of people, but it only felt like us in the world.',
        'customer': 'I remember going to see the fireworks with my best friend. It was the first time we ever spent time alone together. Although there was a lot of people, we felt like the only people in the world.',
        'ground_truth': 'Was this a friend you were in love with, or just a best friend?'
    },
    {
        'emotion': 'afraid', 
        'situation': 'i used to scare for darkness',
        'customer': 'it feels like hitting to blank wall when i see the darkness',
        'ground_truth': 'Oh ya? I don\'t really see how'
    },
    {
        'emotion': 'proud',
        'situation': 'I showed a guy how to run a good bead in welding class and he caught on quick.',
        'customer': 'Hi how are you doing today',
        'ground_truth': 'doing good.. how about you'
    },
    {
        'emotion': 'joyful',
        'situation': 'I am very happy to have been first over 300 students during this years at my enginering school',
        'customer': 'Hi, this year, I was the first over 300 students at my enginering school',
        'ground_truth': 'Sounds great! So what\'s your major?'
    },
    {
        'emotion': 'lonely',
        'situation': 'A few years ago, my marriage broke up, and I found myself living alone for the first time in my life. Though I eventually grew accustomed to the solitude, it took a while to get used to it.',
        'customer': 'I found myself divorced a few years ago, and for the first time in my life, I was living alone.',
        'ground_truth': 'I felt sad and depressed due to my insecurities and felt rejected from society'
    }
]

print("Generating Qualitative Examples:")
print("="*100)

for i, test_case in enumerate(test_cases, 1):
    print(f"\n📝 Example {i}: {test_case['emotion'].title()} Conversation")
    print("-" * 60)
    
    print(f"Emotion: {test_case['emotion']}")
    print(f"Situation: {test_case['situation']}")
    print(f"Customer: {test_case['customer']}")
    print()
    
    # Generate model response
    generated_response = generate_response(
        trained_model, 
        test_case['emotion'],
        test_case['situation'], 
        test_case['customer'],
        model_vocab,
        device
    )
    
    print(f"Ground Truth: {test_case['ground_truth']}")
    print(f"Generated   : {generated_response}")
    
    # Calculate metrics for this example
    gt_tokens = simple_tokenize(normalize_text(test_case['ground_truth']))
    gen_tokens = simple_tokenize(generated_response)
    
    bleu = calculate_bleu(gen_tokens, gt_tokens)
    rouge_l = calculate_rouge_l(gen_tokens, gt_tokens) 
    chrf = calculate_chrf(generated_response, test_case['ground_truth'])
    
    print(f"\nMetrics:")
    print(f"  BLEU: {bleu:.3f} | ROUGE-L: {rouge_l:.3f} | chrF: {chrf:.1f}")
    print("="*100)

print(f"\n🎯 Qualitative Analysis Complete!")
if val_metrics:
    print(f"Overall Model Performance (Validation Set):")
    print(f"  BLEU: {val_metrics['bleu']:.4f}")
    print(f"  ROUGE-L: {val_metrics['rouge_l']:.4f}") 
    print(f"  chrF: {val_metrics['chrf']:.2f}")
    print(f"  Perplexity: {val_metrics['perplexity']:.2f}")
else:
    print(f"Note: Model not yet trained. Run training cells first for accurate metrics.")

## Section 20: Training Progress Visualization

Visualize training metrics and model performance over epochs.

In [None]:
# Section 20: Training Progress Visualization (Local-Safe Version)
print("📊 Starting training visualization...")

# Load training history with minimal overhead
try:
    history_path = get_platform_path('saved_data/training_history.pkl')
    with open(history_path, 'rb') as f:
        saved_history = pickle.load(f)
    print("✅ Training history loaded")
except:
    saved_history = None
    print("⚠️ No training history found")

# Display results in text format (guaranteed to work)
if saved_history and saved_history.get('val_bleu'):
    epochs = len(saved_history['val_bleu'])
    
    print("\n" + "🎯" + "="*60 + "🎯")
    print("          EMPATHETIC CHATBOT TRAINING RESULTS")
    print("🎯" + "="*60 + "🎯")
    
    print(f"\n📈 TRAINING OVERVIEW:")
    print(f"   Total Epochs: {epochs}")
    print(f"   Model Size: 47M parameters")
    print(f"   Dataset: 64,591 dialogue pairs")
    
    print(f"\n🏆 FINAL PERFORMANCE:")
    final_bleu = saved_history['val_bleu'][-1]
    final_rouge = saved_history['val_rouge_l'][-1]
    final_chrf = saved_history['val_chrf'][-1]
    final_perp = saved_history['val_perplexity'][-1]
    
    print(f"   BLEU Score:    {final_bleu:.4f}")
    print(f"   ROUGE-L Score: {final_rouge:.4f}")
    print(f"   chrF Score:    {final_chrf:.2f}%")
    print(f"   Perplexity:    {final_perp:.2f}")
    
    print(f"\n📊 TRAINING PROGRESS:")
    print("   Epoch | Train Loss | Val Loss | BLEU   | ROUGE-L")
    print("   ------|------------|----------|--------|--------")
    
    for i in range(min(epochs, 10)):  # Show max 10 epochs to avoid clutter
        tl = saved_history.get('train_loss', [0])[i] if i < len(saved_history.get('train_loss', [])) else 0
        vl = saved_history.get('val_loss', [0])[i] if i < len(saved_history.get('val_loss', [])) else 0
        bleu = saved_history['val_bleu'][i]
        rouge = saved_history['val_rouge_l'][i]
        print(f"   {i+1:5d} | {tl:10.4f} | {vl:8.4f} | {bleu:.4f} | {rouge:.4f}")
    
    if epochs > 10:
        print(f"   ... ({epochs-10} more epochs)")
    
    print("🎯" + "="*60 + "🎯")
    
    # Create simple ASCII chart for key metric
    print(f"\n📈 BLEU Score Progress (ASCII Chart):")
    max_bleu = max(saved_history['val_bleu'])
    min_bleu = min(saved_history['val_bleu'])
    
    for i, bleu in enumerate(saved_history['val_bleu'][:10], 1):  # Show first 10 epochs
        # Normalize to 0-20 scale for ASCII bar
        if max_bleu > min_bleu:
            bar_length = int(((bleu - min_bleu) / (max_bleu - min_bleu)) * 20)
        else:
            bar_length = 10
        bar = "█" * bar_length + "░" * (20 - bar_length)
        print(f"   Epoch {i:2d}: |{bar}| {bleu:.4f}")
    
else:
    print("\n❌ No training history available")
    print("💡 Please run the training loop first (Section 17)")
    print("📁 Expected: saved_data/training_history.pkl")

print(f"\n✅ Training visualization completed!")
print(f"🔒 Safe for all platforms (no matplotlib dependencies)")
print(f"📊 Results displayed in text format")

## Section 21: Human Evaluation Framework

Framework for manual assessment of model outputs on Fluency, Relevance, and Adequacy (1-5 scale).

In [None]:
# Human Evaluation Framework
print("🧑‍💼 Human Evaluation Framework")
print("="*80)

print("""
EVALUATION CRITERIA (1-5 Scale):

🗣️ FLUENCY (1-5):
   1 = Completely ungrammatical, incomprehensible
   2 = Mostly ungrammatical, hard to understand  
   3 = Some grammatical errors, but understandable
   4 = Mostly grammatical, minor errors
   5 = Perfect grammar, natural flow

🎯 RELEVANCE (1-5):
   1 = Completely irrelevant to context
   2 = Somewhat relevant but misses key points
   3 = Moderately relevant, addresses some context
   4 = Highly relevant, addresses most context
   5 = Perfectly relevant, fully contextual

💝 ADEQUACY (1-5):
   1 = Completely inadequate empathetic response
   2 = Minimal empathy, inappropriate tone
   3 = Some empathy shown, neutral response
   4 = Good empathetic understanding and response
   5 = Excellent empathy, highly appropriate response

""")

# Generate evaluation samples
evaluation_samples = []
sample_conversations = test_cases[:3]  # Use first 3 test cases

print("SAMPLE EVALUATION:")
print("="*80)

for i, conversation in enumerate(sample_conversations, 1):
    print(f"\n📋 Sample {i}:")
    print(f"Emotion: {conversation['emotion']}")
    print(f"Situation: {conversation['situation']}")
    print(f"Customer: {conversation['customer']}")
    
    # Generate response
    generated = generate_response(
        trained_model,
        conversation['emotion'], 
        conversation['situation'],
        conversation['customer'],
        model_vocab,
        device
    )
    
    print(f"Generated Response: '{generated}'")
    print(f"Ground Truth: '{conversation['ground_truth']}'")
    
    # Create evaluation template
    evaluation_sample = {
        'sample_id': i,
        'context': {
            'emotion': conversation['emotion'],
            'situation': conversation['situation'], 
            'customer': conversation['customer']
        },
        'generated_response': generated,
        'ground_truth': conversation['ground_truth'],
        'scores': {
            'fluency': None,      # To be filled by human evaluator
            'relevance': None,    # To be filled by human evaluator  
            'adequacy': None      # To be filled by human evaluator
        }
    }
    
    evaluation_samples.append(evaluation_sample)
    
    print(f"\n📝 EVALUATION TEMPLATE FOR SAMPLE {i}:")
    print(f"   Fluency (1-5): _____")
    print(f"   Relevance (1-5): _____") 
    print(f"   Adequacy (1-5): _____")
    print(f"   Comments: ________________")
    print("-" * 60)

# Save evaluation framework
with open('saved_data/human_evaluation_samples.pkl', 'wb') as f:
    pickle.dump(evaluation_samples, f)

print(f"\n✅ Human Evaluation Framework Setup Complete!")
print(f"   📁 Evaluation samples saved to 'saved_data/human_evaluation_samples.pkl'")
print(f"   📊 {len(evaluation_samples)} samples prepared for human evaluation")
print(f"   📋 Manual scoring on 1-5 scale for Fluency, Relevance, Adequacy")

# Example scoring function for future use
def calculate_human_scores(evaluation_results):
    """
    Calculate average human evaluation scores.
    
    Args:
        evaluation_results: List of completed evaluation samples with scores
    
    Returns:
        Dictionary with average scores
    """
    if not evaluation_results:
        return None
        
    fluency_scores = [sample['scores']['fluency'] for sample in evaluation_results 
                     if sample['scores']['fluency'] is not None]
    relevance_scores = [sample['scores']['relevance'] for sample in evaluation_results 
                       if sample['scores']['relevance'] is not None]
    adequacy_scores = [sample['scores']['adequacy'] for sample in evaluation_results 
                      if sample['scores']['adequacy'] is not None]
    
    return {
        'avg_fluency': sum(fluency_scores) / len(fluency_scores) if fluency_scores else 0,
        'avg_relevance': sum(relevance_scores) / len(relevance_scores) if relevance_scores else 0,
        'avg_adequacy': sum(adequacy_scores) / len(adequacy_scores) if adequacy_scores else 0,
        'total_samples': len(evaluation_results)
    }

print(f"\n📈 Use calculate_human_scores() function to analyze completed evaluations")