# Next-Word Prediction using MLP - Complete Training

## Both Category I (Shakespeare) and Category II (Linux Kernel)

### Instructions:
1. Upload both `shakespeare_processed.pkl` and `linux_kernel_processed.pkl` to Kaggle dataset
2. Update DATA_PATHS below if needed
3. Enable GPU (Settings ‚Üí Accelerator ‚Üí GPU T4 x2)
4. Run all cells

### What This Notebook Does:
- Trains Shakespeare model (Category I - Natural Language)
- Trains Linux Kernel model (Category II - Structured Text)
- Generates all visualizations and reports
- Saves all models with checkpointing

### Expected Time:
- Total: 4-8 hours with GPU (both models)
- Shakespeare: 2-4 hours
- Linux Kernel: 2-4 hours

---
# Part 1: Setup and Imports

In [None]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
import warnings
warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

---
# Part 2: Define Model Architecture and Helper Functions

In [None]:
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.LongTensor(X)
        self.y = torch.LongTensor(y)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class NextWordMLP(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_length, hidden_dim, activation='relu', dropout=0.5):
        super(NextWordMLP, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        input_dim = context_length * embedding_dim
        
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, vocab_size)
        
        if activation == 'relu':
            self.activation = nn.ReLU()
        elif activation == 'tanh':
            self.activation = nn.Tanh()
        else:
            raise ValueError(f"Unknown activation: {activation}")
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        embedded = self.embedding(x)
        embedded = embedded.view(embedded.size(0), -1)
        
        h1 = self.activation(self.fc1(embedded))
        h1 = self.dropout(h1)
        
        h2 = self.activation(self.fc2(h1))
        h2 = self.dropout(h2)
        
        output = self.fc3(h2)
        return output
    
    def get_embeddings(self):
        return self.embedding.weight.data.cpu().numpy()

print("Model architecture defined")

In [None]:
def train_epoch(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for inputs, targets in loader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        total_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
    
    return total_loss / len(loader), 100. * correct / total

def validate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for inputs, targets in loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            total_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    
    return total_loss / len(loader), 100. * correct / total

def generate_text(model, seed_text, num_words, word_to_idx, idx_to_word, context_length, temperature=1.0, is_code=False):
    model.eval()
    
    if is_code:
        words = seed_text if isinstance(seed_text, list) else seed_text.split()
    else:
        words = seed_text.lower().split()
    
    generated = words.copy()
    
    for _ in range(num_words):
        context = generated[-context_length:] if len(generated) >= context_length else generated
        context_indices = [word_to_idx.get(w, word_to_idx['<UNK>']) for w in context]
        
        if len(context_indices) < context_length:
            context_indices = [word_to_idx['<START>']] * (context_length - len(context_indices)) + context_indices
        
        context_tensor = torch.LongTensor([context_indices]).to(device)
        
        with torch.no_grad():
            output = model(context_tensor) / temperature
            probs = F.softmax(output, dim=1)
            predicted_idx = torch.multinomial(probs, 1).item()
        
        next_word = idx_to_word[predicted_idx]
        if next_word == '<END>':
            break
        generated.append(next_word)
    
    return ' '.join(generated)

print("Training and generation functions defined")

---
# Part 3: Shakespeare Model (Category I - Natural Language)

## Task 1.1 & 1.2 - Shakespeare Dataset

In [None]:
SHAKESPEARE_DATA_PATH = '/kaggle/input/shakespeare-processed/shakespeare_processed.pkl'

print("="*70)
print("LOADING SHAKESPEARE DATASET (Category I)")
print("="*70)

with open(SHAKESPEARE_DATA_PATH, 'rb') as f:
    shakespeare_data = pickle.load(f)

shak_X = shakespeare_data['X']
shak_y = shakespeare_data['y']
shak_vocab = shakespeare_data['vocab']
shak_word_to_idx = shakespeare_data['word_to_idx']
shak_idx_to_word = shakespeare_data['idx_to_word']
shak_context_length = shakespeare_data['context_length']
shak_vocab_stats = shakespeare_data['vocab_stats']

print("\nSHAKESPEARE DATASET STATISTICS (Task 1.1)")
print("="*70)
print(f"Vocabulary size: {shak_vocab_stats['vocab_size']:,}")
print(f"Total training samples: {len(shak_X):,}")
print(f"Context length: {shak_context_length}")
print(f"Total words in corpus: {shak_vocab_stats['total_words']:,}")
print(f"<UNK> percentage: {shak_vocab_stats.get('unk_percentage', 0):.2f}%")

print("\n10 Most frequent words:")
for word, count in shak_vocab_stats['most_common']:
    print(f"  {word}: {count:,}")

print("\n10 Least frequent words:")
for word, count in shak_vocab_stats['least_common']:
    print(f"  {word}: {count:,}")

In [None]:
shak_X_train, shak_X_val, shak_y_train, shak_y_val = train_test_split(
    shak_X, shak_y, test_size=0.2, random_state=42
)

print("\nData Split:")
print(f"Training samples: {len(shak_X_train):,}")
print(f"Validation samples: {len(shak_X_val):,}")

shak_train_dataset = TextDataset(shak_X_train, shak_y_train)
shak_val_dataset = TextDataset(shak_X_val, shak_y_val)

batch_size = 256
shak_train_loader = DataLoader(shak_train_dataset, batch_size=batch_size, shuffle=True)
shak_val_loader = DataLoader(shak_val_dataset, batch_size=batch_size)

print(f"\nBatch size: {batch_size}")
print(f"Training batches: {len(shak_train_loader)}")
print(f"Validation batches: {len(shak_val_loader)}")

In [None]:
EMBEDDING_DIM = 64
HIDDEN_DIM = 512
ACTIVATION = 'relu'
LEARNING_RATE = 0.001
NUM_EPOCHS = 200

shak_model = NextWordMLP(
    vocab_size=len(shak_vocab),
    embedding_dim=EMBEDDING_DIM,
    context_length=shak_context_length,
    hidden_dim=HIDDEN_DIM,
    activation=ACTIVATION
    dropout=0.5).to(device)

shak_criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
shak_optimizer = optim.Adam(shak_model.parameters(), lr=LEARNING_RATE)
shak_optimizer = optim.Adam(shak_model.parameters(), lr=LEARNING_RATE, weight_decay=1e-5)
shak_scheduler = optim.lr_scheduler.ReduceLROnPlateau(shak_optimizer, mode='min', factor=0.5, patience=10, verbose=True)

print("\nSHAKESPEARE MODEL ARCHITECTURE (Task 1.2)")
print("="*70)
print(shak_model)
print(f"\nTotal parameters: {sum(p.numel() for p in shak_model.parameters()):,}")

In [None]:
print("\nTRAINING SHAKESPEARE MODEL")
print("="*70)

shak_history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
shak_best_val_loss = float('inf')
patience_counter = 0
PATIENCE = 10

for epoch in range(NUM_EPOCHS):
    train_loss, train_acc = train_epoch(shak_model, shak_train_loader, shak_criterion, shak_optimizer, device)
    val_loss, val_acc = validate(shak_model, shak_val_loader, shak_criterion, device)
    shak_scheduler.step(val_loss)
    
    shak_history['train_loss'].append(train_loss)
    shak_history['train_acc'].append(train_acc)
    shak_history['val_loss'].append(val_loss)
    shak_history['val_acc'].append(val_acc)
    
    if (epoch + 1) % 10 == 0 or epoch == 0:
        print(f"Epoch [{epoch+1}/{NUM_EPOCHS}]")
        print(f"  Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%")
        print(f"  Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")
    
    if val_loss < shak_best_val_loss:
        shak_best_val_loss = val_loss
        patience_counter = 0
        torch.save({
            'epoch': epoch,
            'model_state_dict': shak_model.state_dict(),
            'optimizer_state_dict': shak_optimizer.state_dict(),
            'val_loss': val_loss,
            'vocab': shak_vocab,
            'word_to_idx': shak_word_to_idx,
            'idx_to_word': shak_idx_to_word,
            'context_length': shak_context_length,
            'embedding_dim': EMBEDDING_DIM,
            'hidden_dim': HIDDEN_DIM,
            'activation': ACTIVATION
        }, 'shakespeare_best_model.pth')
        print(f"  Saved best model")
    else:
        patience_counter += 1
    
    if patience_counter >= PATIENCE:
        print(f"\nEarly stopping at epoch {epoch+1}")
        break
    
    if (epoch + 1) % 50 == 0:
        torch.save({
            'epoch': epoch,
            'model_state_dict': shak_model.state_dict(),
            'optimizer_state_dict': shak_optimizer.state_dict(),
            'history': shak_history
        }, f'shakespeare_checkpoint_epoch_{epoch+1}.pth')
        print(f"  Checkpoint saved")

print("\nShakespeare training completed!")
print(f"Best Validation Loss: {shak_best_val_loss:.4f}")
print(f"Final Validation Accuracy: {shak_history['val_acc'][-1]:.2f}%")

In [None]:
# Plot Shakespeare training curves
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

ax1.plot(shak_history['train_loss'], label='Train Loss')
ax1.plot(shak_history['val_loss'], label='Validation Loss')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.set_title('Shakespeare - Training and Validation Loss')
ax1.legend()
ax1.grid(True)

ax2.plot(shak_history['train_acc'], label='Train Accuracy')
ax2.plot(shak_history['val_acc'], label='Validation Accuracy')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy (%)')
ax2.set_title('Shakespeare - Training and Validation Accuracy')
ax2.legend()
ax2.grid(True)

plt.tight_layout()
plt.savefig('shakespeare_training_curves.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Shakespeare text generation
checkpoint = torch.load('shakespeare_best_model.pth')
shak_model.load_state_dict(checkpoint['model_state_dict'])
shak_model.eval()

print("\n SHAKESPEARE TEXT GENERATION")
print("="*70)

test_seeds = ["to be or not to", "what is the", "i am", "the king of"]

for seed in test_seeds:
    print(f"\nSeed: '{seed}'")
    for temp in [0.5, 1.0, 1.5]:
        generated = generate_text(shak_model, seed, 20, shak_word_to_idx, shak_idx_to_word, 
                                 shak_context_length, temp, is_code=False)
        print(f"  T={temp}: {generated}")

In [None]:
# Shakespeare embedding visualization (Task 1.3)
print("\n SHAKESPEARE EMBEDDING VISUALIZATION (Task 1.3)")
print("="*70)

shak_embeddings = shak_model.get_embeddings()

# Select words to visualize
words_to_viz = [w for w, c in shak_vocab_stats['most_common'][:50]]
interesting = ['king', 'queen', 'love', 'hate', 'good', 'evil', 'man', 'woman', 'life', 'death']
words_to_viz.extend([w for w in interesting if w in shak_word_to_idx])
words_to_viz = list(set(words_to_viz))[:100]

word_indices = [shak_word_to_idx[w] for w in words_to_viz]
word_embeddings = shak_embeddings[word_indices]

print("Applying t-SNE...")
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
embeddings_2d = tsne.fit_transform(word_embeddings)

plt.figure(figsize=(20, 15))
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], alpha=0.5)
for i, word in enumerate(words_to_viz):
    plt.annotate(word, (embeddings_2d[i, 0], embeddings_2d[i, 1]), fontsize=8, alpha=0.7)

plt.title('t-SNE Visualization of Word Embeddings (Shakespeare)', fontsize=16)
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.grid(True, alpha=0.3)
plt.savefig('shakespeare_embeddings_tsne.png', dpi=300, bbox_inches='tight')
plt.show()
print(" Visualization saved")

In [None]:
# Save Shakespeare artifacts
with open('shakespeare_training_history.pkl', 'wb') as f:
    pickle.dump(shak_history, f)

print("\n SHAKESPEARE MODEL COMPLETE!")
print("="*70)
print("Saved files:")
print("  ‚Ä¢ shakespeare_best_model.pth")
print("  ‚Ä¢ shakespeare_training_curves.png")
print("  ‚Ä¢ shakespeare_embeddings_tsne.png")
print("  ‚Ä¢ shakespeare_training_history.pkl")

---
# Part 4: Linux Kernel Model (Category II - Structured Text)

## Task 1.1 & 1.2 - Linux Kernel Code Dataset

In [None]:

LINUX_DATA_PATH = '/kaggle/input/linux-kernel-processed/linux_kernel_processed.pkl'

print("="*70)
print("LOADING LINUX KERNEL DATASET (Category II)")
print("="*70)

with open(LINUX_DATA_PATH, 'rb') as f:
    linux_data = pickle.load(f)

linux_X = linux_data['X']
linux_y = linux_data['y']
linux_vocab = linux_data['vocab']
linux_word_to_idx = linux_data['word_to_idx']
linux_idx_to_word = linux_data['idx_to_word']
linux_context_length = linux_data['context_length']
linux_vocab_stats = linux_data['vocab_stats']

print("\n LINUX KERNEL DATASET STATISTICS")
print("="*70)
print(f"Vocabulary size: {linux_vocab_stats['vocab_size']:,}")
print(f"Total training samples: {len(linux_X):,}")
print(f"Context length: {linux_context_length}")
print(f"Total tokens in corpus: {linux_vocab_stats['total_tokens']:,}")
print(f"<UNK> percentage: {linux_vocab_stats.get('unk_percentage', 0):.2f}%")

print("\n10 Most frequent tokens:")
for token, count in linux_vocab_stats['most_common']:
    print(f"  {repr(token)}: {count:,}")

print("\n10 Least frequent tokens:")
for token, count in linux_vocab_stats['least_common']:
    print(f"  {repr(token)}: {count:,}")

In [None]:
linux_X_train, linux_X_val, linux_y_train, linux_y_val = train_test_split(
    linux_X, linux_y, test_size=0.2, random_state=42
)

print("\n Data Split:")
print(f"Training samples: {len(linux_X_train):,}")
print(f"Validation samples: {len(linux_X_val):,}")

linux_train_dataset = TextDataset(linux_X_train, linux_y_train)
linux_val_dataset = TextDataset(linux_X_val, linux_y_val)

linux_train_loader = DataLoader(linux_train_dataset, batch_size=batch_size, shuffle=True)
linux_val_loader = DataLoader(linux_val_dataset, batch_size=batch_size)

print(f"\nBatch size: {batch_size}")
print(f"Training batches: {len(linux_train_loader)}")
print(f"Validation batches: {len(linux_val_loader)}")

In [None]:
linux_model = NextWordMLP(
    vocab_size=len(linux_vocab),
    embedding_dim=EMBEDDING_DIM,
    context_length=linux_context_length,
    hidden_dim=HIDDEN_DIM,
    activation=ACTIVATION
).to(device)

linux_criterion = nn.CrossEntropyLoss()
linux_optimizer = optim.Adam(linux_model.parameters(), lr=LEARNING_RATE)
linux_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    linux_optimizer, mode='min', factor=0.5, patience=10, verbose=True
)

print("\n LINUX KERNEL MODEL ARCHITECTURE (Task 1.2)")
print("="*70)
print(linux_model)
print(f"\nTotal parameters: {sum(p.numel() for p in linux_model.parameters()):,}")

In [None]:
print("\n TRAINING LINUX KERNEL MODEL")
print("="*70)

linux_history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
linux_best_val_loss = float('inf')
patience_counter = 0

for epoch in range(NUM_EPOCHS):
    train_loss, train_acc = train_epoch(linux_model, linux_train_loader, linux_criterion, linux_optimizer, device)
    val_loss, val_acc = validate(linux_model, linux_val_loader, linux_criterion, device)
    linux_scheduler.step(val_loss)
    
    linux_history['train_loss'].append(train_loss)
    linux_history['train_acc'].append(train_acc)
    linux_history['val_loss'].append(val_loss)
    linux_history['val_acc'].append(val_acc)
    
    if (epoch + 1) % 10 == 0 or epoch == 0:
        print(f"Epoch [{epoch+1}/{NUM_EPOCHS}]")
        print(f"  Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%")
        print(f"  Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")
    
    if val_loss < linux_best_val_loss:
        linux_best_val_loss = val_loss
        patience_counter = 0
        torch.save({
            'epoch': epoch,
            'model_state_dict': linux_model.state_dict(),
            'optimizer_state_dict': linux_optimizer.state_dict(),
            'val_loss': val_loss,
            'vocab': linux_vocab,
            'word_to_idx': linux_word_to_idx,
            'idx_to_word': linux_idx_to_word,
            'context_length': linux_context_length,
            'embedding_dim': EMBEDDING_DIM,
            'hidden_dim': HIDDEN_DIM,
            'activation': ACTIVATION
        }, 'linux_kernel_best_model.pth')
        print(f"  ‚úì Saved best model")
    else:
        patience_counter += 1
    
    if patience_counter >= PATIENCE:
        print(f"\n Early stopping at epoch {epoch+1}")
        break
    
    if (epoch + 1) % 50 == 0:
        torch.save({
            'epoch': epoch,
            'model_state_dict': linux_model.state_dict(),
            'optimizer_state_dict': linux_optimizer.state_dict(),
            'history': linux_history
        }, f'linux_kernel_checkpoint_epoch_{epoch+1}.pth')
        print(f"  ‚úì Checkpoint saved")

print("\n Linux Kernel training completed!")
print(f"Best Validation Loss: {linux_best_val_loss:.4f}")
print(f"Final Validation Accuracy: {linux_history['val_acc'][-1]:.2f}%")

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

ax1.plot(linux_history['train_loss'], label='Train Loss')
ax1.plot(linux_history['val_loss'], label='Validation Loss')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.set_title('Linux Kernel - Training and Validation Loss')
ax1.legend()
ax1.grid(True)

ax2.plot(linux_history['train_acc'], label='Train Accuracy')
ax2.plot(linux_history['val_acc'], label='Validation Accuracy')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy (%)')
ax2.set_title('Linux Kernel - Training and Validation Accuracy')
ax2.legend()
ax2.grid(True)

plt.tight_layout()
plt.savefig('linux_kernel_training_curves.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
checkpoint = torch.load('linux_kernel_best_model.pth')
linux_model.load_state_dict(checkpoint['model_state_dict'])
linux_model.eval()

print("\n LINUX KERNEL CODE GENERATION")
print("="*70)

test_seeds = [
    ['struct', 'task_struct', '*'],
    ['if', '('],
    ['int', 'ret', '='],
    ['static', 'void']
]

for seed in test_seeds:
    print(f"\nSeed: {' '.join(seed)}")
    for temp in [0.5, 1.0, 1.5]:
        generated = generate_text(linux_model, seed, 20, linux_word_to_idx, linux_idx_to_word, 
                                 linux_context_length, temp, is_code=True)
        print(f"  T={temp}: {generated}")

In [None]:
print("\n LINUX KERNEL EMBEDDING VISUALIZATION (Task 1.3)")
print("="*70)

linux_embeddings = linux_model.get_embeddings()

tokens_to_viz = [t for t, c in linux_vocab_stats['most_common'][:50]]
interesting = ['if', 'else', 'for', 'while', 'return', 'struct', 'int', 'void',
               'static', 'const', 'unsigned', 'char', '*', '{', '}', '(', ')']
tokens_to_viz.extend([t for t in interesting if t in linux_word_to_idx])
tokens_to_viz = list(set(tokens_to_viz))[:100]

token_indices = [linux_word_to_idx[t] for t in tokens_to_viz]
token_embeddings = linux_embeddings[token_indices]

print("Applying t-SNE...")
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
embeddings_2d = tsne.fit_transform(token_embeddings)

plt.figure(figsize=(20, 15))
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], alpha=0.5)
for i, token in enumerate(tokens_to_viz):
    plt.annotate(token, (embeddings_2d[i, 0], embeddings_2d[i, 1]), fontsize=8, alpha=0.7)

plt.title('t-SNE Visualization of Token Embeddings (Linux Kernel Code)', fontsize=16)
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.grid(True, alpha=0.3)
plt.savefig('linux_kernel_embeddings_tsne.png', dpi=300, bbox_inches='tight')
plt.show()
print("‚úì Visualization saved")

In [None]:

with open('linux_kernel_training_history.pkl', 'wb') as f:
    pickle.dump(linux_history, f)

print("\n LINUX KERNEL MODEL COMPLETE!")
print("="*70)
print("Saved files:")
print("  ‚Ä¢ linux_kernel_best_model.pth")
print("  ‚Ä¢ linux_kernel_training_curves.png")
print("  ‚Ä¢ linux_kernel_embeddings_tsne.png")
print("  ‚Ä¢ linux_kernel_training_history.pkl")

---
# Part 5: Comparative Analysis (Task 1.5)

## Compare Both Models

In [None]:
print("\n" + "="*70)
print("COMPARATIVE ANALYSIS - SHAKESPEARE VS LINUX KERNEL")
print("="*70)

print("\n Dataset Comparison:")
print("-" * 70)
print(f"{'Metric':<30} {'Shakespeare':<20} {'Linux Kernel':<20}")
print("-" * 70)
print(f"{'Vocabulary Size':<30} {len(shak_vocab):>19,} {len(linux_vocab):>19,}")
print(f"{'Training Samples':<30} {len(shak_X):>19,} {len(linux_X):>19,}")
print(f"{'Context Length':<30} {shak_context_length:>19} {linux_context_length:>19}")
print(f"{'<UNK> Percentage':<30} {shak_vocab_stats.get('unk_percentage', 0):>18.2f}% {linux_vocab_stats.get('unk_percentage', 0):>18.2f}%")

print("\n Model Performance:")
print("-" * 70)
print(f"{'Metric':<30} {'Shakespeare':<20} {'Linux Kernel':<20}")
print("-" * 70)
print(f"{'Best Val Loss':<30} {shak_best_val_loss:>19.4f} {linux_best_val_loss:>19.4f}")
print(f"{'Final Val Accuracy':<30} {shak_history['val_acc'][-1]:>18.2f}% {linux_history['val_acc'][-1]:>18.2f}%")
print(f"{'Epochs Trained':<30} {len(shak_history['train_loss']):>19} {len(linux_history['train_loss']):>19}")
print(f"{'Parameters':<30} {sum(p.numel() for p in shak_model.parameters()):>19,} {sum(p.numel() for p in linux_model.parameters()):>19,}")

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))

# Shakespeare loss
ax1.plot(shak_history['train_loss'], label='Train', alpha=0.7)
ax1.plot(shak_history['val_loss'], label='Val', alpha=0.7)
ax1.set_title('Shakespeare - Loss')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Linux loss
ax2.plot(linux_history['train_loss'], label='Train', alpha=0.7, color='orange')
ax2.plot(linux_history['val_loss'], label='Val', alpha=0.7, color='red')
ax2.set_title('Linux Kernel - Loss')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.grid(True, alpha=0.3)

# Shakespeare accuracy
ax3.plot(shak_history['train_acc'], label='Train', alpha=0.7)
ax3.plot(shak_history['val_acc'], label='Val', alpha=0.7)
ax3.set_title('Shakespeare - Accuracy')
ax3.set_xlabel('Epoch')
ax3.set_ylabel('Accuracy (%)')
ax3.legend()
ax3.grid(True, alpha=0.3)

# Linux accuracy
ax4.plot(linux_history['train_acc'], label='Train', alpha=0.7, color='orange')
ax4.plot(linux_history['val_acc'], label='Val', alpha=0.7, color='red')
ax4.set_title('Linux Kernel - Accuracy')
ax4.set_xlabel('Epoch')
ax4.set_ylabel('Accuracy (%)')
ax4.legend()
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('comparison_training_curves.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n‚úì Comparison plot saved as 'comparison_training_curves.png'")

---
# Part 6: Final Summary

In [None]:
print("\n" + "="*70)
print("üéâ ALL TRAINING COMPLETE!")
print("="*70)

print("\n‚úÖ Shakespeare Model (Category I):")
print("   ‚Ä¢ shakespeare_best_model.pth")
print("   ‚Ä¢ shakespeare_training_curves.png")
print("   ‚Ä¢ shakespeare_embeddings_tsne.png")
print("   ‚Ä¢ shakespeare_training_history.pkl")

print("\n‚úÖ Linux Kernel Model (Category II):")
print("   ‚Ä¢ linux_kernel_best_model.pth")
print("   ‚Ä¢ linux_kernel_training_curves.png")
print("   ‚Ä¢ linux_kernel_embeddings_tsne.png")
print("   ‚Ä¢ linux_kernel_training_history.pkl")

print("\n‚úÖ Comparison:")
print("   ‚Ä¢ comparison_training_curves.png")

print("\nüìã Tasks Completed:")
print("   ‚úì Task 1.1: Preprocessing (both datasets)")
print("   ‚úì Task 1.2: Model training (both models)")
print("   ‚úì Task 1.3: Embedding visualization (both models)")
print("   ‚úì Task 1.5: Comparative analysis")

print("\nüöÄ Next Steps:")
print("   ‚Ä¢ Download all saved files from Kaggle")
print("   ‚Ä¢ Use models for Task 1.4 (Streamlit app)")
print("   ‚Ä¢ Write detailed analysis for assignment report")

print("\n" + "="*70)
print("Good luck with your assignment! üçÄ")
print("="*70)