# 20M Parameter Text Generation Model - Kaggle Training
## Simple Version - Uses Pre-installed Packages

This notebook trains a transformer model on Kaggle using only pre-installed packages.

### 1. Fix Protobuf Issue (Run This First!)

In [None]:
# Fix protobuf version conflict
!pip uninstall -y protobuf
!pip install -q protobuf==3.20.3
print("‚úì Protobuf fixed")

### 2. Setup and Imports

In [None]:
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
from transformers import get_linear_schedule_with_warmup
from datasets import load_dataset
import json
from tqdm.auto import tqdm
import gc

print("‚úì All imports successful")
print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {__import__('transformers').__version__}")

In [None]:
# Check GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    print(f"CUDA version: {torch.version.cuda}")
else:
    print("‚ö†Ô∏è WARNING: No GPU detected! Training will be very slow.")

### 3. Configuration

In [None]:
# Training configuration
CONFIG = {
    'batch_size': 8,
    'learning_rate': 5e-4,
    'epochs': 3,
    'warmup_steps': 500,
    'gradient_accumulation_steps': 8,
    'max_grad_norm': 1.0,
    'save_steps': 1000,
    'eval_steps': 500,
    'max_length': 512,
}

print("Training Configuration:")
print("=" * 40)
for key, value in CONFIG.items():
    print(f"  {key:30s}: {value}")
print("=" * 40)

### 4. Model Configuration

In [None]:
# Model architecture (10M parameters)
model_config = GPT2Config(
    vocab_size=50257,
    n_positions=512,
    n_embd=256,
    n_layer=8,
    n_head=8,
    n_inner=1024,
    activation_function='gelu_new',
    resid_pdrop=0.1,
    embd_pdrop=0.1,
    attn_pdrop=0.1,
)

print("Model Configuration:")
print("=" * 40)
print(f"  Vocabulary size: {model_config.vocab_size:,}")
print(f"  Max sequence length: {model_config.n_positions}")
print(f"  Embedding dimension: {model_config.n_embd}")
print(f"  Number of layers: {model_config.n_layer}")
print(f"  Number of heads: {model_config.n_head}")
print(f"  FFN dimension: {model_config.n_inner}")
print("=" * 40)

In [None]:
# Initialize model from scratch
print("Initializing model...")
model = GPT2LMHeadModel(model_config)
model = model.to(device)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\n‚úì Model initialized successfully")
print(f"  Total parameters: {total_params:,}")
print(f"  Trainable parameters: {trainable_params:,}")
print(f"  Model size: {total_params * 4 / 1e6:.2f} MB (FP32)")
print(f"\n‚úì Starting fresh training from scratch")

### 5. Data Preparation

In [None]:
# Load tokenizer
print("Loading tokenizer...")
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
print(f"‚úì Tokenizer loaded (vocab size: {len(tokenizer)})")

In [None]:
# Load dataset
print("\nLoading dataset...")
dataset = load_dataset('wikitext', 'wikitext-103-v1')

print(f"‚úì Dataset loaded")
print(f"  Train samples: {len(dataset['train']):,}")
print(f"  Validation samples: {len(dataset['validation']):,}")
print(f"  Test samples: {len(dataset['test']):,}")

In [None]:
# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        max_length=CONFIG['max_length'],
        padding='max_length',
        return_tensors='pt'
    )

print("Tokenizing datasets...")
print("  This may take a few minutes...")

tokenized_train = dataset['train'].map(
    tokenize_function,
    batched=True,
    remove_columns=dataset['train'].column_names,
    desc="Tokenizing train"
)

tokenized_val = dataset['validation'].map(
    tokenize_function,
    batched=True,
    remove_columns=dataset['validation'].column_names,
    desc="Tokenizing validation"
)

tokenized_train.set_format('torch')
tokenized_val.set_format('torch')

print("‚úì Tokenization complete")

In [None]:
# Create dataloaders
train_loader = DataLoader(
    tokenized_train,
    batch_size=CONFIG['batch_size'],
    shuffle=True,
    num_workers=2,
    pin_memory=True
)

val_loader = DataLoader(
    tokenized_val,
    batch_size=CONFIG['batch_size'],
    num_workers=2,
    pin_memory=True
)

print(f"‚úì DataLoaders created")
print(f"  Train batches: {len(train_loader):,}")
print(f"  Val batches: {len(val_loader):,}")
print(f"  Effective batch size: {CONFIG['batch_size'] * CONFIG['gradient_accumulation_steps']}")

### 6. Training Setup

In [None]:
# Optimizer
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=CONFIG['learning_rate'],
    weight_decay=0.01,
    betas=(0.9, 0.999),
    eps=1e-8
)

# Learning rate scheduler
total_steps = len(train_loader) * CONFIG['epochs'] // CONFIG['gradient_accumulation_steps']
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=CONFIG['warmup_steps'],
    num_training_steps=total_steps
)

print("‚úì Optimizer and scheduler configured")
print(f"  Total training steps: {total_steps:,}")
print(f"  Warmup steps: {CONFIG['warmup_steps']:,}")
print(f"  Initial learning rate: {CONFIG['learning_rate']}")

### 7. Training Functions

In [None]:
def train_epoch(model, loader, optimizer, scheduler, device, epoch):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    
    progress_bar = tqdm(loader, desc=f"Epoch {epoch}")
    
    for step, batch in enumerate(progress_bar):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=input_ids
        )
        
        loss = outputs.loss / CONFIG['gradient_accumulation_steps']
        loss.backward()
        
        # Update weights
        if (step + 1) % CONFIG['gradient_accumulation_steps'] == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), CONFIG['max_grad_norm'])
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        
        total_loss += loss.item() * CONFIG['gradient_accumulation_steps']
        
        # Update progress bar
        progress_bar.set_postfix({
            'loss': f"{loss.item() * CONFIG['gradient_accumulation_steps']:.4f}",
            'lr': f"{scheduler.get_last_lr()[0]:.2e}"
        })
        
        # Save checkpoint periodically
        if (step + 1) % CONFIG['save_steps'] == 0:
            checkpoint_path = f'/kaggle/working/checkpoint_epoch{epoch}_step{step+1}.pt'
            torch.save({
                'epoch': epoch,
                'step': step + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'train_loss': total_loss / (step + 1),
                'config': CONFIG,
            }, checkpoint_path)
            print(f"\n‚úì Checkpoint saved: {checkpoint_path}")
    
    return total_loss / len(loader)


def evaluate(model, loader, device):
    """Evaluate the model"""
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=input_ids
            )
            total_loss += outputs.loss.item()
    
    avg_loss = total_loss / len(loader)
    perplexity = torch.exp(torch.tensor(avg_loss))
    return avg_loss, perplexity.item()

print("‚úì Training functions defined")

### 8. Training Loop

In [None]:
# Training loop
best_val_loss = float('inf')
training_history = []

print("\n" + "="*60)
print("STARTING TRAINING")
print("="*60)

for epoch in range(1, CONFIG['epochs'] + 1):
    print(f"\n{'='*60}")
    print(f"Epoch {epoch}/{CONFIG['epochs']}")
    print(f"{'='*60}")
    
    # Train
    train_loss = train_epoch(
        model,
        train_loader,
        optimizer,
        scheduler,
        device,
        epoch
    )
    
    # Evaluate
    val_loss, val_perplexity = evaluate(model, val_loader, device)
    
    # Print results
    print(f"\n{'='*60}")
    print(f"Epoch {epoch} Results:")
    print(f"  Train Loss: {train_loss:.4f}")
    print(f"  Val Loss: {val_loss:.4f}")
    print(f"  Val Perplexity: {val_perplexity:.2f}")
    print(f"{'='*60}")
    
    # Save history
    training_history.append({
        'epoch': epoch,
        'train_loss': train_loss,
        'val_loss': val_loss,
        'val_perplexity': val_perplexity
    })
    
    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'train_loss': train_loss,
            'val_loss': val_loss,
            'val_perplexity': val_perplexity,
            'config': CONFIG,
            'model_config': model_config.to_dict(),
        }, '/kaggle/working/best_model.pt')
        print("‚úì Saved best model")
    
    # Save epoch checkpoint
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'train_loss': train_loss,
        'val_loss': val_loss,
        'val_perplexity': val_perplexity,
        'config': CONFIG,
        'model_config': model_config.to_dict(),
    }, f'/kaggle/working/checkpoint_epoch{epoch}.pt')
    print(f"‚úì Saved epoch {epoch} checkpoint")
    
    # Clear cache
    torch.cuda.empty_cache()
    gc.collect()

print("\n" + "="*60)
print("TRAINING COMPLETED!")
print(f"Best validation loss: {best_val_loss:.4f}")
print("="*60)

### 9. Save Training History

In [None]:
# Save training history
with open('/kaggle/working/training_history.json', 'w') as f:
    json.dump(training_history, f, indent=2)

print("‚úì Training history saved!")
print("\nTraining Summary:")
print("=" * 60)
for entry in training_history:
    print(f"Epoch {entry['epoch']}: "
          f"Train Loss={entry['train_loss']:.4f}, "
          f"Val Loss={entry['val_loss']:.4f}, "
          f"Perplexity={entry['val_perplexity']:.2f}")
print("=" * 60)

### 10. Text Generation Test

In [None]:
def generate_text(prompt, max_length=100, temperature=0.8, num_return_sequences=1):
    """Generate text from a prompt"""
    model.eval()
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            temperature=temperature,
            top_k=50,
            top_p=0.95,
            do_sample=True,
            num_return_sequences=num_return_sequences,
            pad_token_id=tokenizer.eos_token_id
        )
    
    return [tokenizer.decode(seq, skip_special_tokens=True) for seq in output]

# Test generation
test_prompts = [
    "The future of artificial intelligence",
    "In a world where technology",
    "Scientists have discovered",
    "Once upon a time"
]

print("\n" + "="*60)
print("TEXT GENERATION EXAMPLES")
print("="*60)

for prompt in test_prompts:
    print(f"\nPrompt: '{prompt}'")
    print("-" * 60)
    generated = generate_text(prompt, max_length=150, temperature=0.8)
    print(generated[0])
    print()

### 11. Save Final Model

In [None]:
# Save model in HuggingFace format
print("Saving final model...")
model.save_pretrained('/kaggle/working/final_model')
tokenizer.save_pretrained('/kaggle/working/final_model')

# Save config
with open('/kaggle/working/final_model/training_config.json', 'w') as f:
    json.dump(CONFIG, f, indent=2)

print("\n" + "="*60)
print("‚úì All models and configs saved!")
print("="*60)
print("\nOutput files:")
print("  üìÅ /kaggle/working/")
print("    ‚îú‚îÄ‚îÄ best_model.pt (best checkpoint)")
print("    ‚îú‚îÄ‚îÄ checkpoint_epoch*.pt (epoch checkpoints)")
print("    ‚îú‚îÄ‚îÄ training_history.json (training metrics)")
print("    ‚îî‚îÄ‚îÄ final_model/ (HuggingFace format)")
print("        ‚îú‚îÄ‚îÄ pytorch_model.bin")
print("        ‚îú‚îÄ‚îÄ config.json")
print("        ‚îú‚îÄ‚îÄ training_config.json")
print("        ‚îî‚îÄ‚îÄ tokenizer files")
print("="*60)