# 10M Parameter Text Generation Model
## Training and Benchmarking Pipeline

This notebook implements a 10 million parameter transformer-based model for text generation and NLP tasks.

### 1. Setup and Dependencies

In [None]:
# Install required packages
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install transformers datasets tokenizers accelerate wandb evaluate rouge-score nltk sacrebleu
!pip install sentencepiece protobuf

Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m11

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
from transformers import get_linear_schedule_with_warmup
from datasets import load_dataset
import numpy as np
import wandb
from tqdm.auto import tqdm
import json
import os

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

Using device: cuda
GPU: Tesla T4
Memory: 15.83 GB


### 2. Model Configuration (10M Parameters)

In [None]:
# Configure model to have ~10M parameters
config = GPT2Config(
    vocab_size=50257,
    n_positions=512,
    n_embd=256,
    n_layer=8,
    n_head=8,
    n_inner=1024,
    activation_function='gelu_new',
    resid_pdrop=0.1,
    embd_pdrop=0.1,
    attn_pdrop=0.1,
    layer_norm_epsilon=1e-5,
    initializer_range=0.02,
)

# Initialize model
model = GPT2LMHeadModel(config)
model = model.to(device)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Model size: {total_params * 4 / 1e6:.2f} MB (FP32)")

Total parameters: 19,315,456
Trainable parameters: 19,315,456
Model size: 77.26 MB (FP32)


### 3. Data Preparation

In [None]:
# Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Load dataset (using WikiText-103 for text generation)
dataset = load_dataset('wikitext', 'wikitext-103-v1')

print(f"Train samples: {len(dataset['train'])}")
print(f"Validation samples: {len(dataset['validation'])}")
print(f"Test samples: {len(dataset['test'])}")

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

wikitext-103-v1/test-00000-of-00001.parq(…):   0%|          | 0.00/722k [00:00<?, ?B/s]

wikitext-103-v1/train-00000-of-00002.par(…):   0%|          | 0.00/156M [00:00<?, ?B/s]

wikitext-103-v1/train-00001-of-00002.par(…):   0%|          | 0.00/156M [00:00<?, ?B/s]

wikitext-103-v1/validation-00000-of-0000(…):   0%|          | 0.00/655k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Train samples: 1801350
Validation samples: 3760
Test samples: 4358


In [None]:
# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        max_length=512,
        padding='max_length',
        return_tensors='pt'
    )

# Tokenize datasets
tokenized_train = dataset['train'].map(
    tokenize_function,
    batched=True,
    remove_columns=dataset['train'].column_names
)

tokenized_val = dataset['validation'].map(
    tokenize_function,
    batched=True,
    remove_columns=dataset['validation'].column_names
)

tokenized_train.set_format('torch')
tokenized_val.set_format('torch')

Map:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

### 4. Training Configuration

In [None]:
# Training hyperparameters
BATCH_SIZE = 16
LEARNING_RATE = 5e-4
EPOCHS = 3
WARMUP_STEPS = 500
GRADIENT_ACCUMULATION_STEPS = 4
MAX_GRAD_NORM = 1.0

# Create dataloaders
train_loader = DataLoader(tokenized_train, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(tokenized_val, batch_size=BATCH_SIZE)

# Optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
total_steps = len(train_loader) * EPOCHS // GRADIENT_ACCUMULATION_STEPS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=WARMUP_STEPS,
    num_training_steps=total_steps
)

print(f"Total training steps: {total_steps}")

Total training steps: 84438


### 5. Training Loop

In [None]:
# Initialize wandb (optional)
# wandb.init(project='10m-text-generation', config={
#     'batch_size': BATCH_SIZE,
#     'learning_rate': LEARNING_RATE,
#     'epochs': EPOCHS,
#     'model_params': total_params
# })

def train_epoch(model, loader, optimizer, scheduler, device, epoch):
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    
    progress_bar = tqdm(loader, desc=f"Epoch {epoch}")
    
    for step, batch in enumerate(progress_bar):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss / GRADIENT_ACCUMULATION_STEPS
        loss.backward()
        
        if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        
        total_loss += loss.item() * GRADIENT_ACCUMULATION_STEPS
        progress_bar.set_postfix({'loss': loss.item() * GRADIENT_ACCUMULATION_STEPS})
    
    return total_loss / len(loader)

def evaluate(model, loader, device):
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
            total_loss += outputs.loss.item()
    
    avg_loss = total_loss / len(loader)
    perplexity = torch.exp(torch.tensor(avg_loss))
    return avg_loss, perplexity.item()

In [None]:
# Training loop
best_val_loss = float('inf')
training_history = []

for epoch in range(1, EPOCHS + 1):
    print(f"\n{'='*50}")
    print(f"Epoch {epoch}/{EPOCHS}")
    print(f"{'='*50}")
    
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, device, epoch)
    val_loss, val_perplexity = evaluate(model, val_loader, device)
    
    print(f"\nTrain Loss: {train_loss:.4f}")
    print(f"Val Loss: {val_loss:.4f}")
    print(f"Val Perplexity: {val_perplexity:.2f}")
    
    training_history.append({
        'epoch': epoch,
        'train_loss': train_loss,
        'val_loss': val_loss,
        'val_perplexity': val_perplexity
    })
    
    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_loss': val_loss,
            'val_perplexity': val_perplexity
        }, 'best_model.pt')
        print("✓ Saved best model")

print("\n" + "="*50)
print("Training completed!")
print(f"Best validation loss: {best_val_loss:.4f}")


Epoch 1/3


Epoch 1:   0%|          | 0/112585 [00:00<?, ?it/s]

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


KeyboardInterrupt: 

### 6. Text Generation

In [None]:
def generate_text(prompt, max_length=100, temperature=0.8, top_k=50, top_p=0.95):
    model.eval()
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Test generation
prompts = [
    "The future of artificial intelligence",
    "In a world where technology",
    "Scientists have discovered"
]

print("\n" + "="*50)
print("Text Generation Examples")
print("="*50)

for prompt in prompts:
    print(f"\nPrompt: {prompt}")
    print("-" * 50)
    generated = generate_text(prompt, max_length=150)
    print(generated)
    print()

### 7. Benchmarking and Evaluation

In [None]:
import time
from evaluate import load

# Load metrics
perplexity_metric = load('perplexity', module_type='metric')

def benchmark_model(model, test_loader, device):
    model.eval()
    
    # Inference speed
    start_time = time.time()
    total_tokens = 0
    
    with torch.no_grad():
        for i, batch in enumerate(tqdm(test_loader, desc="Benchmarking")):
            if i >= 100:  # Test on 100 batches
                break
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            total_tokens += input_ids.numel()
    
    elapsed_time = time.time() - start_time
    tokens_per_second = total_tokens / elapsed_time
    
    return {
        'tokens_per_second': tokens_per_second,
        'inference_time': elapsed_time,
        'total_tokens': total_tokens
    }

# Run benchmarks
test_loader = DataLoader(tokenized_val, batch_size=BATCH_SIZE)
benchmark_results = benchmark_model(model, test_loader, device)

print("\n" + "="*50)
print("Benchmark Results")
print("="*50)
print(f"Tokens per second: {benchmark_results['tokens_per_second']:.2f}")
print(f"Inference time: {benchmark_results['inference_time']:.2f}s")
print(f"Total tokens processed: {benchmark_results['total_tokens']:,}")

In [None]:
# Final evaluation metrics
final_val_loss, final_perplexity = evaluate(model, val_loader, device)

metrics_summary = {
    'model_parameters': total_params,
    'final_val_loss': final_val_loss,
    'final_perplexity': final_perplexity,
    'tokens_per_second': benchmark_results['tokens_per_second'],
    'training_epochs': EPOCHS,
    'best_val_loss': best_val_loss
}

print("\n" + "="*50)
print("Final Metrics Summary")
print("="*50)
for key, value in metrics_summary.items():
    print(f"{key}: {value}")

# Save metrics
with open('metrics_summary.json', 'w') as f:
    json.dump(metrics_summary, f, indent=2)

### 8. Save Model

In [None]:
# Save final model
model.save_pretrained('./final_model')
tokenizer.save_pretrained('./final_model')

# Save to Google Drive (optional)
# from google.colab import drive
# drive.mount('/content/drive')
# !cp -r ./final_model /content/drive/MyDrive/10m_text_model

print("Model saved successfully!")