In [None]:
import sys
sys.path.append('..')

import torch
import matplotlib.pyplot as plt
import seaborn as sns
from config import Config
from training.train import BertQATrainer
from training.evaluate import BertQAEvaluator
import json
from pathlib import Path

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Check device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
if device == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## 1. Configuration Setup

In [None]:
# Create and display configuration
config = Config()
config.display()

In [None]:
# Modify configuration for quick testing (optional)
# Uncomment these lines to test with smaller settings first

# config.training.num_epochs = 1
# config.training.batch_size = 8
# config.training.logging_steps = 50
# config.training.save_steps = 500

print("Configuration ready!")

## 2. Test Training on Small Subset (Optional)

Before training on the full dataset, let's test with a small subset to verify everything works.

In [None]:
# Create a small test dataset
import json

# Load full dataset
with open('../archive/train-v1.1.json', 'r') as f:
    full_data = json.load(f)

# Create small version (first 2 articles)
small_data = {
    'version': full_data['version'],
    'data': full_data['data'][:2]  # Just first 2 articles
}

# Save small version
with open('../archive/train_small.json', 'w') as f:
    json.dump(small_data, f)

# Do the same for dev
with open('../archive/dev-v1.1.json', 'r') as f:
    dev_data = json.load(f)

small_dev = {
    'version': dev_data['version'],
    'data': dev_data['data'][:2]
}

with open('../archive/dev_small.json', 'w') as f:
    json.dump(small_dev, f)

print("Small test datasets created!")
print(f"Train articles: {len(small_data['data'])}")
print(f"Dev articles: {len(small_dev['data'])}")

In [None]:
# Test training on small dataset
test_config = Config()
test_config.data.train_file = 'archive/train_small.json'
test_config.data.dev_file = 'archive/dev_small.json'
test_config.training.num_epochs = 1
test_config.training.batch_size = 4
test_config.training.logging_steps = 10

print("Testing with small dataset...")
test_trainer = BertQATrainer(test_config)
test_trainer.setup()

print("\nSmall dataset setup successful!")
print("You can now run: test_trainer.train() to test the training loop")

## 3. Full Training

In [None]:
# Create trainer
trainer = BertQATrainer(config)

# Setup (load data, model, optimizer)
trainer.setup()

In [None]:
# Start training
# WARNING: This will take several hours on CPU, ~2-3 hours on GPU
# Make sure you have saved your work before running

trainer.train()

## 4. Training Visualization

In [None]:
# Parse training logs
import re
from pathlib import Path

# Find the latest log file
log_dir = Path('../logs')
log_files = sorted(log_dir.glob('training_*.log'))

if log_files:
    latest_log = log_files[-1]
    print(f"Reading log: {latest_log}")
    
    # Parse training and evaluation losses
    train_losses = []
    eval_losses = []
    epochs = []
    
    with open(latest_log, 'r') as f:
        for line in f:
            # Match training loss
            if 'Training Loss:' in line:
                match = re.search(r'Epoch (\d+) - Training Loss: ([\d.]+)', line)
                if match:
                    epoch = int(match.group(1))
                    loss = float(match.group(2))
                    epochs.append(epoch)
                    train_losses.append(loss)
            
            # Match evaluation loss
            if 'Evaluation Loss:' in line:
                match = re.search(r'Evaluation Loss: ([\d.]+)', line)
                if match:
                    loss = float(match.group(1))
                    eval_losses.append(loss)
    
    print(f"Found {len(train_losses)} training epochs")
else:
    print("No log files found. Run training first.")

In [None]:
# Plot training curves
if train_losses and eval_losses:
    plt.figure(figsize=(12, 6))
    
    plt.plot(epochs, train_losses, marker='o', label='Training Loss', linewidth=2)
    plt.plot(epochs, eval_losses, marker='s', label='Evaluation Loss', linewidth=2)
    
    plt.xlabel('Epoch', fontsize=12)
    plt.ylabel('Loss', fontsize=12)
    plt.title('Training and Evaluation Loss Over Epochs', fontsize=14, fontweight='bold')
    plt.legend(fontsize=11)
    plt.grid(True, alpha=0.3)
    
    # Add annotations for best model
    best_epoch = epochs[eval_losses.index(min(eval_losses))]
    best_loss = min(eval_losses)
    plt.axvline(x=best_epoch, color='red', linestyle='--', alpha=0.5, label=f'Best Model (Epoch {best_epoch})')
    plt.legend(fontsize=11)
    
    plt.tight_layout()
    plt.savefig('../outputs/training_curves.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"\nBest model at epoch {best_epoch} with eval loss: {best_loss:.4f}")
else:
    print("No training data to plot")

## 5. Evaluate Trained Model

In [None]:
# Load best model for evaluation
best_model_path = '../checkpoints/best_model.pt'

if Path(best_model_path).exists():
    evaluator = BertQAEvaluator(best_model_path, config)
    print("Best model loaded successfully!")
else:
    print(f"Model not found at {best_model_path}")
    print("Please train the model first or specify correct path")

In [None]:
# Evaluate on dev set
# Start with a small sample for quick testing
results = evaluator.evaluate_dataset(
    '../archive/dev-v1.1.json',
    max_samples=100  # Remove this to evaluate on full dataset
)

In [None]:
# Evaluate on FULL dev set (this will take time)
# Uncomment to run full evaluation

# full_results = evaluator.evaluate_dataset('../archive/dev-v1.1.json')
# evaluator.save_predictions(full_results['predictions'], '../outputs/predictions.json')

## 6. Test Predictions on Custom Examples

In [None]:
# Test with custom question-context pairs
context = """The Amazon rainforest, also known as Amazonia, is a moist broadleaf tropical rainforest 
in the Amazon biome that covers most of the Amazon basin of South America. This basin encompasses 
7,000,000 km2 (2,700,000 sq mi), of which 5,500,000 km2 (2,100,000 sq mi) are covered by the rainforest. 
The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by Peru with 
13%, and Colombia with 10%."""

questions = [
    "What is another name for the Amazon rainforest?",
    "How much of the Amazon rainforest is in Brazil?",
    "Which country has the second largest portion of the Amazon?"
]

print("\n" + "="*80)
print("Custom Predictions")
print("="*80)

for i, question in enumerate(questions, 1):
    result = evaluator.predict(question, context)
    print(f"\nQ{i}: {question}")
    print(f"A:  {result['text']}")
    print(f"Confidence: {result['score']:.2f}")

## 7. Error Analysis

In [None]:
# Analyze prediction errors
if 'results' in locals():
    errors = evaluator.analyze_errors(
        results['predictions'],
        results['references'],
        top_n=10
    )
    
    print(f"\nTotal errors: {len(errors)}")
    print(f"Error rate: {len(errors) / len(results['predictions']) * 100:.2f}%")

## 8. Save Model for Deployment

In [None]:
# Save model in Hugging Face format for easy deployment
if Path(best_model_path).exists():
    from transformers import BertForQuestionAnswering, BertTokenizerFast
    
    # Load checkpoint
    checkpoint = torch.load(best_model_path, map_location='cpu')
    
    # Load model
    model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
    model.load_state_dict(checkpoint['model_state_dict'])
    
    # Save in Hugging Face format
    output_dir = Path('../outputs/bert-qa-squad')
    output_dir.mkdir(parents=True, exist_ok=True)
    
    model.save_pretrained(output_dir)
    
    # Save tokenizer
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
    tokenizer.save_pretrained(output_dir)
    
    print(f"Model saved to {output_dir}")
    print("You can now load it with:")
    print(f"  model = BertForQuestionAnswering.from_pretrained('{output_dir}')")
    print(f"  tokenizer = BertTokenizerFast.from_pretrained('{output_dir}')")

## Summary

### Training Results:
- Model: BERT-base-uncased
- Dataset: SQuAD v1.1
- Training time: ~2-3 hours on GPU

### Expected Performance:
- Exact Match: ~80-85%
- F1 Score: ~88-92%

### Next Steps:
1. Build deployment interface (Gradio/Streamlit)
2. Create interactive demo
3. Document usage and API
4. Deploy to cloud (optional)