# Model Training and Experimentation

This notebook demonstrates interactive model training and hyperparameter experimentation for the text classification pipeline.



In [None]:

import os
import sys
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

# Hugging Face Ecosystem
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding,
    pipeline, set_seed
)
import torch
from torch.utils.data import DataLoader

# Optimization and Evaluation
import optuna
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# Visualization
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configuration
CONFIG = {
    'model_name': 'distilbert-base-uncased',
    'multilingual_model': 'distilbert-base-multilingual-cased',
    'max_length': 512,
    'batch_size': 16,
    'learning_rate': 2e-5,
    'num_epochs': 3,
    'seed': 42,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu'
}

set_seed(CONFIG['seed'])
print(f"🚀 Training Environment: {CONFIG['device']}")
print(f"📊 Configuration: {CONFIG}")


: 

In [None]:

class DatasetManager:
    """Comprehensive dataset management for text classification"""
    
    def __init__(self):
        self.supported_datasets = {
            'imdb': {'task': 'sentiment', 'labels': ['NEGATIVE', 'POSITIVE']},
            'ag_news': {'task': 'topic', 'labels': ['World', 'Sports', 'Business', 'Technology']},
            'yelp_review_full': {'task': 'sentiment', 'labels': ['1', '2', '3', '4', '5']},
            'emotion': {'task': 'emotion', 'labels': ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']}
        }
    
    def load_dataset(self, dataset_name, sample_size=None):
        """Load and sample dataset with comprehensive analysis"""
        print(f"📁 Loading {dataset_name} dataset...")
        
        if dataset_name not in self.supported_datasets:
            raise ValueError(f"Dataset {dataset_name} not supported. Available: {list(self.supported_datasets.keys())}")
        
        # Load dataset
        if dataset_name == 'ag_news':
            dataset = load_dataset('ag_news')
        elif dataset_name == 'yelp_review_full':
            dataset = load_dataset('yelp_review_full')
        elif dataset_name == 'emotion':
            dataset = load_dataset('emotion')
        else:
            dataset = load_dataset(dataset_name)
        
        # Sample for faster experimentation
        if sample_size:
            train_size = min(sample_size, len(dataset['train']))
            test_size = min(sample_size // 4, len(dataset['test']))
            
            dataset['train'] = dataset['train'].shuffle(seed=42).select(range(train_size))
            dataset['test'] = dataset['test'].shuffle(seed=42).select(range(test_size))
        
        # Dataset analysis
        self.analyze_dataset(dataset, dataset_name)
        return dataset
    
    def analyze_dataset(self, dataset, name):
        """Comprehensive dataset analysis with visualizations"""
        train_data = dataset['train']
        test_data = dataset['test']
        
        print(f"\n📊 Dataset Analysis: {name}")
        print(f"Train samples: {len(train_data):,}")
        print(f"Test samples: {len(test_data):,}")
        
        # Label distribution
        train_labels = train_data['label']
        label_counts = pd.Series(train_labels).value_counts().sort_index()
        
        # Visualization
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
        # Label distribution
        axes[0,0].bar(range(len(label_counts)), label_counts.values)
        axes[0,0].set_title('Training Label Distribution')
        axes[0,0].set_xlabel('Label')
        axes[0,0].set_ylabel('Count')
        
        # Text length distribution
        text_lengths = [len(text.split()) for text in train_data['text']]
        axes[0,1].hist(text_lengths, bins=50, alpha=0.7)
        axes[0,1].set_title('Text Length Distribution')
        axes[0,1].set_xlabel('Number of Words')
        axes[0,1].set_ylabel('Frequency')
        
        # Character length distribution
        char_lengths = [len(text) for text in train_data['text']]
        axes[1,0].hist(char_lengths, bins=50, alpha=0.7, color='orange')
        axes[1,0].set_title('Character Length Distribution')
        axes[1,0].set_xlabel('Number of Characters')
        axes[1,0].set_ylabel('Frequency')
        
        # Sample texts per label
        axes[1,1].axis('off')
        sample_text = "Sample texts:\n\n"
        for i, label in enumerate(set(train_labels[:100])):
            sample_idx = next(idx for idx, lbl in enumerate(train_labels) if lbl == label)
            sample_text += f"Label {label}: {train_data['text'][sample_idx][:100]}...\n\n"
        axes[1,1].text(0.1, 0.9, sample_text, transform=axes[1,1].transAxes, 
                      verticalalignment='top', fontsize=8)
        
        plt.tight_layout()
        plt.show()
        
        print(f"📈 Statistics:")
        print(f"Average text length: {np.mean(text_lengths):.1f} words")
        print(f"Max text length: {max(text_lengths)} words")
        print(f"Average character length: {np.mean(char_lengths):.1f} characters")

# Initialize and load dataset
dataset_manager = DatasetManager()
dataset = dataset_manager.load_dataset('imdb', sample_size=5000)  # Adjust size as needed


In [None]:

class AdvancedTrainer:
    """Advanced training framework with hyperparameter optimization"""
    
    def __init__(self, dataset, config):
        self.dataset = dataset
        self.config = config
        self.tokenizer = None
        self.model = None
        self.trainer = None
        self.study = None
        
    def setup_tokenizer_and_model(self, model_name=None, num_labels=2):
        """Initialize tokenizer and model"""
        model_name = model_name or self.config['model_name']
        
        print(f"🔧 Setting up {model_name}...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name, 
            num_labels=num_labels
        )
        
        # Add padding token if not present
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
    
    def preprocess_data(self):
        """Advanced data preprocessing with tokenization"""
        def tokenize_function(examples):
            return self.tokenizer(
                examples['text'],
                truncation=True,
                padding=True,
                max_length=self.config['max_length']
            )
        
        print("🔄 Preprocessing data...")
        tokenized_dataset = self.dataset.map(tokenize_function, batched=True)
        
        # Create validation split if not present
        if 'validation' not in tokenized_dataset:
            train_test = tokenized_dataset['train'].train_test_split(test_size=0.2, seed=42)
            tokenized_dataset = DatasetDict({
                'train': train_test['train'],
                'validation': train_test['test'],
                'test': tokenized_dataset['test']
            })
        
        return tokenized_dataset
    
    def compute_metrics(self, eval_pred):
        """Comprehensive metrics computation"""
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        
        precision, recall, f1, _ = precision_recall_fscore_support(
            labels, predictions, average='weighted'
        )
        accuracy = accuracy_score(labels, predictions)
        
        return {
            'accuracy': accuracy,
            'f1': f1,
            'precision': precision,
            'recall': recall
        }
    
    def create_trainer(self, tokenized_dataset, trial_params=None):
        """Create trainer with custom configuration"""
        params = trial_params or {
            'learning_rate': self.config['learning_rate'],
            'per_device_train_batch_size': self.config['batch_size'],
            'num_train_epochs': self.config['num_epochs']
        }
        
        training_args = TrainingArguments(
            output_dir='./results',
            evaluation_strategy='epoch',
            save_strategy='epoch',
            logging_dir='./logs',
            load_best_model_at_end=True,
            metric_for_best_model='f1',
            greater_is_better=True,
            save_total_limit=2,
            report_to=None,
            **params
        )
        
        self.trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=tokenized_dataset['train'],
            eval_dataset=tokenized_dataset['validation'],
            tokenizer=self.tokenizer,
            data_collator=DataCollatorWithPadding(tokenizer=self.tokenizer),
            compute_metrics=self.compute_metrics
        )
        
        return self.trainer
    
    def hyperparameter_optimization(self, tokenized_dataset, n_trials=20):
        """Advanced hyperparameter optimization with Optuna"""
        print(f"🎯 Starting hyperparameter optimization with {n_trials} trials...")
        
        def objective(trial):
            # Define search space
            params = {
                'learning_rate': trial.suggest_float('learning_rate', 1e-6, 1e-4, log=True),
                'per_device_train_batch_size': trial.suggest_categorical('batch_size', [16, 32]),
                'num_train_epochs': trial.suggest_int('num_train_epochs', 2, 5),
                'weight_decay': trial.suggest_float('weight_decay', 0.0, 0.3),
                'warmup_ratio': trial.suggest_float('warmup_ratio', 0.0, 0.2)
            }
            
            # Reinitialize model for each trial
            self.setup_tokenizer_and_model(num_labels=len(set(self.dataset['train']['label'])))
            
            # Create trainer with trial parameters
            trainer = self.create_trainer(tokenized_dataset, params)
            
            # Train and evaluate
            trainer.train()
            eval_results = trainer.evaluate()
            
            return eval_results['eval_f1']
        
        # Create and run study
        self.study = optuna.create_study(direction='maximize')
        self.study.optimize(objective, n_trials=n_trials)
        
        print(f"🏆 Best trial: {self.study.best_trial.value:.4f}")
        print(f"📊 Best parameters: {self.study.best_trial.params}")
        
        return self.study.best_trial.params
    
    def train_model(self, tokenized_dataset, use_best_params=False):
        """Train model with optional best parameters"""
        if use_best_params and self.study:
            print("🚀 Training with optimized hyperparameters...")
            trainer = self.create_trainer(tokenized_dataset, self.study.best_trial.params)
        else:
            print("🚀 Training with default parameters...")
            trainer = self.create_trainer(tokenized_dataset)
        
        # Train the model
        train_result = trainer.train()
        
        # Save model and tokenizer
        trainer.save_model('./models/trained_model')
        self.tokenizer.save_pretrained('./models/tokenizer')
        
        print("✅ Training completed!")
        print(f"📊 Training metrics: {train_result.metrics}")
        
        return trainer

# Initialize advanced trainer
advanced_trainer = AdvancedTrainer(dataset, CONFIG)

# Setup model for IMDB (binary classification)
num_labels = len(set(dataset['train']['label']))
advanced_trainer.setup_tokenizer_and_model(num_labels=num_labels)

# Preprocess data
tokenized_dataset = advanced_trainer.preprocess_data()
print(f"✅ Data preprocessing completed!")


In [None]:

def interactive_training_session():
    """Interactive training session with multiple options"""
    print("🚀 Interactive Training Session")
    print("=" * 50)
    
    options = {
        '1': 'Quick Training (Default Parameters)',
        '2': 'Hyperparameter Optimization',
        '3': 'Model Comparison',
        '4': 'Multilingual Training',
        '5': 'Custom Configuration'
    }
    
    for key, value in options.items():
        print(f"{key}. {value}")
    
    choice = input("\nSelect training option (1-5): ").strip()
    
    if choice == '1':
        # Quick training
        print("\n🚀 Starting quick training...")
        trainer = advanced_trainer.train_model(tokenized_dataset)
        return trainer
        
    elif choice == '2':
        # Hyperparameter optimization
        n_trials = int(input("Number of optimization trials (default 10): ") or "10")
        best_params = advanced_trainer.hyperparameter_optimization(tokenized_dataset, n_trials)
        
        # Train with best parameters
        trainer = advanced_trainer.train_model(tokenized_dataset, use_best_params=True)
        return trainer
        
    elif choice == '3':
        # Model comparison
        return model_comparison_experiment(tokenized_dataset)
        
    elif choice == '4':
        # Multilingual training
        return multilingual_training_experiment()
        
    elif choice == '5':
        # Custom configuration
        return custom_training_configuration(tokenized_dataset)
    
    else:
        print("❌ Invalid choice. Using default training...")
        return advanced_trainer.train_model(tokenized_dataset)

def model_comparison_experiment(tokenized_dataset):
    """Compare multiple model architectures"""
    models_to_compare = [
        'distilbert-base-uncased',
        'bert-base-uncased',
        'roberta-base'
    ]
    
    results = {}
    
    for model_name in models_to_compare:
        print(f"\n🔄 Training {model_name}...")
        
        # Setup model
        advanced_trainer.setup_tokenizer_and_model(model_name, num_labels)
        
        # Train
        trainer = advanced_trainer.train_model(tokenized_dataset)
        
        # Evaluate
        eval_results = trainer.evaluate()
        results[model_name] = eval_results
        
        print(f"✅ {model_name} - F1: {eval_results['eval_f1']:.4f}")
    
    # Display comparison
    comparison_df = pd.DataFrame(results).T
    print("\n📊 Model Comparison Results:")
    print(comparison_df[['eval_accuracy', 'eval_f1', 'eval_precision', 'eval_recall']])
    
    return results

# Run interactive training
print("Starting Interactive Training Session...")
training_results = interactive_training_session()
