In [1]:
# ==== IMPORTS ====
!pip install mlflow transformers datasets
import time
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from datasets import load_dataset
from torch.utils.data import DataLoader

import mlflow
import mlflow.pytorch
from mlflow.models import infer_signature

import numpy as np
import psutil
import platform
import subprocess
from datetime import datetime
import random
import string

import logging
import sys
import matplotlib.pyplot as plt

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")



  from .autonotebook import tqdm as notebook_tqdm


PyTorch version: 2.7.1
CUDA available: False


In [2]:
# ==== LOGGING SETUP ====
format_str = '%(asctime)s - %(levelname)s - %(filename)s - PID:%(process)d - TID:%(thread)d - %(message)s'
logger = logging.getLogger(__name__ + str(time.time()))
logger.setLevel(logging.DEBUG)
logger.propagate = False
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(logging.Formatter(format_str))
logger.addHandler(handler)

logger.debug("Logger initialization completed")

2025-07-27 17:27:29,741 - DEBUG - 666343768.py - PID:26758 - TID:8462606080 - Logger initialization completed


In [3]:
# ==== COMPREHENSIVE TRAINING MONITOR ====
class ComprehensiveTrainingMonitor:
    """Advanced training monitor with complete metrics tracking"""
    
    def __init__(self, model, optimizer, criterion, device, model_name='GPT2',
                 dataset_name='Conversational', batch_size=16, epochs=10,
                 input_size='max_length_512', use_mlflow=True,
                 learning_rate=5e-5, use_pretrained=True, train_size=50000,
                 val_size=10000, num_workers=0):

        self.model = model
        self.optimizer = optimizer
        self.criterion = criterion
        self.device = device
        self.model_name = model_name
        self.dataset_name = dataset_name
        self.batch_size = batch_size
        self.epochs = epochs
        self.input_size = input_size
        self.use_mlflow = use_mlflow
        self.learning_rate = learning_rate
        self.use_pretrained = use_pretrained
        self.train_size = train_size
        self.val_size = val_size
        self.num_workers = num_workers

        # Tracking variables
        self.best_metric = float('inf')  # Lower perplexity is better
        self.epoch_times = []
        self.start_time = time.time()
        self.run_started = False
        self.prev_loss = None  # Track loss improvement

        # MLflow configuration
        self.mlflow_uri = "https://neuralripper.com/mlflow/"
        self.gcs_bucket = "gs://neuralripper-mlflow-artifacts"

        if self.use_mlflow:
            self._initialize_mlflow()

    def _initialize_mlflow(self):
        """Initialize MLflow with comprehensive experiment tracking"""
        try:
            mlflow.set_tracking_uri(self.mlflow_uri)
            mlflow.set_experiment(f"{self.model_name}-{self.dataset_name}")

            run_name = f"{self.model_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
            mlflow.start_run(run_name=run_name)
            self.run_started = True

            params = {
                **self._get_model_params(),
                **self._get_system_params(),
                **self._get_environment_params(),
                **self._get_data_params(),
                **self._get_training_params(),
            }

            mlflow.log_params(params)
            logger.info(f"MLflow run started: {run_name}")

        except Exception as e:
            logger.warning(f"Failed to initialize MLflow: {e}")
            self.use_mlflow = False

    def _get_model_params(self):
        """Get model-specific parameters"""
        params = {
            'model_name': self.model_name,
            'model_architecture': 'GPT2',
            'model_size': 'base',
            'use_pretrained': self.use_pretrained,
            'num_parameters': sum(p.numel() for p in self.model.parameters()),
            'trainable_parameters': sum(p.numel() for p in self.model.parameters() if p.requires_grad),
        }
        
        try:
            if hasattr(self.model, 'config'):
                params.update({
                    'vocab_size': self.model.config.vocab_size,
                    'n_positions': self.model.config.n_positions,
                    'n_ctx': self.model.config.n_ctx,
                    'n_embd': self.model.config.n_embd,
                    'n_layer': self.model.config.n_layer,
                    'n_head': self.model.config.n_head,
                })
        except Exception as e:
            logger.warning(f"Could not extract model config: {e}")
            
        return params

    def _get_system_params(self):
        """Get system and hardware parameters"""
        return {
            'device': str(self.device),
            'python_version': platform.python_version(),
            'pytorch_version': torch.__version__,
            'cuda_version': torch.version.cuda if torch.cuda.is_available() else 'N/A',
            'cpu_count': psutil.cpu_count(),
            'memory_gb': round(psutil.virtual_memory().total / (1024**3), 2),
            'platform': platform.platform(),
        }

    def _get_environment_params(self):
        """Get training environment parameters"""
        env_params = {
            'timestamp': datetime.now().isoformat(),
            'timezone': str(datetime.now().astimezone().tzinfo),
            'random_seed': getattr(torch, 'initial_seed', lambda: None)(),
        }
        
        # Add git information
        try:
            commit = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
            branch = subprocess.check_output(['git', 'rev-parse', '--abbrev-ref', 'HEAD']).decode().strip()
            env_params.update({
                'git_commit': commit[:8],
                'git_branch': branch,
            })
        except:
            env_params.update({'git_commit': 'unknown', 'git_branch': 'unknown'})
            
        return env_params

    def _get_data_params(self):
        """Get dataset and data loading parameters"""
        return {
            'dataset_name': self.dataset_name,
            'train_size': self.train_size,
            'val_size': self.val_size,
            'batch_size': self.batch_size,
            'num_workers': self.num_workers,
            'input_size': self.input_size,
        }

    def _get_training_params(self):
        """Get training configuration parameters"""
        return {
            'epochs': self.epochs,
            'learning_rate': self.learning_rate,
            'optimizer': self.optimizer.__class__.__name__,
            'criterion': self.criterion.__class__.__name__,
        }

    def setup_mlflow(self):
        """Setup MLflow tracking if not already initialized"""
        if self.use_mlflow and not self.run_started:
            self._initialize_mlflow()

    def _get_gpu_metrics(self):
        """Get GPU memory usage metrics"""
        if not torch.cuda.is_available():
            return {}
        
        try:
            return {
                'gpu_memory_allocated_mb': torch.cuda.memory_allocated() / 1024**2,
                'gpu_memory_reserved_mb': torch.cuda.memory_reserved() / 1024**2,
                'gpu_memory_max_allocated_mb': torch.cuda.max_memory_allocated() / 1024**2,
            }
        except:
            return {}

    def _get_system_metrics(self):
        """Get system performance metrics"""
        try:
            memory = psutil.virtual_memory()
            return {
                'cpu_percent': psutil.cpu_percent(interval=0.1),
                'memory_used_percent': memory.percent,
                'memory_used_gb': memory.used / (1024**3),
                'memory_available_gb': memory.available / (1024**3),
            }
        except:
            return {}

    def log_epoch_metrics(self, epoch, epoch_loss, epoch_perplexity, epoch_token_accuracy, batch_count=None, epoch_top5_accuracy=None):
        """Comprehensive epoch metrics logging for GPT2"""
        if not self.use_mlflow or not self.run_started:
            return {}

        try:
            current_time = time.time()

            # Timing metrics
            epoch_time = current_time - (self.start_time if epoch == 0 else self.start_time + sum(self.epoch_times))
            self.epoch_times.append(epoch_time)

            # Core metrics - MAP TO FRONTEND EXPECTED NAMES
            metrics = {
                "epoch": epoch,
                "train_loss": epoch_loss,           # Frontend expects 'train_loss'
                "train_accuracy": epoch_token_accuracy,  # Frontend expects 'train_accuracy' 
                "perplexity": epoch_perplexity,     # Text model specific metric
                "learning_rate": self.optimizer.param_groups[0]["lr"],
                "epoch_time_seconds": epoch_time,
                "total_time_seconds": current_time - self.start_time,
                "avg_epoch_time": sum(self.epoch_times) / len(self.epoch_times),
            }

            # Add top-5 accuracy if provided (text models only)
            if epoch_top5_accuracy is not None:
                metrics["top5_accuracy"] = epoch_top5_accuracy

            # Training dynamics (use loss for improvement tracking)
            if self.prev_loss is not None:
                loss_improvement = self.prev_loss - epoch_loss
                metrics.update({
                    "loss_improvement": loss_improvement,
                    "loss_improvement_percent": (loss_improvement / self.prev_loss) * 100 if self.prev_loss != 0 else 0,
                })
            
            self.prev_loss = epoch_loss

            # Performance metrics
            if batch_count and epoch_time > 0:
                samples_per_sec = (self.batch_size * batch_count) / epoch_time
                metrics.update({
                    "batches_per_second": batch_count / epoch_time,
                    "samples_per_second": samples_per_sec,
                })

            # GPU/System metrics
            if torch.cuda.is_available():
                gpu_metrics = self._get_gpu_metrics()
                metrics.update(gpu_metrics)

            system_metrics = self._get_system_metrics()
            metrics.update(system_metrics)

            mlflow.log_metrics(metrics, step=epoch)
            return metrics

        except Exception as e:
            logger.warning(f"Failed to log epoch metrics: {e}")
            return {}

    def should_log_model(self, current_metric, metric_name="token_accuracy"):
        """Enhanced model checkpointing with improvement tracking"""
        if current_metric > self.best_metric:  # Higher token accuracy is better
            improvement = current_metric - self.best_metric
            self.best_metric = current_metric
            
            logger.info(f"New best {metric_name}: {current_metric:.4f} "
                    f"(improvement: +{improvement:.4f})")
            return True
        return False

    def log_model_artifact(self):
        """Log model metadata to MLflow"""
        if not self.use_mlflow:
            return

        try:
            model_info = {
                'model_architecture': self.model_name,
                'best_perplexity': float(self.best_metric),
                'total_parameters': sum(p.numel() for p in self.model.parameters()),
                'model_size_mb': sum(p.numel() * p.element_size() for p in self.model.parameters()) / (1024**2),
            }
            mlflow.log_dict(model_info, "model_metadata.json")
        except Exception as e:
            logger.warning(f"Failed to log model metadata: {e}")

    def end_run(self, status="FINISHED"):
        """End MLflow run and return summary"""
        if not self.use_mlflow:
            return {}

        total_time = time.time() - self.start_time
        summary = {
            'final_total_training_time_minutes': round(total_time / 60, 2),
            'final_best_perplexity': float(self.best_metric),
            'final_epochs_completed': len(self.epoch_times),
        }

        try:
            mlflow.log_params({'training_status': status})
            mlflow.log_metrics(summary)
            mlflow.end_run()
        except Exception as e:
            logger.warning(f"Failed to end MLflow run: {e}")

        return summary

In [4]:
# ==== DATA LOADING ====
def load_and_prepare_data():
    """Load conversational dataset and prepare for training"""
    
    try:
        # Load blended_skill_talk dataset
        dataset = load_dataset("blended_skill_talk")
        conversations = []
        
        # Debug: print first item structure
        print("Dataset structure:", dataset["train"][0].keys())
        print("Sample item:", dataset["train"][0])
        
        for item in dataset["train"]:
            # Extract the guided and free messages
            guided_messages = item.get("guided_messages", [])
            free_messages = item.get("free_messages", [])
            
            # Create conversations from guided messages
            if guided_messages and len(guided_messages) >= 2:
                for i in range(0, len(guided_messages) - 1, 2):
                    if i + 1 < len(guided_messages):
                        user_msg = guided_messages[i].strip()
                        assistant_msg = guided_messages[i + 1].strip()
                        if user_msg and assistant_msg:
                            conversation = f"User: {user_msg}\nAssistant: {assistant_msg}<|endoftext|>"
                            conversations.append(conversation)
            
            # Create conversations from free messages  
            if free_messages and len(free_messages) >= 2:
                for i in range(0, len(free_messages) - 1, 2):
                    if i + 1 < len(free_messages):
                        user_msg = free_messages[i].strip()
                        assistant_msg = free_messages[i + 1].strip()
                        if user_msg and assistant_msg:
                            conversation = f"User: {user_msg}\nAssistant: {assistant_msg}<|endoftext|>"
                            conversations.append(conversation)
            
            if len(conversations) >= 1000:  # Stop early for demo
                break
        
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return []
    
    conversations = conversations[:1000]
    
    print(f"Training conversations: {len(conversations)}")
    if conversations:
        print(f"Sample conversation: {conversations[0][:200]}...")
    else:
        print("No conversations found - check dataset structure")
    
    return conversations

# Load data
train_conversations = load_and_prepare_data()

Dataset structure: dict_keys(['personas', 'additional_context', 'previous_utterance', 'context', 'free_messages', 'guided_messages', 'suggestions', 'guided_chosen_suggestions', 'label_candidates'])
Sample item: {'personas': ["i've 2 kids.", 'i love flowers.'], 'additional_context': '', 'previous_utterance': ["I love live music, that's why I try to go to concerts", 'I do too. Wat do you like?'], 'context': 'empathetic_dialogues', 'free_messages': ['I like acting, I hope to be an actor, what about you?', 'No, but someday.', 'After I am done with school I plan to have a family.', 'I hope so, how old are your kids?', 'I would imagine. I am sure they a great kids.', 'I wish I had more time to do stuff like that. Medical school is exhausting. '], 'guided_messages': ['that is ok.  have any kids?', 'that is good. I have 2', 'that is great! you will be ready', '5 & 7.  they take up a lot of my time', 'luckily, they love flowers just as much as I do.  we spend a lot of time in the garden', 'soun

In [5]:
# ==== DATA LOADER ====
def create_data_loader(conversations, tokenizer, batch_size=16, max_length=512):
    """Create DataLoader with tokenization for conversations"""
    
    def collate_fn(batch):
        # Tokenize all conversations in batch
        encoded = tokenizer(
            batch,
            truncation=True,
            padding=True,
            max_length=max_length,
            return_tensors='pt'
        )
        
        # For GPT-2, input_ids serve as both input and labels (shifted by 1)
        input_ids = encoded['input_ids']
        attention_mask = encoded['attention_mask']
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': input_ids.clone()  # For language modeling
        }
    
    return DataLoader(
        conversations,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=collate_fn,
        num_workers=0
    )

In [6]:
# ==== ENHANCED GPT-2 CLASS ====
class EnhancedGPT2:
    """GPT-2 with comprehensive monitoring integration"""

    def __init__(self, num_epochs=3, batch_size=16, 
                 learning_rate=5e-5, use_mlflow=True, model_name='gpt2'):
        
        # Core parameters
        self._use_mlflow = use_mlflow
        self._batch_size = batch_size
        self._num_epochs = num_epochs
        self._learning_rate = learning_rate
        self.model_name = model_name

        # Create model and tokenizer
        self._tokenizer = self._create_tokenizer()
        self._model = self._create_model()
        self._device = self._set_device()
        self._model.to(self._device)
        self._criterion = self._set_criterion()
        self._optimizer = self._set_optimizer()

        # Initialize comprehensive monitor
        self.monitor = ComprehensiveTrainingMonitor(
            model=self._model,
            optimizer=self._optimizer,
            criterion=self._criterion,
            device=self._device,
            model_name='GPT2',
            dataset_name='Conversational',
            batch_size=batch_size,
            epochs=num_epochs,
            input_size='max_length_512',
            use_mlflow=use_mlflow,
            use_pretrained=True,
            train_size=10000,
            val_size=2000,
            num_workers=0,
        )

        logger.info(f"Model initialized on device: {self._device}")
        logger.info(f"Model parameters: {sum(p.numel() for p in self._model.parameters()):,}")

    def _create_tokenizer(self):
        """Create GPT-2 tokenizer"""
        tokenizer = GPT2Tokenizer.from_pretrained(self.model_name)
        tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have pad token
        return tokenizer

    def _create_model(self):
        """Create GPT-2 model for language modeling"""
        return GPT2LMHeadModel.from_pretrained(self.model_name)

    def _set_device(self):
        """Set appropriate device for training"""
        if torch.backends.mps.is_available():
            return torch.device("mps")
        elif torch.cuda.is_available():
            return torch.device("cuda")
        return torch.device("cpu")

    def _set_optimizer(self):
        """Configure AdamW optimizer"""
        return torch.optim.AdamW(self._model.parameters(), lr=self._learning_rate)

    def _set_criterion(self):
        """Set loss function for language modeling"""
        return nn.CrossEntropyLoss(ignore_index=self._tokenizer.pad_token_id)

    def train_epoch(self, data_loader, epoch_idx):
        """Enhanced training epoch with comprehensive monitoring including top-k accuracy"""
        logger.info(f"Starting epoch {epoch_idx+1}, total batches: {len(data_loader)}")

        self._model.train()
        epoch_total_loss = 0.0
        running_loss = 0.0
        running_correct = 0
        running_top5_correct = 0
        running_total = 0
        batch_count = len(data_loader)

        for idx, batch in enumerate(data_loader):
            input_ids = batch['input_ids'].to(self._device)
            attention_mask = batch['attention_mask'].to(self._device)
            labels = batch['labels'].to(self._device)

            self._optimizer.zero_grad()
            
            # GPT-2 forward pass
            outputs = self._model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
            logits = outputs.logits
            
            loss.backward()
            self._optimizer.step()

            # Calculate token-level accuracy
            predictions = logits.argmax(dim=-1)
            # Mask out padding tokens for accuracy calculation
            mask = (labels != self._tokenizer.pad_token_id)
            correct_predictions = ((predictions == labels) & mask).sum().item()
            total_tokens = mask.sum().item()
            
            # Calculate top-5 accuracy
            top5_preds = torch.topk(logits, 5, dim=-1)[1]  # Shape: [batch, seq_len, 5]
            top5_correct = 0
            for k in range(5):
                top5_correct += ((top5_preds[:, :, k] == labels) & mask).sum().item()
            
            running_correct += correct_predictions
            running_top5_correct += top5_correct
            running_total += total_tokens
            running_loss += loss.item()
            epoch_total_loss += loss.item()

            # Progress logging every 10 batches
            if idx % 10 == 9:
                avg_loss = running_loss / 10
                perplexity = torch.exp(torch.tensor(avg_loss)).item()
                token_accuracy = running_correct / running_total if running_total > 0 else 0
                top5_accuracy = running_top5_correct / running_total if running_total > 0 else 0
                logger.info(f"Epoch {epoch_idx + 1} | Batch {idx + 1} | "
                        f"Loss: {avg_loss:.4f} | Perplexity: {perplexity:.2f} | "
                        f"Token Acc: {token_accuracy:.4f} | Top5 Acc: {top5_accuracy:.4f}")
                running_loss = 0.0

        # Calculate epoch metrics
        epoch_loss = epoch_total_loss / batch_count
        epoch_perplexity = torch.exp(torch.tensor(epoch_loss)).item()
        epoch_token_accuracy = running_correct / running_total if running_total > 0 else 0
        epoch_top5_accuracy = running_top5_correct / running_total if running_total > 0 else 0

        logger.info(f"Epoch {epoch_idx + 1} completed - Loss: {epoch_loss:.4f}, "
                f"Perplexity: {epoch_perplexity:.2f}, Token Accuracy: {epoch_token_accuracy:.4f}, "
                f"Top5 Accuracy: {epoch_top5_accuracy:.4f}")
        
        return epoch_loss, epoch_perplexity, epoch_token_accuracy, epoch_top5_accuracy, batch_count

    def train(self, train_loader):
        """Enhanced training with comprehensive monitoring"""
        try:
            logger.info(f"Starting training for {self._num_epochs} epochs on {self._device}")

            # Setup MLflow
            self.monitor.setup_mlflow()

            for epoch in range(self._num_epochs):
                epoch_loss, epoch_perplexity, epoch_token_accuracy, epoch_top5_accuracy, batch_count = self.train_epoch(train_loader, epoch)

                # Log comprehensive epoch metrics - UPDATE THIS CALL
                metrics = self.monitor.log_epoch_metrics(
                    epoch, epoch_loss, epoch_perplexity, epoch_token_accuracy, 
                    batch_count, epoch_top5_accuracy  # Add top5 parameter
                )

                # Model checkpointing (lower perplexity is better)
                if self.monitor.should_log_model(epoch_perplexity):
                    self.monitor.log_model_artifact()
                    logger.info(f"New best model metadata saved with perplexity: {epoch_perplexity:.2f}")

                # Enhanced progress display
                print(f"Epoch {epoch+1}/{self._num_epochs}: "
                    f"Loss: {epoch_loss:.4f} | Perplexity: {epoch_perplexity:.2f} | "
                    f"Token Acc: {epoch_token_accuracy:.4f} | Top5 Acc: {epoch_top5_accuracy:.4f} | "
                    f"Time: {metrics.get('epoch_time_seconds', 0):.1f}s | "
                    f"LR: {metrics.get('learning_rate', 0):.2e} | "
                    f"Memory: {metrics.get('memory_used_percent', 0):.1f}%")

        except Exception as e:
            logger.error(f"Training failed: {e}")
            self.monitor.end_run(status="FAILED")
            raise
        finally:
            summary = self.monitor.end_run()
            logger.info(f"Training completed. Summary: {summary}")
            print(f"\nTraining Summary:")
            print(f"Total time: {summary.get('final_total_training_time_minutes', 0):.1f} minutes")
            print(f"Best perplexity: {summary.get('final_best_perplexity', 0):.2f}")
            print(f"Epochs completed: {summary.get('final_epochs_completed', 0)}")

In [7]:
# ==== INITIALIZE AND TRAIN ====
logger.info("Initializing Enhanced GPT-2 model")

model = EnhancedGPT2(
    num_epochs=10,
    batch_size=64,
    learning_rate=5e-5,
    use_mlflow=True,
    model_name='gpt2'
)

print("Model initialized successfully!")
print(f"Device: {model._device}")
print(f"Total parameters: {sum(p.numel() for p in model._model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model._model.parameters() if p.requires_grad):,}")

# Create data loader
train_loader = create_data_loader(
    train_conversations, 
    model._tokenizer, 
    batch_size=64,
    max_length=512
)

print(f"Number of batches per epoch: {len(train_loader)}")
print(f"Total samples per epoch: {len(train_loader) * 32}")

# Start training
print("\n" + "="*50)
print(" STARTING ENHANCED GPT-2 TRAINING")
print("="*50)
print(f"Dataset: Daily Dialog (Conversational)")
print(f"Model: GPT-2 (pretrained)")
print(f"Epochs: {model._num_epochs}")
print(f"Batch size: {model._batch_size}")
print(f"Learning rate: {model._learning_rate}")
print(f"MLflow tracking: {'Enabled' if model._use_mlflow else 'Disabled'}")
print(f"MLflow URI: {model.monitor.mlflow_uri}")
print("="*50 + "\n")

# Start training
model.train(train_loader)

2025-07-27 17:27:51,309 - INFO - 268818953.py - PID:26758 - TID:8462606080 - Initializing Enhanced GPT-2 model


2025/07/27 17:27:53 INFO mlflow.tracking.fluent: Experiment with name 'GPT2-Conversational' does not exist. Creating a new experiment.


2025-07-27 17:27:54,140 - INFO - 2929732547.py - PID:26758 - TID:8462606080 - MLflow run started: GPT2_20250727_172753
2025-07-27 17:27:54,141 - INFO - 3216512036.py - PID:26758 - TID:8462606080 - Model initialized on device: mps
2025-07-27 17:27:54,141 - INFO - 3216512036.py - PID:26758 - TID:8462606080 - Model parameters: 124,439,808
Model initialized successfully!
Device: mps
Total parameters: 124,439,808
Trainable parameters: 124,439,808
Number of batches per epoch: 16
Total samples per epoch: 512

 STARTING ENHANCED GPT-2 TRAINING
Dataset: Daily Dialog (Conversational)
Model: GPT-2 (pretrained)
Epochs: 20
Batch size: 64
Learning rate: 5e-05
MLflow tracking: Enabled
MLflow URI: https://neuralripper.com/mlflow/

2025-07-27 17:27:54,142 - INFO - 3216512036.py - PID:26758 - TID:8462606080 - Starting training for 20 epochs on mps
2025-07-27 17:27:54,143 - INFO - 3216512036.py - PID:26758 - TID:8462606080 - Starting epoch 1, total batches: 16


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


2025-07-27 17:30:35,890 - INFO - 3216512036.py - PID:26758 - TID:8462606080 - Epoch 1 | Batch 10 | Loss: 3.5898 | Perplexity: 36.23 | Token Acc: 0.0159 | Top5 Acc: 0.0363
2025-07-27 17:32:38,796 - INFO - 3216512036.py - PID:26758 - TID:8462606080 - Epoch 1 completed - Loss: 2.9995, Perplexity: 20.08, Token Accuracy: 0.0114, Top5 Accuracy: 0.0353
Epoch 1/20: Loss: 2.9995 | Perplexity: 20.08 | Token Acc: 0.0114 | Top5 Acc: 0.0353 | Time: 285.9s | LR: 5.00e-05 | Memory: 90.0%
2025-07-27 17:32:43,302 - INFO - 3216512036.py - PID:26758 - TID:8462606080 - Starting epoch 2, total batches: 16
2025-07-27 17:35:48,643 - INFO - 3216512036.py - PID:26758 - TID:8462606080 - Epoch 2 | Batch 10 | Loss: 1.9218 | Perplexity: 6.83 | Token Acc: 0.0027 | Top5 Acc: 0.0157
2025-07-27 17:37:12,035 - INFO - 3216512036.py - PID:26758 - TID:8462606080 - Epoch 2 completed - Loss: 1.8578, Perplexity: 6.41, Token Accuracy: 0.0023, Top5 Accuracy: 0.0139
Epoch 2/20: Loss: 1.8578 | Perplexity: 6.41 | Token Acc: 0.002

KeyboardInterrupt: 