In [6]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, IterableDataset
from hugging
from torch.utils.tensorboard import SummaryWriter
from transformers import (
    GPT2LMHeadModel, 
    GPT2Tokenizer, 
    GPT2Config,
    DataCollatorForLanguageModeling,
    get_linear_schedule_with_warmup
)
from datasets import load_dataset
import torchmetrics
from torchmetrics.text import Perplexity, BLEUScore
from torchmetrics import MeanMetric
import os
from tqdm import tqdm
import json
from datetime import datetime
import subprocess

# Try to import git, but don't fail if not available
try:
    import git
    GIT_AVAILABLE = True
except ImportError:
    GIT_AVAILABLE = False
    print("‚ö†Ô∏è  GitPython not available. Git operations will be skipped.")


‚ö†Ô∏è  GitPython not available. Git operations will be skipped.


In [8]:
# Training Configuration
class TrainingConfig:
    # Model
    model_name = "gpt2"  # Default Transformer
    vocab_size = 50257
    n_positions = 1024  # Context length
    n_embd = 768
    n_layer = 12
    n_head = 12
    
    # Dataset
    dataset_name = "cerebras/SlimPajama-627B"
    max_seq_length = 1024
    
    # Training
    batch_size = 4
    gradient_accumulation_steps = 8
    learning_rate = 3e-4
    num_epochs = 1
    warmup_steps = 1000
    max_steps = 10000  # Limit for demo
    eval_steps = 500
    save_steps = 1000
    logging_steps = 100
    
    # Paths
    # Vast.ai tip: S·ª≠ d·ª•ng /workspace ho·∫∑c /data cho persistent storage
    # output_dir = "/workspace/checkpoints"  # Uncomment n·∫øu c√≥ persistent volume
    output_dir = "./checkpoints"
    log_dir = "./runs/tensorboard"
    run_name = f"train_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
    
    # Device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    mixed_precision = True
    
    def __init__(self):
        os.makedirs(self.output_dir, exist_ok=True)
        os.makedirs(self.log_dir, exist_ok=True)
        print(f"üìÅ Created directories: {self.output_dir}, {self.log_dir}")

config = TrainingConfig()
print(f"üöÄ Training Config:")
print(f"   Device: {config.device}")
print(f"   Output Dir: {config.output_dir}")
print(f"   Log Dir: {config.log_dir}")
print(f"   Run Name: {config.run_name}")
print(f"\nüí° Vast.ai Tip: N·∫øu c√≥ persistent volume, ƒë·ªïi output_dir sang /workspace ho·∫∑c /data")


üìÅ Created directories: ./checkpoints, ./runs/tensorboard
üöÄ Training Config:
   Device: cuda
   Output Dir: ./checkpoints
   Log Dir: ./runs/tensorboard
   Run Name: train_20251228_080806

üí° Vast.ai Tip: N·∫øu c√≥ persistent volume, ƒë·ªïi output_dir sang /workspace ho·∫∑c /data


In [10]:
## 2. HuggingFace Authentication (n·∫øu c·∫ßn)


In [11]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [12]:
## 3. Load Dataset v√† Tokenizer
# Load tokenizer
print("üì• Loading tokenizer...")
tokenizer = GPT2Tokenizer.from_pretrained(config.model_name)
tokenizer.pad_token = tokenizer.eos_token
print(f"   Vocab size: {len(tokenizer)}")

# Load dataset in streaming mode
# Vast.ai tip: Streaming mode kh√¥ng c·∫ßn download to√†n b·ªô dataset, ti·∫øt ki·ªám disk space
print("üì• Loading SlimPajama dataset (streaming)...")
try:
    train_dataset = load_dataset(
        config.dataset_name,
        split="train",
        streaming=True
    )
    
    # Take a subset for validation
    eval_dataset = load_dataset(
        config.dataset_name,
        split="train",
        streaming=True
    )
    print("‚úÖ Dataset loaded successfully")
except Exception as e:
    print(f"‚ùå Error loading dataset: {e}")
    print("   Make sure you have internet connection and HuggingFace access")
    raise


üì• Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

   Vocab size: 50257
üì• Loading SlimPajama dataset (streaming)...


Resolving data files:   0%|          | 0/59166 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/31428 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/31411 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/59166 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/31428 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/31411 [00:00<?, ?it/s]

‚úÖ Dataset loaded successfully


In [13]:
# Preprocessing function
def tokenize_function(examples):
    """Tokenize text and truncate to max_seq_length"""
    texts = examples['text'] if isinstance(examples, dict) else [examples['text']]
    
    # Tokenize
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=config.max_seq_length,
        padding='max_length',
        return_tensors='pt'
    )
    
    return {
        'input_ids': tokenized['input_ids'].squeeze(0),
        'attention_mask': tokenized['attention_mask'].squeeze(0)
    }

# Create iterable dataset wrapper
class SlimPajamaDataset(IterableDataset):
    def __init__(self, dataset, tokenizer, max_length, max_samples=None):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.max_samples = max_samples
        
    def __iter__(self):
        count = 0
        for example in self.dataset:
            if self.max_samples and count >= self.max_samples:
                break
            tokenized = tokenize_function(example)
            yield tokenized
            count += 1

# Create datasets
print("üîÑ Creating train dataset...")
train_iterable = SlimPajamaDataset(
    train_dataset, 
    tokenizer, 
    config.max_seq_length,
    max_samples=config.max_steps * config.batch_size * config.gradient_accumulation_steps
)

print("üîÑ Creating eval dataset...")
eval_iterable = SlimPajamaDataset(
    eval_dataset,
    tokenizer,
    config.max_seq_length,
    max_samples=100  # Small eval set
)


üîÑ Creating train dataset...
üîÑ Creating eval dataset...


In [14]:
## 4. Kh·ªüi t·∫°o Model
# Initialize model
print("ü§ñ Initializing GPT-2 model...")
model_config = GPT2Config(
    vocab_size=config.vocab_size,
    n_positions=config.n_positions,
    n_embd=config.n_embd,
    n_layer=config.n_layer,
    n_head=config.n_head,
    pad_token_id=tokenizer.pad_token_id
)

model = GPT2LMHeadModel.from_pretrained(config.model_name, config=model_config)
model = model.to(config.device)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"   Total parameters: {total_params:,}")
print(f"   Trainable parameters: {trainable_params:,}")
print(f"   Model size: {total_params * 4 / 1e9:.2f} GB (FP32)")


ü§ñ Initializing GPT-2 model...


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

   Total parameters: 124,439,808
   Trainable parameters: 124,439,808
   Model size: 0.50 GB (FP32)


In [15]:
## 5. Setup Optimizer v√† Scheduler
# Optimizer
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=config.learning_rate,
    betas=(0.9, 0.95),
    weight_decay=0.1
)

# Scheduler will be created after we know total steps
# We'll create it in the training loop


In [20]:
## 6. H√†m ƒë√°nh gi√° Metrics v·ªõi Torchmetrics
# Initialize metrics
class MetricsEvaluator:
    """ƒê√°nh gi√° metrics s·ª≠ d·ª•ng Torchmetrics"""
    
    def __init__(self, device):
        self.device = device
        
        # Loss metric
        self.loss_metric = MeanMetric().to(device)
        
        # Perplexity metric
        self.perplexity_metric = Perplexity(ignore_index=-100).to(device)
        
        # Additional metrics
        self.learning_rate_metric = MeanMetric().to(device)
        
    def reset(self):
        """Reset t·∫•t c·∫£ metrics"""
        self.loss_metric.reset()
        self.perplexity_metric.reset()
        self.learning_rate_metric.reset()
    
    def update(self, logits, labels, loss, lr=None):
        """
        Update metrics v·ªõi batch m·ªõi
        
        Args:
            logits: Model predictions [batch, seq_len, vocab_size]
            labels: Ground truth labels [batch, seq_len]
            loss: Computed loss value
            lr: Current learning rate (optional)
        """
        # Update loss
        self.loss_metric.update(loss.item())
        
        # Update perplexity
        # Perplexity metric expects: logits [batch, seq, vocab] and targets [batch, seq]
        # Shift for next-token prediction
        shift_logits = logits[..., :-1, :].contiguous()  # [batch, seq-1, vocab]
        shift_labels = labels[..., 1:].contiguous()     # [batch, seq-1]
        
        # Set padding tokens to -100 (ignore_index) so Perplexity metric ignores them
        shift_labels_masked = shift_labels.clone()
        padding_mask = (shift_labels == tokenizer.pad_token_id) | (shift_labels < 0)
        shift_labels_masked[padding_mask] = -100
        
        # Update perplexity with proper 3D shape
        self.perplexity_metric.update(shift_logits, shift_labels_masked)
        
        # Update learning rate if provided
        if lr is not None:
            self.learning_rate_metric.update(lr)
    
    def compute(self):
        """Compute v√† return t·∫•t c·∫£ metrics"""
        metrics = {
            'loss': self.loss_metric.compute().item(),
            'perplexity': self.perplexity_metric.compute().item(),
            'learning_rate': self.learning_rate_metric.compute().item() if self.learning_rate_metric._update_count > 0 else 0.0
        }
        return metrics
    
    def compute_and_reset(self):
        """Compute metrics v√† reset cho epoch ti·∫øp theo"""
        metrics = self.compute()
        self.reset()
        return metrics

# Initialize evaluator
metrics_evaluator = MetricsEvaluator(config.device)
print("‚úÖ Metrics evaluator initialized")

‚úÖ Metrics evaluator initialized


In [17]:
## 7. Setup TensorBoard
# Initialize TensorBoard writer
tb_writer = SummaryWriter(log_dir=os.path.join(config.log_dir, config.run_name))
print(f"üìä TensorBoard logging to: {tb_writer.log_dir}")
print(f"   View with: tensorboard --logdir {config.log_dir}")


üìä TensorBoard logging to: ./runs/tensorboard/train_20251228_080806
   View with: tensorboard --logdir ./runs/tensorboard


In [18]:
## 8. Training Loop
# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Causal LM, not masked LM
)

# Create data loaders
train_loader = DataLoader(
    train_iterable,
    batch_size=config.batch_size,
    collate_fn=data_collator,
    num_workers=0  # Streaming dataset doesn't support multiprocessing
)

eval_loader = DataLoader(
    eval_iterable,
    batch_size=config.batch_size,
    collate_fn=data_collator,
    num_workers=0
)

# Mixed precision scaler
scaler = torch.cuda.amp.GradScaler() if config.mixed_precision else None

# Training state
global_step = 0
best_eval_loss = float('inf')

# Create scheduler
total_steps = config.max_steps
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=config.warmup_steps,
    num_training_steps=total_steps
)

print(f"üöÄ Starting training for {total_steps} steps...")
print(f"   Batch size: {config.batch_size}")
print(f"   Gradient accumulation: {config.gradient_accumulation_steps}")
print(f"   Effective batch size: {config.batch_size * config.gradient_accumulation_steps}")


üöÄ Starting training for 10000 steps...
   Batch size: 4
   Gradient accumulation: 8
   Effective batch size: 32


  scaler = torch.cuda.amp.GradScaler() if config.mixed_precision else None


In [21]:
# Training loop
model.train()
metrics_evaluator.reset()

progress_bar = tqdm(total=total_steps, desc="Training")

for step, batch in enumerate(train_loader):
    if global_step >= total_steps:
        break
    
    # Move batch to device
    input_ids = batch['input_ids'].to(config.device)
    labels = batch['labels'].to(config.device)
    
    # Forward pass with mixed precision
    optimizer.zero_grad()
    
    if config.mixed_precision and scaler:
        with torch.cuda.amp.autocast():
            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss / config.gradient_accumulation_steps
        
        scaler.scale(loss).backward()
        
        if (step + 1) % config.gradient_accumulation_steps == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            optimizer.zero_grad()
    else:
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss / config.gradient_accumulation_steps
        loss.backward()
        
        if (step + 1) % config.gradient_accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
    
    # Update metrics
    current_lr = scheduler.get_last_lr()[0] if (step + 1) % config.gradient_accumulation_steps == 0 else 0.0
    metrics_evaluator.update(
        outputs.logits, 
        labels, 
        outputs.loss * config.gradient_accumulation_steps,
        lr=current_lr if current_lr > 0 else None
    )
    
    # Logging
    if (step + 1) % config.gradient_accumulation_steps == 0:
        global_step += 1
        
        if global_step % config.logging_steps == 0:
            metrics = metrics_evaluator.compute()
            
            # Log to TensorBoard
            tb_writer.add_scalar('train/loss', metrics['loss'], global_step)
            tb_writer.add_scalar('train/perplexity', metrics['perplexity'], global_step)
            tb_writer.add_scalar('train/learning_rate', metrics['learning_rate'], global_step)
            
            # Update progress bar
            progress_bar.set_postfix({
                'loss': f"{metrics['loss']:.4f}",
                'ppl': f"{metrics['perplexity']:.2f}",
                'lr': f"{metrics['learning_rate']:.2e}"
            })
            progress_bar.update(config.logging_steps)
            
            # Reset metrics for next logging period
            metrics_evaluator.reset()
        
        # Evaluation
        if global_step % config.eval_steps == 0:
            print(f"\nüîç Evaluating at step {global_step}...")
            model.eval()
            eval_metrics_evaluator = MetricsEvaluator(config.device)
            
            with torch.no_grad():
                eval_count = 0
                for eval_batch in eval_loader:
                    if eval_count >= 10:  # Limit eval batches
                        break
                    
                    eval_input_ids = eval_batch['input_ids'].to(config.device)
                    eval_labels = eval_batch['labels'].to(config.device)
                    
                    if config.mixed_precision:
                        with torch.cuda.amp.autocast():
                            eval_outputs = model(input_ids=eval_input_ids, labels=eval_labels)
                    else:
                        eval_outputs = model(input_ids=eval_input_ids, labels=eval_labels)
                    
                    eval_metrics_evaluator.update(
                        eval_outputs.logits,
                        eval_labels,
                        eval_outputs.loss
                    )
                    eval_count += 1
            
            eval_metrics = eval_metrics_evaluator.compute_and_reset()
            
            # Log eval metrics to TensorBoard
            tb_writer.add_scalar('eval/loss', eval_metrics['loss'], global_step)
            tb_writer.add_scalar('eval/perplexity', eval_metrics['perplexity'], global_step)
            
            print(f"   Eval Loss: {eval_metrics['loss']:.4f}")
            print(f"   Eval Perplexity: {eval_metrics['perplexity']:.2f}")
            
            # Save best model
            if eval_metrics['loss'] < best_eval_loss:
                best_eval_loss = eval_metrics['loss']
                save_path = os.path.join(config.output_dir, f"best_model_step_{global_step}")
                model.save_pretrained(save_path)
                tokenizer.save_pretrained(save_path)
                print(f"   üíæ Saved best model to {save_path}")
            
            model.train()
        
        # Save checkpoint
        if global_step % config.save_steps == 0:
            save_path = os.path.join(config.output_dir, f"checkpoint_step_{global_step}")
            model.save_pretrained(save_path)
            tokenizer.save_pretrained(save_path)
            print(f"üíæ Saved checkpoint to {save_path}")

progress_bar.close()
tb_writer.close()
print("\n‚úÖ Training completed!")



Training:   0%|          | 0/10000 [07:41<?, ?it/s][A
  with torch.cuda.amp.autocast():

Training:   0%|          | 0/10000 [02:04<?, ?it/s, loss=27.9449, ppl=inf, lr=1.51e-05][A
Training:   1%|          | 100/10000 [02:04<3:24:41,  1.24s/it, loss=27.9449, ppl=inf, lr=1.51e-05][A
Training:   1%|          | 100/10000 [04:05<3:24:41,  1.24s/it, loss=27.2630, ppl=inf, lr=4.51e-05][A
Training:   2%|‚ñè         | 200/10000 [04:05<3:20:08,  1.23s/it, loss=27.2630, ppl=inf, lr=4.51e-05][A
Training:   2%|‚ñè         | 200/10000 [06:09<3:20:08,  1.23s/it, loss=27.2661, ppl=inf, lr=7.52e-05][A
Training:   3%|‚ñé         | 300/10000 [06:09<3:19:02,  1.23s/it, loss=27.2661, ppl=inf, lr=7.52e-05][A

KeyboardInterrupt: 

In [None]:
## 8. L∆∞u k·∫øt qu·∫£ v√† commit l√™n GitHub
# L∆∞u training summary
training_summary = {
    'run_name': config.run_name,
    'total_steps': global_step,
    'best_eval_loss': best_eval_loss,
    'config': {
        'model_name': config.model_name,
        'batch_size': config.batch_size,
        'learning_rate': config.learning_rate,
        'max_seq_length': config.max_seq_length,
    },
    'timestamp': datetime.now().isoformat()
}

summary_path = os.path.join(config.output_dir, 'training_summary.json')
with open(summary_path, 'w') as f:
    json.dump(training_summary, f, indent=2)

print(f"üìù Training summary saved to {summary_path}")
