# Medical Sentiment Analysis - Encoder Training on Google Colab

This notebook provides a complete setup for training personalized emotion analysis models using transformer-based encoders (XLM-RoBERTa, mDeBERTa) on Google Colab.

## Overview
- **Task**: 21-dimensional emotion regression for medical text
- **Models**: XLM-RoBERTa-base, mDeBERTa-v3-base
- **Personalization**: Persona tokens + descriptions
- **Languages**: Polish + English

## Setup Steps
1. Enable GPU runtime (Runtime → Change runtime type → GPU)
2. Run all cells sequentially
3. Training will automatically save checkpoints to Google Drive

## 🔧 Environment Setup

In [None]:
# Check GPU availability and system info
import torch
import sys

print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("⚠️ No GPU detected! Please enable GPU runtime: Runtime → Change runtime type → Hardware accelerator: GPU")

In [None]:
# Mount Google Drive for saving models and results
from google.colab import drive
drive.mount('/content/drive')

# Create project directory in Drive
import os
PROJECT_DIR = '/content/drive/MyDrive/medical_sentiment_analysis'
os.makedirs(PROJECT_DIR, exist_ok=True)
print(f"Project directory: {PROJECT_DIR}")

## 📥 Repository Setup

In [None]:
# Clone the repository
import os

REPO_URL = "https://github.com/your-username/medical_sentiment_analysis.git"  # Update with actual repo URL
WORK_DIR = "/content/medical_sentiment_analysis"

# Clone if not exists
if not os.path.exists(WORK_DIR):
    !git clone {REPO_URL} {WORK_DIR}
    print(f"✅ Repository cloned to {WORK_DIR}")
else:
    print(f"✅ Repository already exists at {WORK_DIR}")

# Change to project directory
os.chdir(WORK_DIR)
print(f"Current directory: {os.getcwd()}")

# Show project structure
!find . -type f -name "*.py" | head -10
!ls -la configs/experiment/

In [None]:
# Install dependencies
!pip install -q uv
!uv sync --frozen

# Verify key dependencies
!python -c "import transformers, torch, pandas, sklearn, yaml; print('✅ All dependencies installed')"

## 📊 Data Preparation

In [None]:
# Verify data files exist
import os
from pathlib import Path

data_files = [
    "data/processed/base/all.jsonl",
    "data/processed/encoder/train.jsonl",
    "data/processed/encoder/val.jsonl",
    "data/processed/encoder/test.jsonl",
    "data/personas/personas.json",
    "artifacts/tokenizers/xlmr-base-with-personas/tokenizer_config.json",
    "artifacts/tokenizers/mdeberta-v3-base-with-personas/tokenizer_config.json"
]

missing_files = []
for file_path in data_files:
    if Path(file_path).exists():
        size = Path(file_path).stat().st_size
        print(f"✅ {file_path} ({size:,} bytes)")
    else:
        missing_files.append(file_path)
        print(f"❌ {file_path} - MISSING")

if missing_files:
    print(f"\n⚠️ Missing {len(missing_files)} files. You may need to:")
    print("1. Run data preparation steps locally first")
    print("2. Upload missing files to the repository")
    print("3. Or run data preparation here (if raw data is available)")
else:
    print("\n🎉 All required data files are present!")

In [None]:
# Preview data to verify format
!python scripts/preview_base_data.py --config configs/data_base.yaml --sample data/processed/base/sample.jsonl || echo "Base data preview not available"

# Preview encoder view
!ENC_MODE=persona_token python scripts/preview_encoder_view.py --config configs/data_encoder.yaml --sample data/processed/base/sample.jsonl || echo "Encoder preview not available"

In [None]:
# Test the training pipeline components
import sys
sys.path.append('/content/medical_sentiment_analysis')

try:
    from modules.models.encoder_regressor import EncoderRegressor
    from modules.data.datasets import EncoderJsonlDataset, EncoderDatasetConfig
    from modules.training.trainer_encoder import build_trainer
    from transformers import AutoTokenizer
    
    print("✅ All modules imported successfully")
    
    # Test tokenizer loading
    tokenizer = AutoTokenizer.from_pretrained('artifacts/tokenizers/xlmr-base-with-personas', use_fast=False)
    print(f"✅ XLM-R tokenizer loaded: {len(tokenizer)} tokens")
    
    # Test dataset loading
    cfg = EncoderDatasetConfig(max_length=128)
    if Path('data/processed/encoder/train.jsonl').exists():
        ds = EncoderJsonlDataset('data/processed/encoder/train.jsonl', tokenizer, cfg)
        print(f"✅ Dataset loaded: {len(ds)} samples")
        
        # Test one sample
        sample = ds[0]
        print(f"✅ Sample shape: input={sample['input_ids'].shape}, labels={sample['labels'].shape}")
    
except Exception as e:
    print(f"❌ Pipeline test failed: {e}")
    import traceback
    traceback.print_exc()

## 🚀 Model Training

In [None]:
# Setup training output directories in Google Drive
import os
from datetime import datetime

# Create timestamped training run directory
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
TRAINING_RUN_DIR = f"{PROJECT_DIR}/training_runs/{timestamp}"
os.makedirs(TRAINING_RUN_DIR, exist_ok=True)

print(f"Training outputs will be saved to: {TRAINING_RUN_DIR}")

# Create symlink for easy access
LOCAL_OUTPUT_DIR = "/content/training_output"
if os.path.exists(LOCAL_OUTPUT_DIR):
    !rm -rf {LOCAL_OUTPUT_DIR}
!ln -s {TRAINING_RUN_DIR} {LOCAL_OUTPUT_DIR}

print(f"Local training directory: {LOCAL_OUTPUT_DIR}")

In [None]:
# Training configuration
EXPERIMENTS = {
    "xlmr_persona_token": "configs/experiment/enc_persona_token_xlmr.yaml",
    "mdeberta_persona_token": "configs/experiment/enc_persona_token_mdeberta.yaml",
    "xlmr_personalized": "configs/experiment/enc_personalized_desc_xlmr.yaml", 
    "mdeberta_personalized": "configs/experiment/enc_personalized_desc_mdeberta.yaml"
}

# Select experiment to run
EXPERIMENT_NAME = "xlmr_persona_token"  # Change this to run different experiments
CONFIG_PATH = EXPERIMENTS[EXPERIMENT_NAME]

print(f"Selected experiment: {EXPERIMENT_NAME}")
print(f"Config file: {CONFIG_PATH}")

# Show config
!cat {CONFIG_PATH}

In [None]:
# Create modified config for Colab training
import yaml
import copy

# Load original config
with open(CONFIG_PATH, 'r') as f:
    config = yaml.safe_load(f)

# Modify for Colab training
colab_config = copy.deepcopy(config)
colab_config['training']['output_dir'] = f"{LOCAL_OUTPUT_DIR}/{EXPERIMENT_NAME}"
colab_config['training']['epochs'] = 3  # Adjust as needed
colab_config['training']['train_bs'] = 8  # Adjust based on GPU memory
colab_config['training']['eval_bs'] = 16
colab_config['training']['logging_steps'] = 25
colab_config['training']['evaluation_strategy'] = "steps"
colab_config['training']['eval_steps'] = 100
colab_config['training']['save_strategy'] = "steps"
colab_config['training']['save_steps'] = 100

# Save modified config
COLAB_CONFIG_PATH = f"/tmp/{EXPERIMENT_NAME}_colab.yaml"
with open(COLAB_CONFIG_PATH, 'w') as f:
    yaml.dump(colab_config, f, default_flow_style=False)

print(f"Modified config saved to: {COLAB_CONFIG_PATH}")
print(f"Output directory: {colab_config['training']['output_dir']}")
print(f"Batch sizes: train={colab_config['training']['train_bs']}, eval={colab_config['training']['eval_bs']}")

In [None]:
# Run training
import os
os.environ['PYTHONPATH'] = '/content/medical_sentiment_analysis'

print(f"🚀 Starting training: {EXPERIMENT_NAME}")
print(f"⏰ This may take 1-3 hours depending on data size and epochs")

# Run training script
!cd /content/medical_sentiment_analysis && python scripts/train_encoder.py --config {COLAB_CONFIG_PATH}

print("✅ Training completed!")

## 📈 Training Monitoring & Results

In [None]:
# Check training outputs
import os
from pathlib import Path

output_dir = f"{LOCAL_OUTPUT_DIR}/{EXPERIMENT_NAME}"

if Path(output_dir).exists():
    print(f"📁 Training outputs in: {output_dir}")
    !ls -la {output_dir}
    
    # Check for model checkpoints
    checkpoints = list(Path(output_dir).glob("checkpoint-*"))
    if checkpoints:
        print(f"\n🎯 Found {len(checkpoints)} checkpoints:")
        for cp in sorted(checkpoints):
            print(f"  - {cp.name}")
    
    # Check training logs
    if Path(f"{output_dir}/trainer_state.json").exists():
        print("\n📊 Training state saved")
        !head -20 {output_dir}/trainer_state.json
        
else:
    print(f"❌ No outputs found at: {output_dir}")

In [None]:
# Plot training metrics if available
import json
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path

trainer_state_path = Path(f"{LOCAL_OUTPUT_DIR}/{EXPERIMENT_NAME}/trainer_state.json")

if trainer_state_path.exists():
    with open(trainer_state_path, 'r') as f:
        trainer_state = json.load(f)
    
    # Extract log history
    log_history = trainer_state.get('log_history', [])
    
    if log_history:
        # Convert to DataFrame
        df = pd.DataFrame(log_history)
        
        # Plot training metrics
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        fig.suptitle(f'Training Metrics: {EXPERIMENT_NAME}', fontsize=16)
        
        # Training loss
        if 'train_loss' in df.columns:
            train_df = df.dropna(subset=['train_loss'])
            axes[0,0].plot(train_df['step'], train_df['train_loss'], 'b-', label='Train Loss')
            axes[0,0].set_title('Training Loss')
            axes[0,0].set_xlabel('Step')
            axes[0,0].set_ylabel('Loss')
            axes[0,0].grid(True)
        
        # Validation metrics
        eval_df = df.dropna(subset=['eval_loss']) if 'eval_loss' in df.columns else pd.DataFrame()
        
        if not eval_df.empty:
            # Validation loss
            axes[0,1].plot(eval_df['step'], eval_df['eval_loss'], 'r-', label='Eval Loss')
            axes[0,1].set_title('Validation Loss')
            axes[0,1].set_xlabel('Step')
            axes[0,1].set_ylabel('Loss')
            axes[0,1].grid(True)
            
            # R² score
            if 'eval_r2' in eval_df.columns:
                axes[1,0].plot(eval_df['step'], eval_df['eval_r2'], 'g-', label='R²')
                axes[1,0].set_title('R² Score')
                axes[1,0].set_xlabel('Step')
                axes[1,0].set_ylabel('R²')
                axes[1,0].grid(True)
            
            # MAE
            if 'eval_mae' in eval_df.columns:
                axes[1,1].plot(eval_df['step'], eval_df['eval_mae'], 'm-', label='MAE')
                axes[1,1].set_title('Mean Absolute Error')
                axes[1,1].set_xlabel('Step')
                axes[1,1].set_ylabel('MAE')
                axes[1,1].grid(True)
        
        plt.tight_layout()
        plt.show()
        
        # Print final metrics
        if not eval_df.empty:
            final_metrics = eval_df.iloc[-1]
            print("\n🎯 Final Validation Metrics:")
            for col in final_metrics.index:
                if col.startswith('eval_') and col != 'eval_runtime':
                    print(f"  {col}: {final_metrics[col]:.4f}")
    else:
        print("No training logs found")
else:
    print("No trainer state file found")

In [None]:
# Run evaluation on test set
import os
from pathlib import Path

# Find best checkpoint
output_dir = f"{LOCAL_OUTPUT_DIR}/{EXPERIMENT_NAME}"
checkpoint_dirs = list(Path(output_dir).glob("checkpoint-*"))

if checkpoint_dirs:
    # Use the last checkpoint (highest step number)
    best_checkpoint = max(checkpoint_dirs, key=lambda x: int(x.name.split('-')[1]))
    print(f"Using checkpoint: {best_checkpoint}")
    
    # Run evaluation
    eval_cmd = f"cd /content/medical_sentiment_analysis && EVAL_SPLIT=test python scripts/eval_encoder.py --config {COLAB_CONFIG_PATH} --checkpoint {best_checkpoint}"
    print(f"Running: {eval_cmd}")
    
    !{eval_cmd}
    
else:
    print("❌ No checkpoints found for evaluation")

## 💾 Save Results

In [None]:
# Create results summary
import json
from datetime import datetime
from pathlib import Path

results_summary = {
    "experiment_name": EXPERIMENT_NAME,
    "timestamp": timestamp,
    "config_path": CONFIG_PATH,
    "output_directory": output_dir,
    "colab_session": True,
    "gpu_info": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU"
}

# Add final metrics if available
trainer_state_path = Path(f"{output_dir}/trainer_state.json")
if trainer_state_path.exists():
    with open(trainer_state_path, 'r') as f:
        trainer_state = json.load(f)
    
    log_history = trainer_state.get('log_history', [])
    if log_history:
        # Get final evaluation metrics
        eval_logs = [log for log in log_history if any(k.startswith('eval_') for k in log.keys())]
        if eval_logs:
            final_eval = eval_logs[-1]
            results_summary['final_metrics'] = {
                k: v for k, v in final_eval.items() 
                if k.startswith('eval_') and k not in ['eval_runtime', 'eval_samples_per_second']
            }

# Save results summary
summary_path = f"{TRAINING_RUN_DIR}/results_summary.json"
with open(summary_path, 'w') as f:
    json.dump(results_summary, f, indent=2)

print(f"📝 Results summary saved to: {summary_path}")
print("\n📊 Summary:")
print(json.dumps(results_summary, indent=2))

In [None]:
# Compress and download best model (optional)
from pathlib import Path

output_dir = f"{LOCAL_OUTPUT_DIR}/{EXPERIMENT_NAME}"
if Path(output_dir).exists():
    # Create compressed archive
    archive_name = f"{EXPERIMENT_NAME}_{timestamp}.tar.gz"
    archive_path = f"{TRAINING_RUN_DIR}/{archive_name}"
    
    !tar -czf {archive_path} -C {LOCAL_OUTPUT_DIR} {EXPERIMENT_NAME}
    
    print(f"📦 Model archived to: {archive_path}")
    
    # Show file size
    size_mb = Path(archive_path).stat().st_size / 1024 / 1024
    print(f"📏 Archive size: {size_mb:.1f} MB")
    
    # Option to download (uncomment if needed)
    # from google.colab import files
    # files.download(archive_path)
    
else:
    print("❌ No model outputs to archive")

## 🎯 Next Steps

1. **Run Other Experiments**: Change `EXPERIMENT_NAME` variable and re-run training cells
2. **Hyperparameter Tuning**: Modify `colab_config` parameters (learning rate, batch size, etc.)
3. **Extended Training**: Increase epochs for better performance
4. **Model Analysis**: Use the evaluation results for model comparison
5. **Download Models**: Your trained models are saved to Google Drive at: `{PROJECT_DIR}/training_runs/{timestamp}/`

## 📚 Available Experiments
- `xlmr_persona_token`: XLM-RoBERTa with persona tokens
- `mdeberta_persona_token`: mDeBERTa with persona tokens
- `xlmr_personalized`: XLM-RoBERTa with persona descriptions
- `mdeberta_personalized`: mDeBERTa with persona descriptions