# 🚀 SmolLM3 Financial Sentiment Training - Google Colab

This notebook trains SmolLM3-3B on financial sentiment data in Google Colab.

**Optimized for Colab T4 GPU (15GB VRAM)**

## 📋 Instructions:
1. **Upload your data**: Upload `all-data.csv` to Colab Files
2. **Run all cells**: Execute cells sequentially
3. **Download model**: Get the trained model ZIP file

## ⚡ Why Colab for SmolLM3:
- **Memory**: 15GB GPU vs 9GB MacBook MPS limit
- **Speed**: GPU training ~10-50x faster than CPU
- **Free**: No local resource usage
- **No thermal throttling**: Unlike laptops

## 🔧 Environment Setup

Install required packages and check GPU availability.

In [None]:
# Install required packages
!pip install transformers datasets accelerate torch -q
!pip install scikit-learn matplotlib seaborn pandas numpy -q

print("✅ Packages installed successfully!")

In [None]:
# Check GPU and environment
import torch
import os

print("🖥️ Environment Check:")
print(f"Python version: {os.sys.version}")
print(f"PyTorch version: {torch.__version__}")

if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"✅ GPU: {gpu_name}")
    print(f"💾 GPU Memory: {gpu_memory:.1f} GB")
    
    if gpu_memory >= 12:
        print("🎉 Perfect! Sufficient memory for SmolLM3-3B training")
    else:
        print("⚠️ Limited GPU memory - will use aggressive optimizations")
else:
    print("❌ No GPU detected - training will be very slow!")
    response = input("Continue with CPU training? (y/N): ")
    if response.lower() != 'y':
        raise RuntimeError("GPU required for efficient SmolLM3 training")

## 📊 Data Loading and Preprocessing

Load and prepare financial sentiment data for training.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

# Try to find the data file in common locations
data_paths = [
    "/content/all-data.csv",
    "/content/data/FinancialPhraseBank/all-data.csv",
    "all-data.csv",
    "/content/drive/MyDrive/all-data.csv"
]

data_df = None
for path in data_paths:
    if Path(path).exists():
        print(f"📂 Found data at: {path}")
        try:
            data_df = pd.read_csv(path, 
                                names=['text', 'label'], 
                                encoding='utf-8', 
                                on_bad_lines='skip')
            break
        except Exception as e:
            print(f"❌ Failed to load {path}: {e}")
            continue

if data_df is None:
    print("❌ No data file found!")
    print("📤 Please upload your 'all-data.csv' file to Colab:")
    print("   1. Click the Files tab (📁) on the left")
    print("   2. Click Upload (📤)")
    print("   3. Select your all-data.csv file")
    print("   4. Re-run this cell")
    raise FileNotFoundError("Data file not found")

print(f"✅ Loaded {len(data_df)} samples")
print(f"📊 Data shape: {data_df.shape}")
print(f"🔍 Sample data:")
print(data_df.head())

In [None]:
# Clean and preprocess data
print("🧹 Cleaning data...")

# Remove any rows with missing data
initial_size = len(data_df)
data_df = data_df.dropna()
print(f"   Removed {initial_size - len(data_df)} rows with missing data")

# Ensure text and labels are strings
data_df['text'] = data_df['text'].astype(str)
data_df['label'] = data_df['label'].astype(str)

# Check unique labels
unique_labels = data_df['label'].unique()
print(f"🏷️ Found labels: {unique_labels}")

# Map sentiment labels to integers
if set(unique_labels).issubset({'positive', 'neutral', 'negative'}):
    # Standard sentiment labels
    label_mapping = {
        'negative': 0,
        'neutral': 1, 
        'positive': 2
    }
else:
    # Auto-create mapping for other label formats
    sorted_labels = sorted(unique_labels)
    label_mapping = {label: i for i, label in enumerate(sorted_labels)}
    print(f"🔄 Auto-mapped labels: {label_mapping}")

data_df['label_id'] = data_df['label'].map(label_mapping)
data_df = data_df.dropna(subset=['label_id'])
data_df['label_id'] = data_df['label_id'].astype(int)

print(f"✅ Final dataset: {len(data_df)} samples")
print(f"📊 Label distribution:")
label_counts = data_df['label_id'].value_counts().sort_index()
for label_id, count in label_counts.items():
    label_name = [k for k, v in label_mapping.items() if v == label_id][0]
    print(f"   {label_name} ({label_id}): {count} samples")

In [None]:
# Train/validation split
print("🔄 Creating train/validation split...")

train_df, val_df = train_test_split(
    data_df, 
    test_size=0.2, 
    random_state=42, 
    stratify=data_df['label_id']
)

print(f"📊 Split complete:")
print(f"   Training: {len(train_df)} samples")
print(f"   Validation: {len(val_df)} samples")

# Visualize data distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Training distribution
train_counts = train_df['label_id'].value_counts().sort_index()
axes[0].bar(train_counts.index, train_counts.values)
axes[0].set_title('Training Set Distribution')
axes[0].set_xlabel('Label')
axes[0].set_ylabel('Count')

# Validation distribution
val_counts = val_df['label_id'].value_counts().sort_index()
axes[1].bar(val_counts.index, val_counts.values)
axes[1].set_title('Validation Set Distribution')
axes[1].set_xlabel('Label')
axes[1].set_ylabel('Count')

plt.tight_layout()
plt.show()

print("✅ Data preprocessing complete!")

## 🤖 SmolLM3 Model Setup

Load and configure SmolLM3-3B for financial sentiment classification.

In [None]:
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding
)
from datasets import Dataset
from sklearn.metrics import accuracy_score, classification_report
import json
from datetime import datetime

# Configuration
MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
MODEL_NAME = "smollm3-financial-sentiment"
NUM_LABELS = len(label_mapping)

print(f"🚀 Loading SmolLM3 model: {MODEL_ID}")
print(f"📊 Number of labels: {NUM_LABELS}")

# Load tokenizer
print("📥 Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Ensure pad token exists
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("🔧 Set pad_token to eos_token")

print(f"✅ Tokenizer loaded: {len(tokenizer)} tokens in vocabulary")

In [None]:
# Load model
print("📥 Loading SmolLM3 model...")
print("⏳ This may take a few minutes (downloading ~6GB)...")

# Create reverse mapping for model
id2label = {v: k for k, v in label_mapping.items()}

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ID,
    num_labels=NUM_LABELS,
    id2label=id2label,
    label2id=label_mapping,
    torch_dtype=torch.float16,  # Use half precision to save memory
    device_map="auto"  # Automatically place on GPU
)

print(f"✅ Model loaded: {model.config.__class__.__name__}")
print(f"📊 Parameters: {model.num_parameters():,}")
print(f"💾 Model size: ~{model.num_parameters() * 2 / 1e9:.1f}GB (FP16)")

# Check model device
device = next(model.parameters()).device
print(f"🔧 Model device: {device}")

## 🔄 Dataset Preparation

Tokenize and prepare datasets for training.

In [None]:
# Training configuration
MAX_LENGTH = 512  # SmolLM3 can handle longer sequences
BATCH_SIZE = 2    # Conservative for 15GB GPU
GRADIENT_ACCUMULATION = 8  # Effective batch size = 16
LEARNING_RATE = 1e-5  # Lower for large model
NUM_EPOCHS = 2    # Quick training

print(f"⚙️ Training Configuration:")
print(f"   Max length: {MAX_LENGTH}")
print(f"   Batch size: {BATCH_SIZE}")
print(f"   Gradient accumulation: {GRADIENT_ACCUMULATION}")
print(f"   Effective batch size: {BATCH_SIZE * GRADIENT_ACCUMULATION}")
print(f"   Learning rate: {LEARNING_RATE}")
print(f"   Epochs: {NUM_EPOCHS}")

def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        padding=False,
        max_length=MAX_LENGTH,
        return_tensors=None
    )

def create_dataset(df):
    """Create HuggingFace dataset from pandas DataFrame."""
    dataset = Dataset.from_pandas(
        df[['text', 'label_id']].rename(columns={'label_id': 'labels'})
    )
    
    dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=['text'],
        num_proc=1
    )
    
    return dataset

print("🔄 Tokenizing datasets...")
train_dataset = create_dataset(train_df)
val_dataset = create_dataset(val_df)

print(f"✅ Datasets created:")
print(f"   Training: {len(train_dataset)} samples")
print(f"   Validation: {len(val_dataset)} samples")
print(f"   Features: {train_dataset.features}")

## 🏋️ Model Training

Train SmolLM3 on financial sentiment data with GPU optimization.

In [None]:
# Setup training arguments
output_dir = f"/content/{MODEL_NAME}"

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_steps=100,
    logging_steps=50,
    save_steps=500,
    eval_steps=500,
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to=None,  # Disable wandb
    fp16=True,  # Enable mixed precision
    dataloader_num_workers=2,
    save_total_limit=2,
    logging_dir=f"{output_dir}/logs",
    push_to_hub=False,
    remove_unused_columns=True,
)

# Data collator for dynamic padding
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding=True,
    return_tensors='pt'
)

# Metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    return {'accuracy': accuracy}

print("🔧 Training setup complete!")
print(f"📁 Output directory: {output_dir}")

In [None]:
# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("✅ Trainer created successfully!")
print(f"🔧 Model device: {trainer.model.device}")

# Estimate training time
steps_per_epoch = len(train_dataset) // (BATCH_SIZE * GRADIENT_ACCUMULATION)
total_steps = steps_per_epoch * NUM_EPOCHS
estimated_minutes = total_steps * 2 // 60  # ~2 seconds per step estimate

print(f"⏱️ Training estimates:")
print(f"   Steps per epoch: {steps_per_epoch}")
print(f"   Total steps: {total_steps}")
print(f"   Estimated time: ~{estimated_minutes} minutes")

In [None]:
# Start training
print("🚀 Starting SmolLM3 training...")
print(f"⏳ Estimated completion: ~{estimated_minutes} minutes")
print("📊 Monitor GPU usage in the sidebar →")

# Train the model
train_result = trainer.train()

print("✅ Training completed!")
print(f"📊 Training metrics:")
print(f"   Final loss: {train_result.training_loss:.4f}")
print(f"   Training time: {train_result.metrics.get('train_runtime', 0):.1f}s")
print(f"   Samples/second: {train_result.metrics.get('train_samples_per_second', 0):.2f}")

## 📊 Model Evaluation

Evaluate the trained model on validation data.

In [None]:
# Evaluate model
print("📊 Running final evaluation...")
eval_result = trainer.evaluate()

print(f"🎯 Final Results:")
print(f"   Validation accuracy: {eval_result['eval_accuracy']:.4f}")
print(f"   Validation loss: {eval_result['eval_loss']:.4f}")

# Get detailed predictions for analysis
predictions = trainer.predict(val_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids

# Classification report
print(f"\n📈 Detailed Classification Report:")
target_names = [id2label[i] for i in sorted(id2label.keys())]
report = classification_report(true_labels, predicted_labels, target_names=target_names)
print(report)

In [None]:
# Visualize results
from sklearn.metrics import confusion_matrix

# Confusion matrix
cm = confusion_matrix(true_labels, predicted_labels)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=target_names, 
            yticklabels=target_names)
plt.title('SmolLM3 Financial Sentiment - Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Sample predictions
print(f"\n🔍 Sample Predictions:")
sample_indices = np.random.choice(len(val_df), 5, replace=False)

for i, idx in enumerate(sample_indices):
    text = val_df.iloc[idx]['text'][:100] + "..."
    true_label = id2label[true_labels[idx]]
    pred_label = id2label[predicted_labels[idx]]
    confidence = np.max(predictions.predictions[idx])
    
    print(f"\n{i+1}. Text: {text}")
    print(f"   True: {true_label} | Predicted: {pred_label} | Confidence: {confidence:.3f}")
    if true_label != pred_label:
        print("   ❌ Incorrect prediction")
    else:
        print("   ✅ Correct prediction")

## 💾 Save and Download Model

Save the trained model and prepare for download.

In [None]:
# Save model and tokenizer
print("💾 Saving trained model...")

trainer.save_model()
tokenizer.save_pretrained(output_dir)

# Save training configuration and results
results = {
    'training_timestamp': datetime.now().isoformat(),
    'model_id': MODEL_ID,
    'model_name': MODEL_NAME,
    'num_labels': NUM_LABELS,
    'label_mapping': label_mapping,
    'id2label': id2label,
    'training_config': {
        'max_length': MAX_LENGTH,
        'batch_size': BATCH_SIZE,
        'gradient_accumulation': GRADIENT_ACCUMULATION,
        'learning_rate': LEARNING_RATE,
        'num_epochs': NUM_EPOCHS,
    },
    'results': {
        'train_loss': train_result.training_loss,
        'eval_loss': eval_result['eval_loss'],
        'eval_accuracy': eval_result['eval_accuracy'],
        'train_runtime': train_result.metrics.get('train_runtime', 0),
        'samples_per_second': train_result.metrics.get('train_samples_per_second', 0)
    }
}

# Save results JSON
with open(f"{output_dir}/training_results.json", 'w') as f:
    json.dump(results, f, indent=2)

# Save label mapping separately
import pickle
with open(f"{output_dir}/label_encoder.pkl", 'wb') as f:
    pickle.dump({'label2id': label_mapping, 'id2label': id2label}, f)

print(f"✅ Model saved to: {output_dir}")
print(f"📁 Files saved:")
print(f"   - config.json")
print(f"   - model.safetensors")
print(f"   - tokenizer files")
print(f"   - training_results.json")
print(f"   - label_encoder.pkl")

In [None]:
# Create downloadable ZIP file
print("📦 Creating download package...")

!zip -r {MODEL_NAME}.zip {output_dir}

import os
zip_size = os.path.getsize(f"{MODEL_NAME}.zip") / (1024**3)  # Size in GB

print(f"✅ Created {MODEL_NAME}.zip")
print(f"📦 Package size: {zip_size:.2f} GB")
print(f"")
print(f"📥 To download your trained model:")
print(f"   1. Click the Files tab (📁) on the left")
print(f"   2. Find '{MODEL_NAME}.zip'")
print(f"   3. Click the download icon (⬇️)")
print(f"")
print(f"🎊 Training complete! Your SmolLM3 model is ready.")

# Display final summary
print(f"\n{'='*50}")
print(f"🏆 SMOLLM3 TRAINING COMPLETED!")
print(f"{'='*50}")
print(f"📊 Final Performance:")
print(f"   🎯 Accuracy: {eval_result['eval_accuracy']:.1%}")
print(f"   📉 Loss: {eval_result['eval_loss']:.4f}")
print(f"   ⏱️ Training time: {train_result.metrics.get('train_runtime', 0)/60:.1f} minutes")
print(f"")
print(f"🎉 Your SmolLM3 model is now trained for financial sentiment analysis!")