# SMT-WEEX Notebook 2: Train/Test Split & Multi-Model Training
**Project:** smt-weex-2025
**Author:** Jannet Ekka

This notebook:
1. Loads cleaned data from GCS (output of notebook 01)
2. Splits into train/val/test (70/10/20)
3. Trains multiple models (CatBoost, XGBoost, RandomForest, LightGBM)
4. Initial evaluation
5. Saves models to GCS

## 1. Setup

In [None]:
# Install packages
!pip install -q catboost xgboost lightgbm scikit-learn pandas numpy matplotlib seaborn google-cloud-storage

In [None]:
# Authenticate
from google.colab import auth
auth.authenticate_user()

PROJECT_ID = 'smt-weex-2025'
BUCKET = 'smt-weex-2025-models'

!gcloud config set project {PROJECT_ID}

In [None]:
import pandas as pd
import numpy as np
import json
import pickle
from datetime import datetime

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)
from sklearn.ensemble import RandomForestClassifier

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print("Libraries loaded")

## 2. Load Cleaned Data from GCS

In [None]:
# Download from GCS (output of notebook 01)
!gsutil cp gs://{BUCKET}/data/whale_features_cleaned.csv /content/
!gsutil cp gs://{BUCKET}/data/feature_config.json /content/

# Load data
df = pd.read_csv('/content/whale_features_cleaned.csv')

# Load feature config
with open('/content/feature_config.json', 'r') as f:
    config = json.load(f)

FEATURES = config['features']
TARGET = config['target']

print(f"Loaded {len(df)} samples")
print(f"Features: {len(FEATURES)}")
print(f"Categories: {df[TARGET].value_counts().to_dict()}")

In [None]:
# Prepare X and y
X = df[FEATURES].values
y_raw = df[TARGET].values

# Encode labels
le = LabelEncoder()
y = le.fit_transform(y_raw)

# Save label mapping
label_mapping = {i: label for i, label in enumerate(le.classes_)}
print("Label mapping:")
for idx, label in label_mapping.items():
    count = (y == idx).sum()
    print(f"  {idx}: {label} ({count} samples)")

## 3. Train/Val/Test Split (70/10/20)

In [None]:
# First split: 80% train+val, 20% test
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, 
    test_size=0.20, 
    random_state=42, 
    stratify=y
)

# Second split: 87.5% train, 12.5% val (from trainval) = 70/10 overall
X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval,
    test_size=0.125,  # 0.125 * 0.8 = 0.1 (10% of total)
    random_state=42,
    stratify=y_trainval
)

print(f"Train: {len(X_train)} ({len(X_train)/len(X)*100:.1f}%)")
print(f"Val:   {len(X_val)} ({len(X_val)/len(X)*100:.1f}%)")
print(f"Test:  {len(X_test)} ({len(X_test)/len(X)*100:.1f}%)")

In [None]:
# Check stratification
print("\n=== Class Distribution ===")
for name, y_subset in [('Train', y_train), ('Val', y_val), ('Test', y_test)]:
    unique, counts = np.unique(y_subset, return_counts=True)
    print(f"{name}: {dict(zip([label_mapping[u] for u in unique], counts))}")

## 4. Model Training

In [None]:
# Store models and results
models = {}
results = {}
n_classes = len(label_mapping)

### 4.1 CatBoost (Primary Model - Best for Small Data)

In [None]:
print("=" * 50)
print("Training CatBoost...")
print("=" * 50)

catboost_model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.03,
    depth=4,
    l2_leaf_reg=3,
    loss_function='MultiClass',
    eval_metric='Accuracy',
    random_seed=42,
    verbose=100,
    early_stopping_rounds=50,
    auto_class_weights='Balanced'  # Handle class imbalance
)

catboost_model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    use_best_model=True
)

models['CatBoost'] = catboost_model
print("CatBoost training complete!")

### 4.2 XGBoost

In [None]:
print("=" * 50)
print("Training XGBoost...")
print("=" * 50)

# Calculate class weights for XGBoost
from sklearn.utils.class_weight import compute_sample_weight
sample_weights = compute_sample_weight('balanced', y_train)

xgb_model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.03,
    max_depth=4,
    reg_alpha=1,
    reg_lambda=3,
    objective='multi:softmax',
    num_class=n_classes,
    random_state=42,
    early_stopping_rounds=50,
    eval_metric='mlogloss'
)

xgb_model.fit(
    X_train, y_train,
    sample_weight=sample_weights,
    eval_set=[(X_val, y_val)],
    verbose=100
)

models['XGBoost'] = xgb_model
print("XGBoost training complete!")

### 4.3 Random Forest

In [None]:
print("=" * 50)
print("Training Random Forest...")
print("=" * 50)

rf_model = RandomForestClassifier(
    n_estimators=500,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=3,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)

rf_model.fit(X_train, y_train)

models['RandomForest'] = rf_model
print("Random Forest training complete!")

### 4.4 LightGBM (Not Recommended for Small Data - Testing Only)

In [None]:
print("=" * 50)
print("Training LightGBM (WARNING: Not ideal for <10K samples)...")
print("=" * 50)

lgbm_model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.03,
    max_depth=4,
    num_leaves=15,
    reg_alpha=1,
    reg_lambda=3,
    objective='multiclass',
    num_class=n_classes,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced',
    verbosity=-1
)

lgbm_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)]
)

models['LightGBM'] = lgbm_model
print("LightGBM training complete!")

## 5. Initial Evaluation

In [None]:
def evaluate_model(model, X_test, y_test, model_name):
    """Evaluate a model and return metrics"""
    y_pred = model.predict(X_test)
    
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision_macro': precision_score(y_test, y_pred, average='macro', zero_division=0),
        'recall_macro': recall_score(y_test, y_pred, average='macro', zero_division=0),
        'f1_macro': f1_score(y_test, y_pred, average='macro', zero_division=0),
        'f1_weighted': f1_score(y_test, y_pred, average='weighted', zero_division=0)
    }
    
    return metrics, y_pred

In [None]:
# Evaluate all models on test set
print("=" * 60)
print("MODEL EVALUATION ON TEST SET")
print("=" * 60)

for name, model in models.items():
    metrics, y_pred = evaluate_model(model, X_test, y_test, name)
    results[name] = metrics
    
    print(f"\n{name}:")
    print(f"  Accuracy:         {metrics['accuracy']:.4f}")
    print(f"  Precision (macro): {metrics['precision_macro']:.4f}")
    print(f"  Recall (macro):    {metrics['recall_macro']:.4f}")
    print(f"  F1 (macro):        {metrics['f1_macro']:.4f}")
    print(f"  F1 (weighted):     {metrics['f1_weighted']:.4f}")

In [None]:
# Results comparison table
results_df = pd.DataFrame(results).T
results_df = results_df.round(4)
print("\n=== Model Comparison ===")
print(results_df.sort_values('f1_macro', ascending=False))

In [None]:
# Visualize comparison
fig, ax = plt.subplots(figsize=(12, 6))

metrics_to_plot = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
x = np.arange(len(models))
width = 0.2

for i, metric in enumerate(metrics_to_plot):
    values = [results[m][metric] for m in models.keys()]
    ax.bar(x + i*width, values, width, label=metric)

ax.set_ylabel('Score')
ax.set_title('Model Performance Comparison')
ax.set_xticks(x + width * 1.5)
ax.set_xticklabels(models.keys())
ax.legend()
ax.set_ylim(0, 1)
plt.tight_layout()
plt.show()

In [None]:
# Find best model
best_model_name = max(results, key=lambda x: results[x]['f1_macro'])
print(f"\nBest Model: {best_model_name} (F1 macro: {results[best_model_name]['f1_macro']:.4f})")

## 6. Save Models to GCS

In [None]:
# Save all models
import os
os.makedirs('/content/models', exist_ok=True)

# CatBoost native format
catboost_model.save_model('/content/models/catboost_whale_classifier.cbm')

# Others as pickle
with open('/content/models/xgboost_whale_classifier.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

with open('/content/models/randomforest_whale_classifier.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

with open('/content/models/lightgbm_whale_classifier.pkl', 'wb') as f:
    pickle.dump(lgbm_model, f)

# Save label encoder
with open('/content/models/label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)

# Save results
with open('/content/models/initial_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print("Models saved locally")

In [None]:
# Upload to GCS
!gsutil -m cp -r /content/models/* gs://{BUCKET}/models/initial/
print(f"Models uploaded to gs://{BUCKET}/models/initial/")

In [None]:
# Save train/val/test splits for reproducibility
np.savez('/content/data_splits.npz',
         X_train=X_train, y_train=y_train,
         X_val=X_val, y_val=y_val,
         X_test=X_test, y_test=y_test)

!gsutil cp /content/data_splits.npz gs://{BUCKET}/data/data_splits.npz
print("Data splits saved")

## Summary

Training completed:
1. Split data: 70% train, 10% val, 20% test
2. Trained 4 models: CatBoost, XGBoost, RandomForest, LightGBM
3. Initial evaluation on test set
4. Saved all models to GCS

**Best Model:** [See output above]

**Next:** Run Notebook 3 for detailed evaluation and insights.