# SMT-WEEX Notebook 4: Hyperparameter Tuning & Final Model
**Project:** smt-weex-2025
**Author:** Jannet Ekka

This notebook:
1. RandomizedSearchCV for hyperparameter tuning
2. Retrain best model with optimal params
3. Final evaluation
4. Export production model to GCS

## 1. Setup

In [None]:
!pip install -q catboost xgboost lightgbm scikit-learn pandas numpy matplotlib seaborn google-cloud-storage

In [None]:
from google.colab import auth
auth.authenticate_user()

PROJECT_ID = 'smt-weex-2025'
BUCKET = 'smt-weex-2025-models'

!gcloud config set project {PROJECT_ID}

In [None]:
import pandas as pd
import numpy as np
import json
import pickle
from datetime import datetime

from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, classification_report, make_scorer, confusion_matrix, balanced_accuracy_score

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from scipy.stats import randint, uniform

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print("Libraries loaded")

## 2. Load Data from GCS

In [None]:
# Download from GCS
!gsutil cp gs://{BUCKET}/data/data_splits.npz /content/
!gsutil cp gs://{BUCKET}/data/feature_config.json /content/
!mkdir -p /content/models
!gsutil cp gs://{BUCKET}/models/initial/label_encoder.pkl /content/models/

In [None]:
# Load data splits
splits = np.load('/content/data_splits.npz')
X_train, y_train = splits['X_train'], splits['y_train']
X_val, y_val = splits['X_val'], splits['y_val']
X_test, y_test = splits['X_test'], splits['y_test']

# Combine train + val for tuning (will use cross-validation)
X_trainval = np.vstack([X_train, X_val])
y_trainval = np.concatenate([y_train, y_val])

# Load feature config
with open('/content/feature_config.json', 'r') as f:
    config = json.load(f)
FEATURES = config['features']

# Load label encoder
with open('/content/models/label_encoder.pkl', 'rb') as f:
    le = pickle.load(f)

label_mapping = {i: label for i, label in enumerate(le.classes_)}
labels = list(label_mapping.values())
n_classes = len(label_mapping)

print(f"Train+Val: {len(X_trainval)}, Test: {len(X_test)}")
print(f"Features: {len(FEATURES)}")
print(f"Classes: {n_classes} - {labels}")

## 3. CatBoost Hyperparameter Tuning

In [None]:
# Define search space for CatBoost
catboost_param_dist = {
    'iterations': randint(200, 800),
    'learning_rate': uniform(0.01, 0.15),
    'depth': randint(3, 8),
    'l2_leaf_reg': uniform(1, 10),
    'border_count': randint(32, 128),
    'bagging_temperature': uniform(0, 1),
    'random_strength': uniform(0, 5)
}

# Scoring metric
f1_macro_scorer = make_scorer(f1_score, average='macro')

# Cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("CatBoost search space defined")
print(f"Parameters: {list(catboost_param_dist.keys())}")

In [None]:
%%time
print("=" * 60)
print("CatBoost Hyperparameter Tuning (30 iterations)")
print("=" * 60)

catboost_base = CatBoostClassifier(
    loss_function='MultiClass',
    random_seed=42,
    verbose=0,
    auto_class_weights='Balanced'
)

catboost_search = RandomizedSearchCV(
    catboost_base,
    param_distributions=catboost_param_dist,
    n_iter=30,
    scoring=f1_macro_scorer,
    cv=cv,
    random_state=42,
    n_jobs=-1,
    verbose=2
)

catboost_search.fit(X_trainval, y_trainval)

print(f"\nBest F1 (macro) CV score: {catboost_search.best_score_:.4f}")
print(f"Best params: {catboost_search.best_params_}")

## 4. XGBoost Hyperparameter Tuning

In [None]:
# Define search space for XGBoost
xgb_param_dist = {
    'n_estimators': randint(200, 800),
    'learning_rate': uniform(0.01, 0.15),
    'max_depth': randint(3, 8),
    'min_child_weight': randint(1, 10),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': uniform(0, 3),
    'reg_alpha': uniform(0, 2),
    'reg_lambda': uniform(0, 3)
}

print("XGBoost search space defined")

In [None]:
%%time
print("=" * 60)
print("XGBoost Hyperparameter Tuning (30 iterations)")
print("=" * 60)

xgb_base = XGBClassifier(
    objective='multi:softmax',
    num_class=n_classes,
    random_state=42,
    n_jobs=-1,
    verbosity=0
)

xgb_search = RandomizedSearchCV(
    xgb_base,
    param_distributions=xgb_param_dist,
    n_iter=30,
    scoring=f1_macro_scorer,
    cv=cv,
    random_state=42,
    n_jobs=-1,
    verbose=2
)

xgb_search.fit(X_trainval, y_trainval)

print(f"\nBest F1 (macro) CV score: {xgb_search.best_score_:.4f}")
print(f"Best params: {xgb_search.best_params_}")

## 5. Compare Tuned Models

In [None]:
tuned_models = {
    'CatBoost_tuned': catboost_search.best_estimator_,
    'XGBoost_tuned': xgb_search.best_estimator_
}

print("=" * 60)
print("TUNED MODEL COMPARISON ON TEST SET")
print("=" * 60)

tuned_results = {}
for name, model in tuned_models.items():
    y_pred = model.predict(X_test)
    tuned_results[name] = {
        'accuracy': accuracy_score(y_test, y_pred),
        'balanced_accuracy': balanced_accuracy_score(y_test, y_pred),
        'f1_macro': f1_score(y_test, y_pred, average='macro', zero_division=0),
        'f1_weighted': f1_score(y_test, y_pred, average='weighted', zero_division=0)
    }
    print(f"\n{name}:")
    for metric, val in tuned_results[name].items():
        print(f"  {metric}: {val:.4f}")

In [None]:
# Select best tuned model
best_tuned_name = max(tuned_results, key=lambda x: tuned_results[x]['f1_macro'])
best_tuned_model = tuned_models[best_tuned_name]
print(f"\nBest tuned model: {best_tuned_name}")
print(f"F1 macro: {tuned_results[best_tuned_name]['f1_macro']:.4f}")

## 6. Final Evaluation

In [None]:
y_pred_final = best_tuned_model.predict(X_test)

print("=" * 60)
print(f"FINAL MODEL: {best_tuned_name}")
print("=" * 60)
print("\nClassification Report:")
print(classification_report(y_test, y_pred_final, target_names=labels, zero_division=0))

In [None]:
# Final confusion matrix
cm = confusion_matrix(y_test, y_pred_final)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.title(f'{best_tuned_name} - Final Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.savefig('/content/final_confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("Saved confusion matrix to /content/final_confusion_matrix.png")

In [None]:
# Feature importance for final model
if 'CatBoost' in best_tuned_name:
    importance = best_tuned_model.get_feature_importance()
else:
    importance = best_tuned_model.feature_importances_

importance_df = pd.DataFrame({
    'feature': FEATURES,
    'importance': importance
}).sort_values('importance', ascending=False)

plt.figure(figsize=(12, 8))
top_n = min(20, len(FEATURES))
plt.barh(range(top_n), importance_df['importance'].head(top_n).values, color='steelblue')
plt.yticks(range(top_n), importance_df['feature'].head(top_n).values)
plt.xlabel('Importance')
plt.title(f'Top {top_n} Features - {best_tuned_name}')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('/content/final_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

## 7. Export Production Model to GCS

In [None]:
import os
os.makedirs('/content/production_model', exist_ok=True)

# Save model in appropriate format
if 'CatBoost' in best_tuned_name:
    best_tuned_model.save_model('/content/production_model/whale_classifier_final.cbm')
    model_format = 'cbm'
else:
    with open('/content/production_model/whale_classifier_final.pkl', 'wb') as f:
        pickle.dump(best_tuned_model, f)
    model_format = 'pkl'

print(f"Model saved as {model_format}")

In [None]:
# Get best params based on model type
if 'CatBoost' in best_tuned_name:
    best_params = catboost_search.best_params_
else:
    best_params = xgb_search.best_params_

# Convert numpy types to native Python types for JSON serialization
def convert_to_native(obj):
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {k: convert_to_native(v) for k, v in obj.items()}
    return obj

# Create model metadata
model_metadata = {
    'model_name': best_tuned_name,
    'model_format': model_format,
    'best_params': convert_to_native(best_params),
    'features': FEATURES,
    'n_features': len(FEATURES),
    'classes': labels,
    'n_classes': n_classes,
    'metrics': convert_to_native(tuned_results[best_tuned_name]),
    'cv_score': float(catboost_search.best_score_ if 'CatBoost' in best_tuned_name else xgb_search.best_score_),
    'timestamp': str(datetime.now()),
    'version': 'v1.0'
}

with open('/content/production_model/model_metadata.json', 'w') as f:
    json.dump(model_metadata, f, indent=2)

# Copy label encoder
with open('/content/production_model/label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)

# Copy feature config
with open('/content/production_model/feature_config.json', 'w') as f:
    json.dump(config, f, indent=2)

print("All artifacts saved")
print(f"\nModel metadata:")
print(json.dumps(model_metadata, indent=2))

In [None]:
# Upload to GCS
!gsutil -m cp -r /content/production_model/* gs://{BUCKET}/models/production/
!gsutil cp /content/final_confusion_matrix.png gs://{BUCKET}/results/
!gsutil cp /content/final_feature_importance.png gs://{BUCKET}/results/

print(f"\nProduction model uploaded to gs://{BUCKET}/models/production/")
print(f"Results uploaded to gs://{BUCKET}/results/")

In [None]:
# List uploaded files
print("\n=== GCS Contents ===")
!gsutil ls gs://{BUCKET}/models/production/
print("")
!gsutil ls gs://{BUCKET}/results/

## 8. Quick Test: Load and Predict

In [None]:
# Verify model can be loaded and used
print("=== Testing Production Model ===")

if model_format == 'cbm':
    test_model = CatBoostClassifier()
    test_model.load_model('/content/production_model/whale_classifier_final.cbm')
else:
    with open('/content/production_model/whale_classifier_final.pkl', 'rb') as f:
        test_model = pickle.load(f)

# Test prediction
sample_idx = 0
sample_features = X_test[sample_idx:sample_idx+1]
pred = test_model.predict(sample_features)
proba = test_model.predict_proba(sample_features)

print(f"Sample prediction: {label_mapping[int(pred[0])]}")
print(f"True label: {label_mapping[y_test[sample_idx]]}")
print(f"Probabilities: {dict(zip(labels, proba[0].round(4)))}")
print("\nModel loaded and working correctly!")

## Summary

Hyperparameter tuning completed:
1. RandomizedSearchCV (30 iterations) for CatBoost and XGBoost
2. 5-fold stratified cross-validation
3. Optimized for F1 (macro)
4. Final model exported to GCS

**Production Model Location:** `gs://smt-weex-2025-models/models/production/`

**Files:**
- `whale_classifier_final.cbm` (or .pkl) - trained model
- `model_metadata.json` - hyperparameters, metrics, feature list
- `label_encoder.pkl` - for encoding/decoding labels
- `feature_config.json` - feature names and config

**Next Steps:**
1. Deploy to Vertex AI for online prediction
2. Integrate with Gemini 2.5 Flash for signal validation
3. Connect to WEEX API for trading