# SMT-WEEX Notebook 4: Hyperparameter Tuning
**Project:** smt-weex-2025
**Author:** Jannet Ekka

**Focus:**
- Tune the **5-class merged model** (best from notebook 03)
- RandomizedSearchCV for CatBoost
- Investigate Exploiter detection drop
- Export production model

**Baseline from Notebook 03:**
- 5-class CV F1: 63.6% (+/- 1.0%)
- Target: 65-70% F1

## 1. Setup

In [None]:
!pip install -q catboost scikit-learn pandas numpy matplotlib seaborn

In [None]:
from google.colab import auth
auth.authenticate_user()

PROJECT_ID = 'smt-weex-2025'
BUCKET = 'smt-weex-2025-models'

!gcloud config set project {PROJECT_ID}

In [None]:
import pandas as pd
import numpy as np
import json
import pickle
from datetime import datetime

from sklearn.model_selection import (
    train_test_split, StratifiedKFold, RandomizedSearchCV, cross_val_score
)
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    f1_score, accuracy_score, classification_report, 
    confusion_matrix, make_scorer
)

from catboost import CatBoostClassifier

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print("Libraries loaded")

## 2. Load Data and Prepare Merged Classes

In [None]:
# Download data
!gsutil cp gs://{BUCKET}/data/whale_features_cleaned.csv /content/
!gsutil cp gs://{BUCKET}/data/feature_config.json /content/

df = pd.read_csv('/content/whale_features_cleaned.csv')

with open('/content/feature_config.json', 'r') as f:
    config = json.load(f)
FEATURES = config['features']

print(f"Loaded {len(df)} samples, {len(FEATURES)} features")

In [None]:
# Create merged dataset (from notebook 03)
# Merge Institutional + CEX_Wallet -> Large_Holder
df['category_merged'] = df['category'].replace({
    'Institutional': 'Large_Holder',
    'CEX_Wallet': 'Large_Holder'
})

print("=== Merged Class Distribution ===")
print(df['category_merged'].value_counts())

In [None]:
# Prepare X and y
X = df[FEATURES].values
y_raw = df['category_merged'].values

# Encode labels
le = LabelEncoder()
y = le.fit_transform(y_raw)

labels = list(le.classes_)
n_classes = len(labels)

print(f"Labels: {labels}")
print(f"Classes: {n_classes}")

In [None]:
# Train/test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    random_state=42,
    stratify=y
)

print(f"Train: {len(X_train)}, Test: {len(X_test)}")
print(f"\nTest distribution:")
for i, label in enumerate(labels):
    print(f"  {label}: {(y_test == i).sum()}")

## 3. Baseline Model (from Notebook 03)

In [None]:
def get_predictions(model, X):
    """Get predictions, handling CatBoost's 2D output"""
    y_pred = model.predict(X)
    if hasattr(y_pred, 'shape') and len(y_pred.shape) > 1:
        y_pred = y_pred.flatten()
    return y_pred.astype(int)

def evaluate_model(model, X_test, y_test, labels):
    """Evaluate model and print results"""
    y_pred = get_predictions(model, X_test)
    
    f1_macro = f1_score(y_test, y_pred, average='macro')
    f1_weighted = f1_score(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"Accuracy:     {accuracy:.4f}")
    print(f"F1 Macro:     {f1_macro:.4f}")
    print(f"F1 Weighted:  {f1_weighted:.4f}")
    print("\nPer-class F1:")
    f1_per_class = f1_score(y_test, y_pred, average=None)
    for label, f1 in zip(labels, f1_per_class):
        print(f"  {label:15s}: {f1:.4f}")
    
    return f1_macro, f1_per_class

In [None]:
# Train baseline model (same params as notebook 03)
print("=" * 60)
print("BASELINE MODEL (from Notebook 03)")
print("=" * 60)

baseline_model = CatBoostClassifier(
    iterations=300,
    learning_rate=0.03,
    depth=5,
    l2_leaf_reg=3,
    loss_function='MultiClass',
    random_seed=42,
    verbose=50,
    auto_class_weights='Balanced'
)

baseline_model.fit(X_train, y_train)
print("\n--- Baseline Results ---")
baseline_f1, baseline_per_class = evaluate_model(baseline_model, X_test, y_test, labels)

In [None]:
# Baseline 5-fold CV
print("\n=== Baseline 5-Fold CV ===")
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

baseline_cv_scores = []
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    cb_temp = CatBoostClassifier(
        iterations=300, learning_rate=0.03, depth=5, l2_leaf_reg=3,
        random_seed=42, verbose=0, auto_class_weights='Balanced'
    )
    cb_temp.fit(X[train_idx], y[train_idx])
    y_pred = get_predictions(cb_temp, X[val_idx])
    score = f1_score(y[val_idx], y_pred, average='macro')
    baseline_cv_scores.append(score)
    print(f"Fold {fold+1}: {score:.4f}")

print(f"\nBaseline CV: {np.mean(baseline_cv_scores):.4f} (+/- {np.std(baseline_cv_scores):.4f})")

## 4. Investigate Exploiter Detection

In notebook 03, Exploiter F1 dropped from 94% (6-class) to 53% (5-class merged).
Let's understand why and fix it.

In [None]:
# Check Exploiter confusion in merged model
y_pred_baseline = get_predictions(baseline_model, X_test)

# Get Exploiter index
exploiter_idx = list(le.classes_).index('Exploiter')

# Find all Exploiters in test set
exploiter_mask = y_test == exploiter_idx
exploiter_total = exploiter_mask.sum()
exploiter_correct = (y_pred_baseline[exploiter_mask] == exploiter_idx).sum()

print(f"=== Exploiter Detection Analysis ===")
print(f"Total Exploiters in test: {exploiter_total}")
print(f"Correctly identified: {exploiter_correct}")
print(f"Accuracy: {exploiter_correct/exploiter_total*100:.1f}%")

# What are Exploiters misclassified as?
print(f"\nMisclassification breakdown:")
for i, label in enumerate(labels):
    count = (y_pred_baseline[exploiter_mask] == i).sum()
    if count > 0:
        print(f"  Predicted as {label}: {count}")

In [None]:
# Plot confusion matrix for baseline
cm = confusion_matrix(y_test, y_pred_baseline)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=labels, yticklabels=labels)
plt.title('Baseline Model Confusion Matrix (5-class)')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

## 5. Hyperparameter Tuning with RandomizedSearchCV

### Parameter Ranges (based on research):
- `iterations`: 200-500 (more iterations for small data)
- `depth`: 4-7 (shallow to prevent overfitting)
- `learning_rate`: 0.01-0.1
- `l2_leaf_reg`: 1-10 (regularization)
- `border_count`: 32-255 (split precision)
- `bagging_temperature`: 0-1 (randomization)

In [None]:
# Define parameter grid
param_distributions = {
    'iterations': [200, 250, 300, 350, 400, 500],
    'depth': [4, 5, 6, 7],
    'learning_rate': [0.01, 0.02, 0.03, 0.05, 0.07, 0.1],
    'l2_leaf_reg': [1, 2, 3, 5, 7, 10],
    'border_count': [32, 64, 128, 255],
    'bagging_temperature': [0, 0.5, 1.0],
    'random_strength': [0, 0.5, 1.0],
}

print("Parameter search space:")
total_combinations = 1
for param, values in param_distributions.items():
    print(f"  {param}: {values}")
    total_combinations *= len(values)
print(f"\nTotal possible combinations: {total_combinations}")

In [None]:
# Create base model for RandomizedSearchCV
base_model = CatBoostClassifier(
    loss_function='MultiClass',
    random_seed=42,
    verbose=0,
    auto_class_weights='Balanced',
    thread_count=-1
)

# Custom scorer for macro F1
f1_macro_scorer = make_scorer(f1_score, average='macro')

# RandomizedSearchCV
print("=" * 60)
print("RANDOMIZED SEARCH CV (This may take 10-20 minutes)")
print("=" * 60)

random_search = RandomizedSearchCV(
    estimator=base_model,
    param_distributions=param_distributions,
    n_iter=50,  # Test 50 random combinations
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring=f1_macro_scorer,
    n_jobs=-1,
    verbose=2,
    random_state=42,
    return_train_score=True
)

random_search.fit(X_train, y_train)
print("\nSearch complete!")

In [None]:
# Best parameters
print("=" * 60)
print("BEST PARAMETERS")
print("=" * 60)
print(f"Best CV Score: {random_search.best_score_:.4f}")
print(f"\nBest Parameters:")
for param, value in random_search.best_params_.items():
    print(f"  {param}: {value}")

In [None]:
# Top 10 parameter combinations
results_df = pd.DataFrame(random_search.cv_results_)
results_df = results_df.sort_values('rank_test_score')

print("\n=== Top 10 Parameter Combinations ===")
top_10 = results_df[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']].head(10)
for idx, row in top_10.iterrows():
    print(f"\nRank {int(row['rank_test_score'])}: CV F1 = {row['mean_test_score']:.4f} (+/- {row['std_test_score']:.4f})")
    print(f"  Params: {row['params']}")

## 6. Train Final Model with Best Parameters

In [None]:
# Train final model with best parameters
print("=" * 60)
print("TRAINING FINAL MODEL")
print("=" * 60)

best_params = random_search.best_params_.copy()
best_params['loss_function'] = 'MultiClass'
best_params['random_seed'] = 42
best_params['auto_class_weights'] = 'Balanced'
best_params['verbose'] = 50

final_model = CatBoostClassifier(**best_params)
final_model.fit(X_train, y_train)

print("\nFinal model trained!")

In [None]:
# Evaluate final model on test set
print("\n=== Final Model Test Results ===")
final_f1, final_per_class = evaluate_model(final_model, X_test, y_test, labels)

In [None]:
# 5-fold CV with best parameters for reliable estimate
print("\n=== Final Model 5-Fold CV ===")
final_cv_scores = []
final_cv_per_class = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    params = best_params.copy()
    params['verbose'] = 0
    
    cb_temp = CatBoostClassifier(**params)
    cb_temp.fit(X[train_idx], y[train_idx])
    y_pred = get_predictions(cb_temp, X[val_idx])
    
    score = f1_score(y[val_idx], y_pred, average='macro')
    per_class = f1_score(y[val_idx], y_pred, average=None)
    
    final_cv_scores.append(score)
    final_cv_per_class.append(per_class)
    print(f"Fold {fold+1}: {score:.4f}")

print(f"\nFinal CV: {np.mean(final_cv_scores):.4f} (+/- {np.std(final_cv_scores):.4f})")

In [None]:
# Per-class CV performance
print("\n=== Per-Class CV Performance ===")
final_cv_per_class_mean = np.mean(final_cv_per_class, axis=0)
final_cv_per_class_std = np.std(final_cv_per_class, axis=0)

for i, label in enumerate(labels):
    print(f"{label:15s}: {final_cv_per_class_mean[i]:.4f} (+/- {final_cv_per_class_std[i]:.4f})")

## 7. Compare Baseline vs Tuned Model

In [None]:
# Comparison
print("=" * 60)
print("BASELINE vs TUNED MODEL COMPARISON")
print("=" * 60)

print(f"\n{'Metric':<25} {'Baseline':<15} {'Tuned':<15} {'Improvement':<15}")
print("-" * 70)

# Holdout
print(f"{'Holdout F1 Macro':<25} {baseline_f1:<15.4f} {final_f1:<15.4f} {(final_f1-baseline_f1)*100:+.2f}%")

# CV
baseline_cv_mean = np.mean(baseline_cv_scores)
final_cv_mean = np.mean(final_cv_scores)
print(f"{'CV F1 Macro (mean)':<25} {baseline_cv_mean:<15.4f} {final_cv_mean:<15.4f} {(final_cv_mean-baseline_cv_mean)*100:+.2f}%")

# CV std
baseline_cv_std = np.std(baseline_cv_scores)
final_cv_std = np.std(final_cv_scores)
print(f"{'CV F1 Std':<25} {baseline_cv_std:<15.4f} {final_cv_std:<15.4f} {(final_cv_std-baseline_cv_std)*100:+.2f}%")

# Per-class comparison
print("\n--- Per-Class F1 (Holdout) ---")
baseline_per_class_arr = np.array(baseline_per_class)
final_per_class_arr = np.array(final_per_class)

for i, label in enumerate(labels):
    diff = (final_per_class_arr[i] - baseline_per_class_arr[i]) * 100
    print(f"{label:<15} {baseline_per_class_arr[i]:<15.4f} {final_per_class_arr[i]:<15.4f} {diff:+.2f}%")

In [None]:
# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Overall comparison
metrics = ['Holdout F1', 'CV Mean F1']
baseline_vals = [baseline_f1, baseline_cv_mean]
tuned_vals = [final_f1, final_cv_mean]

x = np.arange(len(metrics))
width = 0.35

axes[0].bar(x - width/2, baseline_vals, width, label='Baseline', color='steelblue')
axes[0].bar(x + width/2, tuned_vals, width, label='Tuned', color='coral')
axes[0].set_ylabel('F1 Score')
axes[0].set_title('Baseline vs Tuned Model')
axes[0].set_xticks(x)
axes[0].set_xticklabels(metrics)
axes[0].legend()
axes[0].set_ylim(0, 1)

# Per-class comparison
x = np.arange(len(labels))
axes[1].bar(x - width/2, baseline_per_class_arr, width, label='Baseline', color='steelblue')
axes[1].bar(x + width/2, final_per_class_arr, width, label='Tuned', color='coral')
axes[1].set_ylabel('F1 Score')
axes[1].set_title('Per-Class F1 Comparison')
axes[1].set_xticks(x)
axes[1].set_xticklabels(labels, rotation=45, ha='right')
axes[1].legend()
axes[1].set_ylim(0, 1)

plt.tight_layout()
plt.show()

## 8. Final Confusion Matrix

In [None]:
# Final model confusion matrix
y_pred_final = get_predictions(final_model, X_test)
cm = confusion_matrix(y_test, y_pred_final)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=labels, yticklabels=labels)
plt.title('Final Tuned Model Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

# Classification report
print("\n=== Final Model Classification Report ===")
print(classification_report(y_test, y_pred_final, target_names=labels, zero_division=0))

## 9. Feature Importance (Tuned Model)

In [None]:
# Feature importance
importance = final_model.get_feature_importance()
importance_df = pd.DataFrame({
    'feature': FEATURES,
    'importance': importance
}).sort_values('importance', ascending=False)

print("=== Top 15 Features (Tuned Model) ===")
print(importance_df.head(15).to_string(index=False))

In [None]:
# Visualize
plt.figure(figsize=(12, 10))
top_n = 20
top_features = importance_df.head(top_n)

plt.barh(range(len(top_features)), top_features['importance'].values, color='coral')
plt.yticks(range(len(top_features)), top_features['feature'].values)
plt.xlabel('Importance')
plt.title(f'Top {top_n} Most Important Features (Tuned Model)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 10. Save Production Model

In [None]:
import os
os.makedirs('/content/production', exist_ok=True)

# Save final model
final_model.save_model('/content/production/catboost_whale_classifier_production.cbm')

# Save label encoder
with open('/content/production/label_encoder_production.pkl', 'wb') as f:
    pickle.dump(le, f)

# Save feature list
with open('/content/production/features.json', 'w') as f:
    json.dump({'features': FEATURES}, f, indent=2)

# Save model config and results
model_config = {
    'model_type': 'CatBoost',
    'n_classes': n_classes,
    'classes': labels,
    'n_features': len(FEATURES),
    'best_params': best_params,
    'performance': {
        'holdout_f1_macro': float(final_f1),
        'cv_f1_macro_mean': float(final_cv_mean),
        'cv_f1_macro_std': float(final_cv_std),
        'per_class_f1': {label: float(f1) for label, f1 in zip(labels, final_per_class)}
    },
    'baseline_comparison': {
        'baseline_cv_f1': float(baseline_cv_mean),
        'tuned_cv_f1': float(final_cv_mean),
        'improvement_pct': float((final_cv_mean - baseline_cv_mean) * 100)
    },
    'training_date': datetime.now().isoformat(),
    'merged_classes': {
        'original': ['Institutional', 'CEX_Wallet'],
        'merged_to': 'Large_Holder'
    }
}

with open('/content/production/model_config.json', 'w') as f:
    json.dump(model_config, f, indent=2)

print("Production model saved locally")
print(json.dumps(model_config, indent=2))

In [None]:
# Upload to GCS
!gsutil -m cp -r /content/production/* gs://{BUCKET}/models/production/
print(f"\nUploaded to gs://{BUCKET}/models/production/")

In [None]:
# Also save search results for reference
results_df[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']].to_csv(
    '/content/production/tuning_results.csv', index=False
)
!gsutil cp /content/production/tuning_results.csv gs://{BUCKET}/models/production/
print("Tuning results saved")

## 11. Production Usage Example

In [None]:
# Example: How to use the production model
print("=" * 60)
print("PRODUCTION USAGE EXAMPLE")
print("=" * 60)

print("""
# Load model
from catboost import CatBoostClassifier
import pickle
import json

# Download from GCS
# gsutil cp gs://smt-weex-2025-models/models/production/* ./

# Load
model = CatBoostClassifier()
model.load_model('catboost_whale_classifier_production.cbm')

with open('label_encoder_production.pkl', 'rb') as f:
    le = pickle.load(f)

with open('features.json', 'r') as f:
    features = json.load(f)['features']

# Predict
X_new = extract_features(wallet_address)  # Your feature extraction
X_new = X_new[features].values.reshape(1, -1)

prediction = model.predict(X_new)
category = le.inverse_transform(prediction.flatten())[0]

# Get probabilities
probas = model.predict_proba(X_new)
confidence = probas.max()

print(f"Category: {category}, Confidence: {confidence:.2%}")
""")

## Summary

### Results:
- **Baseline (5-class):** CV F1 = [see output]
- **Tuned (5-class):** CV F1 = [see output]
- **Improvement:** [see output]

### Best Parameters:
[see output above]

### Production Model:
- Saved to: `gs://smt-weex-2025-models/models/production/`
- Files:
  - `catboost_whale_classifier_production.cbm`
  - `label_encoder_production.pkl`
  - `features.json`
  - `model_config.json`

### Per-Class Reliability (for Trading Signals):
| Category | F1 | Trading Action |
|----------|-----|----------------|
| Miner | [see output] | BEARISH on sells |
| DeFi_Trader | [see output] | FOLLOW with caution |
| Large_Holder | [see output] | Exchange flow |
| Staker | [see output] | Weak unstake signal |
| Exploiter | [see output] | AVOID |

### Next Steps:
1. Integrate model into FastAPI backend
2. Set up real-time inference pipeline
3. Monitor model performance in production