In [1]:
#==============================================================================
# CELL 1: IMPORT LIBRARIES
#==============================================================================
"""
Import semua libraries yang dibutuhkan untuk hyperparameter tuning.
"""
# Data manipulation
import pandas as pd
import numpy as np
from collections import Counter

# Machine Learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import (
    RandomizedSearchCV, 
    cross_val_score, 
    StratifiedKFold,
    cross_validate
)

# Imbalanced learning
from imblearn.over_sampling import SMOTE, ADASYN

# Evaluation metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, auc,
    confusion_matrix, classification_report,
    precision_recall_curve, average_precision_score,
    cohen_kappa_score, matthews_corrcoef,
    balanced_accuracy_score, make_scorer
)

# Statistical testing
from scipy import stats
from statsmodels.stats.contingency_tables import mcnemar

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Utilities
import warnings
import json
import os
import joblib
from datetime import datetime
import time

warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

# Plot settings
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['savefig.dpi'] = 300
sns.set_style("whitegrid")
sns.set_palette("husl")

print("="*80)
print("FASE 4: HYPERPARAMETER TUNING & MODEL OPTIMIZATION")
print("="*80)
print("\n‚úÖ All libraries imported successfully!")
print("="*80)

FASE 4: HYPERPARAMETER TUNING & MODEL OPTIMIZATION

‚úÖ All libraries imported successfully!


In [2]:
#==============================================================================
# CELL 2: LOAD DATA & BASELINE MODEL
#==============================================================================
"""
Load preprocessed data dan baseline model untuk comparison.
"""
print("\n" + "="*80)
print("üìÇ LOADING DATA & BASELINE MODEL")
print("="*80)

# Load training data
X_train = pd.read_csv('../data/processed/X_train.csv')
y_train = pd.read_csv('../data/processed/y_train.csv').values.ravel()

# Load test data
X_test = pd.read_csv('../data/processed/X_test.csv')
y_test = pd.read_csv('../data/processed/y_test.csv').values.ravel()

# Load baseline model
baseline_model = joblib.load('../models/baseline_rf_model.pkl')

# Load baseline metrics
with open('../results/metrics/03_baseline_metrics.json', 'r') as f:
    baseline_metrics = json.load(f)

print("\n‚úÖ Data Loaded:")
print(f"  ‚Ä¢ Training samples: {len(X_train)}")
print(f"  ‚Ä¢ Test samples: {len(X_test)}")
print(f"  ‚Ä¢ Features: {X_train.shape[1]}")
print(f"  ‚Ä¢ Feature names: {X_train.columns.tolist()}")

print("\n‚úÖ Class Distribution:")
train_dist = Counter(y_train)
test_dist = Counter(y_test)
print(f"  ‚Ä¢ Training: Class 0={train_dist[0]}, Class 1={train_dist[1]} (ratio: {train_dist[0]/train_dist[1]:.2f})")
print(f"  ‚Ä¢ Test: Class 0={test_dist[0]}, Class 1={test_dist[1]} (ratio: {test_dist[0]/test_dist[1]:.2f})")

print("\n‚úÖ Baseline Model Performance (Recap):")
print(f"  ‚Ä¢ Test Accuracy: {baseline_metrics['test_metrics']['accuracy']:.4f}")
print(f"  ‚Ä¢ Test ROC-AUC: {baseline_metrics['test_metrics']['roc_auc']:.4f}")
print(f"  ‚Ä¢ Test F1-Score: {baseline_metrics['test_metrics']['f1_score']:.4f}")
print(f"  ‚Ä¢ Training Time: {baseline_metrics['model_info']['training_time_seconds']:.2f}s")

# Get baseline predictions for later comparison
y_test_pred_baseline = baseline_model.predict(X_test)

print("\n" + "="*80)


üìÇ LOADING DATA & BASELINE MODEL

‚úÖ Data Loaded:
  ‚Ä¢ Training samples: 1508
  ‚Ä¢ Test samples: 377
  ‚Ä¢ Features: 24
  ‚Ä¢ Feature names: ['Age_Encoded', 'Gender_Encoded', 'Education_Encoded', 'country_Australia', 'country_Canada', 'country_New Zealand', 'country_Other', 'country_Republic of Ireland', 'country_UK', 'country_USA', 'ethnicity_Asian', 'ethnicity_Black', 'ethnicity_Mixed-Black/Asian', 'ethnicity_Mixed-White/Asian', 'ethnicity_Mixed-White/Black', 'ethnicity_Other', 'ethnicity_White', 'Nscore', 'Escore', 'Oscore', 'Ascore', 'Cscore', 'Impulsive', 'SS']

‚úÖ Class Distribution:
  ‚Ä¢ Training: Class 0=568, Class 1=940 (ratio: 0.60)
  ‚Ä¢ Test: Class 0=142, Class 1=235 (ratio: 0.60)

‚úÖ Baseline Model Performance (Recap):
  ‚Ä¢ Test Accuracy: 0.8647
  ‚Ä¢ Test ROC-AUC: 0.9271
  ‚Ä¢ Test F1-Score: 0.8898
  ‚Ä¢ Training Time: 0.47s



In [3]:
#==============================================================================
# CELL 3: DEFINE HYPERPARAMETER SEARCH SPACE
#==============================================================================
"""
Define comprehensive hyperparameter search space untuk RandomizedSearchCV.
"""
print("\n" + "="*80)
print("üîß DEFINING HYPERPARAMETER SEARCH SPACE")
print("="*80)

# Hyperparameter distributions
param_distributions = {
    # Number of trees
    'n_estimators': [100, 200, 300, 500, 1000],
    
    # Tree depth (add constraint to reduce overfitting)
    'max_depth': [10, 20, 30, 40, None],
    
    # Minimum samples to split a node (increase to reduce overfitting)
    'min_samples_split': [2, 5, 10, 20],
    
    # Minimum samples at leaf node
    'min_samples_leaf': [1, 2, 4, 8],
    
    # Number of features for best split
    'max_features': ['sqrt', 'log2', 0.3, 0.5],
    
    # Bootstrap sampling
    'bootstrap': [True, False],
    
    # Class weights (for handling imbalance)
    'class_weight': ['balanced', 'balanced_subsample', None],
    
    # Split criterion
    'criterion': ['gini', 'entropy']
}

print("\n‚úÖ Search Space Defined:")
for param, values in param_distributions.items():
    print(f"  ‚Ä¢ {param:20s}: {values}")

# Calculate total combinations
total_combinations = 1
for values in param_distributions.values():
    total_combinations *= len(values)

print(f"\n‚úÖ Total possible combinations: {total_combinations:,}")
print(f"‚úÖ RandomizedSearchCV will sample: 100 combinations")
print(f"‚úÖ With 5-fold CV: 100 √ó 5 = 500 model fits")
print(f"‚úÖ Estimated time: 10-30 minutes (depending on hardware)")

print("\n" + "="*80)


üîß DEFINING HYPERPARAMETER SEARCH SPACE

‚úÖ Search Space Defined:
  ‚Ä¢ n_estimators        : [100, 200, 300, 500, 1000]
  ‚Ä¢ max_depth           : [10, 20, 30, 40, None]
  ‚Ä¢ min_samples_split   : [2, 5, 10, 20]
  ‚Ä¢ min_samples_leaf    : [1, 2, 4, 8]
  ‚Ä¢ max_features        : ['sqrt', 'log2', 0.3, 0.5]
  ‚Ä¢ bootstrap           : [True, False]
  ‚Ä¢ class_weight        : ['balanced', 'balanced_subsample', None]
  ‚Ä¢ criterion           : ['gini', 'entropy']

‚úÖ Total possible combinations: 19,200
‚úÖ RandomizedSearchCV will sample: 100 combinations
‚úÖ With 5-fold CV: 100 √ó 5 = 500 model fits
‚úÖ Estimated time: 10-30 minutes (depending on hardware)



In [4]:
#==============================================================================
# CELL 4: RANDOMIZED SEARCH CV
#==============================================================================
"""
Perform RandomizedSearchCV untuk find optimal hyperparameters.
PALING LAMA: 10-30 menit tergantung hardware!
"""
print("\n" + "="*80)
print("üîç HYPERPARAMETER TUNING - RANDOMIZED SEARCH CV")
print("="*80)

# Define cross-validation strategy
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize base Random Forest
rf_base = RandomForestClassifier(
    random_state=42,
    n_jobs=-1,  # Use all CPU cores
    verbose=0
)

# Initialize RandomizedSearchCV
print("\n‚öôÔ∏è  Setting up RandomizedSearchCV...")
random_search = RandomizedSearchCV(
    estimator=rf_base,
    param_distributions=param_distributions,
    n_iter=100,  # Sample 100 combinations
    scoring='roc_auc',  # Optimize for ROC-AUC as per proposal
    cv=cv_strategy,
    verbose=2,  # Show progress
    random_state=42,
    n_jobs=-1,  # Parallel processing
    return_train_score=True
)

print("\n‚úÖ RandomizedSearchCV Configuration:")
print(f"  ‚Ä¢ Estimator: RandomForestClassifier")
print(f"  ‚Ä¢ n_iter: 100 (sample 100 combinations)")
print(f"  ‚Ä¢ Scoring: roc_auc")
print(f"  ‚Ä¢ CV: 5-fold Stratified")
print(f"  ‚Ä¢ n_jobs: -1 (all cores)")
print(f"  ‚Ä¢ Total fits: 100 √ó 5 = 500")

# Perform hyperparameter search
print("\nüöÄ Starting hyperparameter search...")
print("‚è±Ô∏è  This will take 10-30 minutes depending on your hardware...")
print("-"*80)

start_time = time.time()
random_search.fit(X_train, y_train)
tuning_time = time.time() - start_time

print("\n" + "="*80)
print(f"‚úÖ HYPERPARAMETER TUNING COMPLETED!")
print(f"‚è±Ô∏è  Total time: {tuning_time/60:.2f} minutes ({tuning_time:.0f} seconds)")
print("="*80)


üîç HYPERPARAMETER TUNING - RANDOMIZED SEARCH CV

‚öôÔ∏è  Setting up RandomizedSearchCV...

‚úÖ RandomizedSearchCV Configuration:
  ‚Ä¢ Estimator: RandomForestClassifier
  ‚Ä¢ n_iter: 100 (sample 100 combinations)
  ‚Ä¢ Scoring: roc_auc
  ‚Ä¢ CV: 5-fold Stratified
  ‚Ä¢ n_jobs: -1 (all cores)
  ‚Ä¢ Total fits: 100 √ó 5 = 500

üöÄ Starting hyperparameter search...
‚è±Ô∏è  This will take 10-30 minutes depending on your hardware...
--------------------------------------------------------------------------------
Fitting 5 folds for each of 100 candidates, totalling 500 fits

‚úÖ HYPERPARAMETER TUNING COMPLETED!
‚è±Ô∏è  Total time: 5.03 minutes (302 seconds)


In [5]:
#==============================================================================
# CELL 5: BEST HYPERPARAMETERS
#==============================================================================
"""
Extract dan display best hyperparameters found.
"""
print("\n" + "="*80)
print("üèÜ BEST HYPERPARAMETERS FOUND")
print("="*80)

best_params = random_search.best_params_
best_score = random_search.best_score_

print("\n‚úÖ Best Parameters:")
print("-"*80)
for param, value in sorted(best_params.items()):
    print(f"  ‚Ä¢ {param:20s}: {value}")

print(f"\n‚úÖ Best Cross-Validation Score:")
print(f"  ‚Ä¢ ROC-AUC: {best_score:.4f}")

# Get best model
best_rf_model = random_search.best_estimator_

print(f"\n‚úÖ Best Model Info:")
print(f"  ‚Ä¢ n_estimators: {best_rf_model.n_estimators}")
print(f"  ‚Ä¢ max_depth: {best_rf_model.max_depth}")
print(f"  ‚Ä¢ min_samples_split: {best_rf_model.min_samples_split}")
print(f"  ‚Ä¢ min_samples_leaf: {best_rf_model.min_samples_leaf}")
print(f"  ‚Ä¢ max_features: {best_rf_model.max_features}")
print(f"  ‚Ä¢ criterion: {best_rf_model.criterion}")
print(f"  ‚Ä¢ bootstrap: {best_rf_model.bootstrap}")
print(f"  ‚Ä¢ class_weight: {best_rf_model.class_weight}")

print("\n" + "="*80)


üèÜ BEST HYPERPARAMETERS FOUND

‚úÖ Best Parameters:
--------------------------------------------------------------------------------
  ‚Ä¢ bootstrap           : True
  ‚Ä¢ class_weight        : None
  ‚Ä¢ criterion           : gini
  ‚Ä¢ max_depth           : 20
  ‚Ä¢ max_features        : log2
  ‚Ä¢ min_samples_leaf    : 8
  ‚Ä¢ min_samples_split   : 10
  ‚Ä¢ n_estimators        : 500

‚úÖ Best Cross-Validation Score:
  ‚Ä¢ ROC-AUC: 0.8984

‚úÖ Best Model Info:
  ‚Ä¢ n_estimators: 500
  ‚Ä¢ max_depth: 20
  ‚Ä¢ min_samples_split: 10
  ‚Ä¢ min_samples_leaf: 8
  ‚Ä¢ max_features: log2
  ‚Ä¢ criterion: gini
  ‚Ä¢ bootstrap: True
  ‚Ä¢ class_weight: None



In [6]:
#==============================================================================
# CELL 6: EVALUATE TUNED MODEL ON TEST SET
#==============================================================================
"""
Evaluate tuned model pada test set dengan comprehensive metrics.
"""
print("\n" + "="*80)
print("üìä EVALUATING TUNED MODEL ON TEST SET")
print("="*80)

# Predictions
print("\n‚öôÔ∏è  Generating predictions...")
y_train_pred_tuned = best_rf_model.predict(X_train)
y_train_proba_tuned = best_rf_model.predict_proba(X_train)[:, 1]

y_test_pred_tuned = best_rf_model.predict(X_test)
y_test_proba_tuned = best_rf_model.predict_proba(X_test)[:, 1]

print("‚úÖ Predictions generated!")

# Calculate comprehensive metrics
def calculate_all_metrics(y_true, y_pred, y_pred_proba, set_name=""):
    """Calculate all evaluation metrics"""
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    metrics = {
        'Set': set_name,
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred, zero_division=0),
        'Recall': recall_score(y_true, y_pred, zero_division=0),
        'Specificity': tn / (tn + fp) if (tn + fp) > 0 else 0,
        'F1-Score': f1_score(y_true, y_pred, zero_division=0),
        'Balanced Accuracy': balanced_accuracy_score(y_true, y_pred),
        'ROC-AUC': roc_auc_score(y_true, y_pred_proba),
        'PR-AUC': average_precision_score(y_true, y_pred_proba),
        'Cohen Kappa': cohen_kappa_score(y_true, y_pred),
        'MCC': matthews_corrcoef(y_true, y_pred),
        'TP': int(tp),
        'TN': int(tn),
        'FP': int(fp),
        'FN': int(fn)
    }
    
    return metrics

# Calculate metrics
train_metrics_tuned = calculate_all_metrics(y_train, y_train_pred_tuned, 
                                            y_train_proba_tuned, "Training")
test_metrics_tuned = calculate_all_metrics(y_test, y_test_pred_tuned, 
                                           y_test_proba_tuned, "Test")

# Display results
tuned_df = pd.DataFrame([train_metrics_tuned, test_metrics_tuned]).set_index('Set')

print("\n‚úÖ TUNED MODEL PERFORMANCE:")
print("="*80)
display_cols = ['Accuracy', 'Precision', 'Recall', 'Specificity', 'F1-Score', 
                'Balanced Accuracy', 'ROC-AUC', 'PR-AUC', 'Cohen Kappa', 'MCC']
print(tuned_df[display_cols].round(4).to_string())

print("\n‚úÖ Confusion Matrix Breakdown:")
print("-"*80)
cm_cols = ['TP', 'TN', 'FP', 'FN']
print(tuned_df[cm_cols].to_string())

# Check overfitting
train_test_gap = train_metrics_tuned['Accuracy'] - test_metrics_tuned['Accuracy']
print(f"\n‚úÖ Overfitting Analysis:")
print(f"  ‚Ä¢ Training Accuracy: {train_metrics_tuned['Accuracy']:.4f}")
print(f"  ‚Ä¢ Test Accuracy: {test_metrics_tuned['Accuracy']:.4f}")
print(f"  ‚Ä¢ Gap: {train_test_gap:.4f} ({train_test_gap*100:.2f}%)")

if train_test_gap > 0.10:
    print("  ‚Ä¢ Status: ‚ö†Ô∏è  Still some overfitting (gap > 10%)")
elif train_test_gap > 0.05:
    print("  ‚Ä¢ Status: ‚ö†Ô∏è  Moderate overfitting (gap 5-10%)")
else:
    print("  ‚Ä¢ Status: ‚úÖ Good generalization (gap < 5%)")

print("\n" + "="*80)


üìä EVALUATING TUNED MODEL ON TEST SET

‚öôÔ∏è  Generating predictions...
‚úÖ Predictions generated!

‚úÖ TUNED MODEL PERFORMANCE:
          Accuracy  Precision  Recall  Specificity  F1-Score  Balanced Accuracy  ROC-AUC  PR-AUC  Cohen Kappa    MCC
Set                                                                                                                 
Training    0.8707     0.9045  0.8862       0.8451    0.8952             0.8656   0.9441  0.9680       0.7264 0.7267
Test        0.8621     0.9031  0.8723       0.8451    0.8874             0.8587   0.9347  0.9612       0.7095 0.7102

‚úÖ Confusion Matrix Breakdown:
--------------------------------------------------------------------------------
           TP   TN  FP   FN
Set                        
Training  833  480  88  107
Test      205  120  22   30

‚úÖ Overfitting Analysis:
  ‚Ä¢ Training Accuracy: 0.8707
  ‚Ä¢ Test Accuracy: 0.8621
  ‚Ä¢ Gap: 0.0086 (0.86%)
  ‚Ä¢ Status: ‚úÖ Good generalization (gap < 5%)

