# HW06 ‚Äì Decision Trees & Ensembles

–°–µ–º–∏–Ω–∞—Ä S06: –î–µ—Ä–µ–≤—å—è —Ä–µ—à–µ–Ω–∏–π –∏ –∞–Ω—Å–∞–º–±–ª–∏ (Bagging / Random Forest / Boosting)

**Dataset:** S06-hw-dataset-04.csv (–±–∏–Ω–∞—Ä–Ω–∞—è –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ü–∏—è, —Å–∏–ª—å–Ω—ã–π –¥–∏—Å–±–∞–ª–∞–Ω—Å)

**–¶–µ–ª—å:** –ß–µ—Å—Ç–Ω—ã–π ML-—ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç —Å –∫–æ–Ω—Ç—Ä–æ–ª–µ–º —Å–ª–æ–∂–Ω–æ—Å—Ç–∏, –ø–æ–¥–±–æ—Ä–æ–º –≥–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤ –∏ —Å—Ä–∞–≤–Ω–µ–Ω–∏–µ–º –º–æ–¥–µ–ª–µ–π.

## 1. Setup & Imports

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, roc_curve, confusion_matrix,
    auc, precision_recall_curve
)
from sklearn.inspection import permutation_importance

import matplotlib.pyplot as plt
import seaborn as sns
import json
import joblib
import os

RANDOM_STATE = 42
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

os.makedirs('artifacts/figures', exist_ok=True)
print('‚úì Setup complete')

## 2. Data Loading & EDA

In [None]:
df = pd.read_csv('S06-hw-dataset-04.csv')

print('Dataset Shape:', df.shape)
print('\nFirst rows:')
print(df.head())

print('\nData Info:')
print(df.info())

print('\nBasic Stats:')
print(df.describe())

In [None]:
print('\n' + '='*70)
print('TARGET DISTRIBUTION')
print('='*70)
target_counts = df['target'].value_counts()
target_props = df['target'].value_counts(normalize=True)

print(f'\nClass 0: {target_counts[0]} ({target_props[0]:.2%})')
print(f'Class 1: {target_counts[1]} ({target_props[1]:.2%})')
print(f'\nImbalance ratio: {target_counts[0] / target_counts[1]:.1f}:1')
print('\nNote: Strong class imbalance (fraud-like). Accuracy alone is insufficient!')
print('We will use F1 and ROC-AUC as primary metrics.')

In [None]:
X = df.drop(columns=['id', 'target'])
y = df['target']

print(f'Features: {X.shape[1]} columns')
print(f'Feature names (first 10): {list(X.columns[:10])}')
print(f'\nTarget: {y.shape[0]} samples, {y.nunique()} classes')
print(f'\nMissing values: {X.isnull().sum().sum()} in features, {y.isnull().sum()} in target')

## 3. Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=RANDOM_STATE, stratify=y
)

print('='*70)
print('TRAIN/TEST SPLIT')
print('='*70)
print(f'\nTrain size: {X_train.shape[0]} ({X_train.shape[0]/len(X):.1%})')
print(f'Test size: {X_test.shape[0]} ({X_test.shape[0]/len(X):.1%})')
print(f'\nTrain target distribution:')
print(y_train.value_counts(normalize=True).round(4))
print(f'\nTest target distribution:')
print(y_test.value_counts(normalize=True).round(4))
print(f'\n‚úì Stratification preserved class balance in both splits')

## 4. Baseline Models

In [None]:
results = {}

dummy_clf = DummyClassifier(strategy='most_frequent', random_state=RANDOM_STATE)
dummy_clf.fit(X_train, y_train)
y_pred_dummy = dummy_clf.predict(X_test)
y_pred_proba_dummy = dummy_clf.predict_proba(X_test)[:, 1]

results['DummyClassifier'] = {
    'accuracy': accuracy_score(y_test, y_pred_dummy),
    'f1': f1_score(y_test, y_pred_dummy, zero_division=0),
    'roc_auc': roc_auc_score(y_test, y_pred_proba_dummy),
    'model': dummy_clf,
    'y_pred': y_pred_dummy,
    'y_pred_proba': y_pred_proba_dummy,
}

print('Baseline 1: DummyClassifier (most_frequent)')
print(f"  Accuracy: {results['DummyClassifier']['accuracy']:.4f}")
print(f"  F1: {results['DummyClassifier']['f1']:.4f}")
print(f"  ROC-AUC: {results['DummyClassifier']['roc_auc']:.4f}")

In [None]:
pipeline_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000, class_weight='balanced'))
])
pipeline_lr.fit(X_train, y_train)
y_pred_lr = pipeline_lr.predict(X_test)
y_pred_proba_lr = pipeline_lr.predict_proba(X_test)[:, 1]

results['LogisticRegression'] = {
    'accuracy': accuracy_score(y_test, y_pred_lr),
    'f1': f1_score(y_test, y_pred_lr),
    'roc_auc': roc_auc_score(y_test, y_pred_proba_lr),
    'model': pipeline_lr,
    'y_pred': y_pred_lr,
    'y_pred_proba': y_pred_proba_lr,
    'params': {'class_weight': 'balanced'}
}

print('\nBaseline 2: LogisticRegression (with StandardScaler & balanced weights)')
print(f"  Accuracy: {results['LogisticRegression']['accuracy']:.4f}")
print(f"  F1: {results['LogisticRegression']['f1']:.4f}")
print(f"  ROC-AUC: {results['LogisticRegression']['roc_auc']:.4f}")

## 5. Decision Tree

In [None]:
dt_params = {
    'max_depth': [3, 5, 7, 10, 15, None],
    'min_samples_leaf': [5, 10, 20],
    'min_samples_split': [10, 20]
}

dt_base = DecisionTreeClassifier(random_state=RANDOM_STATE, class_weight='balanced')
dt_grid = GridSearchCV(
    dt_base, dt_params, cv=5, scoring='roc_auc', n_jobs=-1
)
dt_grid.fit(X_train, y_train)

print('DecisionTree: Hyperparameter Search Results')
print(f'Best params: {dt_grid.best_params_}')
print(f'Best CV ROC-AUC: {dt_grid.best_score_:.4f}')

dt_best = dt_grid.best_estimator_
y_pred_dt = dt_best.predict(X_test)
y_pred_proba_dt = dt_best.predict_proba(X_test)[:, 1]

results['DecisionTree'] = {
    'accuracy': accuracy_score(y_test, y_pred_dt),
    'f1': f1_score(y_test, y_pred_dt),
    'roc_auc': roc_auc_score(y_test, y_pred_proba_dt),
    'model': dt_best,
    'y_pred': y_pred_dt,
    'y_pred_proba': y_pred_proba_dt,
    'params': dt_grid.best_params_,
    'cv_score': dt_grid.best_score_
}

print(f"\nTest set metrics:")
print(f"  Accuracy: {results['DecisionTree']['accuracy']:.4f}")
print(f"  F1: {results['DecisionTree']['f1']:.4f}")
print(f"  ROC-AUC: {results['DecisionTree']['roc_auc']:.4f}")

## 6. Random Forest

In [None]:
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, 15, None],
    'min_samples_leaf': [5, 10],
    'max_features': ['sqrt', 'log2']
}

rf_base = RandomForestClassifier(random_state=RANDOM_STATE, class_weight='balanced', n_jobs=-1)
rf_grid = GridSearchCV(
    rf_base, rf_params, cv=5, scoring='roc_auc', n_jobs=-1
)
rf_grid.fit(X_train, y_train)

print('RandomForest: Hyperparameter Search Results')
print(f'Best params: {rf_grid.best_params_}')
print(f'Best CV ROC-AUC: {rf_grid.best_score_:.4f}')

rf_best = rf_grid.best_estimator_
y_pred_rf = rf_best.predict(X_test)
y_pred_proba_rf = rf_best.predict_proba(X_test)[:, 1]

results['RandomForest'] = {
    'accuracy': accuracy_score(y_test, y_pred_rf),
    'f1': f1_score(y_test, y_pred_rf),
    'roc_auc': roc_auc_score(y_test, y_pred_proba_rf),
    'model': rf_best,
    'y_pred': y_pred_rf,
    'y_pred_proba': y_pred_proba_rf,
    'params': rf_grid.best_params_,
    'cv_score': rf_grid.best_score_
}

print(f"\nTest set metrics:")
print(f"  Accuracy: {results['RandomForest']['accuracy']:.4f}")
print(f"  F1: {results['RandomForest']['f1']:.4f}")
print(f"  ROC-AUC: {results['RandomForest']['roc_auc']:.4f}")

## 7. Gradient Boosting

In [None]:
gb_params = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5, 7],
    'min_samples_leaf': [5, 10]
}

gb_base = GradientBoostingClassifier(random_state=RANDOM_STATE)
gb_grid = GridSearchCV(
    gb_base, gb_params, cv=5, scoring='roc_auc', n_jobs=-1
)
gb_grid.fit(X_train, y_train)

print('GradientBoosting: Hyperparameter Search Results')
print(f'Best params: {gb_grid.best_params_}')
print(f'Best CV ROC-AUC: {gb_grid.best_score_:.4f}')

gb_best = gb_grid.best_estimator_
y_pred_gb = gb_best.predict(X_test)
y_pred_proba_gb = gb_best.predict_proba(X_test)[:, 1]

results['GradientBoosting'] = {
    'accuracy': accuracy_score(y_test, y_pred_gb),
    'f1': f1_score(y_test, y_pred_gb),
    'roc_auc': roc_auc_score(y_test, y_pred_proba_gb),
    'model': gb_best,
    'y_pred': y_pred_gb,
    'y_pred_proba': y_pred_proba_gb,
    'params': gb_grid.best_params_,
    'cv_score': gb_grid.best_score_
}

print(f"\nTest set metrics:")
print(f"  Accuracy: {results['GradientBoosting']['accuracy']:.4f}")
print(f"  F1: {results['GradientBoosting']['f1']:.4f}")
print(f"  ROC-AUC: {results['GradientBoosting']['roc_auc']:.4f}")

## 8. Results Summary

In [None]:
results_df = pd.DataFrame({
    model_name: {
        'Accuracy': data['accuracy'],
        'F1': data['f1'],
        'ROC-AUC': data['roc_auc']
    }
    for model_name, data in results.items()
}).T

print('='*70)
print('FINAL TEST METRICS - ALL MODELS')
print('='*70)
print(results_df.round(4))

best_model_name = results_df['ROC-AUC'].idxmax()
best_roc_auc = results_df['ROC-AUC'].max()
print(f'\nüèÜ BEST MODEL: {best_model_name}')
print(f'   ROC-AUC: {best_roc_auc:.4f}')
print(f'   Accuracy: {results_df.loc[best_model_name, "Accuracy"]:.4f}')
print(f'   F1: {results_df.loc[best_model_name, "F1"]:.4f}')

## 9. Diagnostics: ROC Curves

In [None]:
plt.figure(figsize=(10, 7))

for model_name, data in results.items():
    fpr, tpr, _ = roc_curve(y_test, data['y_pred_proba'])
    roc_auc = data['roc_auc']
    plt.plot(fpr, tpr, label=f'{model_name} (AUC={roc_auc:.3f})', linewidth=2)

plt.plot([0, 1], [0, 1], 'k--', label='Random (AUC=0.500)', linewidth=1.5)
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves ‚Äì All Models (Test Set)', fontsize=13, fontweight='bold')
plt.legend(loc='lower right', fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('artifacts/figures/roc_curves.png', dpi=100, bbox_inches='tight')
plt.show()
print('‚úì ROC curves saved')

## 10. Confusion Matrix

In [None]:
best_model = results[best_model_name]['model']
best_y_pred = results[best_model_name]['y_pred']
cm = confusion_matrix(y_test, best_y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Class 0', 'Class 1'],
            yticklabels=['Class 0', 'Class 1'],
            cbar_kws={'label': 'Count'})
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.title(f'Confusion Matrix ‚Äì {best_model_name}', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.savefig('artifacts/figures/confusion_matrix.png', dpi=100, bbox_inches='tight')
plt.show()
print('‚úì Confusion matrix saved')

tn, fp, fn, tp = cm.ravel()
print(f'\nCM Analysis: TN={tn}, FP={fp}, FN={fn}, TP={tp}')

## 11. Permutation Importance

In [None]:
perm_importance = permutation_importance(
    best_model, X_test, y_test, n_repeats=10, random_state=RANDOM_STATE, n_jobs=-1
)

importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance_mean': perm_importance.importances_mean,
    'importance_std': perm_importance.importances_std
}).sort_values('importance_mean', ascending=False).head(15)

print('\nTop-15 Features by Permutation Importance:')
print(importance_df.to_string(index=False))

plt.figure(figsize=(10, 7))
plt.barh(range(len(importance_df)), importance_df['importance_mean'], 
         xerr=importance_df['importance_std'], color='steelblue', capsize=5)
plt.yticks(range(len(importance_df)), importance_df['feature'], fontsize=10)
plt.xlabel('Permutation Importance (Mean ¬± Std)', fontsize=12)
plt.title(f'Top-15 Features ‚Äì {best_model_name}', fontsize=13, fontweight='bold')
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.savefig('artifacts/figures/permutation_importance.png', dpi=100, bbox_inches='tight')
plt.show()
print('‚úì Permutation importance plot saved')

## 12. Save Artifacts

In [None]:
metrics_test = {}
for model_name, data in results.items():
    metrics_test[model_name] = {
        'accuracy': float(data['accuracy']),
        'f1': float(data['f1']),
        'roc_auc': float(data['roc_auc'])
    }

with open('artifacts/metrics_test.json', 'w') as f:
    json.dump(metrics_test, f, indent=2)
print('‚úì Test metrics saved')

In [None]:
search_summaries = {}
for model_name in ['DecisionTree', 'RandomForest', 'GradientBoosting']:
    if 'params' in results[model_name]:
        search_summaries[model_name] = {
            'best_params': results[model_name]['params'],
            'best_cv_score': float(results[model_name].get('cv_score', 0))
        }

with open('artifacts/search_summaries.json', 'w') as f:
    json.dump(search_summaries, f, indent=2)
print('‚úì Hyperparameter search summaries saved')

In [None]:
joblib.dump(best_model, 'artifacts/best_model.joblib')
print(f'‚úì Best model ({best_model_name}) saved')

best_model_meta = {
    'model_name': best_model_name,
    'test_metrics': metrics_test[best_model_name],
    'hyperparameters': results[best_model_name].get('params', {}),
    'cv_score': float(results[best_model_name].get('cv_score', 0))
}

with open('artifacts/best_model_meta.json', 'w') as f:
    json.dump(best_model_meta, f, indent=2)
print('‚úì Best model metadata saved')

## 13. Summary

In [None]:
print('\n' + '='*70)
print('EXPERIMENT SUMMARY')
print('='*70)
print(f'\nDataset: S06-hw-dataset-04.csv')
print(f'  Samples: {len(df)}, Features: {X.shape[1]}')
print(f'  Class imbalance: {len(df[df.target==0])/len(df[df.target==1]):.1f}:1')
print(f'\nüèÜ Best Model: {best_model_name} (ROC-AUC={best_roc_auc:.4f})')
print(f'\n‚úÖ HW06 completed successfully!')