# HW06: Decision Trees and Ensembles

**Студент**: Фех Алексей Александрович  
**Датасет**: S06-hw-dataset-02.csv  
**Задача**: Бинарная классификация с дисбалансом классов

## Цель

Сравнить деревья решений и ансамбли (RandomForest, GradientBoosting) на задаче бинарной классификации.

In [None]:
# Импорты
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, 
    confusion_matrix, RocCurveDisplay, ConfusionMatrixDisplay
)
from sklearn.inspection import permutation_importance
import joblib
import json
import warnings
warnings.filterwarnings('ignore')

# Для воспроизводимости
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

## 1. Загрузка и первичный анализ данных

In [None]:
# Загрузка данных
df = pd.read_csv('S06-hw-dataset-02.csv')

print(f"Размер датасета: {df.shape}")
print(f"\nПервые строки:")
df.head()

In [None]:
# Информация о данных
print("Информация о датасете:")
df.info()

print(f"\nПропущенные значения: {df.isnull().sum().sum()}")
print(f"\nСтатистика:")
df.describe()

In [None]:
# Распределение целевой переменной
target_dist = df['target'].value_counts(normalize=True)
print("Распределение целевой переменной:")
print(target_dist)
print(f"\nДисбаланс классов: {target_dist[0] / target_dist[1]:.2f}:1")

# График
plt.figure(figsize=(8, 5))
df['target'].value_counts().plot(kind='bar')
plt.title('Распределение классов')
plt.xlabel('Класс')
plt.ylabel('Количество')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

## 2. Train/Test Split

Разделяем данные на train/test с фиксированным `random_state=42` и стратификацией по целевой переменной.  
Это важно для:
- **Воспроизводимости** результатов
- **Сохранения пропорций** классов в обеих выборках

In [None]:
# Подготовка признаков и таргета
X = df.drop(['id', 'target'], axis=1)
y = df['target']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=RANDOM_STATE, stratify=y
)

print(f"Train размер: {X_train.shape}")
print(f"Test размер: {X_test.shape}")
print(f"\nTrain распределение: {y_train.value_counts(normalize=True).to_dict()}")
print(f"Test распределение: {y_test.value_counts(normalize=True).to_dict()}")

## 3. Baseline модели

In [None]:
# DummyClassifier - всегда предсказывает мажоритарный класс
dummy = DummyClassifier(strategy='most_frequent', random_state=RANDOM_STATE)
dummy.fit(X_train, y_train)
y_pred_dummy = dummy.predict(X_test)

print("DummyClassifier (baseline):")
print(f"  Accuracy: {accuracy_score(y_test, y_pred_dummy):.4f}")
print(f"  F1-score: {f1_score(y_test, y_pred_dummy):.4f}")
print(f"  ROC-AUC: N/A (не возвращает вероятности)")

In [None]:
# LogisticRegression с нормализацией
lr_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000))
])

lr_pipe.fit(X_train, y_train)
y_pred_lr = lr_pipe.predict(X_test)
y_proba_lr = lr_pipe.predict_proba(X_test)[:, 1]

print("LogisticRegression (baseline):")
print(f"  Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print(f"  F1-score: {f1_score(y_test, y_pred_lr):.4f}")
print(f"  ROC-AUC: {roc_auc_score(y_test, y_proba_lr):.4f}")

## 4. Decision Tree с контролем сложности

In [None]:
# Контроль сложности через max_depth и min_samples_leaf
dt = DecisionTreeClassifier(
    max_depth=10, 
    min_samples_leaf=20, 
    random_state=RANDOM_STATE
)

dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
y_proba_dt = dt.predict_proba(X_test)[:, 1]

print("DecisionTree:")
print(f"  Accuracy: {accuracy_score(y_test, y_pred_dt):.4f}")
print(f"  F1-score: {f1_score(y_test, y_pred_dt):.4f}")
print(f"  ROC-AUC: {roc_auc_score(y_test, y_proba_dt):.4f}")

## 5. Random Forest

In [None]:
# Random Forest - bagging + случайность по признакам
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=15,
    min_samples_leaf=10,
    max_features='sqrt',
    random_state=RANDOM_STATE,
    n_jobs=-1
)

rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:, 1]

print("RandomForest:")
print(f"  Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"  F1-score: {f1_score(y_test, y_pred_rf):.4f}")
print(f"  ROC-AUC: {roc_auc_score(y_test, y_proba_rf):.4f}")

## 6. Gradient Boosting

In [None]:
# Gradient Boosting - последовательное улучшение
gb = GradientBoostingClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    random_state=RANDOM_STATE
)

gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
y_proba_gb = gb.predict_proba(X_test)[:, 1]

print("GradientBoosting:")
print(f"  Accuracy: {accuracy_score(y_test, y_pred_gb):.4f}")
print(f"  F1-score: {f1_score(y_test, y_pred_gb):.4f}")
print(f"  ROC-AUC: {roc_auc_score(y_test, y_proba_gb):.4f}")

## 7. Сравнение моделей

In [None]:
# Таблица результатов
results_df = pd.DataFrame({
    'Model': ['DummyClassifier', 'LogisticRegression', 'DecisionTree', 'RandomForest', 'GradientBoosting'],
    'Accuracy': [
        accuracy_score(y_test, y_pred_dummy),
        accuracy_score(y_test, y_pred_lr),
        accuracy_score(y_test, y_pred_dt),
        accuracy_score(y_test, y_pred_rf),
        accuracy_score(y_test, y_pred_gb)
    ],
    'F1': [
        f1_score(y_test, y_pred_dummy),
        f1_score(y_test, y_pred_lr),
        f1_score(y_test, y_pred_dt),
        f1_score(y_test, y_pred_rf),
        f1_score(y_test, y_pred_gb)
    ],
    'ROC-AUC': [
        None,
        roc_auc_score(y_test, y_proba_lr),
        roc_auc_score(y_test, y_proba_dt),
        roc_auc_score(y_test, y_proba_rf),
        roc_auc_score(y_test, y_proba_gb)
    ]
})

print("\nСравнение всех моделей:")
print(results_df.to_string(index=False))

# Лучшая модель
best_idx = results_df['ROC-AUC'].dropna().idxmax()
print(f"\n\nЛучшая модель: {results_df.loc[best_idx, 'Model']} (ROC-AUC={results_df.loc[best_idx, 'ROC-AUC']:.4f})")

## 8. ROC-кривые

In [None]:
# ROC curves для всех моделей с вероятностями
fig, ax = plt.subplots(figsize=(10, 7))

models_roc = [
    ('LogisticRegression', lr_pipe, y_proba_lr),
    ('DecisionTree', dt, y_proba_dt),
    ('RandomForest', rf, y_proba_rf),
    ('GradientBoosting', gb, y_proba_gb)
]

for name, model, y_proba in models_roc:
    RocCurveDisplay.from_predictions(y_test, y_proba, ax=ax, name=name)

ax.plot([0, 1], [0, 1], 'k--', label='Random')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC Curves Comparison')
ax.legend(loc='lower right')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('artifacts/figures/roc_curves.png', dpi=150, bbox_inches='tight')
plt.show()

print("ROC кривые сохранены в artifacts/figures/roc_curves.png")

## 9. Confusion Matrix для лучшей модели

In [None]:
# Confusion matrix для GradientBoosting
cm = confusion_matrix(y_test, y_pred_gb)
print("Confusion Matrix (GradientBoosting):")
print(cm)

fig, ax = plt.subplots(figsize=(8, 6))
ConfusionMatrixDisplay(cm, display_labels=['Class 0', 'Class 1']).plot(ax=ax, cmap='Blues')
ax.set_title('Confusion Matrix - GradientBoosting')
plt.tight_layout()
plt.savefig('artifacts/figures/confusion_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nConfusion matrix сохранена в artifacts/figures/confusion_matrix.png")

## 10. Feature Importance

In [None]:
# Permutation importance для GradientBoosting
perm_importance = permutation_importance(
    gb, X_test, y_test, n_repeats=10, random_state=RANDOM_STATE, n_jobs=-1
)

# Топ-15 признаков
indices = np.argsort(perm_importance.importances_mean)[::-1][:15]
top_features = X.columns[indices]
top_importances = perm_importance.importances_mean[indices]

print("Top-15 признаков по permutation importance:")
for i, (feat, imp) in enumerate(zip(top_features, top_importances), 1):
    print(f"{i:2d}. {feat}: {imp:.4f}")

# График
fig, ax = plt.subplots(figsize=(10, 8))
ax.barh(range(len(top_features)), top_importances)
ax.set_yticks(range(len(top_features)))
ax.set_yticklabels(top_features)
ax.set_xlabel('Permutation Importance')
ax.set_title('Top-15 Feature Importance (GradientBoosting)')
ax.invert_yaxis()
ax.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.savefig('artifacts/figures/feature_importance.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nFeature importance сохранена в artifacts/figures/feature_importance.png")

## 11. Сохранение артефактов

In [None]:
# Сохраняем лучшую модель
joblib.dump(gb, 'artifacts/best_model.joblib')
print("Модель сохранена в artifacts/best_model.joblib")

# Метрики
metrics_test = {
    'DummyClassifier': {
        'accuracy': float(accuracy_score(y_test, y_pred_dummy)),
        'f1': float(f1_score(y_test, y_pred_dummy)),
        'roc_auc': None
    },
    'LogisticRegression': {
        'accuracy': float(accuracy_score(y_test, y_pred_lr)),
        'f1': float(f1_score(y_test, y_pred_lr)),
        'roc_auc': float(roc_auc_score(y_test, y_proba_lr))
    },
    'DecisionTree': {
        'accuracy': float(accuracy_score(y_test, y_pred_dt)),
        'f1': float(f1_score(y_test, y_pred_dt)),
        'roc_auc': float(roc_auc_score(y_test, y_proba_dt))
    },
    'RandomForest': {
        'accuracy': float(accuracy_score(y_test, y_pred_rf)),
        'f1': float(f1_score(y_test, y_pred_rf)),
        'roc_auc': float(roc_auc_score(y_test, y_proba_rf))
    },
    'GradientBoosting': {
        'accuracy': float(accuracy_score(y_test, y_pred_gb)),
        'f1': float(f1_score(y_test, y_pred_gb)),
        'roc_auc': float(roc_auc_score(y_test, y_proba_gb))
    }
}

with open('artifacts/metrics_test.json', 'w') as f:
    json.dump(metrics_test, f, indent=2)

print("Метрики сохранены в artifacts/metrics_test.json")
print("\nВсе артефакты созданы!")