In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.datasets import make_moons, load_breast_cancer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.inspection import permutation_importance
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.dummy import DummyClassifier
import sklearn.metrics as skm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

def print_metrics(y_true, y_pred, y_proba=None, title=None):
        # y_true, y_pred: истинные и предсказанные метки классов (например, 0/1)
        # y_proba: вероятность класса 1 (нужна, если хотим посчитать ROC-AUC)
        if title:
            print(title)
        # Accuracy – доля правильных ответов (хороша как “первая” метрика, но не всегда достаточно)
        acc = accuracy_score(y_true, y_pred)
        # F1 – баланс precision/recall (часто полезнее accuracy при дисбалансе классов)
        f1 = f1_score(y_true, y_pred)
        print(f"accuracy = {acc:.4f}")
        print(f"f1       = {f1:.4f}")
        if y_proba is not None:
            # ROC-AUC корректно считать по вероятностям/скорингам, а не по “жёстким” классам
            try:
                auc = roc_auc_score(y_true, y_proba)
                print(f"roc_auc  = {auc:.4f}")
            except Exception:
                # Иногда AUC может не считаться (например, если в y_true один класс)
                pass
        print("confusion_matrix:")
        print(confusion_matrix(y_true, y_pred))
        print()

url = 'https://raw.githubusercontent.com/mirea-aie-2025/aie-course-meta/refs/heads/main/seminars/S06/S06-hw-dataset-01.csv'
df = pd.read_csv(url)
print(df.head())
print('=' * 52)
print(df.info())
print('=' * 52)
print(df.describe())
print('=' * 52)
X = df.drop('id', axis=1)
X = X.drop('target', axis=1)
y = df.target
print('Доля классов:', abs(1 -len(df[df.target == 1])/len(df)), ':', len(df[df.target == 1])/len(df))
print('=' * 52)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

bl = DummyClassifier(strategy='most_frequent', random_state=42)
bl.fit(X_train, y_train)
y_proba = bl.predict_proba(X_test)[:, 1]
y_pred = bl.predict(X_test)
print_metrics(y_test, y_pred, y_proba=y_proba, title='Dummy')
print('=' * 52)

lr = Pipeline(steps=[('skelter', StandardScaler()), ("logreg", LogisticRegression(penalty='l2' , C=1.0, solver='liblinear', random_state=42))])
lr.fit(X_train, y_train)
y_lr_pred = lr.predict(X_test)
y_lr_proba = lr.predict_proba(X_test)[:, 1]
print_metrics(y_test, y_lr_pred, y_proba=y_lr_proba, title='LogisticRegression')
print('=' * 52)

dt = DecisionTreeClassifier(random_state=42, max_depth=5, min_samples_leaf=10)
dt.fit(X_train, y_train)
y_dt_proba = dt.predict_proba(X_test)[:, 1]
y_dt_pred = (y_dt_proba >= 0.5).astype(int)
print_metrics(y_test, y_dt_pred, y_proba=y_dt_proba, title='DecisionTree')
print('=' * 52)

rf = RandomForestClassifier(random_state=42, n_jobs=-1, oob_score=True, n_estimators=600, max_depth=5)
rf.fit(X_train, y_train)
y_rf_proba = rf.predict_proba(X_test)[:, 1]
y_rf_pred = (y_rf_proba >= 0.5).astype(int)
print_metrics(y_test, y_rf_pred, y_proba=y_rf_proba, title='RandomForest')
print('=' * 52)

ada = AdaBoostClassifier(random_state=42, estimator=dt, n_estimators=200, learning_rate=0.6)
ada.fit(X_train, y_train)
pred = ada.predict(X_test)
proba = None
if hasattr(ada, "predict_proba"):
    proba = ada.predict_proba(X_test)[:, 1]
print_metrics(y_test, pred, y_proba=proba, title='AdaBoost')