In [None]:
!pip install xgboost

import os
import random
import logging
import joblib
import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn
from datetime import datetime
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import dvc.api

# ----------------------------
# 1. Настройки
# ----------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s"
)

EXPERIMENT_NAME = "customer_churn_prediction"
MODELS_DIR = "models"
os.makedirs(MODELS_DIR, exist_ok=True)

# ----------------------------
# 2. Загрузка и подготовка данных
# ----------------------------
def load_data():
    data_url = dvc.api.get_url("customer_churn.csv")
    df = pd.read_csv(data_url)
    df = df[["churn", "tenure", "age", "contract_type", "monthly_charges"]].dropna().drop_duplicates()
    df["churn"] = df["churn"].astype(int)
    return df

def preprocess_split(df):
    X = df.drop("churn", axis=1)
    y = df["churn"]
    return train_test_split(X, y, test_size=0.2, stratify=y, random_state=SEED)

def get_preprocessor():
    categorical_cols = ["contract_type"]
    numerical_cols = ["tenure", "age", "monthly_charges"]

    return ColumnTransformer([
        ("onehot", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("scaler", StandardScaler(), numerical_cols)
    ])

# ----------------------------
# 3. Определение моделей
# ----------------------------
def get_model_configs(preprocessor):
    return {
        "LogReg": {
            "pipeline": Pipeline([
                ("prep", preprocessor),
                ("model", LogisticRegression(max_iter=1000, random_state=SEED))
            ]),
            "params": {
                "model__C": [0.1, 1, 10],
                "model__penalty": ["l2"]
            }
        },
        "RandomForest": {
            "pipeline": Pipeline([
                ("prep", preprocessor),
                ("model", RandomForestClassifier(random_state=SEED))
            ]),
            "params": {
                "model__n_estimators": [200, 400],
                "model__max_depth": [None, 10, 20]
            }
        },
        "XGBoost": {
            "pipeline": Pipeline([
                ("prep", preprocessor),
                ("model", XGBClassifier(
                    objective="binary:logistic",
                    eval_metric="logloss",
                    random_state=SEED,
                    use_label_encoder=False
                ))
            ]),
            "params": {
                "model__n_estimators": [300, 600],
                "model__learning_rate": [0.05, 0.1],
                "model__max_depth": [4, 6]
            }
        }
    }

# ----------------------------
# 4. Тренировка и логгинг
# ----------------------------
def train_and_log_model(name, pipeline, param_grid, X_train, y_train, X_test, y_test):
    mlflow.end_run()  # На всякий случай

    with mlflow.start_run(run_name=name):
        grid = GridSearchCV(
            estimator=pipeline,
            param_grid=param_grid,
            cv=5,
            scoring="roc_auc",
            n_jobs=-1
        )
        grid.fit(X_train, y_train)
        best_pipeline = grid.best_estimator_
        best_params = grid.best_params_

        mlflow.log_params(best_params)

        cv_scores = cross_val_score(best_pipeline, X_train, y_train, cv=5, scoring="roc_auc")
        mlflow.log_metric("cv_auc_mean", cv_scores.mean())
        mlflow.log_metric("cv_auc_std", cv_scores.std())

        y_pred_proba = best_pipeline.predict_proba(X_test)[:, 1]
        y_pred = (y_pred_proba >= 0.5).astype(int)

        test_auc = roc_auc_score(y_test, y_pred_proba)
        test_acc = accuracy_score(y_test, y_pred)
        test_f1 = f1_score(y_test, y_pred)

        mlflow.log_metric("test_auc", test_auc)
        mlflow.log_metric("test_accuracy", test_acc)
        mlflow.log_metric("test_f1", test_f1)

        model_path = os.path.join(MODELS_DIR, f"{name}_pipeline.pkl")
        joblib.dump(best_pipeline, model_path)
        mlflow.sklearn.log_model(best_pipeline, f"model_{name}")

        logging.info(f"{name} | Test AUC: {test_auc:.3f} | ACC: {test_acc:.3f} | F1: {test_f1:.3f}")
        return best_pipeline, test_auc, best_params

# ----------------------------
# 5. Основной pipeline
# ----------------------------
def main():
    df = load_data()
    X_train, X_test, y_train, y_test = preprocess_split(df)
    preprocessor = get_preprocessor()
    models = get_model_configs(preprocessor)

    mlflow.set_experiment(EXPERIMENT_NAME)

    best_model = None
    best_auc = -np.inf
    best_model_name = ""

    for name, cfg in models.items():
        model, auc, params = train_and_log_model(
            name,
            cfg["pipeline"],
            cfg["params"],
            X_train,
            y_train,
            X_test,
            y_test
        )
        if auc > best_auc:
            best_auc = auc
            best_model = model
            best_model_name = name

    # Финальная модель
    final_model_path = os.path.join(MODELS_DIR, f"best_model_{best_model_name}.pkl")
    joblib.dump(best_model, final_model_path)
    mlflow.sklearn.log_model(best_model, f"best_model_{best_model_name}")
    logging.info(f"🏆 Лучший модель: {best_model_name} с AUC = {best_auc:.3f}")

if __name__ == "__main__":
    main()

Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.27.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.0.2-py3-none-manylinux_2_28_x86_64.whl (253.9 MB)
[2K   [38;2;249;38;114m━━━━━━━━━━━━━━━━━━━[0m[38;2;249;38;114m╸[0m[38;5;237m━━━━━━━━━━━━━━━━━━━━[0m [32m125.4/253.9 MB[0m [31m1.5 MB/s[0m eta [36m0:01:25[0m