In [2]:
# ПОДБОР ГИПЕРПАРАМЕТРОВ OPTUNA
# Шаг 1. Загружаем данные
import pandas as pd
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv
import yaml

load_dotenv()
host = os.environ.get('DB_DESTINATION_HOST')
port = os.environ.get('DB_DESTINATION_PORT')
db = os.environ.get('DB_DESTINATION_NAME')
username = os.environ.get('DB_DESTINATION_USER')
password = os.environ.get('DB_DESTINATION_PASSWORD')

print(f'postgresql://{username}:{password}@{host}:{port}/{db}')
conn = create_engine(f'postgresql://{username}:{password}@{host}:{port}/{db}', connect_args={'sslmode':'require'})

with open('params.yaml', 'r') as fd:
    params = yaml.safe_load(fd)
df = pd.read_sql('select * from clean_users_churn', conn, index_col=params['index_col'])
conn.dispose()

os.makedirs('data', exist_ok=True)
df.to_csv('data/initial_data.csv', index=None)

print("Данные загружены")
print(df.shape)
print(df.head(2))

postgresql://mle_20240325_54955bf804:6e3f607018b444f69359510efb12da90@rc1b-uh7kdmcx67eomesf.mdb.yandexcloud.net:6432/playground_mle_20240325_54955bf804
Данные загружены
(7019, 21)
             id begin_date end_date      type paperless_billing   
customer_id                                                       
8191-XWSZG    1 2015-10-01      NaT  One year                No  \
3957-SQXML    2 2017-04-01      NaT  Two year                No   

                      payment_method  monthly_charges  total_charges   
customer_id                                                            
8191-XWSZG              Mailed check            20.65        1022.95  \
3957-SQXML   Credit card (automatic)            24.95         894.30   

            internet_service online_security  ... device_protection   
customer_id                                   ...                     
8191-XWSZG       Fiber optic              No  ...                No  \
3957-SQXML       Fiber optic              No  ...

In [3]:
from sklearn.model_selection import train_test_split

TABLE_NAME = "clean_users_churn"

features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

split_column = "customer_id"
stratify_column = "target"
test_size = 0.3

df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=test_size, shuffle=False)

print(f"Размер выборки для обучения: {X_train.shape}")
print(f"Размер выборки для теста: {X_test.shape}")


Размер выборки для обучения: (4913, 3)
Размер выборки для теста: (2106, 3)


In [5]:
import mlflow
import os
import optuna

from catboost import CatBoostClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score, log_loss
from collections import defaultdict
from sklearn.model_selection import StratifiedKFold
from optuna.integration.mlflow import MLflowCallback
from numpy import median, array
from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID

os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID")
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY")

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

EXPERIMENT_NAME = "krosh_exp_30_07"
RUN_NAME = "model_bayesian_search"

STUDY_DB_NAME = "sqlite:///local.study.db"
STUDY_NAME = "churn_model_4"


def objective(trial: optuna.Trial) -> float:
    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 12),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.1, 5),
        "random_strength": trial.suggest_float("random_strength", 0.1, 5),
        "loss_function": "Logloss",
        "task_type": "CPU",
        "random_seed": 0,
        "iterations": 300,
        "verbose": False,
    }
 
    model = CatBoostClassifier(**param)

    skf = StratifiedKFold(n_splits=2)

    metrics = defaultdict(list)
    for i, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
        # ваш код здесь #
        train_x = X_train.iloc[train_index]
        val_x = X_train.iloc[val_index]
        
        train_y = y_train.iloc[train_index]
        val_y = y_train.iloc[val_index]

        model.fit(train_x, train_y, eval_set=(val_x, val_y), early_stopping_rounds=50, verbose=False)

        prediction = model.predict(val_x)
        probas = model.predict_proba(val_x)[:, 1]

        _, err1, _, err2 = confusion_matrix(val_y, prediction, normalize='all').ravel()
        auc = roc_auc_score(val_y, probas)
        precision = precision_score(val_y, prediction)
        recall = recall_score(val_y, prediction)
        f1 = f1_score(val_y, prediction)
        logloss = log_loss(val_y, probas)
        
        metrics["err1"].append(err1)
        metrics["err2"].append(err2)
        metrics["auc"].append(auc)
        metrics["precision"].append(precision)
        metrics["recall"].append(recall)
        metrics["f1"].append(f1)
        metrics["logloss"].append(logloss)


    # ваш код здесь #
    err_1 = median(array(metrics['err1']))
    err_2 = median(array(metrics['err2']))
    auc = median(array(metrics['auc']))
    precision = median(array(metrics['precision']))
    recall = median(array(metrics['recall']))
    f1 = median(array(metrics['f1']))
    logloss = median(array(metrics['logloss']))

    return auc


experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if not experiment:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id
    

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    print(run_id)


mlflc = MLflowCallback(
    tracking_uri=f'http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}',
    metric_name='AUC',
    create_experiment=False,
    mlflow_kwargs={'experiment_id': experiment_id, 'tags': {MLFLOW_PARENT_RUN_ID: run_id}})

study = optuna.create_study(direction='maximize',
                            sampler=optuna.samplers.TPESampler(),
                            study_name=STUDY_NAME,
                            storage=STUDY_DB_NAME)
study.optimize(objective,
               n_trials=10,
               callbacks=[mlflc])
best_params = study.best_params

with mlflow.start_run(run_id=run_id, experiment_id=experiment_id) as run:
    model_best = CatBoostClassifier(**best_params)
    model_best.fit(X_train, y_train)
    mlflow.sklearn.log_model(
        sk_model=model_best,
        await_registration_for=60,
        artifact_path="cv")

print(f"Number of finished trials: {len(study.trials)}")
print(f"Best params: {best_params}")

  mlflc = MLflowCallback(
[I 2024-07-30 21:03:08,265] A new study created in RDB with name: churn_model_4


84882776fd3a4ddcbcf9a0606473b00d


[I 2024-07-30 21:03:09,853] Trial 0 finished with value: 0.8135993200258812 and parameters: {'learning_rate': 0.016068299071428326, 'depth': 6, 'l2_leaf_reg': 1.351318637022291, 'random_strength': 4.505661499669862}. Best is trial 0 with value: 0.8135993200258812.
[I 2024-07-30 21:03:13,213] Trial 1 finished with value: 0.8081324768581615 and parameters: {'learning_rate': 0.002393609597775497, 'depth': 10, 'l2_leaf_reg': 2.1155988571712023, 'random_strength': 1.7647500571667774}. Best is trial 0 with value: 0.8135993200258812.
[I 2024-07-30 21:03:17,003] Trial 2 finished with value: 0.8132531418233233 and parameters: {'learning_rate': 0.04263756495809249, 'depth': 11, 'l2_leaf_reg': 0.34257572762867, 'random_strength': 2.925167156932399}. Best is trial 0 with value: 0.8135993200258812.
[I 2024-07-30 21:03:18,194] Trial 3 finished with value: 0.8148684102059782 and parameters: {'learning_rate': 0.07991815428210872, 'depth': 6, 'l2_leaf_reg': 3.9848157401271975, 'random_strength': 3.6467

0:	learn: 0.6871748	total: 6.43ms	remaining: 6.43s
1:	learn: 0.6764126	total: 9.6ms	remaining: 4.79s
2:	learn: 0.6671337	total: 11.7ms	remaining: 3.88s
3:	learn: 0.6573004	total: 13.5ms	remaining: 3.35s
4:	learn: 0.6466833	total: 15ms	remaining: 2.98s
5:	learn: 0.6371213	total: 16.5ms	remaining: 2.73s
6:	learn: 0.6287479	total: 18ms	remaining: 2.56s
7:	learn: 0.6215178	total: 19.9ms	remaining: 2.46s
8:	learn: 0.6138146	total: 21.5ms	remaining: 2.37s
9:	learn: 0.6069823	total: 23ms	remaining: 2.28s
10:	learn: 0.6001361	total: 24.6ms	remaining: 2.21s
11:	learn: 0.5937910	total: 26.2ms	remaining: 2.16s
12:	learn: 0.5875453	total: 27.8ms	remaining: 2.11s
13:	learn: 0.5818303	total: 29.4ms	remaining: 2.07s
14:	learn: 0.5764669	total: 30.9ms	remaining: 2.03s
15:	learn: 0.5709912	total: 33.2ms	remaining: 2.04s
16:	learn: 0.5662441	total: 34.8ms	remaining: 2.01s
17:	learn: 0.5615406	total: 41.2ms	remaining: 2.25s
18:	learn: 0.5560067	total: 44.2ms	remaining: 2.28s
19:	learn: 0.5514678	total: 4