In [2]:
# ПОДБОР ГИПЕРПАРАМЕТРОВ
# Шаг 1. Загружаем данные
import pandas as pd
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv
import yaml

load_dotenv()
host = os.environ.get('DB_DESTINATION_HOST')
port = os.environ.get('DB_DESTINATION_PORT')
db = os.environ.get('DB_DESTINATION_NAME')
username = os.environ.get('DB_DESTINATION_USER')
password = os.environ.get('DB_DESTINATION_PASSWORD')

print(f'postgresql://{username}:{password}@{host}:{port}/{db}')
conn = create_engine(f'postgresql://{username}:{password}@{host}:{port}/{db}', connect_args={'sslmode':'require'})

with open('params.yaml', 'r') as fd:
    params = yaml.safe_load(fd)
df = pd.read_sql('select * from clean_users_churn', conn, index_col=params['index_col'])
conn.dispose()

os.makedirs('data', exist_ok=True)
df.to_csv('data/initial_data.csv', index=None)

print("Данные загружены")
print(df.shape)
print(df.head(2))

postgresql://mle_20240325_54955bf804:6e3f607018b444f69359510efb12da90@rc1b-uh7kdmcx67eomesf.mdb.yandexcloud.net:6432/playground_mle_20240325_54955bf804
Данные загружены
(7019, 21)
             id begin_date end_date      type paperless_billing   
customer_id                                                       
8191-XWSZG    1 2015-10-01      NaT  One year                No  \
3957-SQXML    2 2017-04-01      NaT  Two year                No   

                      payment_method  monthly_charges  total_charges   
customer_id                                                            
8191-XWSZG              Mailed check            20.65        1022.95  \
3957-SQXML   Credit card (automatic)            24.95         894.30   

            internet_service online_security  ... device_protection   
customer_id                                   ...                     
8191-XWSZG       Fiber optic              No  ...                No  \
3957-SQXML       Fiber optic              No  ...

In [3]:
# РЕШЕТЧАТЫЙ ПОИСК
import os
import mlflow
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score, log_loss


TABLE_NAME = "clean_users_churn"

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "krosh_exp_28_07"
RUN_NAME = 'model_grid_search'
REGISTRY_MODEL_NAME = "model_krosh_3"

features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

split_column = "customer_id"
stratify_column = "target"
test_size = 0.3

df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=test_size, shuffle=False)

print(f"Размер выборки для обучения: {X_train.shape}")
print(f"Размер выборки для теста: {X_test.shape}")

loss_function = "Logloss"
task_type = 'CPU'
random_seed = 0
iterations = 300
verbose = False

params = {
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.2],
    'iterations': [100, 200]
}

model = CatBoostClassifier(verbose=verbose,
                           loss_function=loss_function,
                           iterations=iterations,
                           task_type=task_type,
                           random_seed=random_seed)

cv = GridSearchCV(estimator=model,
                 param_grid=params,
                 cv=2,
                 scoring='accuracy',
                 n_jobs=-1)

clf = cv.fit(X_train, y_train)

os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID")
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY")


mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

cv_results = pd.DataFrame(clf.cv_results_)

best_params = clf.best_params_

model_best = CatBoostClassifier(**best_params,
                                verbose=verbose,
                                loss_function=loss_function,
                                task_type=task_type,
                                random_seed=random_seed)

model_best.fit(X_train, y_train)

prediction = model_best.predict(X_test)
probas = model_best.predict_proba(X_test)[:, 1]

# расчёт метрик качества
metrics = {}

_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel() # ошибки первого и второго рода
auc = roc_auc_score(y_test, probas) # площадь под ROC-кривой
precision = precision_score(y_test, prediction) # точность
recall = recall_score(y_test, prediction) # полнота
f1 = f1_score(y_test, prediction) # F1-мера
logloss = log_loss(y_test, probas) # LogLoss

# сохранение метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

# дополнительные метрики из результатов кросс-валидации
metrics['mean_fit_time'] = cv_results['mean_fit_time'].mean() # среднее время обучения
metrics['std_fit_time'] = cv_results['std_fit_time'].mean() # стандартное отклонение времени обучения
metrics['mean_test_score'] = cv_results['mean_test_score'].mean() # средний результат на тесте
metrics['std_test_score'] = cv_results['std_test_score'].mean() # стандартное отклонение результата на тесте
metrics['best_score'] = clf.best_score_ # лучший результат кросс-валидации

# настройки для логирования в MLFlow
pip_requirements= "../requirements.txt"
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]

print("Начинаем эксперимент")
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if experiment is None:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    mlflow.log_metrics(metrics)
    mlflow.log_params(best_params)
    cv_info = mlflow.sklearn.log_model(cv, artifact_path='cv')
    
    model_info = mlflow.catboost.log_model(
    cb_model=model_best,
    await_registration_for=60,
    signature=signature,
    input_example=input_example,
    registered_model_name=REGISTRY_MODEL_NAME,
    artifact_path="models",
    pip_requirements=pip_requirements)
    print("Финиш!")

Размер выборки для обучения: (4913, 3)
Размер выборки для теста: (2106, 3)


  inputs = _infer_schema(model_input) if model_input is not None else None


Начинаем эксперимент


Registered model 'model_krosh_3' already exists. Creating a new version of this model...
2024/07/28 22:31:19 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: model_krosh_3, version 10


Финиш!


Created version '10' of model 'model_krosh_3'.


In [4]:
# СЛУЧАЙНЫЙ ПОИСК
from sklearn.model_selection import RandomizedSearchCV

TABLE_NAME = "clean_users_churn"

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "krosh_exp_28_07"
RUN_NAME = 'model_random_search'
REGISTRY_MODEL_NAME = "model_krosh_3"

features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

split_column = "customer_id"
stratify_column = "target"
test_size = 0.3

df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=test_size, shuffle=False)

print(f"Размер выборки для обучения: {X_train.shape}")
print(f"Размер выборки для теста: {X_test.shape}")

loss_function = "Logloss"
task_type = 'CPU'
random_seed = 0
iterations = 300
verbose = False

param_distributions = {
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.2],
    'iterations': [100, 200]
}

model = CatBoostClassifier(verbose=verbose,
                           loss_function=loss_function,
                           iterations=iterations,
                           task_type=task_type,
                           random_seed=random_seed)

cv = RandomizedSearchCV(estimator=model,
                 param_distributions=param_distributions,
                 n_iter=20,
                 cv=2,
                 random_state=42,
                 n_jobs=-1)

clf = cv.fit(X_train, y_train)

os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID")
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY")


mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

cv_results = pd.DataFrame(clf.cv_results_)

best_params = clf.best_params_

model = CatBoostClassifier(**best_params,
                           verbose=verbose,
                           loss_function=loss_function,
                           task_type=task_type,
                           random_seed=random_seed)

model.fit(X_train, y_train)

prediction = model_best.predict(X_test)
probas = model_best.predict_proba(X_test)[:, 1]

# расчёт метрик качества
metrics = {}

_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel() # ошибки первого и второго рода
auc = roc_auc_score(y_test, probas) # площадь под ROC-кривой
precision = precision_score(y_test, prediction) # точность
recall = recall_score(y_test, prediction) # полнота
f1 = f1_score(y_test, prediction) # F1-мера
logloss = log_loss(y_test, probas) # LogLoss

# сохранение метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

# дополнительные метрики из результатов кросс-валидации
metrics['mean_fit_time'] = cv_results['mean_fit_time'].mean() # среднее время обучения
metrics['std_fit_time'] = cv_results['std_fit_time'].mean() # стандартное отклонение времени обучения
metrics['mean_test_score'] = cv_results['mean_test_score'].mean() # средний результат на тесте
metrics['std_test_score'] = cv_results['std_test_score'].mean() # стандартное отклонение результата на тесте
metrics['best_score'] = clf.best_score_ # лучший результат кросс-валидации

# настройки для логирования в MLFlow
pip_requirements= "../requirements.txt"
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]


experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if experiment is None:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    mlflow.log_metrics(metrics)
    mlflow.log_params(best_params)
    cv_info = mlflow.sklearn.log_model(cv, artifact_path='cv')
    
    model_info = mlflow.catboost.log_model(
    cb_model=model,
    await_registration_for=60,
    signature=signature,
    input_example=input_example,
    registered_model_name=REGISTRY_MODEL_NAME,
    artifact_path="models",
    pip_requirements=pip_requirements)



Размер выборки для обучения: (4913, 3)
Размер выборки для теста: (2106, 3)


  inputs = _infer_schema(model_input) if model_input is not None else None
Registered model 'model_krosh_3' already exists. Creating a new version of this model...
2024/07/28 22:31:41 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: model_krosh_3, version 11
Created version '11' of model 'model_krosh_3'.
