In [1]:
# Обучим новую модель.
# Шаг 1. Загружаем очищенные данные из таблицы clean_users_churn
import pandas as pd
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv
import yaml

pd.set_option('display.max_columns', None)

def create_connection():

    load_dotenv()
    host = os.environ.get('DB_DESTINATION_HOST')
    port = os.environ.get('DB_DESTINATION_PORT')
    db = os.environ.get('DB_DESTINATION_NAME')
    username = os.environ.get('DB_DESTINATION_USER')
    password = os.environ.get('DB_DESTINATION_PASSWORD')
    
    print(f'postgresql://{username}:{password}@{host}:{port}/{db}')
    conn = create_engine(f'postgresql://{username}:{password}@{host}:{port}/{db}', connect_args={'sslmode':'require'})
    return conn

def get_data():
    with open('params.yaml', 'r') as fd:
        params = yaml.safe_load(fd)

    conn = create_connection()
    data = pd.read_sql('select * from clean_users_churn', conn, index_col=params['index_col'])
    conn.dispose()

    os.makedirs('data', exist_ok=True)
    data.to_csv('data/initial_data.csv', index=None)

    print("Данные загружены")
    print(data)

if __name__ == '__main__':
    get_data()

postgresql://mle_20240325_54955bf804:6e3f607018b444f69359510efb12da90@rc1b-uh7kdmcx67eomesf.mdb.yandexcloud.net:6432/playground_mle_20240325_54955bf804
Данные загружены
               id begin_date   end_date            type paperless_billing   
customer_id                                                                 
8191-XWSZG      1 2015-10-01        NaT        One year                No  \
3957-SQXML      2 2017-04-01        NaT        Two year                No   
6837-BJYDQ      3 2019-11-01        NaT        One year                No   
0486-LGCCH      4 2019-03-01        NaT        Two year                No   
7590-VHVEG      5 2020-01-01        NaT  Month-to-month               Yes   
...           ...        ...        ...             ...               ...   
2823-LKABH   7015 2018-08-01        NaT  Month-to-month               Yes   
8775-CEBBJ   7016 2019-02-01 2019-11-01  Month-to-month               Yes   
0550-DCXLH   7017 2019-01-01        NaT  Month-to-month      

In [2]:
# Шаг 2. Обучаем новую модель

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import (OneHotEncoder, SplineTransformer, 
    QuantileTransformer, RobustScaler,PolynomialFeatures,KBinsDiscretizer)
from sklearn.model_selection import train_test_split
import yaml
import os
import joblib

with open('params.yaml', 'r') as fd:
    params = yaml.safe_load(fd)

data = pd.read_csv('data/initial_data.csv')

X = data.drop(columns=[params['target_col'], 'end_date']) # Признаки без утечек
y = data[params['target_col']] # Целевая переменная
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print("Данные разделены")
print(f"X_train: {X_train.shape}, X_test: {X_test.shape},y_train: {y_train.shape}, y_test: {y_test.shape}")

cat_columns = ["type", "payment_method", "internet_service", "gender"]
bin_columns = ["paperless_billing", "online_security", "online_backup", "device_protection",
                "tech_support", "streaming_tv", "streaming_movies", "senior_citizen",
                "partner", "dependents", "multiple_lines"]
num_columns = ["monthly_charges", "total_charges"]

encoder_oh = OneHotEncoder(
categories='auto',
handle_unknown='ignore',
max_categories=10,
sparse_output=False,
drop='first'
)

encoder_spl = SplineTransformer(n_knots=3, degree=4)
encoder_q = QuantileTransformer(n_quantiles=100)
encoder_rb = RobustScaler()
encoder_pol = PolynomialFeatures(degree=3)
encoder_kbd = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform', subsample=None)

numeric_transformer = ColumnTransformer(
    transformers=[
        ('spl', encoder_spl, num_columns),
        ('q', encoder_q, num_columns),
        ('rb', encoder_rb, num_columns),
        ('pol', encoder_pol, num_columns),
        ('kbd', encoder_kbd, num_columns)
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ('encoder', encoder_oh)
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_columns),
        ('cat', categorical_transformer, cat_columns + bin_columns)
        ], 
    n_jobs=-1)


model = LogisticRegression(C=params['C'], penalty=params['penalty'], max_iter=200)

pipeline = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('model', model)
    ]
)
pipeline.fit(X_train, y_train)


os.makedirs('models', exist_ok=True)
with open('models/fitted_model.pkl', 'wb') as fd:
    joblib.dump(pipeline, fd)

print("Модель обучена и сохранена")

Данные разделены
X_train: (4913, 19), X_test: (2106, 19),y_train: (4913,), y_test: (2106,)
Модель обучена и сохранена


In [3]:
# Шаг 3. Считаем метрики
import pandas as pd
import numpy as np
import joblib
import json
import yaml
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score, log_loss

with open('params.yaml', 'r') as fd:
    params = yaml.safe_load(fd)

with open('models/fitted_model.pkl', 'rb') as fd:
    model = joblib.load(fd)

data = pd.read_csv('data/initial_data.csv')

X = data.drop(columns=[params['target_col'], 'end_date']) # Признаки без утечек
y = data[params['target_col']] # Целевая переменная
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print("Данные разделены")

prediction = model.predict(X_test)
print("Предсказания получены")
print(prediction)
# Подсчет количества 1 и 0 в предсказаниях
binary_predictions = (prediction > 0.5).astype(int)
count_zeros = np.sum(binary_predictions == 0)
count_ones = np.sum(binary_predictions == 1)
print(f"Количество 0: {count_zeros}")
print(f"Количество 1: {count_ones}")
probas = model.predict_proba(X_test)[:, 1]
print(probas)

# Заводим словарь для хранения метрик
metrics = {}

# Подсчитываем матрицу ошибок (конфузионную матрицу)
_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()

# Подсчитываем метрики
auc = roc_auc_score(y_test, probas)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, probas)

# Записываем значения метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

# Выводим метрики
print(metrics)

for key, value in metrics.items():
        metrics[key] = round(value.mean(), 3) 

os.makedirs('cv_results', exist_ok=True)
with open('cv_results/cv_res.json', 'w') as fd:
    json.dump(metrics, fd)

Данные разделены
Предсказания получены
[0 0 0 ... 0 0 0]
Количество 0: 2106
Количество 1: 0
[0.49574716 0.4971613  0.49915886 ... 0.48415099 0.47933361 0.4636907 ]
{'err1': 0.0, 'err2': 0.0, 'auc': 0.6125046847950886, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'logloss': 0.6173206426093819}


  _warn_prf(average, modifier, msg_start, len(result))


In [4]:
import os
import mlflow
from dotenv import load_dotenv

EXPERIMENT_NAME = "krosh_exp_21_07"
RUN_NAME = "preprocrssing"
REGISTRY_MODEL_NAME = "churn_model_krosh_2"

load_dotenv()
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID")
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY")

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

pip_requirements = "../requirements.txt"
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:2]
metadata = {'model_type': 'monthly'}

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if experiment is None:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    metrics = {
        "err1": err1,
        "err2": err2,
        "auc": auc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "logloss": logloss
    }
    
    for metric_name, metric_value in metrics.items():
        mlflow.log_metric(metric_name, metric_value)
    
    model_info = mlflow.sklearn.log_model(
    sk_model=model,
    await_registration_for=60,
    signature=signature,
    input_example=input_example,
    metadata=metadata,
    pip_requirements=pip_requirements,
    registered_model_name=REGISTRY_MODEL_NAME,
    artifact_path="models")

  inputs = _infer_schema(model_input) if model_input is not None else None
Registered model 'churn_model_krosh_2' already exists. Creating a new version of this model...
2024/07/24 07:17:33 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_krosh_2, version 3
Created version '3' of model 'churn_model_krosh_2'.


In [5]:
print(run_id)

3057330f7f8941bebe88119191f2a873
