In [1]:
# Обучим новую модель. Автогенерация признаков (AutoFeat)

# Шаг 1. Загружаем очищенные данные из таблицы clean_users_churn
import pandas as pd
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv
import yaml

pd.set_option('display.max_columns', None)

def create_connection():

    load_dotenv()
    host = os.environ.get('DB_DESTINATION_HOST')
    port = os.environ.get('DB_DESTINATION_PORT')
    db = os.environ.get('DB_DESTINATION_NAME')
    username = os.environ.get('DB_DESTINATION_USER')
    password = os.environ.get('DB_DESTINATION_PASSWORD')
    
    print(f'postgresql://{username}:{password}@{host}:{port}/{db}')
    conn = create_engine(f'postgresql://{username}:{password}@{host}:{port}/{db}', connect_args={'sslmode':'require'})
    return conn

def get_data():
    with open('params.yaml', 'r') as fd:
        params = yaml.safe_load(fd)

    conn = create_connection()
    data = pd.read_sql('select * from clean_users_churn', conn, index_col=params['index_col'])
    conn.dispose()

    os.makedirs('data', exist_ok=True)
    data.to_csv('data/initial_data.csv', index=None)

    print("Данные загружены")

if __name__ == '__main__':
    get_data()

postgresql://mle_20240325_54955bf804:6e3f607018b444f69359510efb12da90@rc1b-uh7kdmcx67eomesf.mdb.yandexcloud.net:6432/playground_mle_20240325_54955bf804
Данные загружены


In [2]:
# Шаг 2. Разделяем данные на train и test

import pandas as pd
from sklearn.model_selection import train_test_split
import yaml

with open('params.yaml', 'r') as fd:
    params = yaml.safe_load(fd)

df = pd.read_csv('data/initial_data.csv')

features = df.drop(columns=[params['target_col'], 'end_date']).columns.to_list()
target = [params['target_col']] # колонка с таргетом вашей модели

split_column = "begin_date"
test_size = 0.2

df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    test_size=test_size,
    shuffle=False,
) 
print("Данные разделены!")
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


Данные разделены!
(5615, 19)
(1404, 19)
(5615, 1)
(1404, 1)


In [3]:
# Шаг 3. Проводим автогенерацию новых признаков
from autofeat import AutoFeatClassifier

cat_features = [
    'paperless_billing',
    'payment_method',
    'internet_service',
    'online_security',
    'online_backup',
    'device_protection',
    'tech_support',
    'streaming_tv',
    'streaming_movies',
    'gender',
    'senior_citizen',
    'partner',
    'dependents',
    'multiple_lines',
    'type',
]
num_features = ["monthly_charges", "total_charges"]

features = cat_features + num_features

transformations = ('1/', 'log', 'abs', 'sqrt')

afc = AutoFeatClassifier(
    categorical_cols=cat_features,
    feateng_cols=num_features,
    transformations=transformations,
    feateng_steps=1,
    n_jobs=-1)
print("Начинаем!")
X_train_features = afc.fit_transform(X_train.drop(columns=['begin_date']), y_train)
print(X_train_features.shape)
print(X_train_features.columns.to_list())
X_test_features = afc.transform(X_test.drop(columns=['begin_date']))
print(X_test_features.shape)

Начинаем!


  y = column_or_1d(y, warn=True)


(5615, 37)
['id', 'monthly_charges', 'total_charges', 'cat_paperless_billing_No', 'cat_paperless_billing_Yes', 'cat_payment_method_Bank transfer (automatic)', 'cat_payment_method_Credit card (automatic)', 'cat_payment_method_Electronic check', 'cat_payment_method_Mailed check', 'cat_internet_service_DSL', 'cat_internet_service_Fiber optic', 'cat_online_security_No', 'cat_online_security_Yes', 'cat_online_backup_No', 'cat_online_backup_Yes', 'cat_device_protection_No', 'cat_device_protection_Yes', 'cat_tech_support_No', 'cat_tech_support_Yes', 'cat_streaming_tv_No', 'cat_streaming_tv_Yes', 'cat_streaming_movies_No', 'cat_streaming_movies_Yes', 'cat_gender_Female', 'cat_gender_Male', 'cat_senior_citizen_0', 'cat_senior_citizen_1', 'cat_partner_No', 'cat_partner_Yes', 'cat_dependents_No', 'cat_dependents_Yes', 'cat_multiple_lines_No', 'cat_multiple_lines_Yes', 'cat_type_Month-to-month', 'cat_type_One year', 'cat_type_Two year', '1/total_charges']
(1404, 37)


In [4]:
# Шаг 4. Логируем afc как артефакт
import os
import mlflow
from dotenv import load_dotenv

EXPERIMENT_NAME = "krosh_exp_21_07"
RUN_NAME = "preprocessing_"
REGISTRY_MODEL_NAME = "churn_model_krosh_2"

load_dotenv()
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID")
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY")

# Подключаемся к трекинг серверу
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

artifact_path = "afc"
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

# Если эксперимент не найден, создайте его
if experiment is None:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    afc_info = mlflow.sklearn.log_model(afc, artifact_path=artifact_path)



KeyboardInterrupt: 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import joblib

model = LogisticRegression(C=params['C'], penalty=params['penalty'], max_iter=200)

pipeline = Pipeline(
    [
        ('afc', afc),
        ('model', model)
    ]
)
pipeline.fit(X_train_features, y_train)


os.makedirs('models', exist_ok=True)
with open('models/fitted_model.pkl', 'wb') as fd:
    joblib.dump(pipeline, fd)

print("Модель обучена и сохранена")