In [20]:
import os
import mlflow

from sklearn.model_selection import train_test_split
import pandas as pd
import numpy

from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, TargetEncoder, LabelEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve, f1_score
from sklearn.metrics import classification_report


In [21]:
df = pd.read_pickle('../data/clean_dataset.pkl')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   age        4521 non-null   uint8   
 1   job        4521 non-null   category
 2   marital    4521 non-null   category
 3   education  4521 non-null   category
 4   default    4521 non-null   category
 5   balance    4521 non-null   float32 
 6   housing    4521 non-null   category
 7   loan       4521 non-null   category
 8   contact    4521 non-null   category
 9   day        4521 non-null   uint32  
 10  month      4521 non-null   category
 11  duration   4521 non-null   uint32  
 12  campaign   4521 non-null   uint32  
 13  y          4521 non-null   category
dtypes: category(9), float32(1), uint32(3), uint8(1)
memory usage: 115.2 KB


In [22]:
df = df.rename(columns={'y': 'target'})

In [23]:
lb = LabelEncoder() 
df['target'] = lb.fit_transform(df['target'])

In [24]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.25, random_state=123)

In [25]:
X_test.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign
2982,38,technician,married,tertiary,no,1161.0,yes,no,unknown,8,may,665,1
276,35,housemaid,married,tertiary,no,11219.0,no,no,cellular,12,aug,699,2
4132,37,technician,married,secondary,no,1063.0,yes,no,cellular,31,jul,413,2
511,32,management,married,tertiary,no,820.0,yes,no,cellular,17,jul,738,3
2061,31,services,married,secondary,no,-331.0,yes,no,unknown,23,may,203,5


In [26]:
num_features = list(df.select_dtypes(include=['number']).columns)
num_features.remove("target")
num_features

['age', 'balance', 'day', 'duration', 'campaign']

In [27]:
cat_features = list(df.select_dtypes(include=['category']).columns)
cat_features

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month']

In [28]:
s_scaler = StandardScaler()
t_encoder = TargetEncoder()
classifier = RandomForestClassifier(n_estimators=300, max_depth=50)

In [29]:
# Для удобной работы со столбцами
preprocessor = ColumnTransformer(
    transformers=[
        ('num', s_scaler, num_features),  # преобразования для числовых признаков
        ('cat', t_encoder, cat_features), # преобразования для категориальных признаков
    ],
    remainder='drop' ) # Удаляем столбцы, которые не затронуты преобразования

In [30]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor), 
                           ('model', classifier)])

In [31]:
pipeline.fit(X_train, y_train)

In [32]:
predictions = pipeline.predict(X_test) 
predictions

array([0, 0, 0, ..., 0, 0, 0])

In [33]:
metrics = {}
metrics["recall"] = recall_score(y_test, predictions)   
metrics["precision"] = precision_score(y_test, predictions)
metrics["f1"] = f1_score(y_test, predictions)
metrics["roc_auc"] = roc_auc_score(y_test, predictions)
metrics


{'recall': 0.2682926829268293,
 'precision': 0.5892857142857143,
 'f1': 0.3687150837988827,
 'roc_auc': 0.6227376113046844}

In [34]:
# Обязательно логируем сигнатуру модели и пример входных данных. Подготовим их
from mlflow.models import infer_signature

signature =  infer_signature(model_input = X_train.head(5))
input_example = X_train.head(5)

# Будем логировать requirements и артефакт - текстовый файл
req_file = '../requirements.txt'
art = 'comment.txt'

# Параметры, котороые будут залогированы, можем задавать вручную или полностью взять из модели
#params_dict = {'n_estimators': 10, 'max_depth': 10}
params_dict = pipeline.get_params()



In [35]:
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

registry_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"
tracking_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"

mlflow.set_tracking_uri(tracking_uri)   
mlflow.set_registry_uri(registry_uri) 

In [43]:
# название тестового эксперимента, запуска (run) внутри него, имени, под которым модель будет регистрироваться
EXPERIMENT_NAME = "estate_project1"
RUN_NAME = "baseline model"
REGISTRY_MODEL_NAME = "estate_model_rf"

# Когда создаем новый эксперимент, то: 
experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)

# Впоследствии. чтобы добавлять запуски в этот же эксепримент мы должны получить его id:
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(pipeline, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             pip_requirements=req_file
                             )
    mlflow.log_metrics(metrics)
    # mlflow.log_artifact(art)
    mlflow.log_params(params_dict)

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

2024/12/19 14:34:43 INFO mlflow.tracking._tracking_service.client: 🏃 View run baseline model at: http://127.0.0.1:5000/#/experiments/1/runs/eeb78b5a77454e129032b2f8f2e3d1e4.
2024/12/19 14:34:43 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.


In [44]:
from sklearn.preprocessing import QuantileTransformer, SplineTransformer, PolynomialFeatures, MinMaxScaler, KBinsDiscretizer

In [45]:
X_train_fe_sklearn = X_train.copy()
Y_train_fe_sklearn = y_train.copy()

In [46]:
f = PolynomialFeatures(degree=2)
qt = QuantileTransformer()
kb = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')

In [49]:
preprocessor_sklearn = ColumnTransformer(
    transformers=[
        ('num', s_scaler, num_features),  # преобразования для числовых признаков
        ('cat', t_encoder, cat_features), # преобразования для категориальных признаков
        ('quantile', qt,["balance"]),
        ('poly', Pipeline(steps=[
                                    ('poly', f),
                                    ('scale', StandardScaler())
                                ]), ["age", "balance"]
            ), # В преобразования добавляем созданный ранее pipeline
        ('kbins', kb, num_features),
    ],
    remainder='drop') # Удаляем столбцы, которые не затронуты преобразования


In [50]:
X_train_fe_sklearn_raw = preprocessor_sklearn.fit_transform(X_train_fe_sklearn, Y_train_fe_sklearn)
X_train_fe_sklearn = pd.DataFrame(X_train_fe_sklearn_raw, columns=preprocessor_sklearn.get_feature_names_out())


In [51]:
X_train_fe_sklearn.columns

Index(['num__age', 'num__balance', 'num__day', 'num__duration',
       'num__campaign', 'cat__job', 'cat__marital', 'cat__education',
       'cat__default', 'cat__housing', 'cat__loan', 'cat__contact',
       'cat__month', 'quantile__balance', 'poly__1', 'poly__age',
       'poly__balance', 'poly__age^2', 'poly__age balance', 'poly__balance^2',
       'kbins__age', 'kbins__balance', 'kbins__day', 'kbins__duration',
       'kbins__campaign'],
      dtype='object')

In [52]:
with open("column_names.txt", "w") as output:
    output.write(str(X_train_fe_sklearn.columns))

In [53]:
pipeline_sklearn = Pipeline(steps=[('preprocessor', preprocessor_sklearn), 
                           ('model', classifier)])

In [54]:
pipeline_sklearn.fit(X_train, y_train)

predictions = pipeline_sklearn.predict(X_test) 
metrics = {}
metrics["recall"] = recall_score(y_test, predictions)   
metrics["precision"] = precision_score(y_test, predictions)
metrics["f1"] = f1_score(y_test, predictions)
metrics["roc_auc"] = roc_auc_score(y_test, predictions)
metrics


{'recall': 0.2682926829268293,
 'precision': 0.6111111111111112,
 'f1': 0.3728813559322034,
 'roc_auc': 0.623729674796748}

In [55]:
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
RUN_NAME = 'fe_sklearn'

input_example = X_train_fe_sklearn.head(5)

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
# Параметры, котороые будут залогированы
params_dict = pipeline_sklearn.get_params()

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(pipeline_sklearn, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             pip_requirements=req_file
                             )
    mlflow.log_metrics(metrics)
    mlflow.log_artifact('column_names.txt', artifact_path = 'Фичи')
    mlflow.log_params(pipeline_sklearn.get_params())

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

  "dataframe_split": {
    "columns": [
      "num__age",
      "num__balance",
      "num__day",
      "num__duration",
      "num__campaign",
      "cat__job",
      "cat__marital",
      "cat__education",
      "cat__default",
      "cat__housing",
      "cat__loan",
      "cat__contact",
      "cat__month",
      "quantile__balance",
      "poly__1",
      "poly__age",
      "poly__balance",
      "poly__age^2",
      "poly__age balance",
      "poly__balance^2",
      "kbins__age",
      "kbins__balance",
      "kbins__day",
      "kbins__duration",
      "kbins__campaign"
    ],
    "data": [
      [
        -0.8761525926346357,
        -0.4302728449277611,
        0.3877878384311291,
        -0.2868751456737427,
        0.4111881426913859,
        0.07291275242862363,
        0.10267849712837226,
        0.10461721967255963,
        0.11760300462936908,
        0.08400060114828468,
        0.12629359176744787,
        0.035886328301262135,
        0.0644759810150084,
        0.2