In [1]:
import os
import mlflow
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy

from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from catboost import CatBoostRegressor

from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error

In [2]:
df = pd.read_pickle('../data/clean_data.pkl').sample(frac = 0.1, random_state = 2)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2285 entries, 1626 to 22026
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   Date           2285 non-null   object  
 1   Gender         2285 non-null   category
 2   Annual Income  2285 non-null   float32 
 3   Dealer_Name    2285 non-null   category
 4   Company        2285 non-null   category
 5   Model          2285 non-null   category
 6   Color          2285 non-null   category
 7   Price ($)      2285 non-null   float32 
 8   Dealer_No      2285 non-null   object  
 9   Body Style     2285 non-null   category
 10  Dealer_Region  2285 non-null   category
 11  Config         2285 non-null   category
 12  Month          2285 non-null   int32   
 13  Year           2285 non-null   int32   
 14  Price/Income   2285 non-null   float32 
 15  norm_Income    2285 non-null   float32 
 16  norm_Price     2285 non-null   float32 
dtypes: category(8), float32(5), int32(

In [3]:
df = df.rename(columns={'Price ($)': 'target'})
df = df.drop(columns = ['Date', 'Month', 'Year'])

In [4]:
df

Unnamed: 0,Gender,Annual Income,Dealer_Name,Company,Model,Color,target,Dealer_No,Body Style,Dealer_Region,Config,Price/Income,norm_Income,norm_Price
1626,Male,900000.0,Iceberg Rentals,Mercury,Mercury Sable,Red,39000.0,53546-9427,Sedan,Janesville,DoubleÂ Overhead Camshaft Auto,0.043333,0.079529,0.440191
723,Female,13500.0,Race Car Help,Toyota,Toyota Land Cruiser,Black,21000.0,78758-7841,SUV,Austin,DoubleÂ Overhead Camshaft Auto,1.555556,0.000306,0.224880
19756,Female,620000.0,Clay Johnson Auto Sales,Dodge,Dodge Intrepid,Black,11001.0,78758-7841,Sedan,Austin,Overhead Camshaft Manual,0.017744,0.054506,0.105275
20920,Male,13500.0,McKinney Dodge Chrysler Jeep,Audi,Audi A6,Pale White,24000.0,85257-3102,SUV,Scottsdale,Overhead Camshaft Manual,1.777778,0.000306,0.260766
10420,Female,13500.0,New Castle Ford Lincoln Mercury,Nissan,Nissan Altima,Pale White,21001.0,60504-7114,Hatchback,Aurora,DoubleÂ Overhead Camshaft Auto,1.555630,0.000306,0.224892
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17526,Male,657500.0,Ryder Truck Rental and Leasing,Cadillac,Cadillac Eldorado,Black,29500.0,06457-3834,Passenger,Middletown,Overhead Camshaft Manual,0.044867,0.057857,0.326555
10550,Male,903000.0,Suburban Ford,Lexus,Lexus LS400,Pale White,27001.0,53546-9427,Sedan,Janesville,DoubleÂ Overhead Camshaft Auto,0.029901,0.079797,0.296663
6414,Male,981000.0,Saab-Belle Dodge,Saturn,Saturn LS,Pale White,51000.0,60504-7114,Sedan,Aurora,DoubleÂ Overhead Camshaft Auto,0.051988,0.086767,0.583732
23086,Male,600000.0,Clay Johnson Auto Sales,Chrysler,Chrysler Cirrus,Red,12001.0,78758-7841,Passenger,Austin,DoubleÂ Overhead Camshaft Auto,0.020002,0.052719,0.117237


In [5]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size = 0.25, random_state = 2)

In [6]:
cat_features = X_train.select_dtypes(include=['category', 'object']).columns.to_list()
cat_features

['Gender',
 'Dealer_Name',
 'Company',
 'Model',
 'Color',
 'Dealer_No ',
 'Body Style',
 'Dealer_Region',
 'Config']

In [7]:
num_features = X_train.select_dtypes(include = ['number']).columns.to_list()
num_features

['Annual Income', 'Price/Income', 'norm_Income', 'norm_Price']

In [8]:
s_scaler = StandardScaler()
l_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=99999999)
regressor = CatBoostRegressor()

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', s_scaler, num_features),  # преобразования для числовых признаков
        ('cat', l_encoder, cat_features), # преобразования для категориальных признаков
    ],
    remainder='drop' ) # Удаляем столбцы, которые не затронуты преобразования

In [10]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor), 
                           ('model', regressor)])

pipeline.fit(X_train, y_train)

Learning rate set to 0.044577
0:	learn: 13815.5147903	total: 53.6ms	remaining: 53.5s
1:	learn: 13327.9685779	total: 55.6ms	remaining: 27.7s
2:	learn: 12801.3269962	total: 57.8ms	remaining: 19.2s
3:	learn: 12338.1180410	total: 59ms	remaining: 14.7s
4:	learn: 11914.6950734	total: 60.7ms	remaining: 12.1s
5:	learn: 11471.2387639	total: 62.3ms	remaining: 10.3s
6:	learn: 11032.0400173	total: 64.6ms	remaining: 9.16s
7:	learn: 10615.6392177	total: 66ms	remaining: 8.18s
8:	learn: 10216.6709309	total: 67.1ms	remaining: 7.39s
9:	learn: 9845.0129704	total: 68.7ms	remaining: 6.8s
10:	learn: 9480.8287173	total: 71ms	remaining: 6.38s
11:	learn: 9130.0155008	total: 72.6ms	remaining: 5.97s
12:	learn: 8807.8636577	total: 73.9ms	remaining: 5.61s
13:	learn: 8461.8300792	total: 75.4ms	remaining: 5.31s
14:	learn: 8171.3703568	total: 76.9ms	remaining: 5.05s
15:	learn: 7880.2276926	total: 79.4ms	remaining: 4.88s
16:	learn: 7586.0697397	total: 80.9ms	remaining: 4.68s
17:	learn: 7334.0436813	total: 82.6ms	remai

In [11]:
predictions = pipeline.predict(X_test) 

metrics = {}
metrics["mae"] = mean_absolute_error(y_test, predictions)   
metrics["mape"] = mean_absolute_percentage_error(y_test, predictions)
metrics["mse"] = mean_squared_error(y_test, predictions)

metrics



{'mae': 129.06360043413292,
 'mape': 0.004855717526113309,
 'mse': 129118.23156988967}

In [12]:
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

registry_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"
tracking_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"

mlflow.set_tracking_uri(tracking_uri)   
mlflow.set_registry_uri(registry_uri)   

In [13]:
EXPERIMENT_NAME = "estate_project"
RUN_NAME = "baseline model"
REGISTRY_MODEL_NAME = "estate_model_rf"

In [14]:
from mlflow.models import infer_signature

signature =  infer_signature(model_input = X_train.head(5))
input_example = X_train.head(5)

In [15]:
req_file = '../requirements.txt'
art = '../comment.txt'

In [16]:
params_dict = pipeline.get_params()

In [17]:
# Когда создаем новый эксперимент, то: 
experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)

# Впоследствии. чтобы добавлять запуски в этот же эксепримент мы должны получить его id:
#experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(pipeline, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             pip_requirements=req_file
                             )
    mlflow.log_metrics(metrics)
    mlflow.log_artifact(art)
    mlflow.log_params(params_dict)

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 184.43it/s] 
2024/10/22 18:17:00 INFO mlflow.tracking._tracking_service.client: 🏃 View run baseline model at: http://127.0.0.1:5000/#/experiments/1/runs/7e7e5d36439346238cac406fdd9b0a4b.
2024/10/22 18:17:00 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.


In [18]:
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
mlflow.search_runs(
    experiment_names=[EXPERIMENT_NAME],
)


Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.mse,metrics.mae,metrics.mape,params.preprocessor__verbose_feature_names_out,...,params.preprocessor__cat__encoded_missing_value,params.steps,params.preprocessor__transformer_weights,params.model,params.preprocessor__cat__min_frequency,tags.mlflow.source.type,tags.mlflow.log-model.history,tags.mlflow.user,tags.mlflow.runName,tags.mlflow.source.name
0,7e7e5d36439346238cac406fdd9b0a4b,1,FINISHED,mlflow-artifacts:/1/7e7e5d36439346238cac406fdd...,2024-10-22 15:16:57.611000+00:00,2024-10-22 15:17:00.775000+00:00,129118.23157,129.0636,0.004856,True,...,,"[('preprocessor', ColumnTransformer(transforme...",,<catboost.core.CatBoostRegressor object at 0x7...,,LOCAL,"[{""run_id"": ""7e7e5d36439346238cac406fdd9b0a4b""...",mainuser,baseline model,/home/mainuser/my_proj/.venv_my_proj/lib/pytho...


In [19]:
mlflow.sklearn.autolog()

with mlflow.start_run(run_name='auto', experiment_id=experiment_id) as run:
    pipeline.fit(X_train, y_train)



Learning rate set to 0.044577
0:	learn: 13815.5147903	total: 1.7ms	remaining: 1.7s
1:	learn: 13327.9685779	total: 3.42ms	remaining: 1.7s
2:	learn: 12801.3269962	total: 5.25ms	remaining: 1.74s
3:	learn: 12338.1180410	total: 7ms	remaining: 1.74s
4:	learn: 11914.6950734	total: 8.33ms	remaining: 1.66s
5:	learn: 11471.2387639	total: 9.36ms	remaining: 1.55s
6:	learn: 11032.0400173	total: 10.4ms	remaining: 1.47s
7:	learn: 10615.6392177	total: 12.6ms	remaining: 1.56s
8:	learn: 10216.6709309	total: 13.9ms	remaining: 1.53s
9:	learn: 9845.0129704	total: 18ms	remaining: 1.78s
10:	learn: 9480.8287173	total: 19.3ms	remaining: 1.74s
11:	learn: 9130.0155008	total: 20.4ms	remaining: 1.68s
12:	learn: 8807.8636577	total: 21.7ms	remaining: 1.65s
13:	learn: 8461.8300792	total: 23.5ms	remaining: 1.65s
14:	learn: 8171.3703568	total: 30.8ms	remaining: 2.02s
15:	learn: 7880.2276926	total: 35ms	remaining: 2.15s
16:	learn: 7586.0697397	total: 36.1ms	remaining: 2.09s
17:	learn: 7334.0436813	total: 37.2ms	remainin

2024/10/22 18:17:14 INFO mlflow.tracking._tracking_service.client: 🏃 View run auto at: http://127.0.0.1:5000/#/experiments/1/runs/7d14f4fbd931401393bfb55e9e708184.
2024/10/22 18:17:14 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.


In [20]:
mlflow.sklearn.autolog(disable=True)

In [21]:
regressor2 = RandomForestRegressor(n_estimators=10, max_depth=6)

In [22]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor), 
                           ('model', regressor2)])

pipeline.fit(X_train, y_train)

In [23]:
predictions = pipeline.predict(X_test) 
metrics = {}
metrics["mae"] = mean_absolute_error(y_test, predictions)   
metrics["mape"] = mean_absolute_percentage_error(y_test, predictions)
metrics["mse"] = mean_squared_error(y_test, predictions)

metrics

{'mae': 51.04003304212416,
 'mape': 0.00243597199616633,
 'mse': 12090.258349603355}

In [24]:
RUN_NAME = 'smaller_model'

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(pipeline, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             pip_requirements=req_file
                             )
    mlflow.log_metrics(metrics)
    mlflow.log_artifact(art)
    mlflow.log_params(pipeline.get_params())

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 87.47it/s]  
2024/10/22 18:19:13 INFO mlflow.tracking._tracking_service.client: 🏃 View run smaller_model at: http://127.0.0.1:5000/#/experiments/1/runs/2132b0e8c93c40449e4a4610e06d4189.
2024/10/22 18:19:13 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.


In [25]:
RUN_NAME = 'no_model'
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id 
    mlflow.log_artifact(art)


run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

2024/10/22 18:19:42 INFO mlflow.tracking._tracking_service.client: 🏃 View run no_model at: http://127.0.0.1:5000/#/experiments/1/runs/dbb22d3e818848aa80d1a912e4033453.
2024/10/22 18:19:42 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.


In [28]:
run_id = 'dbb22d3e818848aa80d1a912e4033453' # Указываем run id
mlflow.register_model(f"runs:/{run_id}/models", REGISTRY_MODEL_NAME)

Registered model 'estate_model_rf' already exists. Creating a new version of this model...
2024/10/22 18:21:33 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: estate_model_rf, version 2
Created version '2' of model 'estate_model_rf'.


<ModelVersion: aliases=[], creation_timestamp=1729610493120, current_stage='None', description='', last_updated_timestamp=1729610493120, name='estate_model_rf', run_id='dbb22d3e818848aa80d1a912e4033453', run_link='', source='mlflow-artifacts:/1/dbb22d3e818848aa80d1a912e4033453/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='2'>

In [29]:
from sklearn.preprocessing import QuantileTransformer, SplineTransformer, PolynomialFeatures, MinMaxScaler

In [30]:
X_train_sklearn = X_train.copy()