In [8]:
import numpy as np 
import pandas as pd 

import mlflow 
from mlflow.models.signature import infer_signature

import matplotlib.pyplot as plt 
from yellowbrick.regressor import residuals_plot, prediction_error

from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn import metrics 
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.dummy import DummyRegressor 
from sklearn.linear_model import LinearRegression,LassoCV, RidgeCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn import set_config 
set_config(transform_output='pandas')

In [2]:
def get_metrics(y_true,y_pred):
    dict_metrics = {
        'R2':metrics.r2_score(y_true,y_pred),
        'MAE':metrics.mean_absolute_error(y_true,y_pred),
        'MAPE':metrics.mean_absolute_percentage_error(y_true,y_pred),
        'RMSE':np.sqrt(metrics.mean_squared_error(y_true,y_pred))
    }
    return dict_metrics

In [3]:
df = pd.read_csv('../data/insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
num_features = ['age','bmi','children']
cat_features = ['smoker']

features = num_features+cat_features
target =  'charges'

In [5]:
x = df[features]
y=df[target]

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2023)

In [9]:
numerical_transformer = Pipeline(steps=[
    ('inputer',SimpleImputer(strategy='mean')),
    ('scaler',MinMaxScaler())
])

categorical_transformer = OneHotEncoder(drop='if_binary',sparse_output=False)

In [10]:
preprocessor = ColumnTransformer([
    ('num',numerical_transformer,num_features),
    ('cat',categorical_transformer,cat_features)
])

preprocessor

In [11]:
preprocessor.fit_transform(x_train)

Unnamed: 0,num__age,num__bmi,num__children,cat__smoker_yes
1068,0.978261,0.153349,0.2,0.0
1018,0.782609,0.534167,0.6,0.0
995,0.456522,0.196798,0.6,0.0
1248,0.000000,0.641916,0.0,0.0
549,0.543478,0.813559,0.0,1.0
...,...,...,...,...
884,0.152174,0.288808,0.8,0.0
515,0.869565,0.531073,0.0,0.0
695,0.173913,0.651735,0.0,0.0
454,0.304348,0.822437,0.4,0.0


In [12]:
preprocessor.transform(x_test)

Unnamed: 0,num__age,num__bmi,num__children,cat__smoker_yes
748,0.630435,0.539144,0.2,0.0
745,0.695652,0.380818,0.2,0.0
57,0.000000,0.422922,0.4,1.0
546,0.217391,0.523944,0.0,0.0
279,0.717391,0.150659,0.2,0.0
...,...,...,...,...
681,0.021739,0.116761,0.0,0.0
340,0.130435,0.313156,0.0,0.0
1199,0.282609,0.264730,0.4,0.0
61,0.152174,0.476190,0.8,0.0


In [14]:
models = {
    'Dummy':DummyRegressor(),
    'LinearRegression':LinearRegression(),
    'LassoCV':LassoCV(),
    'RidgeCV':RidgeCV(),
    'RandomForestRegressor':RandomForestRegressor(),
    'GradientBoostingRegressor':GradientBoostingRegressor()
}

In [15]:
mlflow.set_experiment('Insurance')

2024/01/31 18:55:40 INFO mlflow.tracking.fluent: Experiment with name 'Insurance' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///d:/Desktop/Scripts/MLFlow_Class/notebooks/mlruns/315046187671784937', creation_time=1706738140290, experiment_id='315046187671784937', last_update_time=1706738140290, lifecycle_stage='active', name='Insurance', tags={}>

In [44]:
for model_name, model in models.items():
    model_pipeline = Pipeline(steps=[
        ('preprocessor',preprocessor),
        ('model',model)
    ])

    model_pipeline.fit(x_train,y_train)
    y_pred = model_pipeline.predict(x_test)
    model_metrics = get_metrics(y_test,y_pred)
    model_params = model_pipeline.named_steps['model'].get_params()

    with mlflow.start_run():
        mlflow.log_metrics(model_metrics)
        mlflow.log_params(model_params)
        #fig,ax = plt.subplots(figsize=(12,8))
        #prediction_error(model_pipeline,x_train,x_test, show=False,ax=ax)
        #mlflow.log_figure(fig,artifact_file='plots/prediction_error.png')
        #fig,ax = plt.subplots(figsize=(12,8))
        #residuals_plot(model_pipeline,x_train,x_test, show=False,ax=ax)
        #mlflow.log_figure(fig,artifact_file='plots/residuals_plot.png')

        signature = infer_signature(x_test,model_pipeline.predict(x_test))
        mlflow.sklearn.log_model(model_pipeline,'model_pipeline',registered_model_name=model_name,signature=signature)


  inputs = _infer_schema(model_input)
Registered model 'Dummy' already exists. Creating a new version of this model...
2024/01/31 19:22:47 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Dummy, version 2
Created version '2' of model 'Dummy'.
  inputs = _infer_schema(model_input)
Registered model 'LinearRegression' already exists. Creating a new version of this model...
2024/01/31 19:22:53 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: LinearRegression, version 2
Created version '2' of model 'LinearRegression'.
  inputs = _infer_schema(model_input)
Registered model 'LassoCV' already exists. Creating a new version of this model...
2024/01/31 19:22:59 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: LassoCV, versio

In [48]:
model_pipeline = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('model',GradientBoostingRegressor(random_state=2023))
])

params = {
    'model__learning_rate':[0.1,0.01]
}

grid_model = GridSearchCV(model_pipeline,params,cv=4,scoring='r2',n_jobs=-1,verbose=1)

grid_model.fit(x_train,y_train)

Fitting 4 folds for each of 2 candidates, totalling 8 fits


In [50]:
df_cv = pd.DataFrame(grid_model.cv_results_).set_index('rank_test_score').sort_index()
df_cv.loc[:,~df_cv.columns.str.contains('split|time')].head(10)

Unnamed: 0_level_0,param_model__learning_rate,params,mean_test_score,std_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.1,{'model__learning_rate': 0.1},0.847861,0.011963
2,0.01,{'model__learning_rate': 0.01},0.738393,0.020669


In [52]:
#get model 
tuned_model_pipeline = grid_model.best_estimator_
y_pred = tuned_model_pipeline.predict(x_test)
model_metrics = get_metrics(y_test,y_pred)
model_params = tuned_model_pipeline.named_steps['model'].get_params()

with mlflow.start_run():
    mlflow.log_metrics(model_metrics)
    mlflow.log_params(model_params)
        #fig,ax = plt.subplots(figsize=(12,8))
        #prediction_error(model_pipeline,x_train,x_test, show=False,ax=ax)
        #mlflow.log_figure(fig,artifact_file='plots/prediction_error.png')
        #fig,ax = plt.subplots(figsize=(12,8))
        #residuals_plot(model_pipeline,x_train,x_test, show=False,ax=ax)
        #mlflow.log_figure(fig,artifact_file='plots/residuals_plot.png')

    signature = infer_signature(x_test,tuned_model_pipeline.predict(x_test))
    mlflow.sklearn.log_model(tuned_model_pipeline,'tuned_model_pipeline',registered_model_name=model_name,signature=signature)

  inputs = _infer_schema(model_input)
Registered model 'GradientBoostingRegressor' already exists. Creating a new version of this model...
2024/01/31 20:16:07 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: GradientBoostingRegressor, version 3
Created version '3' of model 'GradientBoostingRegressor'.
