<center><h2 style="color:brown">Experiment tracking using MLflow</h2></center>

<h4 style="color:blue">Importing libraries:</h4>

In [52]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [53]:
diamond_data =  pd.read_csv('data/diamonds.csv')

In [54]:
diamond_data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [55]:
diamond_data.shape

(53940, 10)

In [56]:
diamond_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [57]:
diamond_data.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


<h4 style="color:blue"> Identifying the inputs (X) and output (y)</h4>

In [58]:
y = diamond_data['price']

X = diamond_data[['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z']]

<h4 style="color:blue">Split data into train and test data</h4>

In [59]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=33)

In [60]:
X_train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
30634,0.34,Ideal,G,VVS2,62.0,56.0,4.46,4.48,2.77
17601,1.33,Ideal,H,SI1,62.7,56.0,6.99,7.04,4.4
10590,0.91,Very Good,E,VS2,62.9,55.0,6.13,6.17,3.87
53663,0.8,Very Good,D,SI2,63.7,57.0,5.87,5.9,3.75
6235,0.9,Good,F,SI1,62.6,58.0,6.1,6.14,3.83


In [61]:
print(X_train.shape, y_train.shape)

print(X_test.shape, y_test.shape)

(37758, 9) (37758,)
(16182, 9) (16182,)


<h4 style = "color:blue">Separating Categorical and Numerical Columns:</h4>

In [62]:
X_train.dtypes

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
x          float64
y          float64
z          float64
dtype: object

In [63]:
X_train_cat = X_train.select_dtypes(include=['object'])

X_train_cat.head()

Unnamed: 0,cut,color,clarity
30634,Ideal,G,VVS2
17601,Ideal,H,SI1
10590,Very Good,E,VS2
53663,Very Good,D,SI2
6235,Good,F,SI1


In [64]:
X_train_num = X_train.select_dtypes(include=['int64', 'float64'])

X_train_num.head()

Unnamed: 0,carat,depth,table,x,y,z
30634,0.34,62.0,56.0,4.46,4.48,2.77
17601,1.33,62.7,56.0,6.99,7.04,4.4
10590,0.91,62.9,55.0,6.13,6.17,3.87
53663,0.8,63.7,57.0,5.87,5.9,3.75
6235,0.9,62.6,58.0,6.1,6.14,3.83


<h4 style = "color:blue">Scaling the Numerical Features:</h4>

In [65]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_num_rescaled = pd.DataFrame(scaler.fit_transform(X_train_num), 
                                    columns = X_train_num.columns, 
                                    index = X_train_num.index)
X_train_num_rescaled.head()

Unnamed: 0,carat,depth,table,x,y,z
30634,-0.966886,0.174186,-0.653685,-1.134489,-1.088617,-1.082475
17601,1.121469,0.662259,-0.653685,1.121742,1.130137,1.210629
10590,0.2355,0.801709,-1.100909,0.354802,0.376108,0.465019
53663,0.003461,1.359507,-0.20646,0.122936,0.142098,0.296201
6235,0.214406,0.592535,0.240764,0.328048,0.350107,0.408746


<h4 style = "color:blue">Applying Label Encoding on Categorical Columns:</h4>

In [66]:
from sklearn import preprocessing 
label_encode = preprocessing.LabelEncoder()

X_train_cat_le= X_train_cat.apply(label_encode.fit_transform)

In [67]:
X_train_cat_le.head()

Unnamed: 0,cut,color,clarity
30634,2,3,7
17601,2,4,2
10590,4,1,5
53663,4,0,3
6235,1,2,2


<h4 style = "color:blue">Concatinating the Encoded Categorical Features and Rescaled Numerical Features::</h4>

In [68]:
X_train_transformed = pd.concat([X_train_num_rescaled, X_train_cat_le], axis=1)
X_train_transformed.head()

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
30634,-0.966886,0.174186,-0.653685,-1.134489,-1.088617,-1.082475,2,3,7
17601,1.121469,0.662259,-0.653685,1.121742,1.130137,1.210629,2,4,2
10590,0.2355,0.801709,-1.100909,0.354802,0.376108,0.465019,4,1,5
53663,0.003461,1.359507,-0.20646,0.122936,0.142098,0.296201,4,0,3
6235,0.214406,0.592535,0.240764,0.328048,0.350107,0.408746,1,2,2


<h4 style = "color:blue">Preparing Test Data:</h4>

In [69]:
X_test.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
47741,0.7,Premium,F,SI2,62.0,58.0,5.7,5.64,3.52
46368,0.54,Premium,G,VS1,61.1,60.0,5.28,5.23,3.21
35288,0.3,Very Good,H,VS2,60.9,61.0,4.32,4.35,2.64
44106,0.6,Very Good,H,VS2,60.8,60.0,5.39,5.44,3.29
31088,0.33,Very Good,G,VVS2,60.0,63.0,4.45,4.49,2.68


In [70]:
X_test_cat = X_test.select_dtypes(include=['object'])
X_test_cat.head()

Unnamed: 0,cut,color,clarity
47741,Premium,F,SI2
46368,Premium,G,VS1
35288,Very Good,H,VS2
44106,Very Good,H,VS2
31088,Very Good,G,VVS2


In [71]:
X_test_num = X_test.select_dtypes(include=['int64', 'float64'])
X_test_num.head()

Unnamed: 0,carat,depth,table,x,y,z
47741,0.7,62.0,58.0,5.7,5.64,3.52
46368,0.54,61.1,60.0,5.28,5.23,3.21
35288,0.3,60.9,61.0,4.32,4.35,2.64
44106,0.6,60.8,60.0,5.39,5.44,3.29
31088,0.33,60.0,63.0,4.45,4.49,2.68


In [72]:
X_test_num_rescaled = pd.DataFrame(scaler.transform(X_test_num), 
                                   columns = X_test_num.columns, 
                                   index = X_test_num.index)
X_test_num_rescaled.head()

Unnamed: 0,carat,depth,table,x,y,z
47741,-0.207484,0.174186,0.240764,-0.028668,-0.083244,-0.027366
46368,-0.544996,-0.453336,1.135213,-0.403221,-0.438591,-0.463478
35288,-1.051264,-0.592786,1.582438,-1.25934,-1.201288,-1.26536
44106,-0.418429,-0.662511,1.135213,-0.305124,-0.256584,-0.350933
31088,-0.98798,-1.220308,2.476887,-1.143407,-1.07995,-1.209088


In [73]:
from sklearn import preprocessing 
label_encode = preprocessing.LabelEncoder()

X_test_cat_le= X_test_cat.apply(label_encode.fit_transform)

In [74]:
X_test_cat_le.head()

Unnamed: 0,cut,color,clarity
47741,3,2,3
46368,3,3,4
35288,4,4,5
44106,4,4,5
31088,4,3,7


In [75]:
X_test_transformed = pd.concat([X_test_num_rescaled, X_test_cat_le], axis=1)
X_test_transformed.head()

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
47741,-0.207484,0.174186,0.240764,-0.028668,-0.083244,-0.027366,3,2,3
46368,-0.544996,-0.453336,1.135213,-0.403221,-0.438591,-0.463478,3,3,4
35288,-1.051264,-0.592786,1.582438,-1.25934,-1.201288,-1.26536,4,4,5
44106,-0.418429,-0.662511,1.135213,-0.305124,-0.256584,-0.350933,4,4,5
31088,-0.98798,-1.220308,2.476887,-1.143407,-1.07995,-1.209088,4,3,7


<h4 style = "color:blue">Experiment run</h4>

In [76]:
import mlflow

In [78]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")

mlflow.set_experiment("Diamond_Price Prediction")

2022/09/21 00:38:15 INFO mlflow.tracking.fluent: Experiment with name 'Diamond_Price Prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='./mlruns/2', creation_time=1663700895876, experiment_id='2', last_update_time=1663700895876, lifecycle_stage='active', name='Diamond_Price Prediction', tags={}>

In [79]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn import metrics

In [80]:
from pickle import dump
dump(label_encode, open('pickle_files/label_encoderfile.pkl', 'wb'))
dump(scaler, open('pickle_files/standard_scaler.pkl', 'wb'))

<h4 style = "color:blue">Experiment 1 - Training LinearRegression</h4>

In [84]:
with mlflow.start_run():
    mlflow.set_tag("dev", "Kusuma")
    mlflow.set_tag("algorithm", "LinearRegression")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path", "data/diamonds.csv")
    linear_regressor = LinearRegression()
    linear_regressor.fit(X_train_transformed, y_train)
    y_test_pred = linear_regressor.predict(X_test_transformed)
    MAE = metrics.mean_absolute_error(y_test, y_test_pred)
    mlflow.log_metric("Mean Absolute Error", MAE)
    mlflow.sklearn.log_model(linear_regressor, artifact_path="models")
    mlflow.log_artifact("pickle_files/label_encoderfile.pkl")
    mlflow.log_artifact("pickle_files/standard_scaler.pkl")

<h4 style = "color:blue">Experiment 2 -Training KNeighborsRegressor</h4>

In [81]:
with mlflow.start_run():
    mlflow.set_tag("dev", "Kusuma")
    mlflow.set_tag("algorithm", "KNN")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path", "data/diamonds.csv")
    k = 5
    mlflow.log_param("n_neighbors", k)
    knn_regressor = KNeighborsRegressor(n_neighbors=k)
    knn_regressor.fit(X_train_transformed, y_train)
    y_test_pred = knn_regressor.predict(X_test_transformed)
    MAE = metrics.mean_absolute_error(y_test, y_test_pred)
    mlflow.log_metric("Mean Absolute Error", MAE)                                    
    mlflow.sklearn.log_model(knn_regressor, artifact_path="models")
    mlflow.log_artifact("pickle_files/label_encoderfile.pkl")
    mlflow.log_artifact("pickle_files/standard_scaler.pkl")

<h4 style = "color:blue">Experiment 3 - Training DecisionTreeRegressor</h4>

In [85]:
with mlflow.start_run():
    mlflow.set_tag("dev", "Kusuma")
    mlflow.set_tag("algorithm", "DecisionTreeRegressor")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path", "data/diamonds.csv")
    max_depth = 8
    min_samples_leaf= 3
    max_features = "auto"
    max_leaf_nodes = 20
    mlflow.log_param("max_depth",max_depth)
    mlflow.log_param("min_samples_leaf",min_samples_leaf)
    mlflow.log_param("max_features",max_features)
    mlflow.log_param("max_leaf_nodes",max_leaf_nodes)
    dt_regressor = DecisionTreeRegressor(max_depth =max_depth,
                                        min_samples_leaf= min_samples_leaf,
                                        max_features = max_features, 
                                        max_leaf_nodes=max_leaf_nodes)
    dt_regressor.fit(X_train_transformed, y_train)
    y_test_pred = dt_regressor.predict(X_test_transformed)
    MAE = metrics.mean_absolute_error(y_test, y_test_pred)
    mlflow.log_metric("Mean Absolute Error", MAE)                                    
    mlflow.sklearn.log_model(dt_regressor, artifact_path="models")
    mlflow.log_artifact("pickle_files/label_encoderfile.pkl")
    mlflow.log_artifact("pickle_files/standard_scaler.pkl")

<h4 style = "color:blue">Experiment 4 - Training Support Vector Regressor</h4>

In [45]:
with mlflow.start_run():
    mlflow.set_tag("dev", "Kusuma")
    mlflow.set_tag("algorithm", "SVR")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path", "data/diamonds.csv")
    C = 1.0
    mlflow.log_param("C",C)
    sv_regressor = SVR(C=C)
    sv_regressor.fit(X_train_transformed, y_train)
    y_test_pred = sv_regressor.predict(X_test_transformed)
    MAE = metrics.mean_absolute_error(y_test, y_test_pred)
    mlflow.log_metric("Mean Absolute Error", MAE)                                    
    mlflow.sklearn.log_model(sv_regressor, artifact_path="models")
    mlflow.log_artifact("pickle_files/label_encoderfile.pkl")
    mlflow.log_artifact("pickle_files/standard_scaler.pkl")

<h4 style = "color:blue">Experiment 5 - Training RandomForestRegressor </h4>

In [83]:
with mlflow.start_run():
    mlflow.set_tag("dev", "Kusuma")
    mlflow.set_tag("algorithm", "RandomForestRegressor")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path", "data/diamonds.csv")
    max_depth = 30
    min_samples_leaf= 3
    max_features = "auto"
    max_leaf_nodes = 80
    n_estimators = 100
    mlflow.log_param("max_depth",max_depth)
    mlflow.log_param("min_samples_leaf",min_samples_leaf)
    mlflow.log_param("max_features",max_features)
    mlflow.log_param("max_leaf_nodes",max_leaf_nodes)
    mlflow.log_param("n_estimators ",n_estimators )
    fr_regressor = RandomForestRegressor(max_depth=max_depth,
                                         min_samples_leaf=min_samples_leaf,
                                         max_features=max_features,
                                         max_leaf_nodes=max_leaf_nodes,
                                         n_estimators=n_estimators)
    fr_regressor.fit(X_train_transformed, y_train)
    y_test_pred = fr_regressor.predict(X_test_transformed)
    MAE = metrics.mean_absolute_error(y_test, y_test_pred)
    mlflow.log_metric("Mean Absolute Error", MAE)                                    
    mlflow.sklearn.log_model(fr_regressor,artifact_path="models")
    mlflow.log_artifact("pickle_files/label_encoderfile.pkl")
    mlflow.log_artifact("pickle_files/standard_scaler.pkl")

<h4 style = "color:blue">Experiment 6 - Training GradientBoostingRegressor </h4>

In [86]:
with mlflow.start_run():
    mlflow.set_tag("dev", "Kusuma")
    mlflow.set_tag("algorithm", "GradientBoostingRegressor")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path", "data/diamonds.csv")
    learning_rate=0.1
    n_estimators=100
    max_depth=8 #default max_depth=3,
    mlflow.log_param("learning_rate",learning_rate)
    mlflow.log_param("n_estimators ",n_estimators )
    mlflow.log_param("max_depth",max_depth)
    gbr_regressor = GradientBoostingRegressor(learning_rate =learning_rate,
                                            n_estimators=n_estimators,
                                                max_depth=max_depth)
    gbr_regressor.fit(X_train_transformed, y_train)
    y_test_pred = gbr_regressor.predict(X_test_transformed)
    MAE = metrics.mean_absolute_error(y_test, y_test_pred)
    mlflow.log_metric("Mean Absolute Error", MAE)                                    
    mlflow.sklearn.log_model(gbr_regressor,artifact_path="models")
    mlflow.log_artifact("pickle_files/label_encoderfile.pkl")
    mlflow.log_artifact("pickle_files/standard_scaler.pkl")