Dataset link: https://www.kaggle.com/datasets/shivam2503/diamonds

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [5]:
#index_col=[0]
#While you read csv file, if you set index_col=[0] you're explicitly stating to treat the first column as the index.

df = pd.read_csv('diamonds.csv',index_col=[0])

df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [6]:
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [7]:
df.shape

(53940, 10)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53940 entries, 1 to 53940
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.5+ MB


In [10]:
df.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

### Machine Learning Problem
Build a system which can take features of diamond like carat, cut, color, clarity, x, y, z, etc.. and predicts the price of diamond.

In [11]:
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [12]:
df = df[['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z', 'price']]

df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
1,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43,326
2,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31,326
3,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31,327
4,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63,334
5,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75,335


#### a. Identify the Target Variable and Splitting the Data into Train and Test

In [13]:
import sklearn

print(sklearn.__version__)

1.0.2


In [10]:
!pip install -U scikit-learn





In [31]:
!pip upgrade scikit-learn

ERROR: unknown command "upgrade"



In [15]:
# Identifying the inputs (X) and output (y)

y = df['price']

X = df[['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z']]

In [16]:
# split into train and test

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=100)

In [17]:
X_train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
29478,0.32,Ideal,F,VS1,61.3,56.0,4.4,4.44,2.71
52543,0.72,Very Good,I,VVS2,61.7,55.0,5.76,5.81,3.57
8726,0.38,Very Good,D,SI1,62.0,55.0,4.67,4.72,2.91
15578,1.0,Very Good,H,VVS2,62.6,56.0,6.36,6.39,3.99
19652,1.7,Premium,H,SI2,59.8,61.0,7.67,7.62,4.57


In [18]:
print(X_train.shape, y_train.shape)

print(X_test.shape, y_test.shape)

(37758, 9) (37758,)
(16182, 9) (16182,)


#### b. Separating Categorical and Numerical Columns:

In [19]:
X_train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
29478,0.32,Ideal,F,VS1,61.3,56.0,4.4,4.44,2.71
52543,0.72,Very Good,I,VVS2,61.7,55.0,5.76,5.81,3.57
8726,0.38,Very Good,D,SI1,62.0,55.0,4.67,4.72,2.91
15578,1.0,Very Good,H,VVS2,62.6,56.0,6.36,6.39,3.99
19652,1.7,Premium,H,SI2,59.8,61.0,7.67,7.62,4.57


In [20]:

X_train.dtypes

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
x          float64
y          float64
z          float64
dtype: object

In [21]:
X_train_cat = X_train.select_dtypes(include=['object'])

X_train_cat.head()

Unnamed: 0,cut,color,clarity
29478,Ideal,F,VS1
52543,Very Good,I,VVS2
8726,Very Good,D,SI1
15578,Very Good,H,VVS2
19652,Premium,H,SI2


In [22]:
X_train_num = X_train.select_dtypes(include=['int64', 'float64'])

X_train_num.head()

Unnamed: 0,carat,depth,table,x,y,z
29478,0.32,61.3,56.0,4.4,4.44,2.71
52543,0.72,61.7,55.0,5.76,5.81,3.57
8726,0.38,62.0,55.0,4.67,4.72,2.91
15578,1.0,62.6,56.0,6.36,6.39,3.99
19652,1.7,59.8,61.0,7.67,7.62,4.57


#### c. Scaling the Numerical Features

In [23]:
X_train_num.head()

Unnamed: 0,carat,depth,table,x,y,z
29478,0.32,61.3,56.0,4.4,4.44,2.71
52543,0.72,61.7,55.0,5.76,5.81,3.57
8726,0.38,62.0,55.0,4.67,4.72,2.91
15578,1.0,62.6,56.0,6.36,6.39,3.99
19652,1.7,59.8,61.0,7.67,7.62,4.57


In [24]:
# scaling the numerical features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# column names are (annoyingly) lost after Scaling
# (i.e. the dataframe is converted to a numpy ndarray)

X_train_num_rescaled = pd.DataFrame(scaler.fit_transform(X_train_num), 
                                    columns = X_train_num.columns, 
                                    index = X_train_num.index)

X_train_num_rescaled.head()

Unnamed: 0,carat,depth,table,x,y,z
29478,-1.007784,-0.316595,-0.647912,-1.187006,-1.122457,-1.191717
52543,-0.166198,-0.038325,-1.095448,0.023744,0.062997,0.043156
8726,-0.881547,0.170377,-1.095448,-0.946636,-0.880174,-0.904537
15578,0.422912,0.587782,-0.647912,0.557898,0.564868,0.646234
19652,1.895688,-1.360107,1.589767,1.724135,1.62918,1.479055


In [25]:

X_train_cat['color'].value_counts(normalize=True)

G    0.210419
E    0.181736
F    0.178479
H    0.153583
D    0.122888
I    0.100244
J    0.052651
Name: color, dtype: float64

In [26]:
X_train_cat['clarity'].value_counts(normalize=True)

SI1     0.242094
VS2     0.226283
SI2     0.169765
VS1     0.152153
VVS2    0.095212
VVS1    0.068515
IF      0.032602
I1      0.013375
Name: clarity, dtype: float64

#### e. Applying Label Encoding on Categorical Columns

In [27]:
X_train_cat_le = pd.DataFrame(index=X_train_cat.index)

X_train_cat_le.head()

29478
52543
8726
15578
19652


In [28]:

X_train_cat.cut.unique()

array(['Ideal', 'Very Good', 'Premium', 'Good', 'Fair'], dtype=object)

In [29]:
cut_encoder = {'Fair' : 1, 'Good' : 2, 'Very Good' : 3, 'Ideal' : 4, 'Premium' : 5}

X_train_cat_le['cut'] = X_train_cat['cut'].apply(lambda x : cut_encoder[x])

X_train_cat_le.head()

Unnamed: 0,cut
29478,4
52543,3
8726,3
15578,3
19652,5


In [30]:
X_train_cat.color.unique()

array(['F', 'I', 'D', 'H', 'G', 'E', 'J'], dtype=object)

In [31]:
color_encoder = {'J':1, 'I':2, 'H':3, 'G':4, 'F':5, 'E':6, 'D':7}

X_train_cat_le['color'] = X_train_cat['color'].apply(lambda x : color_encoder[x])

X_train_cat_le.head()

Unnamed: 0,cut,color
29478,4,5
52543,3,2
8726,3,7
15578,3,3
19652,5,3


In [32]:
X_train_cat.clarity.unique()

array(['VS1', 'VVS2', 'SI1', 'SI2', 'VVS1', 'VS2', 'I1', 'IF'],
      dtype=object)

In [33]:
clarity_encoder = {'I1':1, 'SI2':2, 'SI1':3, 'VS2':4, 'VS1':5, 'VVS2':6, 'VVS1':7, 'IF':8}

X_train_cat_le['clarity'] = X_train_cat['clarity'].apply(lambda x : clarity_encoder[x])

X_train_cat_le.head()

Unnamed: 0,cut,color,clarity
29478,4,5,5
52543,3,2,6
8726,3,7,3
15578,3,3,6
19652,5,3,2


#### f. Concatinating the Encoded Categorical Features and Rescaled Numerical Features:

In [34]:
X_train_transformed = pd.concat([X_train_num_rescaled, X_train_cat_le], axis=1)

X_train_transformed.head()

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
29478,-1.007784,-0.316595,-0.647912,-1.187006,-1.122457,-1.191717,4,5,5
52543,-0.166198,-0.038325,-1.095448,0.023744,0.062997,0.043156,3,2,6
8726,-0.881547,0.170377,-1.095448,-0.946636,-0.880174,-0.904537,3,7,3
15578,0.422912,0.587782,-0.647912,0.557898,0.564868,0.646234,3,3,6
19652,1.895688,-1.360107,1.589767,1.724135,1.62918,1.479055,5,3,2


#### g. Preparing Test Data

In [35]:
X_test.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
52265,0.57,Ideal,E,VS2,61.5,57.0,5.35,5.32,3.28
21074,1.16,Ideal,G,VS1,61.5,55.0,6.75,6.81,4.17
42162,0.51,Ideal,G,SI1,63.2,58.0,5.05,5.08,3.2
35975,0.42,Ideal,F,VS1,60.6,56.0,4.83,4.87,2.94
7642,0.8,Premium,G,IF,62.6,58.0,5.89,5.93,3.7


In [36]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16182 entries, 52265 to 1320
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    16182 non-null  float64
 1   cut      16182 non-null  object 
 2   color    16182 non-null  object 
 3   clarity  16182 non-null  object 
 4   depth    16182 non-null  float64
 5   table    16182 non-null  float64
 6   x        16182 non-null  float64
 7   y        16182 non-null  float64
 8   z        16182 non-null  float64
dtypes: float64(6), object(3)
memory usage: 1.2+ MB


In [37]:
X_test_cat = X_test.select_dtypes(include=['object'])

X_test_cat.head()

Unnamed: 0,cut,color,clarity
52265,Ideal,E,VS2
21074,Ideal,G,VS1
42162,Ideal,G,SI1
35975,Ideal,F,VS1
7642,Premium,G,IF


In [38]:
X_test_num = X_test.select_dtypes(include=['int64', 'float64'])

X_test_num.head()

Unnamed: 0,carat,depth,table,x,y,z
52265,0.57,61.5,57.0,5.35,5.32,3.28
21074,1.16,61.5,55.0,6.75,6.81,4.17
42162,0.51,63.2,58.0,5.05,5.08,3.2
35975,0.42,60.6,56.0,4.83,4.87,2.94
7642,0.8,62.6,58.0,5.89,5.93,3.7


In [39]:
X_test_num_rescaled = pd.DataFrame(scaler.transform(X_test_num), 
                                   columns = X_test_num.columns, 
                                   index = X_test_num.index)

X_test_num_rescaled.head()

Unnamed: 0,carat,depth,table,x,y,z
52265,-0.481793,-0.17746,-0.200377,-0.341261,-0.360998,-0.373255
21074,0.759547,-0.17746,-1.095448,0.905098,0.928292,0.904696
42162,-0.608031,1.005187,0.247159,-0.608339,-0.568668,-0.488127
35975,-0.797388,-0.803567,-0.647912,-0.804195,-0.75038,-0.86146
7642,0.002119,0.587782,0.247159,0.139477,0.166832,0.229823


In [40]:
X_test_cat_le = pd.DataFrame(index = X_test_cat.index)

X_test_cat_le.head()

52265
21074
42162
35975
7642


In [41]:
X_test_cat_le['cut'] = X_test_cat['cut'].apply(lambda x : cut_encoder[x])

X_test_cat_le['color'] = X_test_cat['color'].apply(lambda x : color_encoder[x])

X_test_cat_le['clarity'] = X_test_cat['clarity'].apply(lambda x : clarity_encoder[x])

X_test_cat_le.head()

Unnamed: 0,cut,color,clarity
52265,4,6,4
21074,4,4,5
42162,4,4,3
35975,4,5,5
7642,5,4,8


In [42]:
X_test_transformed = pd.concat([X_test_num_rescaled, X_test_cat_le], axis=1)

X_test_transformed.head()

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
52265,-0.481793,-0.17746,-0.200377,-0.341261,-0.360998,-0.373255,4,6,4
21074,0.759547,-0.17746,-1.095448,0.905098,0.928292,0.904696,4,4,5
42162,-0.608031,1.005187,0.247159,-0.608339,-0.568668,-0.488127,4,4,3
35975,-0.797388,-0.803567,-0.647912,-0.804195,-0.75038,-0.86146,4,5,5
7642,0.002119,0.587782,0.247159,0.139477,0.166832,0.229823,5,4,8


#### Linear Regression

In [43]:
from sklearn.linear_model import LinearRegression
li_regressor = LinearRegression()
li_regressor.fit(X_train_transformed, y_train)

LinearRegression()

In [44]:
y_test_pred = li_regressor.predict(X_test_transformed)

In [45]:
temp_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})

temp_df.head()

Unnamed: 0,Actual,Predicted
52265,2491,2363.83474
21074,9248,7469.644228
42162,1284,643.298938
35975,921,1516.80821
7642,4268,5721.128606


In [46]:
from sklearn import metrics

print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_test_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_test_pred))

print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

Mean Absolute Error:  795.4560135879746
Mean Squared Error:  1471939.156075997
Root Mean Squared Error:  1213.234996229501


#### KNN Regression

In [47]:
from sklearn.neighbors import KNeighborsRegressor
knn_regressor = KNeighborsRegressor()
knn_regressor.fit(X_train_transformed, y_train)

KNeighborsRegressor()

In [48]:
y_test_pred = knn_regressor.predict(X_test_transformed)

In [49]:
temp_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})

temp_df.head()

Unnamed: 0,Actual,Predicted
52265,2491,1779.6
21074,9248,9000.6
42162,1284,1136.6
35975,921,960.4
7642,4268,5510.6


In [50]:
print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_test_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_test_pred))

print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

Mean Absolute Error:  384.03119515511065
Mean Squared Error:  567200.9668570017
Root Mean Squared Error:  753.127457245453


#### Decision Tree Regression

In [51]:
from sklearn.tree import DecisionTreeRegressor
dt_regressor = DecisionTreeRegressor()
dt_regressor.fit(X_train_transformed, y_train)

DecisionTreeRegressor()

In [52]:
y_test_pred = dt_regressor.predict(X_test_transformed)

In [53]:
temp_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})

temp_df.head()

Unnamed: 0,Actual,Predicted
52265,2491,1625.0
21074,9248,8020.0
42162,1284,1197.0
35975,921,1031.0
7642,4268,6338.0


In [54]:
print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_test_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_test_pred))

print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

Mean Absolute Error:  364.1750607671075
Mean Squared Error:  552929.35770369
Root Mean Squared Error:  743.5921985226108


#### Random Forest Regression

In [55]:
from sklearn.ensemble import RandomForestRegressor
rf_regressor = RandomForestRegressor()
rf_regressor.fit(X_train_transformed, y_train)

RandomForestRegressor()

In [56]:
y_test_pred = rf_regressor.predict(X_test_transformed)

In [57]:
temp_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})

temp_df.head()

Unnamed: 0,Actual,Predicted
52265,2491,1839.17
21074,9248,8660.35
42162,1284,1252.95
35975,921,1032.82
7642,4268,4472.18


In [58]:
print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_test_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_test_pred))

print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

Mean Absolute Error:  269.5033074447799
Mean Squared Error:  290711.4418123234
Root Mean Squared Error:  539.1766332217331


#### Saving the Model (Serialization)

In [59]:
from pickle import dump

dump(scaler, open('project/models/standard_scaler.pkl', 'wb'))
#dump(encoder, open('models/onehot_encoder.pkl', 'wb'))
dump(li_regressor, open('project/models/linearregression.pkl', 'wb'))
dump(knn_regressor, open('project/models/knnregression.pkl', 'wb'))
dump(dt_regressor, open('project/models/decisiontreeregression.pkl', 'wb'))
dump(rf_regressor, open('project/models/randomforestregression.pkl', 'wb'))


#### Deserialization

In [57]:
from pickle import load

In [58]:
rf_regressor = load(open('project/models/randomforestregression.pkl', 'rb'))
scaler = load(open('project/models/standard_scaler.pkl', 'rb'))

In [59]:
clarity_encoder = {'I1':1, 'SI2':2, 'SI1':3, 'VS2':4, 'VS1':5, 'VVS2':6, 'VVS1':7, 'IF':8}
color_encoder = {'J':1, 'I':2, 'H':3, 'G':4, 'F':5, 'E':6, 'D':7}
cut_encoder = {'Fair' : 1, 'Good' : 2, 'Very Good' : 3, 'Ideal' : 4, 'Premium' : 5}

In [60]:
clarity_encoder.keys()

dict_keys(['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF'])

In [61]:
num_scal=scaler.transform([[1.16 ,61.5 ,55.0 ,6.75 ,6.81 ,4.17]])
num_scal.flatten()

  "X does not have valid feature names, but"


array([ 0.7595468 , -0.1774599 , -1.09544817,  0.90509846,  0.92829164,
        0.90469557])

In [62]:
cat_encod=np.array([clarity_encoder["I1"],color_encoder["J"],cut_encoder["Good"]])
cat_encod

array([1, 1, 2])

In [63]:
np.concatenate((cat_encod, num_scal.flatten()), axis=None).reshape(1,-1)

array([[ 1.        ,  1.        ,  2.        ,  0.7595468 , -0.1774599 ,
        -1.09544817,  0.90509846,  0.92829164,  0.90469557]])

In [64]:
rf_regressor.predict(np.concatenate((cat_encod, num_scal.flatten()), axis=None).reshape(1,-1)).item()

  "X does not have valid feature names, but"


2460.31

### Running the Experiment

In [41]:
!pip install mlflow

Collecting mlflow
  Using cached mlflow-1.28.0-py3-none-any.whl (17.0 MB)
Collecting querystring-parser<2
  Using cached querystring_parser-1.2.4-py2.py3-none-any.whl (7.9 kB)
Collecting docker<6,>=4.0.0
  Using cached docker-5.0.3-py2.py3-none-any.whl (146 kB)
Collecting alembic<2
  Downloading alembic-1.8.1-py3-none-any.whl (209 kB)
     -------------------------------------- 209.8/209.8 kB 2.1 MB/s eta 0:00:00
Collecting prometheus-flask-exporter<1
  Using cached prometheus_flask_exporter-0.20.3-py3-none-any.whl (18 kB)
Collecting gitpython<4,>=2.1.0
  Using cached GitPython-3.1.27-py3-none-any.whl (181 kB)
Collecting databricks-cli<1,>=0.8.7
  Using cached databricks-cli-0.17.3.tar.gz (77 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'


  error: subprocess-exited-with-error
  
  python setup.py egg_info did not run successfully.
  exit code: 1
  
  [14 lines of output]
  Traceback (most recent call last):
    File "<string>", line 36, in <module>
    File "<pip-setuptools-caller>", line 14, in <module>
    File "C:\Users\admin\anaconda3\lib\site-packages\setuptools\__init__.py", line 18, in <module>
      from setuptools.dist import Distribution
    File "C:\Users\admin\anaconda3\lib\site-packages\setuptools\dist.py", line 34, in <module>
      from ._importlib import metadata
    File "C:\Users\admin\anaconda3\lib\site-packages\setuptools\_importlib.py", line 39, in <module>
      disable_importlib_metadata_finder(metadata)
    File "C:\Users\admin\anaconda3\lib\site-packages\setuptools\_importlib.py", line 30, in disable_importlib_metadata_finder
      for ob in sys.meta_path
    File "C:\Users\admin\anaconda3\lib\site-packages\setuptools\_importlib.py", line 31, in <listcomp>
      if isinstance(ob, importlib_metad

In [5]:
pip install -U setuptools

Collecting setuptools
  Using cached setuptools-65.3.0-py3-none-any.whl (1.2 MB)
Installing collected packages: setuptools
  Attempting uninstall: setuptools
    Found existing installation: setuptools 63.4.1
    Uninstalling setuptools-63.4.1:
      Successfully uninstalled setuptools-63.4.1
Successfully installed setuptools-65.3.0
Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
anaconda-project 0.11.1 requires ruamel-yaml, which is not installed.
tensorflow 2.10.0 requires protobuf<3.20,>=3.9.2, but you have protobuf 3.20.1 which is incompatible.
tensorboard 2.10.0 requires protobuf<3.20,>=3.9.2, but you have protobuf 3.20.1 which is incompatible.


### Running the Experiment

In [2]:
import mlflow

In [60]:
mlflow.set_tracking_uri('sqlite:///mlflow1.db')
mlflow.set_experiment("Diamond price prediction logs")

2022/09/18 11:42:11 INFO mlflow.tracking.fluent: Experiment with name 'Diamond price prediction logs' does not exist. Creating a new experiment.


<Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='Diamond price prediction logs', tags={}>

### Experiment 1 - Training KNN Regressor

In [63]:
from sklearn import metrics

In [68]:
with mlflow.start_run():
    mlflow.set_tag("developer","Mohan")
    mlflow.set_tag("Algorithm","KNN")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path","Web_application_on_diamonds_dataset/diamonds.csv")
    k=30
    mlflow.log_param("n_neighbors",k)
    knn_regressor = KNeighborsRegressor(n_neighbors=k)
    knn_regressor.fit(X_train_transformed, y_train)
    y_test_pred = knn_regressor.predict(X_test_transformed)
    MAE=metrics.mean_absolute_error(y_test, y_test_pred)
    #acc=metrics.accuracy_score(y_test,y_test_pred)
    mlflow.log_metric("Mean Absolute Error",MAE)
    mlflow.sklearn.log_model(knn_regressor,artifact_path="models")
    mlflow.log_artifact("project/models/standard_scaler.pkl")

### Experiment 2 - Training Decision Tree Regression

In [79]:
from pprint import pprint
pprint(dt_regressor.get_params())

{'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': 30,
 'max_features': 'auto',
 'max_leaf_nodes': 50,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}


In [77]:
with mlflow.start_run():
    mlflow.set_tag("developer","Mohan")
    mlflow.set_tag("Algorithm","DT")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path","Web_application_on_diamonds_dataset/diamonds.csv")
    d=5
    f='auto'
    l=50
    mlflow.log_param("max_depth",k)
    mlflow.log_param("max_features",f)
    mlflow.log_param("max_leaf_nodes",l)
    dt_regressor =DecisionTreeRegressor(max_depth=k,max_features=f,max_leaf_nodes=l)#,min_samples_leaf=2,min_weight_fraction_leaf=0.1,splitter='random')
    dt_regressor.fit(X_train_transformed, y_train)
    y_test_pred = dt_regressor.predict(X_test_transformed)
    MAE=metrics.mean_absolute_error(y_test, y_test_pred)
    mlflow.log_metric("Mean Absolute Error",MAE)
    mlflow.sklearn.log_model(dt_regressor,artifact_path="models")
    mlflow.log_artifact("project/models/standard_scaler.pkl")

### Experiment 3 - Training Random Forest Regression

In [81]:
with mlflow.start_run():
    mlflow.set_tag("developer","Mohan")
    mlflow.set_tag("Algorithm","RT")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path","Web_application_on_diamonds_dataset/diamonds.csv")
    d=5
    f='auto'
    l=50
    n=100
    mlflow.log_param("max_depth",k)
    mlflow.log_param("max_features",f)
    mlflow.log_param("max_leaf_nodes",l)
    mlflow.log_param("n_estimators",n)
    rf_regressor =RandomForestRegressor(max_depth=k,max_features=f,max_leaf_nodes=l,n_estimators=n)
    rf_regressor.fit(X_train_transformed, y_train)
    y_test_pred = dt_regressor.predict(X_test_transformed)
    MAE=metrics.mean_absolute_error(y_test, y_test_pred)
    mlflow.log_metric("Mean Absolute Error",MAE)
    mlflow.sklearn.log_model(dt_regressor,artifact_path="models")
    mlflow.log_artifact("project/models/standard_scaler.pkl")

### Experiment 4- Training KNN Regressor with Hyperparameter Tuning

In [83]:
from sklearn.model_selection import GridSearchCV

In [86]:
# Enabling automatic MLflow logging for scikit-learn runs
mlflow.sklearn.autolog(max_tuning_runs=None)

with mlflow.start_run():
    tuned_parameters = [{'n_neighbors':[i for i in range(1, 51)], 'p':[1, 2]}]

    reg = GridSearchCV(
        estimator=KNeighborsRegressor(), 
        param_grid=tuned_parameters, 
        scoring='neg_mean_absolute_error',
        cv=5,
        return_train_score=True,
        verbose=1
    )
    reg.fit(X_train_transformed, y_train)
    
    # Disabling autologging
    mlflow.sklearn.autolog(disable=True)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


