## 0. Libraries import

In [322]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [323]:
# imports 
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", 50)
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
#train test split
from sklearn.model_selection import train_test_split
# Hyperparameters selection
from sklearn.model_selection import RandomizedSearchCV
# Importing cross validation function from sklearn
from sklearn.model_selection import cross_val_score
# models
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
# error
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error

## 1. Data import

### 1.1 Impor train data

In [324]:
df_diamonds_train=pd.read_csv('../data/diamonds_train.csv')
df_diamonds_train.pop("Unnamed: 0") #dropped an unnecessary column
df_diamonds_train

Unnamed: 0,index_id,depth,table,x,y,z,price,carat,cut,color,clarity,city
0,5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91...,62.4,58.0,6.83,6.79,4.25,4268,1.21,Premium,J,VS2,Dubai
1,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d...,63.0,57.0,4.35,4.38,2.75,505,0.32,Very Good,H,VS2,Kimberly
2,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,65.5,55.0,5.62,5.53,3.65,2686,0.71,Fair,G,VS1,Las Vegas
3,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,63.8,56.0,4.68,4.72,3.00,738,0.41,Good,D,SI1,Kimberly
4,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328c...,60.5,59.0,6.55,6.51,3.95,4882,1.02,Ideal,G,SI1,Dubai
...,...,...,...,...,...,...,...,...,...,...,...,...
40450,f0bc79169405ebeb24e308055156b946ffd819db9b4f75...,62.7,57.0,7.10,7.04,4.43,10070,1.34,Ideal,G,VS1,Antwerp
40451,339916a23bf22b052b54cb2a9b36ee8418c1c68b46acad...,57.1,60.0,8.31,8.25,4.73,12615,2.02,Good,F,SI2,Madrid
40452,46957922b99954654c1deb8d854c3f069bf118b2ce9415...,62.7,56.0,6.37,6.42,4.01,5457,1.01,Ideal,H,SI1,Kimberly
40453,9d733392d362d5c6f1d9b9659b601c7d4b5a1c1c8df579...,61.9,54.3,4.45,4.47,2.76,456,0.33,Ideal,J,VS1,Kimberly


### 1.2 Impor test data

In [325]:
df_diamonds_test=pd.read_csv('../data/diamonds_test.csv')
df_diamonds_test

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90,Kimberly
4,4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30,Amsterdam
13481,13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56,New York City
13482,13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53,Tel Aviv
13483,13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45,Surat


## 2. Data preparation (training set)

### 2.1 Defining numerical and categorical features

In [326]:
# Excluding city (with trees sometimes is better to keep variables even if they are correlated)
num_features_list=['x','y','z','depth','table','carat']
cat_features_list=['cut','color','clarity']
target='price'
features_list=['x','y','z','depth','table','carat','cut','color','clarity']

### 2.2 Checking if null values

In [327]:
df_diamonds_train.isna().sum()

index_id    0
depth       0
table       0
x           0
y           0
z           0
price       0
carat       0
cut         0
color       0
clarity     0
city        0
dtype: int64

In [268]:
df_diamonds_train=df_diamonds_train.dropna()

### 2.2 Checking if zero values

In [269]:
# In the train set
df_diamonds_train[(df_diamonds_train[num_features_list]==0).any(axis=1)]

Unnamed: 0,index_id,depth,table,x,y,z,price,carat,cut,color,clarity,city
1606,25d114d44f1ee498521f51cd12e524e8fd6e67c82a9d21...,59.2,58.0,6.5,6.47,0.0,3837,1.01,Premium,F,SI2,New York City
3945,5366ec7df49331f43da1f43fedc75ce2b333ccc2b39fd9...,62.7,53.0,8.02,7.95,0.0,18207,2.02,Premium,H,VS2,Madrid
6465,0c73ebfedfb4af1e074a8cc2e9c530a9ae8fbc79eae5b5...,64.1,60.0,0.0,0.0,0.0,2130,0.71,Good,F,SI2,Madrid
13839,c279a17c96f126fc4784ddecd87267c9b8dfc9ed047d4b...,63.8,58.0,8.9,8.85,0.0,18788,2.8,Good,G,SI2,Kimberly
14815,2659730a46782b733d4e18192e92a17ee69b2eaa7538e2...,61.6,56.0,0.0,6.62,0.0,4954,1.07,Ideal,F,SI2,Kimberly
14891,105a4d48d04d0d1f1c95676eacd8442d4d2611eab8a107...,59.4,61.0,8.49,8.45,0.0,12631,2.18,Premium,H,SI2,Surat
16425,3366cbd949ed025b9bb9a706f82eb6d53993c012e64360...,61.2,59.0,8.42,8.37,0.0,17265,2.2,Premium,H,SI1,Surat
19856,c9f96f95732be31d3446ba4400a7fe200f252d639074f0...,59.1,59.0,6.55,6.48,0.0,3142,1.0,Premium,G,SI2,Madrid
21602,fa0cb0b93e1bdf27a23aaf0d8f7a775ec055dd432cc8c6...,59.2,56.0,6.88,6.83,0.0,5564,1.15,Ideal,G,VS2,New York City
24795,b7d286a95afa0f5410efa442abe03f1b0182b270035007...,63.0,59.0,6.5,6.47,0.0,3696,1.1,Premium,G,SI2,Dubai


In [270]:
# In the test set
df_diamonds_test[(df_diamonds_test[num_features_list]==0).any(axis=1)]

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
2901,2901,1.2,Premium,D,VVS1,62.1,59.0,0.0,0.0,0.0,Tel Aviv
5465,5465,1.12,Premium,G,I1,60.4,59.0,6.71,6.67,0.0,London
6685,6685,1.0,Very Good,H,VS2,63.3,53.0,0.0,0.0,0.0,Paris
7488,7488,1.56,Ideal,G,VS2,62.2,54.0,0.0,0.0,0.0,Madrid


### 2.3 Check outliers

In [271]:
# In the test set

In [272]:
def check_outliers(df,feature):
    # IQR
    Q1 = np.percentile(df[feature], 25,
                   interpolation = 'midpoint')
    Q3 = np.percentile(df[feature], 75,
                   interpolation = 'midpoint')
    IQR = Q3 - Q1
    # Upper and lower
    upper_limit=Q3+1.5*IQR
    lower_limit=Q1-1.5*IQR
    # Removing the Outliers
    return df[(df[feature]<lower_limit) | (df[feature]>upper_limit)]

In [273]:
check_outliers(df_diamonds_train,'x')
check_outliers(df_diamonds_test,'x')

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
94,94,5.01,Fair,J,I1,65.5,59.0,10.74,10.54,6.98,Luxembourg
1388,1388,3.0,Good,J,SI2,59.3,64.0,9.32,9.19,5.5,New York City
2901,2901,1.2,Premium,D,VVS1,62.1,59.0,0.0,0.0,0.0,Tel Aviv
6399,6399,3.0,Premium,G,I1,59.7,60.0,9.42,9.26,5.58,Las Vegas
6685,6685,1.0,Very Good,H,VS2,63.3,53.0,0.0,0.0,0.0,Paris
7488,7488,1.56,Ideal,G,VS2,62.2,54.0,0.0,0.0,0.0,Madrid
8355,8355,3.01,Premium,J,SI2,60.7,59.0,9.35,9.22,5.64,Zurich
9838,9838,3.65,Fair,H,I1,67.1,53.0,9.53,9.48,6.38,London
12305,12305,3.67,Premium,I,I1,62.4,56.0,9.86,9.81,6.13,Luxembourg


### 2.3 Remove outliers

In [274]:
def remove_outliers(df,feature):
    # IQR
    Q1 = np.percentile(df[feature], 25,
                   interpolation = 'midpoint')
    Q3 = np.percentile(df[feature], 75,
                   interpolation = 'midpoint')
    IQR = Q3 - Q1
    # Upper and lower
    upper_limit=Q3+1.5*IQR
    lower_limit=Q1-1.5*IQR
    # Removing the Outliers
    return df[(df[feature]>=lower_limit) & (df[feature]<=upper_limit)]

In [275]:
df_diamonds_train=remove_outliers(df_diamonds_train,'x')
df_diamonds_train=remove_outliers(df_diamonds_train,'y')
df_diamonds_train=remove_outliers(df_diamonds_train,'z')
df_diamonds_train=remove_outliers(df_diamonds_train,'depth')
df_diamonds_train=remove_outliers(df_diamonds_train,'table')
df_diamonds_train=remove_outliers(df_diamonds_train,'carat')

In [276]:
df_diamonds_train.shape

(37264, 12)

### 3. Feature engineering (training set)

### 3.1 Target encoding (mean and std)

In [328]:
# 1.Target encoding for categorical variables
# Mean
cut_encoding = df_diamonds_train.groupby(['cut'])['price'].mean().to_dict()
df_diamonds_train['cut_encoding'] = df_diamonds_train['cut'].map(cut_encoding).astype(float)
color_encoding = df_diamonds_train.groupby(['color'])['price'].mean().to_dict()
df_diamonds_train['color_encoding'] = df_diamonds_train['color'].map(color_encoding).astype(float)
clarity_encoding = df_diamonds_train.groupby(['clarity'])['price'].mean().to_dict()
df_diamonds_train['clarity_encoding'] = df_diamonds_train['clarity'].map(clarity_encoding).astype(float)
# Std
cut_encoding_std = df_diamonds_train.groupby(['cut'])['price'].std().to_dict()
df_diamonds_train['cut_encoding_std'] = df_diamonds_train['cut'].map(cut_encoding_std).astype(float)
color_encoding_std = df_diamonds_train.groupby(['color'])['price'].std().to_dict()
df_diamonds_train['color_encoding_std'] = df_diamonds_train['color'].map(color_encoding_std).astype(float)
clarity_encoding_std = df_diamonds_train.groupby(['clarity'])['price'].std().to_dict()
df_diamonds_train['clarity_encoding_std'] = df_diamonds_train['clarity'].map(clarity_encoding_std).astype(float)

In [329]:
df_diamonds_train.head()

Unnamed: 0,index_id,depth,table,x,y,z,price,carat,cut,color,clarity,city,cut_encoding,color_encoding,clarity_encoding,cut_encoding_std,color_encoding_std,clarity_encoding_std
0,5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91...,62.4,58.0,6.83,6.79,4.25,4268,1.21,Premium,J,VS2,Dubai,4617.322612,5346.234112,3913.590182,4380.357286,4437.967123,4029.640798
1,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d...,63.0,57.0,4.35,4.38,2.75,505,0.32,Very Good,H,VS2,Kimberly,3994.44442,4476.469014,3913.590182,3955.185677,4204.035086,4029.640798
2,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,65.5,55.0,5.62,5.53,3.65,2686,0.71,Fair,G,VS1,Las Vegas,4333.27198,4023.214902,3796.813551,3496.467642,4063.947046,4001.986722
3,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,63.8,56.0,4.68,4.72,3.0,738,0.41,Good,D,SI1,Kimberly,3880.611794,3134.943157,3999.856908,3647.03984,3315.698012,3821.246565
4,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328c...,60.5,59.0,6.55,6.51,3.95,4882,1.02,Ideal,G,SI1,Dubai,3436.112577,4023.214902,3999.856908,3790.911135,4063.947046,3821.246565


### 3.2 Including volume

In [330]:
#df_diamonds_train['volume']=df_diamonds_train['x']*df_diamonds_train['y']*df_diamonds_train['z']

### 3.3 Cross Target encoding (mean and std)

In [331]:
# 2. Cross target encoding
# Mean
cut_color_encoding = df_diamonds_train.groupby(['cut','color'])['price'].mean().to_dict()
df_diamonds_train['cut_color_encoding'] = df_diamonds_train.set_index(['cut',
                                                                       'color']).index.map(cut_color_encoding.get).astype(float)
cut_clarity_encoding = df_diamonds_train.groupby(['cut','clarity'])['price'].mean().to_dict()
df_diamonds_train['cut_clarity_encoding'] = df_diamonds_train.set_index(['cut',
                                                                       'clarity']).index.map(cut_clarity_encoding.get).astype(float)
color_clarity_encoding = df_diamonds_train.groupby(['color','clarity'])['price'].mean().to_dict()
df_diamonds_train['color_clarity_encoding'] = df_diamonds_train.set_index(['color',
                                                                          'clarity']).index.map(color_clarity_encoding.get).astype(float)
# Std  
#cut_color_encoding_std = df_diamonds_train.groupby(['cut','color'])['price'].std().to_dict()
#df_diamonds_train['cut_color_encoding_std'] = df_diamonds_train.set_index(['cut',
#                                                                       'color']).index.map(cut_color_encoding_std.get).astype(float)
#cut_clarity_encoding_std = df_diamonds_train.groupby(['cut','clarity'])['price'].std().to_dict()
#df_diamonds_train['cut_clarity_encoding_std'] = df_diamonds_train.set_index(['cut',
#                                                                       'clarity']).index.map(cut_clarity_encoding_std.get).astype(float)
#color_clarity_encoding_std = df_diamonds_train.groupby(['color','clarity'])['price'].std().to_dict()
#df_diamonds_train['color_clarity_encoding_std'] = df_diamonds_train.set_index(['color',
#                                                                           'clarity']).index.map(color_clarity_encoding_std.get).astype(float)


In [332]:
features_list_encoding=['x','y','z','depth','table','carat','cut','color','clarity','cut_encoding','color_encoding'
                        ,'clarity_encoding','cut_encoding_std','color_encoding_std','clarity_encoding_std'
                        ,'cut_color_encoding','cut_clarity_encoding','color_clarity_encoding'
#                       ,'cut_color_encoding_std','cut_clarity_encoding_std','color_clarity_encoding_std'
                       ]

In [333]:
#num_features_encoding=['x','y','z','depth','table','carat','cut_encoding','color_encoding','clarity_encoding',
#                        'cut_encoding_std','color_encoding_std','clarity_encoding_std','volume','cut_color_encoding',
#                        'cut_clarity_encoding','color_clarity_encoding','cut_color_encoding_std',
#                        'cut_clarity_encoding_std','color_clarity_encoding_std']
#target='price'

### 3.4 Feature Scaling

With few exceptions, Machine Learning algrorithms don't perform well when the input numerical attributes have very different scales. We sure want our models to work well, so how can we go about it?

Feature scaling can be done in 2 ways: Min-max scaling and Standardization. I would preferably use Standardization, because it is much less affected by outliers. Scikit-Learn provides a transformer called StandardScaler for this transformation.

In [334]:
#Nodf_diamonds_train_num=df_diamonds_train[num_features_encoding]
#df_diamonds_train_num=df_diamonds_train[num_features_list]
#df_diamonds_train_carat=df_diamonds_train[cat_features_list]
#df_diamonds_train_target=df_diamonds_train['price']

In [335]:
# Perform the feature scaling on the numeric attributes of the dataset
#num_scaler = StandardScaler()
#df_diamonds_train_num_scaled = num_scaler.fit_transform(df_diamonds_train_num)
#df_diamonds_train_num_scaled
#df_diamonds_train_num_scaled=pd.DataFrame(df_diamonds_train_num_scaled)
#NOdf_diamonds_train_num_scaled.rename(columns = {0:'x', 1:'y', 2:'z', 3:'depth', 4:'table', 5:'carat', 6:'cut_encoding',
#                                              7:'color_encoding', 8:'clarity_encoding', 9:'cut_encoding_std',
#                                               10:'color_encoding_std',11:'clarity_encoding_std', 12:'volume',
#                                               13:'cut_color_encoding',14:'cut_clarity_encoding',
#                                               15:'color_clarity_encoding',16:'cut_color_encoding_std',
#                                               17:'cut_clarity_encoding_std',18:'color_clarity_encoding_std'}, 
#                           inplace = True)
#df_diamonds_train_num_scaled.rename(columns = {0:'x', 1:'y', 2:'z', 3:'depth', 4:'table', 5:'carat'}, 
#                           inplace = True)
#df_diamonds_train_num_scaled.head()

In [336]:
#df_diamonds_train=pd.merge(df_diamonds_train_num_scaled, df_diamonds_train_carat, left_index=True, right_index=True)
#df_diamonds_train=pd.merge(df_diamonds_train, df_diamonds_train_target, left_index=True, right_index=True)
#df_diamonds_train.head()

### 3.5 One hot encoding

In [337]:
# 3. Defining features y target
#features_list_volume=['x', 'y', 'z', 'depth', 'table', 'carat', 'cut', 'color', 'clarity','volume']
X=df_diamonds_train[features_list_encoding]
y=df_diamonds_train['price']

In [338]:
# 4.One-hot encoding for categorical variables
#X=pd.get_dummies(X,columns=cat_features_list,drop_first=True)

In [339]:
#X.head()

### 3.6 Change categorical variables to code

In [340]:
for column in cat_features_list:
    X[column]=X[column].astype('category')
    X[column]=X[column].cat.codes

### 3.7 Define train and validation

In [341]:
# 5.Splitting train and test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [342]:
X_train.shape

(32364, 18)

In [343]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32364 entries, 32121 to 15795
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   x                       32364 non-null  float64
 1   y                       32364 non-null  float64
 2   z                       32364 non-null  float64
 3   depth                   32364 non-null  float64
 4   table                   32364 non-null  float64
 5   carat                   32364 non-null  float64
 6   cut                     32364 non-null  int8   
 7   color                   32364 non-null  int8   
 8   clarity                 32364 non-null  int8   
 9   cut_encoding            32364 non-null  float64
 10  color_encoding          32364 non-null  float64
 11  clarity_encoding        32364 non-null  float64
 12  cut_encoding_std        32364 non-null  float64
 13  color_encoding_std      32364 non-null  float64
 14  clarity_encoding_std    32364 non-

## 4. Data preparation (test set)

## 5. Feature engineering (test set) 

In [344]:
# 0. Adapting categorical features for validation model
X_test=df_diamonds_test[features_list]

### 5.1 Target encoding (mean and std)

In [345]:
# 1.Target encoding for categorical variables
# Mean
cut_encoding = df_diamonds_train.groupby(['cut'])['price'].mean().to_dict()
X_test['cut_encoding'] = X_test['cut'].map(cut_encoding).astype(float)
color_encoding = df_diamonds_train.groupby(['color'])['price'].mean().to_dict()
X_test['color_encoding'] = X_test['color'].map(color_encoding).astype(float)
clarity_encoding = df_diamonds_train.groupby(['clarity'])['price'].mean().to_dict()
X_test['clarity_encoding'] = X_test['clarity'].map(clarity_encoding).astype(float)
# Std
cut_encoding_std = df_diamonds_train.groupby(['cut'])['price'].std().to_dict()
X_test['cut_encoding_std'] = X_test['cut'].map(cut_encoding_std).astype(float)
color_encoding_std = df_diamonds_train.groupby(['color'])['price'].std().to_dict()
X_test['color_encoding_std'] = X_test['color'].map(color_encoding_std).astype(float)
clarity_encoding_std = df_diamonds_train.groupby(['clarity'])['price'].std().to_dict()
X_test['clarity_encoding_std'] = X_test['clarity'].map(clarity_encoding_std).astype(float)

### 5.2 Including volume

In [346]:
#X_test['volume']=X_test['x']*X_test['y']*X_test['z']

### 5.3 Cross Target encoding (mean and std)

In [347]:
# 2. Cross target encoding
# Mean
cut_color_encoding = df_diamonds_train.groupby(['cut','color'])['price'].mean().to_dict()
X_test['cut_color_encoding'] = X_test.set_index(['cut','color']).index.map(cut_color_encoding.get).astype(float)
cut_clarity_encoding = df_diamonds_train.groupby(['cut','clarity'])['price'].mean().to_dict()
X_test['cut_clarity_encoding'] = X_test.set_index(['cut','clarity']).index.map(cut_clarity_encoding.get).astype(float)
color_clarity_encoding = df_diamonds_train.groupby(['color','clarity'])['price'].mean().to_dict()
X_test['color_clarity_encoding'] = X_test.set_index(['color','clarity']).index.map(color_clarity_encoding.get).astype(float)

# Std  
#cut_color_encoding_std = df_diamonds_train.groupby(['cut','color'])['price'].std().to_dict()
#X_test['cut_color_encoding_std'] = X_test.set_index(['cut','color']).index.map(cut_color_encoding_std.get).astype(float)
#cut_clarity_encoding_std = df_diamonds_train.groupby(['cut','clarity'])['price'].std().to_dict()
#X_test['cut_clarity_encoding_std'] = X_test.set_index(['cut','clarity']).index.map(cut_clarity_encoding_std.get).astype(float)
#color_clarity_encoding_std = df_diamonds_train.groupby(['color','clarity'])['price'].std().to_dict()
#X_test['color_clarity_encoding_std'] = X_test.set_index(['color','clarity']).index.map(color_clarity_encoding_std.get).astype(float)

### 5.4 Feature Scaling

In [348]:
#X_test_num=X_test[num_features_list]
#X_test_carat=X_test[cat_features_list]

In [349]:
# Perform the feature scaling on the numeric attributes of the dataset
#num_scaler = StandardScaler()
#X_test_num_scaled = num_scaler.fit_transform(X_test_num)
#X_test_num_scaled=pd.DataFrame(X_test_num_scaled)
#X_test_num_scaled.rename(columns = {0:'x', 1:'y', 2:'z', 3:'depth', 4:'table', 5:'carat'}, 
#                           inplace = True)
#X_test_num_scaled.head()

In [350]:
#X_test=pd.merge(X_test_num_scaled, X_test_carat, left_index=True, right_index=True)
#X_test.head()

### 5.5 One hot encoding

In [351]:
# 3. One-hot encoding for categorical variables
#X_test=pd.get_dummies(X_test,columns=cat_features_list,drop_first=True)

### 3.6 Change categorical variables to code

In [352]:
for column in features_list_encoding:
    X_test[column]=X_test[column].astype('category')
    X_test[column]=X_test[column].cat.codes

In [353]:
X_test.shape

(13485, 18)

In [354]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13485 entries, 0 to 13484
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   x                       13485 non-null  int16
 1   y                       13485 non-null  int16
 2   z                       13485 non-null  int16
 3   depth                   13485 non-null  int16
 4   table                   13485 non-null  int8 
 5   carat                   13485 non-null  int16
 6   cut                     13485 non-null  int8 
 7   color                   13485 non-null  int8 
 8   clarity                 13485 non-null  int8 
 9   cut_encoding            13485 non-null  int8 
 10  color_encoding          13485 non-null  int8 
 11  clarity_encoding        13485 non-null  int8 
 12  cut_encoding_std        13485 non-null  int8 
 13  color_encoding_std      13485 non-null  int8 
 14  clarity_encoding_std    13485 non-null  int8 
 15  cut_color_encoding 

## Model definition - RandomForestRegressor - with Random Hyperparameter Grid

RandomForestRegressor: multiple trees in paralel changing samples and convining diferrent features (overfitting when the tree is big and good to reduce error variance)

Main Parameters:
   - bootstrap -> method for sampling data points (TRUE bagging and FALSE pasting, with/without replacement)
   - n_estimators -> number of trees in the foreset
   - max_depth -> max number of levels in each decision tree
   - max_features -> max number of features considered for splitting a node
   - ccp_alpha ->
   - criterion ->
   - max_leaf_nodes -> max number of solution nodes 
   - max_samples ->
   - min_impurity_decrease ->
   - min_samples_leaf -> min number of data points allowed in a leaf node
   - min_samples_split -> min number of data points placed in a node before the node is split
   - min_weight_fraction_leaf
   - n_estimators -> number of trees in the foreset
   - n_jobs 
   - oob_score 
   - random_state
   - verbose
   - warm_start   

In [355]:
# 0. Random Hyperparameter Grid - Grid definition

#n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)] # Number of trees in random forest
#max_features = ['auto', 'sqrt'] # Number of features to consider at every split
#max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] # Maximum number of levels in tree
#max_depth.append(None)
#min_samples_split = [2, 5, 10] # Minimum number of samples required to split a node
#min_samples_leaf = [1, 2, 4] # Minimum number of samples required at each leaf node
#bootstrap = [True, False] # Method of selecting samples for training each tree

# Create the random grid
#random_grid = {'n_estimators': n_estimators,
#               'max_features': max_features,
#               'max_depth': max_depth,
#               'min_samples_split': min_samples_split,
#               'min_samples_leaf': min_samples_leaf,
#               'bootstrap': bootstrap}
#print(random_grid)

In [356]:
#%%time
#1. Random Hyperparameter Grid - Use the random grid to search for best hyperparameters

# First create the base model to tune
#rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
#search across 100 different combinations, and use all available cores
#rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100,
#                               cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
#rf_random.fit([(X_train,y_train),(X_val,y_val)])

In [357]:
#rf_random.best_params_

In [358]:
#best_params={'n_estimators': 1600,
# 'min_samples_split': 5,
# 'min_samples_leaf': 1,
# 'max_features': 'auto',
# 'max_depth': 90,
# 'bootstrap': True}

#(n_estimators=100,min_samples_split=5,min_samples_leaf=1,
#                              max_features='auto',max_depth=10,bootstrap=True)

#n_estimators=100,min_samples_split=5,min_samples_leaf=1,
#                              max_features='auto',max_depth=40,bootstrap=True

In [359]:
# 2. RandomForestRegressor definition
model = RandomForestRegressor(n_estimators=500,min_samples_split=5,min_samples_leaf=1,
                              max_features='auto',max_depth=10,bootstrap=True, random_state=42)

In [360]:
#Importance
#feats = {} # a dict to hold feature_name: feature_importance
#for feature, importance in zip(X_train.columns, model.feature_importances_):
#    feats[feature] = importance #add the name/value pair 

In [361]:
#Importance=pd.DataFrame(list(feats.items()),columns = ['feature','importance']).sort_values('importance',ascending=False)

In [362]:
# 1. XGBRegressor 
model = XGBRegressor(n_estimators=200,colsample_bylevel=1,colsample_bynode=1,
                     colsample_bytree=0.8,reg_alpha=1, reg_lambda=1,gamma=0,learning_rate=0.1, random_state=42)
hyperparameters = model.get_params()
print(type(model), '\n')
print('Model hyperparameters:', hyperparameters, '\n')
#n_estimators=200,colsample_bylevel=1,colsample_bynode=1,
#                     colsample_bytree=0.8,reg_alpha=1, reg_lambda=1,gamma=0,learning_rate=0.1, random_state=42

<class 'xgboost.sklearn.XGBRegressor'> 

Model hyperparameters: {'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 0.8, 'enable_categorical': False, 'gamma': 0, 'gpu_id': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.1, 'max_delta_step': None, 'max_depth': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'n_estimators': 200, 'n_jobs': None, 'num_parallel_tree': None, 'predictor': None, 'random_state': 42, 'reg_alpha': 1, 'reg_lambda': 1, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None} 



In [363]:
#model = KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
#                    metric_params=None, n_jobs=None, n_neighbors=4, p=2,
#                    weights='uniform')

## Model training with validation

In [364]:
model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=200, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=42,
             reg_alpha=1, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [365]:
%%time
# 1. XGBRegressor early_stopping_rounds=40
# Model training
model.fit(X_train, y_train,eval_set=[(X_train,y_train),(X_val,y_val)],early_stopping_rounds=40)

[0]	validation_0-rmse:5048.96484	validation_1-rmse:5089.81201
[1]	validation_0-rmse:4563.50439	validation_1-rmse:4601.07861
[2]	validation_0-rmse:4124.07471	validation_1-rmse:4158.65674
[3]	validation_0-rmse:3728.56104	validation_1-rmse:3762.89600
[4]	validation_0-rmse:3373.49121	validation_1-rmse:3407.07275
[5]	validation_0-rmse:3054.22095	validation_1-rmse:3086.47656
[6]	validation_0-rmse:2767.23633	validation_1-rmse:2798.41357
[7]	validation_0-rmse:2510.88477	validation_1-rmse:2541.00879
[8]	validation_0-rmse:2280.06763	validation_1-rmse:2309.65991
[9]	validation_0-rmse:2072.98193	validation_1-rmse:2101.57056
[10]	validation_0-rmse:1887.13037	validation_1-rmse:1914.84143
[11]	validation_0-rmse:1721.07996	validation_1-rmse:1747.58313
[12]	validation_0-rmse:1572.71997	validation_1-rmse:1598.60388
[13]	validation_0-rmse:1441.27441	validation_1-rmse:1466.87939
[14]	validation_0-rmse:1323.80286	validation_1-rmse:1349.67029
[15]	validation_0-rmse:1219.64624	validation_1-rmse:1246.28503
[1

[134]	validation_0-rmse:413.46680	validation_1-rmse:526.42810
[135]	validation_0-rmse:412.67105	validation_1-rmse:526.53565
[136]	validation_0-rmse:411.95956	validation_1-rmse:526.50629
[137]	validation_0-rmse:411.34442	validation_1-rmse:526.54333
[138]	validation_0-rmse:411.16040	validation_1-rmse:526.56018
[139]	validation_0-rmse:409.90637	validation_1-rmse:526.71960
[140]	validation_0-rmse:409.01340	validation_1-rmse:526.38293
[141]	validation_0-rmse:408.37881	validation_1-rmse:526.60498
[142]	validation_0-rmse:407.31415	validation_1-rmse:526.66174
[143]	validation_0-rmse:406.46494	validation_1-rmse:526.70880
[144]	validation_0-rmse:405.80295	validation_1-rmse:526.57391
[145]	validation_0-rmse:405.64966	validation_1-rmse:526.57642
[146]	validation_0-rmse:405.15369	validation_1-rmse:526.55621
[147]	validation_0-rmse:404.54208	validation_1-rmse:526.69080
[148]	validation_0-rmse:403.62512	validation_1-rmse:526.50775
[149]	validation_0-rmse:403.46097	validation_1-rmse:526.48114
[150]	va

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=200, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=42,
             reg_alpha=1, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [367]:
%%time
# Model predictions
y_pred_val = model.predict(X_val)

CPU times: user 93.3 ms, sys: 5.31 ms, total: 98.7 ms
Wall time: 20.1 ms


In [368]:
%%time
# Model predictions
y_pred_train = model.predict(X_train)

CPU times: user 219 ms, sys: 4.35 ms, total: 223 ms
Wall time: 43.1 ms


## Training set error

In [369]:
# Model predictions
rmse_train = mean_squared_error(y_train, y_pred_train)**0.5
rmse_train

379.59381436660425

In [370]:
mae_train=mean_absolute_error(y_train, y_pred_train)
mae_train

216.9793579710707

In [371]:
r2r = r2_score(y_val, y_pred_val)
r2r

0.9830573064173702

## Model validation

In [372]:
#432
rmse_val = mean_squared_error(y_val, y_pred_val)**0.5
rmse_val

525.2697440888562

In [373]:
mae_val=mean_absolute_error(y_val, y_pred_val)
mae_val

267.6166781570572

In [374]:
r2r = r2_score(y_val, y_pred_val)
r2r

0.9830573064173702

## Model training without validation

In [375]:
%%time
# Model training
model.fit(X, y,eval_set=[(X_train,y_train),(X_val,y_val)],early_stopping_rounds=40)

[0]	validation_0-rmse:5048.47363	validation_1-rmse:5086.58057
[1]	validation_0-rmse:4562.79932	validation_1-rmse:4597.12988
[2]	validation_0-rmse:4122.91357	validation_1-rmse:4154.11865
[3]	validation_0-rmse:3727.56299	validation_1-rmse:3758.37769
[4]	validation_0-rmse:3371.26978	validation_1-rmse:3399.22583
[5]	validation_0-rmse:3051.79028	validation_1-rmse:3077.73999
[6]	validation_0-rmse:2764.68066	validation_1-rmse:2788.13086
[7]	validation_0-rmse:2509.36353	validation_1-rmse:2531.80249
[8]	validation_0-rmse:2278.72485	validation_1-rmse:2299.95532
[9]	validation_0-rmse:2071.74976	validation_1-rmse:2090.28174
[10]	validation_0-rmse:1886.24206	validation_1-rmse:1902.30029
[11]	validation_0-rmse:1720.46521	validation_1-rmse:1734.51758
[12]	validation_0-rmse:1573.17481	validation_1-rmse:1585.65332
[13]	validation_0-rmse:1441.78357	validation_1-rmse:1452.22083
[14]	validation_0-rmse:1324.17737	validation_1-rmse:1333.74951
[15]	validation_0-rmse:1220.12463	validation_1-rmse:1228.19470
[1

[134]	validation_0-rmse:423.01740	validation_1-rmse:418.93131
[135]	validation_0-rmse:422.37857	validation_1-rmse:417.88095
[136]	validation_0-rmse:421.99902	validation_1-rmse:417.52188
[137]	validation_0-rmse:421.03671	validation_1-rmse:416.35525
[138]	validation_0-rmse:420.60040	validation_1-rmse:416.01343
[139]	validation_0-rmse:419.79877	validation_1-rmse:415.40445
[140]	validation_0-rmse:419.72760	validation_1-rmse:415.34641
[141]	validation_0-rmse:419.14121	validation_1-rmse:414.96301
[142]	validation_0-rmse:418.20050	validation_1-rmse:414.19037
[143]	validation_0-rmse:417.55972	validation_1-rmse:413.14166
[144]	validation_0-rmse:417.33548	validation_1-rmse:412.89334
[145]	validation_0-rmse:417.27286	validation_1-rmse:412.83530
[146]	validation_0-rmse:416.74231	validation_1-rmse:412.19540
[147]	validation_0-rmse:415.94064	validation_1-rmse:411.61093
[148]	validation_0-rmse:415.71976	validation_1-rmse:411.36939
[149]	validation_0-rmse:415.15228	validation_1-rmse:410.82690
[150]	va

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=200, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=42,
             reg_alpha=1, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

## Test Preditions

In [376]:
predictions = model.predict(X_test)

In [377]:
predictions=pd.DataFrame(predictions)

In [378]:
predictions.reset_index(inplace=True)

In [379]:
predictions=predictions.rename({0: 'price','index': 'id'}, axis=1)

## Save Preditions

In [380]:
predictions.to_csv('../data/diamonds_predictions_XGBRegressor_13.csv',index=False)