## 0. Libraries import

In [127]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [128]:
# imports 
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", 50)
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
#train test split
from sklearn.model_selection import train_test_split
# Hyperparameters selection
from sklearn.model_selection import RandomizedSearchCV
# Importing cross validation function from sklearn
from sklearn.model_selection import cross_val_score
# models
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
# error
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error

## 1. Data import

### 1.1 Impor train data

In [129]:
df_diamonds_train=pd.read_csv('../data/diamonds_train.csv')
df_diamonds_train.pop("Unnamed: 0") #dropped an unnecessary column
df_diamonds_train

Unnamed: 0,index_id,depth,table,x,y,z,price,carat,cut,color,clarity,city
0,5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91...,62.4,58.0,6.83,6.79,4.25,4268,1.21,Premium,J,VS2,Dubai
1,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d...,63.0,57.0,4.35,4.38,2.75,505,0.32,Very Good,H,VS2,Kimberly
2,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,65.5,55.0,5.62,5.53,3.65,2686,0.71,Fair,G,VS1,Las Vegas
3,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,63.8,56.0,4.68,4.72,3.00,738,0.41,Good,D,SI1,Kimberly
4,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328c...,60.5,59.0,6.55,6.51,3.95,4882,1.02,Ideal,G,SI1,Dubai
...,...,...,...,...,...,...,...,...,...,...,...,...
40450,f0bc79169405ebeb24e308055156b946ffd819db9b4f75...,62.7,57.0,7.10,7.04,4.43,10070,1.34,Ideal,G,VS1,Antwerp
40451,339916a23bf22b052b54cb2a9b36ee8418c1c68b46acad...,57.1,60.0,8.31,8.25,4.73,12615,2.02,Good,F,SI2,Madrid
40452,46957922b99954654c1deb8d854c3f069bf118b2ce9415...,62.7,56.0,6.37,6.42,4.01,5457,1.01,Ideal,H,SI1,Kimberly
40453,9d733392d362d5c6f1d9b9659b601c7d4b5a1c1c8df579...,61.9,54.3,4.45,4.47,2.76,456,0.33,Ideal,J,VS1,Kimberly


### 1.2 Impor test data

In [130]:
df_diamonds_test=pd.read_csv('../data/diamonds_test.csv')
df_diamonds_test

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90,Kimberly
4,4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30,Amsterdam
13481,13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56,New York City
13482,13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53,Tel Aviv
13483,13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45,Surat


## 2. Data preparation (training set)

### 2.1 Defining numerical and categorical features

In [131]:
# Excluding city (with trees sometimes is better to keep variables even if they are correlated)
num_features_list=['x','y','z','depth','table','carat']
cat_features_list=['cut','color','clarity']
target='price'
features_list=['x','y','z','depth','table','carat','cut','color','clarity']

### 2.2 Checking if zero/null values

In [132]:
df_diamonds_train[num_features_list].isna().sum()

x        0
y        0
z        0
depth    0
table    0
carat    0
dtype: int64

In [133]:
if 0 in df_diamonds_train[num_features_list]:
    print('0 values')
else:
    print('No 0 values')

No 0 values


### 2.3 Remove outliers

In [134]:
def remove_outliers(df,feature):
    # IQR
    Q1 = np.percentile(df[feature], 25,
                   interpolation = 'midpoint')
    Q3 = np.percentile(df[feature], 75,
                   interpolation = 'midpoint')
    IQR = Q3 - Q1
    # Upper and lower
    upper_limit=Q3+1.5*IQR
    lower_limit=Q1-1.5*IQR
    # Removing the Outliers
    return df[(df[feature]>=lower_limit) & (df[feature]<=upper_limit)]

In [135]:
df_diamonds_train=remove_outliers(df_diamonds_train,'x')
df_diamonds_train=remove_outliers(df_diamonds_train,'y')
df_diamonds_train=remove_outliers(df_diamonds_train,'z')
df_diamonds_train=remove_outliers(df_diamonds_train,'depth')
df_diamonds_train=remove_outliers(df_diamonds_train,'table')
df_diamonds_train=remove_outliers(df_diamonds_train,'carat')

In [136]:
df_diamonds_train.shape

(37264, 12)

### 3. Feature engineering (training set)

### 3.1 Target encoding (mean and std)

In [137]:
# 1.Target encoding for categorical variables
# Mean
cut_encoding = df_diamonds_train.groupby(['cut'])['price'].mean().to_dict()
df_diamonds_train['cut_encoding'] = df_diamonds_train['cut'].map(cut_encoding).astype(float)
color_encoding = df_diamonds_train.groupby(['color'])['price'].mean().to_dict()
df_diamonds_train['color_encoding'] = df_diamonds_train['color'].map(color_encoding).astype(float)
clarity_encoding = df_diamonds_train.groupby(['clarity'])['price'].mean().to_dict()
df_diamonds_train['clarity_encoding'] = df_diamonds_train['clarity'].map(clarity_encoding).astype(float)
# Std
cut_encoding_std = df_diamonds_train.groupby(['cut'])['price'].std().to_dict()
df_diamonds_train['cut_encoding_std'] = df_diamonds_train['cut'].map(cut_encoding_std).astype(float)
color_encoding_std = df_diamonds_train.groupby(['color'])['price'].std().to_dict()
df_diamonds_train['color_encoding_std'] = df_diamonds_train['color'].map(color_encoding_std).astype(float)
clarity_encoding_std = df_diamonds_train.groupby(['clarity'])['price'].std().to_dict()
df_diamonds_train['clarity_encoding_std'] = df_diamonds_train['clarity'].map(clarity_encoding_std).astype(float)

In [138]:
df_diamonds_train.head()

Unnamed: 0,index_id,depth,table,x,y,z,price,carat,cut,color,clarity,city,cut_encoding,color_encoding,clarity_encoding,cut_encoding_std,color_encoding_std,clarity_encoding_std
0,5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91...,62.4,58.0,6.83,6.79,4.25,4268,1.21,Premium,J,VS2,Dubai,4090.451413,4205.372106,3706.916046,3786.059445,3203.931679,3773.320777
1,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d...,63.0,57.0,4.35,4.38,2.75,505,0.32,Very Good,H,VS2,Kimberly,3716.148388,3989.238859,3706.916046,3552.522893,3617.921017,3773.320777
3,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,63.8,56.0,4.68,4.72,3.0,738,0.41,Good,D,SI1,Kimberly,3629.673247,3032.494785,3667.76725,3352.922564,3210.14758,3345.026353
4,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328c...,60.5,59.0,6.55,6.51,3.95,4882,1.02,Ideal,G,SI1,Dubai,3209.761477,3813.444988,3667.76725,3433.146709,3821.613156,3345.026353
5,ef2d127de37b942baad06145e54b0c619a1f22327b2ebb...,61.2,57.0,7.45,7.39,4.54,9057,1.52,Ideal,F,SI2,Tel Aviv,3209.761477,3553.853659,4133.827826,3433.146709,3603.654261,3073.028701


### 3.2 Including volume

In [139]:
#df_diamonds_train['volume']=df_diamonds_train['x']*df_diamonds_train['y']*df_diamonds_train['z']

### 3.3 Cross Target encoding (mean and std)

In [140]:
# 2. Cross target encoding
# Mean
cut_color_encoding = df_diamonds_train.groupby(['cut','color'])['price'].mean().to_dict()
df_diamonds_train['cut_color_encoding'] = df_diamonds_train.set_index(['cut',
                                                                       'color']).index.map(cut_color_encoding.get).astype(float)
cut_clarity_encoding = df_diamonds_train.groupby(['cut','clarity'])['price'].mean().to_dict()
df_diamonds_train['cut_clarity_encoding'] = df_diamonds_train.set_index(['cut',
                                                                       'clarity']).index.map(cut_clarity_encoding.get).astype(float)
color_clarity_encoding = df_diamonds_train.groupby(['color','clarity'])['price'].mean().to_dict()
df_diamonds_train['color_clarity_encoding'] = df_diamonds_train.set_index(['color',
                                                                          'clarity']).index.map(color_clarity_encoding.get).astype(float)
# Std  
#cut_color_encoding_std = df_diamonds_train.groupby(['cut','color'])['price'].std().to_dict()
#df_diamonds_train['cut_color_encoding_std'] = df_diamonds_train.set_index(['cut',
#                                                                       'color']).index.map(cut_color_encoding_std.get).astype(float)
#cut_clarity_encoding_std = df_diamonds_train.groupby(['cut','clarity'])['price'].std().to_dict()
#df_diamonds_train['cut_clarity_encoding_std'] = df_diamonds_train.set_index(['cut',
#                                                                       'clarity']).index.map(cut_clarity_encoding_std.get).astype(float)
#color_clarity_encoding_std = df_diamonds_train.groupby(['color','clarity'])['price'].std().to_dict()
#df_diamonds_train['color_clarity_encoding_std'] = df_diamonds_train.set_index(['color',
#                                                                           'clarity']).index.map(color_clarity_encoding_std.get).astype(float)


In [141]:
features_list_encoding=['x','y','z','depth','table','carat','cut','color','clarity','cut_encoding','color_encoding'
                        ,'clarity_encoding','cut_encoding_std','color_encoding_std','clarity_encoding_std'
                        ,'cut_color_encoding','cut_clarity_encoding','color_clarity_encoding'
#                       ,'cut_color_encoding_std','cut_clarity_encoding_std','color_clarity_encoding_std'
                       ]

In [142]:
#num_features_encoding=['x','y','z','depth','table','carat','cut_encoding','color_encoding','clarity_encoding',
#                        'cut_encoding_std','color_encoding_std','clarity_encoding_std','volume','cut_color_encoding',
#                        'cut_clarity_encoding','color_clarity_encoding','cut_color_encoding_std',
#                        'cut_clarity_encoding_std','color_clarity_encoding_std']
#target='price'

### 3.4 Feature Scaling

With few exceptions, Machine Learning algrorithms don't perform well when the input numerical attributes have very different scales. We sure want our models to work well, so how can we go about it?

Feature scaling can be done in 2 ways: Min-max scaling and Standardization. I would preferably use Standardization, because it is much less affected by outliers. Scikit-Learn provides a transformer called StandardScaler for this transformation.

In [143]:
#Nodf_diamonds_train_num=df_diamonds_train[num_features_encoding]
#df_diamonds_train_num=df_diamonds_train[num_features_list]
#df_diamonds_train_carat=df_diamonds_train[cat_features_list]
#df_diamonds_train_target=df_diamonds_train['price']

In [144]:
# Perform the feature scaling on the numeric attributes of the dataset
#num_scaler = StandardScaler()
#df_diamonds_train_num_scaled = num_scaler.fit_transform(df_diamonds_train_num)
#df_diamonds_train_num_scaled
#df_diamonds_train_num_scaled=pd.DataFrame(df_diamonds_train_num_scaled)
#NOdf_diamonds_train_num_scaled.rename(columns = {0:'x', 1:'y', 2:'z', 3:'depth', 4:'table', 5:'carat', 6:'cut_encoding',
#                                              7:'color_encoding', 8:'clarity_encoding', 9:'cut_encoding_std',
#                                               10:'color_encoding_std',11:'clarity_encoding_std', 12:'volume',
#                                               13:'cut_color_encoding',14:'cut_clarity_encoding',
#                                               15:'color_clarity_encoding',16:'cut_color_encoding_std',
#                                               17:'cut_clarity_encoding_std',18:'color_clarity_encoding_std'}, 
#                           inplace = True)
#df_diamonds_train_num_scaled.rename(columns = {0:'x', 1:'y', 2:'z', 3:'depth', 4:'table', 5:'carat'}, 
#                           inplace = True)
#df_diamonds_train_num_scaled.head()

In [145]:
#df_diamonds_train=pd.merge(df_diamonds_train_num_scaled, df_diamonds_train_carat, left_index=True, right_index=True)
#df_diamonds_train=pd.merge(df_diamonds_train, df_diamonds_train_target, left_index=True, right_index=True)
#df_diamonds_train.head()

### 3.5 One hot encoding

In [146]:
# 3. Defining features y target
#features_list_volume=['x', 'y', 'z', 'depth', 'table', 'carat', 'cut', 'color', 'clarity','volume']
X=df_diamonds_train[features_list_encoding]
y=df_diamonds_train['price']

In [147]:
# 4.One-hot encoding for categorical variables
X=pd.get_dummies(X,columns=cat_features_list,drop_first=True)

In [148]:
X.head()

Unnamed: 0,x,y,z,depth,table,carat,cut_encoding,color_encoding,clarity_encoding,cut_encoding_std,color_encoding_std,clarity_encoding_std,cut_color_encoding,cut_clarity_encoding,color_clarity_encoding,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_E,color_F,color_G,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,6.83,6.79,4.25,62.4,58.0,1.21,4090.451413,4205.372106,3706.916046,3786.059445,3203.931679,3773.320777,4723.032323,4279.325321,4072.978678,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0
1,4.35,4.38,2.75,63.0,57.0,0.32,3716.148388,3989.238859,3706.916046,3552.522893,3617.921017,3773.320777,4131.209839,4064.35642,4536.957035,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0
3,4.68,4.72,3.0,63.8,56.0,0.41,3629.673247,3032.494785,3667.76725,3352.922564,3210.14758,3345.026353,3092.344221,3398.12844,2890.454856,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,6.55,6.51,3.95,60.5,59.0,1.02,3209.761477,3813.444988,3667.76725,3433.146709,3821.613156,3345.026353,3622.447529,3442.506284,3608.093343,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
5,7.45,7.39,4.54,61.2,57.0,1.52,3209.761477,3553.853659,4133.827826,3433.146709,3603.654261,3073.028701,3214.139157,3850.286193,3891.265835,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0


### 3.6 Define train and validation

In [149]:
# 5.Splitting train and test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [150]:
X_train.shape

(29811, 32)

In [151]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29811 entries, 11656 to 17132
Data columns (total 32 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   x                       29811 non-null  float64
 1   y                       29811 non-null  float64
 2   z                       29811 non-null  float64
 3   depth                   29811 non-null  float64
 4   table                   29811 non-null  float64
 5   carat                   29811 non-null  float64
 6   cut_encoding            29811 non-null  float64
 7   color_encoding          29811 non-null  float64
 8   clarity_encoding        29811 non-null  float64
 9   cut_encoding_std        29811 non-null  float64
 10  color_encoding_std      29811 non-null  float64
 11  clarity_encoding_std    29811 non-null  float64
 12  cut_color_encoding      29811 non-null  float64
 13  cut_clarity_encoding    29811 non-null  float64
 14  color_clarity_encoding  29811 non-

## 4. Data preparation (test set)

## 5. Feature engineering (test set) 

In [152]:
# 0. Adapting categorical features for validation model
X_test=df_diamonds_test[features_list]

### 5.1 Target encoding (mean and std)

In [153]:
# 1.Target encoding for categorical variables
# Mean
cut_encoding = df_diamonds_train.groupby(['cut'])['price'].mean().to_dict()
X_test['cut_encoding'] = X_test['cut'].map(cut_encoding).astype(float)
color_encoding = df_diamonds_train.groupby(['color'])['price'].mean().to_dict()
X_test['color_encoding'] = X_test['color'].map(color_encoding).astype(float)
clarity_encoding = df_diamonds_train.groupby(['clarity'])['price'].mean().to_dict()
X_test['clarity_encoding'] = X_test['clarity'].map(clarity_encoding).astype(float)
# Std
cut_encoding_std = df_diamonds_train.groupby(['cut'])['price'].std().to_dict()
X_test['cut_encoding_std'] = X_test['cut'].map(cut_encoding_std).astype(float)
color_encoding_std = df_diamonds_train.groupby(['color'])['price'].std().to_dict()
X_test['color_encoding_std'] = X_test['color'].map(color_encoding_std).astype(float)
clarity_encoding_std = df_diamonds_train.groupby(['clarity'])['price'].std().to_dict()
X_test['clarity_encoding_std'] = X_test['clarity'].map(clarity_encoding_std).astype(float)

### 5.2 Including volume

In [154]:
#X_test['volume']=X_test['x']*X_test['y']*X_test['z']

### 5.3 Cross Target encoding (mean and std)

In [155]:
# 2. Cross target encoding
# Mean
cut_color_encoding = df_diamonds_train.groupby(['cut','color'])['price'].mean().to_dict()
X_test['cut_color_encoding'] = X_test.set_index(['cut','color']).index.map(cut_color_encoding.get).astype(float)
cut_clarity_encoding = df_diamonds_train.groupby(['cut','clarity'])['price'].mean().to_dict()
X_test['cut_clarity_encoding'] = X_test.set_index(['cut','clarity']).index.map(cut_clarity_encoding.get).astype(float)
color_clarity_encoding = df_diamonds_train.groupby(['color','clarity'])['price'].mean().to_dict()
X_test['color_clarity_encoding'] = X_test.set_index(['color','clarity']).index.map(color_clarity_encoding.get).astype(float)

# Std  
#cut_color_encoding_std = df_diamonds_train.groupby(['cut','color'])['price'].std().to_dict()
#X_test['cut_color_encoding_std'] = X_test.set_index(['cut','color']).index.map(cut_color_encoding_std.get).astype(float)
#cut_clarity_encoding_std = df_diamonds_train.groupby(['cut','clarity'])['price'].std().to_dict()
#X_test['cut_clarity_encoding_std'] = X_test.set_index(['cut','clarity']).index.map(cut_clarity_encoding_std.get).astype(float)
#color_clarity_encoding_std = df_diamonds_train.groupby(['color','clarity'])['price'].std().to_dict()
#X_test['color_clarity_encoding_std'] = X_test.set_index(['color','clarity']).index.map(color_clarity_encoding_std.get).astype(float)

### 5.4 Feature Scaling

In [156]:
#X_test_num=X_test[num_features_list]
#X_test_carat=X_test[cat_features_list]

In [95]:
# Perform the feature scaling on the numeric attributes of the dataset
#num_scaler = StandardScaler()
#X_test_num_scaled = num_scaler.fit_transform(X_test_num)
#X_test_num_scaled=pd.DataFrame(X_test_num_scaled)
#X_test_num_scaled.rename(columns = {0:'x', 1:'y', 2:'z', 3:'depth', 4:'table', 5:'carat'}, 
#                           inplace = True)
#X_test_num_scaled.head()

In [96]:
#X_test=pd.merge(X_test_num_scaled, X_test_carat, left_index=True, right_index=True)
#X_test.head()

### 5.5 One hot encoding

In [157]:
# 3. One-hot encoding for categorical variables
X_test=pd.get_dummies(X_test,columns=cat_features_list,drop_first=True)

In [158]:
X_test.shape

(13485, 32)

In [159]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13485 entries, 0 to 13484
Data columns (total 32 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   x                       13485 non-null  float64
 1   y                       13485 non-null  float64
 2   z                       13485 non-null  float64
 3   depth                   13485 non-null  float64
 4   table                   13485 non-null  float64
 5   carat                   13485 non-null  float64
 6   cut_encoding            13485 non-null  float64
 7   color_encoding          13485 non-null  float64
 8   clarity_encoding        13485 non-null  float64
 9   cut_encoding_std        13485 non-null  float64
 10  color_encoding_std      13485 non-null  float64
 11  clarity_encoding_std    13485 non-null  float64
 12  cut_color_encoding      13485 non-null  float64
 13  cut_clarity_encoding    13485 non-null  float64
 14  color_clarity_encoding  13485 non-null

## Model definition - RandomForestRegressor - with Random Hyperparameter Grid

RandomForestRegressor: multiple trees in paralel changing samples and convining diferrent features (overfitting when the tree is big and good to reduce error variance)

Main Parameters:
   - bootstrap -> method for sampling data points (TRUE bagging and FALSE pasting, with/without replacement)
   - n_estimators -> number of trees in the foreset
   - max_depth -> max number of levels in each decision tree
   - max_features -> max number of features considered for splitting a node
   - ccp_alpha ->
   - criterion ->
   - max_leaf_nodes -> max number of solution nodes 
   - max_samples ->
   - min_impurity_decrease ->
   - min_samples_leaf -> min number of data points allowed in a leaf node
   - min_samples_split -> min number of data points placed in a node before the node is split
   - min_weight_fraction_leaf
   - n_estimators -> number of trees in the foreset
   - n_jobs 
   - oob_score 
   - random_state
   - verbose
   - warm_start   

In [166]:
# 0. Random Hyperparameter Grid - Grid definition

#n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)] # Number of trees in random forest
#max_features = ['auto', 'sqrt'] # Number of features to consider at every split
#max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] # Maximum number of levels in tree
#max_depth.append(None)
#min_samples_split = [2, 5, 10] # Minimum number of samples required to split a node
#min_samples_leaf = [1, 2, 4] # Minimum number of samples required at each leaf node
#bootstrap = [True, False] # Method of selecting samples for training each tree

# Create the random grid
#random_grid = {'n_estimators': n_estimators,
#               'max_features': max_features,
#               'max_depth': max_depth,
#               'min_samples_split': min_samples_split,
#               'min_samples_leaf': min_samples_leaf,
#               'bootstrap': bootstrap}
#print(random_grid)

In [167]:
#%%time
#1. Random Hyperparameter Grid - Use the random grid to search for best hyperparameters

# First create the base model to tune
#rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
#search across 100 different combinations, and use all available cores
#rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100,
#                               cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
#rf_random.fit([(X_train,y_train),(X_val,y_val)])

In [102]:
#rf_random.best_params_

In [103]:
#best_params={'n_estimators': 1600,
# 'min_samples_split': 5,
# 'min_samples_leaf': 1,
# 'max_features': 'auto',
# 'max_depth': 90,
# 'bootstrap': True}

#(n_estimators=100,min_samples_split=5,min_samples_leaf=1,
#                              max_features='auto',max_depth=10,bootstrap=True)

#n_estimators=100,min_samples_split=5,min_samples_leaf=1,
#                              max_features='auto',max_depth=40,bootstrap=True

In [125]:
# 2. RandomForestRegressor definition
model = RandomForestRegressor(n_estimators=500,min_samples_split=5,min_samples_leaf=1,
                              max_features='auto',max_depth=10,bootstrap=True)

In [223]:
#Importance
#feats = {} # a dict to hold feature_name: feature_importance
#for feature, importance in zip(X_train.columns, model.feature_importances_):
#    feats[feature] = importance #add the name/value pair 

In [224]:
#Importance=pd.DataFrame(list(feats.items()),columns = ['feature','importance']).sort_values('importance',ascending=False)

In [124]:
# 1. XGBRegressor 
model = XGBRegressor(n_estimators=200,colsample_bylevel=1,colsample_bynode=1,
                     colsample_bytree=0.8,reg_alpha=1, reg_lambda=1,gamma=0,learning_rate=0.1)
hyperparameters = model.get_params()
print(type(model), '\n')
print('Model hyperparameters:', hyperparameters, '\n')

<class 'xgboost.sklearn.XGBRegressor'> 

Model hyperparameters: {'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 0.8, 'enable_categorical': False, 'gamma': 0, 'gpu_id': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.1, 'max_delta_step': None, 'max_depth': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'n_estimators': 200, 'n_jobs': None, 'num_parallel_tree': None, 'predictor': None, 'random_state': None, 'reg_alpha': 1, 'reg_lambda': 1, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None} 



In [551]:
#model = KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
#                    metric_params=None, n_jobs=None, n_neighbors=4, p=2,
#                    weights='uniform')

## Model training with validation

In [126]:
model.fit(X_train, y_train)

RandomForestRegressor(max_depth=10, min_samples_split=5, n_estimators=500)

In [40]:
%%time
# 1. XGBRegressor 
# Model training
model.fit(X_train, y_train,eval_set=[(X_train,y_train),(X_val,y_val)],early_stopping_rounds=40)

[0]	validation_0-rmse:4597.60498	validation_1-rmse:4445.81787
[1]	validation_0-rmse:4153.07617	validation_1-rmse:4013.94238
[2]	validation_0-rmse:3756.91260	validation_1-rmse:3630.57153
[3]	validation_0-rmse:3397.47168	validation_1-rmse:3282.39014
[4]	validation_0-rmse:3073.40454	validation_1-rmse:2967.96875
[5]	validation_0-rmse:2781.83716	validation_1-rmse:2684.21973
[6]	validation_0-rmse:2521.44116	validation_1-rmse:2430.68774
[7]	validation_0-rmse:2286.37354	validation_1-rmse:2202.94507
[8]	validation_0-rmse:2077.30200	validation_1-rmse:1999.03906
[9]	validation_0-rmse:1887.39282	validation_1-rmse:1815.12976
[10]	validation_0-rmse:1718.90271	validation_1-rmse:1652.15759
[11]	validation_0-rmse:1567.44006	validation_1-rmse:1505.20117
[12]	validation_0-rmse:1431.38818	validation_1-rmse:1374.61023
[13]	validation_0-rmse:1310.32849	validation_1-rmse:1257.15369
[14]	validation_0-rmse:1202.43030	validation_1-rmse:1153.43616
[15]	validation_0-rmse:1106.48938	validation_1-rmse:1061.90149
[1

[134]	validation_0-rmse:355.78497	validation_1-rmse:436.48981
[135]	validation_0-rmse:355.48486	validation_1-rmse:436.43372
[136]	validation_0-rmse:354.92654	validation_1-rmse:436.47815
[137]	validation_0-rmse:354.48920	validation_1-rmse:436.51016
[138]	validation_0-rmse:354.13834	validation_1-rmse:436.67044
[139]	validation_0-rmse:353.75394	validation_1-rmse:436.55679
[140]	validation_0-rmse:353.30063	validation_1-rmse:436.54114
[141]	validation_0-rmse:352.27365	validation_1-rmse:436.52615
[142]	validation_0-rmse:351.96234	validation_1-rmse:436.55096
[143]	validation_0-rmse:351.64963	validation_1-rmse:436.65836
[144]	validation_0-rmse:350.90720	validation_1-rmse:436.64740
[145]	validation_0-rmse:350.54047	validation_1-rmse:436.23776
[146]	validation_0-rmse:350.31567	validation_1-rmse:436.21707
[147]	validation_0-rmse:350.13995	validation_1-rmse:436.33942
[148]	validation_0-rmse:349.49503	validation_1-rmse:436.56708
[149]	validation_0-rmse:348.75702	validation_1-rmse:436.43704
[150]	va

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=200, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=1,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [115]:
%%time
# Model predictions
y_pred_val = model.predict(X_val)

CPU times: user 153 ms, sys: 2.21 ms, total: 155 ms
Wall time: 155 ms


In [116]:
%%time
# Model predictions
y_pred_train = model.predict(X_train)

CPU times: user 518 ms, sys: 3.3 ms, total: 521 ms
Wall time: 522 ms


## Training set error

In [117]:
# Model predictions
rmse_train = mean_squared_error(y_train, y_pred_train)**0.5
rmse_train

387.97152511723135

In [118]:
mae_train=mean_absolute_error(y_train, y_pred_train)
mae_train

220.1684072850851

In [119]:
r2r = r2_score(y_val, y_pred_val)
r2r

0.9824080905577942

## Model validation

In [120]:
#432
rmse_val = mean_squared_error(y_val, y_pred_val)**0.5
rmse_val

458.8931212374295

In [121]:
mae_val=mean_absolute_error(y_val, y_pred_val)
mae_val

250.2266356184842

In [122]:
r2r = r2_score(y_val, y_pred_val)
r2r

0.9824080905577942

## Model training without validation

In [236]:
%%time
# Model training
model.fit(X, y,eval_set=[(X_train,y_train),(X_val,y_val)],early_stopping_rounds=40)

[0]	validation_0-rmse:4597.65234	validation_1-rmse:4445.54053
[1]	validation_0-rmse:4153.35352	validation_1-rmse:4013.25537
[2]	validation_0-rmse:3757.12988	validation_1-rmse:3629.02173
[3]	validation_0-rmse:3397.85059	validation_1-rmse:3280.38062
[4]	validation_0-rmse:3074.06885	validation_1-rmse:2965.40698
[5]	validation_0-rmse:2782.83203	validation_1-rmse:2682.07422
[6]	validation_0-rmse:2523.01367	validation_1-rmse:2428.66357
[7]	validation_0-rmse:2288.53931	validation_1-rmse:2200.68994
[8]	validation_0-rmse:2079.38330	validation_1-rmse:1996.21948
[9]	validation_0-rmse:1890.61060	validation_1-rmse:1812.64136
[10]	validation_0-rmse:1722.10022	validation_1-rmse:1649.11829
[11]	validation_0-rmse:1571.44898	validation_1-rmse:1502.40833
[12]	validation_0-rmse:1435.38709	validation_1-rmse:1370.18933
[13]	validation_0-rmse:1314.66370	validation_1-rmse:1252.56372
[14]	validation_0-rmse:1207.35303	validation_1-rmse:1149.10095
[15]	validation_0-rmse:1112.12878	validation_1-rmse:1056.93469
[1

[134]	validation_0-rmse:364.24530	validation_1-rmse:355.72629
[135]	validation_0-rmse:363.75162	validation_1-rmse:355.08414
[136]	validation_0-rmse:363.67737	validation_1-rmse:355.01236
[137]	validation_0-rmse:363.28522	validation_1-rmse:354.65683
[138]	validation_0-rmse:363.10425	validation_1-rmse:354.42325
[139]	validation_0-rmse:362.37460	validation_1-rmse:353.87824
[140]	validation_0-rmse:361.81726	validation_1-rmse:353.52335
[141]	validation_0-rmse:361.30771	validation_1-rmse:352.97806
[142]	validation_0-rmse:360.86020	validation_1-rmse:352.56506
[143]	validation_0-rmse:360.26148	validation_1-rmse:352.11652
[144]	validation_0-rmse:359.53445	validation_1-rmse:351.47607
[145]	validation_0-rmse:359.42099	validation_1-rmse:351.37286
[146]	validation_0-rmse:358.86246	validation_1-rmse:351.12546
[147]	validation_0-rmse:358.52841	validation_1-rmse:350.60715
[148]	validation_0-rmse:357.92682	validation_1-rmse:349.93311
[149]	validation_0-rmse:357.86557	validation_1-rmse:349.87619
[150]	va

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=200, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=1,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

## Test Preditions

In [36]:
predictions = model.predict(X_test)

In [37]:
predictions=pd.DataFrame(predictions)

In [600]:
predictions.reset_index(inplace=True)

In [463]:
predictions=predictions.rename({0: 'price','index': 'id'}, axis=1)

## Save Preditions

In [464]:
predictions.to_csv('../data/diamonds_predictions_XGBRegressor_Cross_Target.csv',index=False)