## 0. Libraries import

In [2]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# imports 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# outliers
from sklearn.datasets import load_boston
#train test split
from sklearn.model_selection import train_test_split
# Hyperparameters selection
from sklearn.model_selection import RandomizedSearchCV
# models
from sklearn.ensemble import RandomForestRegressor
#import lightgbm as ltb
from xgboost import XGBRegressor
# error
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error

## 1. Data import

In [4]:
# Impor train data
df_diamonds_train=pd.read_csv('../data/diamonds_train.csv')
df_diamonds_train.pop("Unnamed: 0")
df_diamonds_train

Unnamed: 0,index_id,depth,table,x,y,z,price,carat,cut,color,clarity,city
0,5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91...,62.4,58.0,6.83,6.79,4.25,4268,1.21,Premium,J,VS2,Dubai
1,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d...,63.0,57.0,4.35,4.38,2.75,505,0.32,Very Good,H,VS2,Kimberly
2,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,65.5,55.0,5.62,5.53,3.65,2686,0.71,Fair,G,VS1,Las Vegas
3,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,63.8,56.0,4.68,4.72,3.00,738,0.41,Good,D,SI1,Kimberly
4,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328c...,60.5,59.0,6.55,6.51,3.95,4882,1.02,Ideal,G,SI1,Dubai
...,...,...,...,...,...,...,...,...,...,...,...,...
40450,f0bc79169405ebeb24e308055156b946ffd819db9b4f75...,62.7,57.0,7.10,7.04,4.43,10070,1.34,Ideal,G,VS1,Antwerp
40451,339916a23bf22b052b54cb2a9b36ee8418c1c68b46acad...,57.1,60.0,8.31,8.25,4.73,12615,2.02,Good,F,SI2,Madrid
40452,46957922b99954654c1deb8d854c3f069bf118b2ce9415...,62.7,56.0,6.37,6.42,4.01,5457,1.01,Ideal,H,SI1,Kimberly
40453,9d733392d362d5c6f1d9b9659b601c7d4b5a1c1c8df579...,61.9,54.3,4.45,4.47,2.76,456,0.33,Ideal,J,VS1,Kimberly


In [5]:
# Import test data
df_diamonds_test=pd.read_csv('../data/diamonds_test.csv')
df_diamonds_test

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90,Kimberly
4,4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30,Amsterdam
13481,13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56,New York City
13482,13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53,Tel Aviv
13483,13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45,Surat


## 2. Data preparation (training set)

In [6]:
# 0.Defining numerical and categorical features
# Excluding city (with trees sometimes is better to keep variables even if they are correlated)
num_features_list=['x','y','z','depth','table','carat']
cat_features_list=['cut','color','clarity']
features_list=['x','y','z','depth','table','carat','cut','color','clarity']

In [7]:
# 1.Checking if 0 values
if 0 in df_diamonds_train[num_features_list]:
    print('0 values')
else:
    print('No 0 values')

No 0 values


In [8]:
# 2.Checking if null values
df_diamonds_train[num_features_list].isna().sum()

x        0
y        0
z        0
depth    0
table    0
carat    0
dtype: int64

In [9]:
# 3.remove outliers
def remove_outliers(df,feature):
    # IQR
    Q1 = np.percentile(df[feature], 25,
                   interpolation = 'midpoint')
    Q3 = np.percentile(df[feature], 75,
                   interpolation = 'midpoint')
    IQR = Q3 - Q1
    # Upper and lower
    upper_limit=Q3+1.5*IQR
    lower_limit=Q1-1.5*IQR
    # Removing the Outliers
    return df[(df[feature]>=lower_limit) & (df[feature]<=upper_limit)]

In [10]:
df_diamonds_train=remove_outliers(df_diamonds_train,'x')
df_diamonds_train=remove_outliers(df_diamonds_train,'y')
df_diamonds_train=remove_outliers(df_diamonds_train,'z')
df_diamonds_train=remove_outliers(df_diamonds_train,'depth')
df_diamonds_train=remove_outliers(df_diamonds_train,'table')
df_diamonds_train=remove_outliers(df_diamonds_train,'carat')

In [11]:
#4. Change colums to categoric
df_diamonds_train['cut'] = df_diamonds_train['cut'].astype('category')
df_diamonds_train['color'] = df_diamonds_train['color'].astype('category')
df_diamonds_train['clarity'] = df_diamonds_train['clarity'].astype('category')

In [12]:
df_diamonds_train.shape

(37264, 12)

## 3. Feature engineering (training set)

Adapting categorical features for training model

In [13]:
# 1.Target encoding for categorical variables
# Mean
cut_encoding = df_diamonds_train.groupby(['cut'])['price'].mean().to_dict()
df_diamonds_train['cut_encoding'] = df_diamonds_train['cut'].map(cut_encoding).astype(float)
color_encoding = df_diamonds_train.groupby(['color'])['price'].mean().to_dict()
df_diamonds_train['color_encoding'] = df_diamonds_train['color'].map(color_encoding).astype(float)
clarity_encoding = df_diamonds_train.groupby(['clarity'])['price'].mean().to_dict()
df_diamonds_train['clarity_encoding'] = df_diamonds_train['clarity'].map(clarity_encoding).astype(float)
# Std
cut_encoding_std = df_diamonds_train.groupby(['cut'])['price'].std().to_dict()
df_diamonds_train['cut_encoding_std'] = df_diamonds_train['cut'].map(cut_encoding_std).astype(float)
color_encoding_std = df_diamonds_train.groupby(['color'])['price'].std().to_dict()
df_diamonds_train['color_encoding_std'] = df_diamonds_train['color'].map(color_encoding_std).astype(float)
clarity_encoding_std = df_diamonds_train.groupby(['clarity'])['price'].std().to_dict()
df_diamonds_train['clarity_encoding_std'] = df_diamonds_train['clarity'].map(clarity_encoding_std).astype(float)

In [14]:
df_diamonds_train['volume']=df_diamonds_train['x']*df_diamonds_train['y']*df_diamonds_train['z']

In [15]:
# 2. Cross target encoding
cut_color_encoding = df_diamonds_train.groupby(['cut','color'])['price'].mean().to_dict()
df_diamonds_train['cut_color_encoding'] = df_diamonds_train.set_index(['cut',
                                                                       'color']).index.map(cut_color_encoding.get).astype(float)
cut_clarity_encoding = df_diamonds_train.groupby(['cut','clarity'])['price'].mean().to_dict()
df_diamonds_train['cut_clarity_encoding'] = df_diamonds_train.set_index(['cut',
                                                                       'clarity']).index.map(cut_clarity_encoding.get).astype(float)
color_clarity_encoding = df_diamonds_train.groupby(['color','clarity'])['price'].mean().to_dict()
df_diamonds_train['color_clarity_encoding'] = df_diamonds_train.set_index(['color',
                                                                       'clarity']).index.map(color_clarity_encoding.get).astype(float)

In [16]:
features_list_encoding=['x','y','z','depth','table','carat','cut','color','clarity','cut_encoding',
                        'color_encoding','clarity_encoding','volume','cut_color_encoding','cut_clarity_encoding',
                        'color_clarity_encoding']

In [17]:
# 3. Defining features y target
X=df_diamonds_train[features_list_encoding]
y=df_diamonds_train['price']

In [18]:
# 4.One-hot encoding for categorical variables
X=pd.get_dummies(X,columns=cat_features_list)

In [19]:
# 5.Splitting train and test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
X_train.shape

(29811, 33)

In [21]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29811 entries, 11656 to 17132
Data columns (total 33 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   x                       29811 non-null  float64
 1   y                       29811 non-null  float64
 2   z                       29811 non-null  float64
 3   depth                   29811 non-null  float64
 4   table                   29811 non-null  float64
 5   carat                   29811 non-null  float64
 6   cut_encoding            29811 non-null  float64
 7   color_encoding          29811 non-null  float64
 8   clarity_encoding        29811 non-null  float64
 9   volume                  29811 non-null  float64
 10  cut_color_encoding      29811 non-null  float64
 11  cut_clarity_encoding    29811 non-null  float64
 12  color_clarity_encoding  29811 non-null  float64
 13  cut_Fair                29811 non-null  uint8  
 14  cut_Good                29811 non-

## 4. Data preparation (test set)

## 5. Feature engineering (test set) 

In [22]:
# 0. Adapting categorical features for validation model
X_test=df_diamonds_test[features_list]

In [23]:
# 1.Target encoding for categorical variables
# Mean
cut_encoding = df_diamonds_train.groupby(['cut'])['price'].mean().to_dict()
X_test['cut_encoding'] = X_test['cut'].map(cut_encoding).astype(float)
color_encoding = df_diamonds_train.groupby(['color'])['price'].mean().to_dict()
X_test['color_encoding'] = X_test['color'].map(color_encoding).astype(float)
clarity_encoding = df_diamonds_train.groupby(['clarity'])['price'].mean().to_dict()
X_test['clarity_encoding'] = X_test['clarity'].map(clarity_encoding).astype(float)
# Std
cut_encoding_std = df_diamonds_train.groupby(['cut'])['price'].std().to_dict()
X_test['cut_encoding_std'] = X_test['cut'].map(cut_encoding_std).astype(float)
color_encoding_std = df_diamonds_train.groupby(['color'])['price'].std().to_dict()
X_test['color_encoding_std'] = X_test['color'].map(color_encoding_std).astype(float)
clarity_encoding_std = df_diamonds_train.groupby(['clarity'])['price'].std().to_dict()
X_test['clarity_encoding_std'] = X_test['clarity'].map(clarity_encoding_std).astype(float)

In [24]:
X_test['volume']=X_test['x']*X_test['y']*X_test['z']

In [25]:
# 2. Cross target encoding
cut_color_encoding = df_diamonds_train.groupby(['cut','color'])['price'].mean().to_dict()
X_test['cut_color_encoding'] = X_test.set_index(['cut','color']).index.map(cut_color_encoding.get).astype(float)
cut_clarity_encoding = df_diamonds_train.groupby(['cut','clarity'])['price'].mean().to_dict()
X_test['cut_clarity_encoding'] = X_test.set_index(['cut','clarity']).index.map(cut_clarity_encoding.get).astype(float)
color_clarity_encoding = df_diamonds_train.groupby(['color','clarity'])['price'].mean().to_dict()
X_test['color_clarity_encoding'] = X_test.set_index(['color','clarity']).index.map(color_clarity_encoding.get).astype(float)

In [26]:
# 3. One-hot encoding for categorical variables
X_test=pd.get_dummies(X_test,columns=cat_features_list)

In [27]:
X_test.shape

(13485, 36)

In [28]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13485 entries, 0 to 13484
Data columns (total 36 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   x                       13485 non-null  float64
 1   y                       13485 non-null  float64
 2   z                       13485 non-null  float64
 3   depth                   13485 non-null  float64
 4   table                   13485 non-null  float64
 5   carat                   13485 non-null  float64
 6   cut_encoding            13485 non-null  float64
 7   color_encoding          13485 non-null  float64
 8   clarity_encoding        13485 non-null  float64
 9   cut_encoding_std        13485 non-null  float64
 10  color_encoding_std      13485 non-null  float64
 11  clarity_encoding_std    13485 non-null  float64
 12  volume                  13485 non-null  float64
 13  cut_color_encoding      13485 non-null  float64
 14  cut_clarity_encoding    13485 non-null

## Model definition - RandomForestRegressor - with Random Hyperparameter Grid

RandomForestRegressor: multiple trees in paralel changing samples and convining diferrent features (overfitting when the tree is big and good to reduce error variance)

Main Parameters:
   - bootstrap -> method for sampling data points (TRUE bagging and FALSE pasting, with/without replacement)
   - n_estimators -> number of trees in the foreset
   - max_depth -> max number of levels in each decision tree
   - max_features -> max number of features considered for splitting a node
   - ccp_alpha ->
   - criterion ->
   - max_leaf_nodes -> max number of solution nodes 
   - max_samples ->
   - min_impurity_decrease ->
   - min_samples_leaf -> min number of data points allowed in a leaf node
   - min_samples_split -> min number of data points placed in a node before the node is split
   - min_weight_fraction_leaf
   - n_estimators -> number of trees in the foreset
   - n_jobs 
   - oob_score 
   - random_state
   - verbose
   - warm_start   

In [29]:
# 0. Random Hyperparameter Grid - Grid definition

#n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)] # Number of trees in random forest
#max_features = ['auto', 'sqrt'] # Number of features to consider at every split
#max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] # Maximum number of levels in tree
#max_depth.append(None)
#min_samples_split = [2, 5, 10] # Minimum number of samples required to split a node
#min_samples_leaf = [1, 2, 4] # Minimum number of samples required at each leaf node
#bootstrap = [True, False] # Method of selecting samples for training each tree

# Create the random grid
#random_grid = {'n_estimators': n_estimators,
#               'max_features': max_features,
#               'max_depth': max_depth,
#               'min_samples_split': min_samples_split,
#               'min_samples_leaf': min_samples_leaf,
#               'bootstrap': bootstrap}
#print(random_grid)

In [30]:
#%%time
#1. Random Hyperparameter Grid - Use the random grid to search for best hyperparameters

# First create the base model to tune
#rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
#search across 100 different combinations, and use all available cores
#rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100,
#                               cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
#rf_random.fit(X_train, y_train)

In [805]:
#rf_random.best_params_

In [806]:
best_params={'n_estimators': 1600,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 90,
 'bootstrap': True}

In [870]:
# 2. RandomForestRegressor definition
#model = RandomForestRegressor(n_estimators=100,min_samples_split=5,min_samples_leaf=1,
#                              max_features='auto',max_depth=10,bootstrap=True)
#hyperparameters = model.get_params()
#print(type(model), '\n')
#print('Model hyperparameters:', hyperparameters, '\n')

In [486]:
#Importance
#feats = {} # a dict to hold feature_name: feature_importance
#for feature, importance in zip(X_train.columns, model.feature_importances_):
#    feats[feature] = importance #add the name/value pair 

In [518]:
#Importance=pd.DataFrame(list(feats.items()),columns = ['feature','importance']).sort_values('importance',ascending=False)

In [71]:
# 1. XGBRegressor 
model = XGBRegressor(n_estimators=200,colsample_bylevel=1,colsample_bynode=1,
                     colsample_bytree=0.8,reg_alpha=1, reg_lambda=1,gamma=0,learning_rate=0.1)
hyperparameters = model.get_params()
print(type(model), '\n')
print('Model hyperparameters:', hyperparameters, '\n')

<class 'xgboost.sklearn.XGBRegressor'> 

Model hyperparameters: {'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 0.8, 'enable_categorical': False, 'gamma': 0, 'gpu_id': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.1, 'max_delta_step': None, 'max_depth': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'n_estimators': 200, 'n_jobs': None, 'num_parallel_tree': None, 'predictor': None, 'random_state': None, 'reg_alpha': 1, 'reg_lambda': 1, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None} 



## Model training with validation

In [72]:
%%time
# Model training
model.fit(X_train, y_train,eval_set=[(X_train,y_train),(X_val,y_val)],early_stopping_rounds=40)

[0]	validation_0-rmse:4597.86719	validation_1-rmse:4446.51709
[1]	validation_0-rmse:4154.05176	validation_1-rmse:4016.38989
[2]	validation_0-rmse:3755.45312	validation_1-rmse:3630.23901
[3]	validation_0-rmse:3398.71362	validation_1-rmse:3283.22827
[4]	validation_0-rmse:3076.13110	validation_1-rmse:2970.41894
[5]	validation_0-rmse:2786.62207	validation_1-rmse:2689.10156
[6]	validation_0-rmse:2528.18408	validation_1-rmse:2438.35474
[7]	validation_0-rmse:2294.86645	validation_1-rmse:2213.35474
[8]	validation_0-rmse:2085.76392	validation_1-rmse:2010.49927
[9]	validation_0-rmse:1898.60156	validation_1-rmse:1829.47827
[10]	validation_0-rmse:1731.58240	validation_1-rmse:1667.54468
[11]	validation_0-rmse:1581.85364	validation_1-rmse:1523.21728
[12]	validation_0-rmse:1448.12134	validation_1-rmse:1394.22571
[13]	validation_0-rmse:1327.11792	validation_1-rmse:1277.87317
[14]	validation_0-rmse:1221.89075	validation_1-rmse:1175.31030
[15]	validation_0-rmse:1127.50476	validation_1-rmse:1084.45801
[1

[134]	validation_0-rmse:356.05078	validation_1-rmse:437.44522
[135]	validation_0-rmse:355.93729	validation_1-rmse:437.36624
[136]	validation_0-rmse:355.56338	validation_1-rmse:437.26260
[137]	validation_0-rmse:354.86835	validation_1-rmse:437.43369
[138]	validation_0-rmse:354.57373	validation_1-rmse:437.25006
[139]	validation_0-rmse:354.28912	validation_1-rmse:437.06625
[140]	validation_0-rmse:353.90326	validation_1-rmse:437.03711
[141]	validation_0-rmse:353.76443	validation_1-rmse:437.05603
[142]	validation_0-rmse:353.63696	validation_1-rmse:437.11060
[143]	validation_0-rmse:352.92044	validation_1-rmse:437.17844
[144]	validation_0-rmse:352.55740	validation_1-rmse:437.02118
[145]	validation_0-rmse:352.28189	validation_1-rmse:437.04645
[146]	validation_0-rmse:351.35535	validation_1-rmse:436.90628
[147]	validation_0-rmse:350.76575	validation_1-rmse:436.84860
[148]	validation_0-rmse:349.89053	validation_1-rmse:437.22345
[149]	validation_0-rmse:349.37772	validation_1-rmse:437.28180
[150]	va

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=200, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=1,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [30]:
%%time
# Model predictions
y_pred_val = model.predict(X_val)
print(type(y_pred_val))

<class 'numpy.ndarray'>
CPU times: user 87.2 ms, sys: 5.77 ms, total: 93 ms
Wall time: 18.9 ms


## Training set error

In [31]:
%%time
# Model predictions
y_pred_train = model.predict(X_train)
print(type(y_pred_train))

<class 'numpy.ndarray'>
CPU times: user 212 ms, sys: 4.26 ms, total: 216 ms
Wall time: 42.8 ms


In [32]:
%%time
# Model predictions
rmse_train = mean_squared_error(y_train, y_pred_train)**0.5
rmse_train

CPU times: user 810 µs, sys: 188 µs, total: 998 µs
Wall time: 864 µs


327.9151117133446

In [33]:
r2r = r2_score(y_val, y_pred_val)
r2r

0.9843800696974195

## Model validation

In [34]:
%%time
# Model predictions
y_pred_val = model.predict(X_val)
print(type(y_pred_val))

<class 'numpy.ndarray'>
CPU times: user 79.8 ms, sys: 3.78 ms, total: 83.6 ms
Wall time: 14.8 ms


In [35]:
#432
rmse_val = mean_squared_error(y_val, y_pred_val)**0.5
rmse_val

432.4088790927343

In [36]:
r2r = r2_score(y_val, y_pred_val)
r2r

0.9843800696974195

## Model training without validation

In [37]:
%%time
# Model training
model.fit(X, y)
print('Model:', model, '\n')
print('Model hyperparameters:', hyperparameters, '\n')

Model: XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=200, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=1,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None) 

Model hyperparameters: {'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 0.8, 'enable_categorical': False, 'gamma': 0, 'gpu_id': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.1, 'max_delta_step': None, 'max_depth': None, 'min_child_weight': No

## Test Preditions

In [38]:
predictions = model.predict(X_test)

In [39]:
predictions=pd.DataFrame(predictions)

In [40]:
predictions.reset_index(inplace=True)

In [41]:
predictions=predictions.rename({0: 'price','index': 'id'}, axis=1)

## Save Preditions

In [42]:
predictions.to_csv('../data/diamonds_predictions_XGBRegressor.csv',index=False)

In [43]:
predictions

Unnamed: 0,id,price
0,0,2846.529785
1,1,5471.448242
2,2,9773.541016
3,3,3964.055420
4,4,1631.235474
...,...,...
13480,13480,1640.777710
13481,13481,2487.817627
13482,13482,3234.020020
13483,13483,2202.828125
