## 0. Libraries import

In [2]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# imports 
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", 50)
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
#train test split
from sklearn.model_selection import train_test_split
# Hyperparameters selection
from sklearn.model_selection import RandomizedSearchCV
# Importing cross validation function from sklearn
from sklearn.model_selection import cross_val_score
# models
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
# error
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error

## 1. Data import

### 1.1 Impor train data

In [4]:
df_diamonds_train=pd.read_csv('../data/diamonds_train.csv')
df_diamonds_train.pop("Unnamed: 0") #dropped an unnecessary column
df_diamonds_train

Unnamed: 0,index_id,depth,table,x,y,z,price,carat,cut,color,clarity,city
0,5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91...,62.4,58.0,6.83,6.79,4.25,4268,1.21,Premium,J,VS2,Dubai
1,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d...,63.0,57.0,4.35,4.38,2.75,505,0.32,Very Good,H,VS2,Kimberly
2,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,65.5,55.0,5.62,5.53,3.65,2686,0.71,Fair,G,VS1,Las Vegas
3,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,63.8,56.0,4.68,4.72,3.00,738,0.41,Good,D,SI1,Kimberly
4,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328c...,60.5,59.0,6.55,6.51,3.95,4882,1.02,Ideal,G,SI1,Dubai
...,...,...,...,...,...,...,...,...,...,...,...,...
40450,f0bc79169405ebeb24e308055156b946ffd819db9b4f75...,62.7,57.0,7.10,7.04,4.43,10070,1.34,Ideal,G,VS1,Antwerp
40451,339916a23bf22b052b54cb2a9b36ee8418c1c68b46acad...,57.1,60.0,8.31,8.25,4.73,12615,2.02,Good,F,SI2,Madrid
40452,46957922b99954654c1deb8d854c3f069bf118b2ce9415...,62.7,56.0,6.37,6.42,4.01,5457,1.01,Ideal,H,SI1,Kimberly
40453,9d733392d362d5c6f1d9b9659b601c7d4b5a1c1c8df579...,61.9,54.3,4.45,4.47,2.76,456,0.33,Ideal,J,VS1,Kimberly


### 1.2 Impor test data

In [5]:
df_diamonds_test=pd.read_csv('../data/diamonds_test.csv')
df_diamonds_test

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90,Kimberly
4,4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30,Amsterdam
13481,13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56,New York City
13482,13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53,Tel Aviv
13483,13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45,Surat


## 2. Data preparation (training set)

### 2.1 Defining numerical and categorical features

In [6]:
# Excluding city (with trees sometimes is better to keep variables even if they are correlated)
num_features_list=['x','y','z','depth','table','carat']
cat_features_list=['cut','color','clarity','city']
target='price'
features_list=['x','y','z','depth','table','carat','cut','color','clarity','city']

## 3. Feature engineering (training set)

### 3.1 Target encoding 

In [7]:
# 1.Target encoding for categorical variables
# Mean
cut_encoding = df_diamonds_train.groupby(['cut'])['price'].mean().to_dict()
df_diamonds_train['cut_encoding'] = df_diamonds_train['cut'].map(cut_encoding).astype(float)
color_encoding = df_diamonds_train.groupby(['color'])['price'].mean().to_dict()
df_diamonds_train['color_encoding'] = df_diamonds_train['color'].map(color_encoding).astype(float)
clarity_encoding = df_diamonds_train.groupby(['clarity'])['price'].mean().to_dict()
df_diamonds_train['clarity_encoding'] = df_diamonds_train['clarity'].map(clarity_encoding).astype(float)

### 3.2 Cross Target encoding 

In [8]:
# 2. Cross target encoding
# Mean
cut_color_encoding = df_diamonds_train.groupby(['cut','color'])['price'].mean().to_dict()
df_diamonds_train['cut_color_encoding'] = df_diamonds_train.set_index(['cut',
                                                                       'color']).index.map(cut_color_encoding.get).astype(float)
cut_clarity_encoding = df_diamonds_train.groupby(['cut','clarity'])['price'].mean().to_dict()
df_diamonds_train['cut_clarity_encoding'] = df_diamonds_train.set_index(['cut',
                                                                       'clarity']).index.map(cut_clarity_encoding.get).astype(float)
color_clarity_encoding = df_diamonds_train.groupby(['color','clarity'])['price'].mean().to_dict()
df_diamonds_train['color_clarity_encoding'] = df_diamonds_train.set_index(['color',
                                                                          'clarity']).index.map(color_clarity_encoding.get).astype(float)

In [9]:
features_list_encoding=['x','y','z','depth','table','carat','cut','color','clarity','city','cut_encoding','color_encoding'
                        ,'clarity_encoding','cut_color_encoding','cut_clarity_encoding','color_clarity_encoding']

### 3.3 Category encoding

In [10]:
# 3. Defining features y target
X=df_diamonds_train[features_list_encoding]
y=df_diamonds_train['price']

In [11]:
for column in cat_features_list:
    X[column]=X[column].astype('category')
    X[column]=X[column].cat.codes

In [12]:
X

Unnamed: 0,x,y,z,depth,table,carat,cut,color,clarity,city,cut_encoding,color_encoding,clarity_encoding,cut_color_encoding,cut_clarity_encoding,color_clarity_encoding
0,6.83,6.79,4.25,62.4,58.0,1.21,3,6,5,2,4617.322612,5346.234112,3913.590182,6376.983740,4577.414201,5260.516874
1,4.35,4.38,2.75,63.0,57.0,0.32,4,4,5,3,3994.444420,4476.469014,3913.590182,4541.993464,4277.315195,4686.621902
2,5.62,5.53,3.65,65.5,55.0,0.71,0,3,4,4,4333.271980,4023.214902,3796.813551,4473.021368,4151.344262,4153.819453
3,4.68,4.72,3.00,63.8,56.0,0.41,1,0,2,3,3880.611794,3134.943157,3999.856908,3311.693089,3601.061697,2970.951792
4,6.55,6.51,3.95,60.5,59.0,1.02,2,3,2,2,3436.112577,4023.214902,3999.856908,3751.364560,3759.057457,3787.558804
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,7.10,7.04,4.43,62.7,57.0,1.34,2,3,4,1,3436.112577,4023.214902,3796.813551,3751.364560,3484.675277,4153.819453
40451,8.31,8.25,4.73,57.1,60.0,2.02,1,2,3,7,3880.611794,3677.355720,5101.044307,3477.504518,4562.165025,4511.178542
40452,6.37,6.42,4.01,62.7,56.0,1.01,2,4,2,3,3436.112577,4476.469014,3999.856908,3873.835802,3759.057457,5042.731524
40453,4.45,4.47,2.76,61.9,54.3,0.33,2,6,4,3,3436.112577,5346.234112,3796.813551,4854.274311,3484.675277,4781.833741


### 3.3 Define train and validation

In [13]:
# 5.Splitting train and test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## 5. Feature engineering (test set) 

In [14]:
# 0. Adapting categorical features for validation model
X_test=df_diamonds_test[features_list]

### 5.1 Target encoding (mean and std)

In [15]:
# 1.Target encoding for categorical variables
# Mean
cut_encoding = df_diamonds_train.groupby(['cut'])['price'].mean().to_dict()
X_test['cut_encoding'] = X_test['cut'].map(cut_encoding).astype(float)
color_encoding = df_diamonds_train.groupby(['color'])['price'].mean().to_dict()
X_test['color_encoding'] = X_test['color'].map(color_encoding).astype(float)
clarity_encoding = df_diamonds_train.groupby(['clarity'])['price'].mean().to_dict()
X_test['clarity_encoding'] = X_test['clarity'].map(clarity_encoding).astype(float)

In [16]:
# 2. Cross target encoding
# Mean
cut_color_encoding = df_diamonds_train.groupby(['cut','color'])['price'].mean().to_dict()
X_test['cut_color_encoding'] = X_test.set_index(['cut','color']).index.map(cut_color_encoding.get).astype(float)
cut_clarity_encoding = df_diamonds_train.groupby(['cut','clarity'])['price'].mean().to_dict()
X_test['cut_clarity_encoding'] = X_test.set_index(['cut','clarity']).index.map(cut_clarity_encoding.get).astype(float)
color_clarity_encoding = df_diamonds_train.groupby(['color','clarity'])['price'].mean().to_dict()
X_test['color_clarity_encoding'] = X_test.set_index(['color','clarity']).index.map(color_clarity_encoding.get).astype(float)

In [17]:
X_train.shape

(32364, 16)

In [18]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32364 entries, 32121 to 15795
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   x                       32364 non-null  float64
 1   y                       32364 non-null  float64
 2   z                       32364 non-null  float64
 3   depth                   32364 non-null  float64
 4   table                   32364 non-null  float64
 5   carat                   32364 non-null  float64
 6   cut                     32364 non-null  int8   
 7   color                   32364 non-null  int8   
 8   clarity                 32364 non-null  int8   
 9   city                    32364 non-null  int8   
 10  cut_encoding            32364 non-null  float64
 11  color_encoding          32364 non-null  float64
 12  clarity_encoding        32364 non-null  float64
 13  cut_color_encoding      32364 non-null  float64
 14  cut_clarity_encoding    32364 non-

### 5.2 Change categorical variables to code

In [19]:
for column in cat_features_list:
    X_test[column]=X_test[column].astype('category')
    X_test[column]=X_test[column].cat.codes

In [20]:
X_test

Unnamed: 0,x,y,z,depth,table,carat,cut,color,clarity,city,cut_encoding,color_encoding,clarity_encoding,cut_color_encoding,cut_clarity_encoding,color_clarity_encoding
0,5.82,5.89,3.67,62.7,60.0,0.79,4,2,2,0,3994.444420,3677.355720,3999.856908,3790.583636,3959.649629,3679.771031
1,6.81,6.89,4.18,61.0,57.0,1.20,2,6,4,10,3436.112577,5346.234112,3796.813551,4854.274311,3484.675277,4781.833741
2,7.38,7.32,4.57,62.2,61.0,1.57,3,4,2,3,4617.322612,4476.469014,3999.856908,5241.424619,4494.566378,5042.731524
3,6.09,6.13,3.90,63.8,54.0,0.90,4,2,2,3,3994.444420,3677.355720,3999.856908,3790.583636,3959.649629,3679.771031
4,5.05,5.09,3.19,62.9,58.0,0.50,4,2,4,0,3994.444420,3677.355720,3796.813551,3790.583636,3763.722052,3659.599024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13480,5.35,5.32,3.30,61.9,56.0,0.57,2,1,2,0,3436.112577,3088.342526,3999.856908,2624.818649,3759.057457,3132.153118
13481,5.71,5.73,3.56,62.2,55.0,0.71,2,5,5,8,3436.112577,5090.868800,3913.590182,4409.012354,3214.791678,5714.858295
13482,5.75,5.71,3.53,61.6,55.0,0.70,2,2,4,11,3436.112577,3677.355720,3796.813551,3284.470003,3484.675277,3659.599024
13483,5.85,5.89,3.45,58.8,57.0,0.70,4,2,3,10,3994.444420,3677.355720,5101.044307,3790.583636,5017.045820,4511.178542


In [21]:
X_test.shape

(13485, 16)

In [22]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13485 entries, 0 to 13484
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   x                       13485 non-null  float64
 1   y                       13485 non-null  float64
 2   z                       13485 non-null  float64
 3   depth                   13485 non-null  float64
 4   table                   13485 non-null  float64
 5   carat                   13485 non-null  float64
 6   cut                     13485 non-null  int8   
 7   color                   13485 non-null  int8   
 8   clarity                 13485 non-null  int8   
 9   city                    13485 non-null  int8   
 10  cut_encoding            13485 non-null  float64
 11  color_encoding          13485 non-null  float64
 12  clarity_encoding        13485 non-null  float64
 13  cut_color_encoding      13485 non-null  float64
 14  cut_clarity_encoding    13485 non-null

## Model definition - RandomForestRegressor - with Random Hyperparameter Grid

RandomForestRegressor: multiple trees in paralel changing samples and convining diferrent features (overfitting when the tree is big and good to reduce error variance)

Main Parameters:
   - bootstrap -> method for sampling data points (TRUE bagging and FALSE pasting, with/without replacement)
   - n_estimators -> number of trees in the foreset
   - max_depth -> max number of levels in each decision tree
   - max_features -> max number of features considered for splitting a node
   - ccp_alpha ->
   - criterion ->
   - max_leaf_nodes -> max number of solution nodes 
   - max_samples ->
   - min_impurity_decrease ->
   - min_samples_leaf -> min number of data points allowed in a leaf node
   - min_samples_split -> min number of data points placed in a node before the node is split
   - min_weight_fraction_leaf
   - n_estimators -> number of trees in the foreset
   - n_jobs 
   - oob_score 
   - random_state
   - verbose
   - warm_start   

In [23]:
# 1. XGBRegressor 
model = XGBRegressor(n_estimators=200,colsample_bylevel=1,colsample_bynode=1,
                     colsample_bytree=0.8,reg_alpha=1, reg_lambda=1,gamma=0,learning_rate=0.1, random_state=42)
hyperparameters = model.get_params()
print(type(model), '\n')

<class 'xgboost.sklearn.XGBRegressor'> 



## Model training with validation

In [24]:
%%time
# 1. XGBRegressor early_stopping_rounds=40
# Model training
model.fit(X_train, y_train,eval_set=[(X_train,y_train),(X_val,y_val)],early_stopping_rounds=40)

[0]	validation_0-rmse:5049.29785	validation_1-rmse:5089.10205
[1]	validation_0-rmse:4561.64062	validation_1-rmse:4598.88281
[2]	validation_0-rmse:4122.19189	validation_1-rmse:4157.25439
[3]	validation_0-rmse:3729.27954	validation_1-rmse:3761.79150
[4]	validation_0-rmse:3374.24805	validation_1-rmse:3404.29688
[5]	validation_0-rmse:3054.86157	validation_1-rmse:3084.18750
[6]	validation_0-rmse:2771.08228	validation_1-rmse:2800.33154
[7]	validation_0-rmse:2513.56787	validation_1-rmse:2539.74805
[8]	validation_0-rmse:2282.36865	validation_1-rmse:2306.34058
[9]	validation_0-rmse:2075.56226	validation_1-rmse:2097.65625
[10]	validation_0-rmse:1890.42297	validation_1-rmse:1913.19373
[11]	validation_0-rmse:1724.94324	validation_1-rmse:1747.57153
[12]	validation_0-rmse:1576.95776	validation_1-rmse:1598.12036
[13]	validation_0-rmse:1444.91528	validation_1-rmse:1466.46313
[14]	validation_0-rmse:1328.07739	validation_1-rmse:1350.16919
[15]	validation_0-rmse:1224.84790	validation_1-rmse:1248.22839
[1

[134]	validation_0-rmse:414.17831	validation_1-rmse:531.33539
[135]	validation_0-rmse:413.41440	validation_1-rmse:531.57361
[136]	validation_0-rmse:412.51959	validation_1-rmse:531.29742
[137]	validation_0-rmse:411.90402	validation_1-rmse:531.17981
[138]	validation_0-rmse:411.15424	validation_1-rmse:531.25250
[139]	validation_0-rmse:409.93103	validation_1-rmse:531.11487
[140]	validation_0-rmse:409.14999	validation_1-rmse:531.12714
[141]	validation_0-rmse:408.35004	validation_1-rmse:530.96088
[142]	validation_0-rmse:407.14923	validation_1-rmse:530.86639
[143]	validation_0-rmse:405.97943	validation_1-rmse:531.13117
[144]	validation_0-rmse:405.12332	validation_1-rmse:531.21417
[145]	validation_0-rmse:404.99487	validation_1-rmse:531.19440
[146]	validation_0-rmse:403.92456	validation_1-rmse:531.06750
[147]	validation_0-rmse:402.89252	validation_1-rmse:531.01154
[148]	validation_0-rmse:401.82660	validation_1-rmse:530.78528
[149]	validation_0-rmse:401.21555	validation_1-rmse:530.67621
[150]	va

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=200, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=42,
             reg_alpha=1, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [25]:
%%time
# Model predictions
y_pred_val = model.predict(X_val)

CPU times: user 68.6 ms, sys: 3.69 ms, total: 72.3 ms
Wall time: 13.3 ms


In [26]:
%%time
# Model predictions
y_pred_train = model.predict(X_train)

CPU times: user 188 ms, sys: 3.28 ms, total: 192 ms
Wall time: 38.8 ms


## Training set error

In [27]:
# Model predictions
rmse_train = mean_squared_error(y_train, y_pred_train)**0.5
rmse_train

395.37694523786644

In [28]:
mae_train=mean_absolute_error(y_train, y_pred_train)
mae_train

224.81418314193445

In [29]:
r2r = r2_score(y_val, y_pred_val)
r2r

0.9827747362918281

## Model validation

In [30]:
#432
rmse_val = mean_squared_error(y_val, y_pred_val)**0.5
rmse_val

529.6318540534799

In [31]:
mae_val=mean_absolute_error(y_val, y_pred_val)
mae_val

271.76663438087195

In [32]:
r2r = r2_score(y_val, y_pred_val)
r2r

0.9827747362918281

## Model training without validation

In [33]:
%%time
# Model training
model.fit(X, y,eval_set=[(X_train,y_train),(X_val,y_val)],early_stopping_rounds=40)

[0]	validation_0-rmse:5048.64600	validation_1-rmse:5087.40723
[1]	validation_0-rmse:4560.85449	validation_1-rmse:4596.12500
[2]	validation_0-rmse:4121.59277	validation_1-rmse:4152.06201
[3]	validation_0-rmse:3728.06567	validation_1-rmse:3756.77759
[4]	validation_0-rmse:3371.70606	validation_1-rmse:3398.64233
[5]	validation_0-rmse:3053.12109	validation_1-rmse:3079.27905
[6]	validation_0-rmse:2768.07935	validation_1-rmse:2791.39380
[7]	validation_0-rmse:2510.64551	validation_1-rmse:2531.61426
[8]	validation_0-rmse:2279.86353	validation_1-rmse:2299.05103
[9]	validation_0-rmse:2073.26831	validation_1-rmse:2090.51440
[10]	validation_0-rmse:1888.02844	validation_1-rmse:1904.29358
[11]	validation_0-rmse:1722.48376	validation_1-rmse:1737.36975
[12]	validation_0-rmse:1574.47632	validation_1-rmse:1587.89307
[13]	validation_0-rmse:1441.59045	validation_1-rmse:1454.49231
[14]	validation_0-rmse:1324.77441	validation_1-rmse:1335.63403
[15]	validation_0-rmse:1220.93897	validation_1-rmse:1230.77588
[1

[134]	validation_0-rmse:423.55588	validation_1-rmse:420.35248
[135]	validation_0-rmse:422.49857	validation_1-rmse:419.05100
[136]	validation_0-rmse:421.97549	validation_1-rmse:418.44125
[137]	validation_0-rmse:420.94904	validation_1-rmse:417.35523
[138]	validation_0-rmse:420.05023	validation_1-rmse:416.32407
[139]	validation_0-rmse:419.57858	validation_1-rmse:416.10986
[140]	validation_0-rmse:419.42017	validation_1-rmse:415.89780
[141]	validation_0-rmse:419.27591	validation_1-rmse:415.69913
[142]	validation_0-rmse:418.31043	validation_1-rmse:414.67294
[143]	validation_0-rmse:418.00983	validation_1-rmse:414.27976
[144]	validation_0-rmse:417.86722	validation_1-rmse:414.12564
[145]	validation_0-rmse:416.68018	validation_1-rmse:413.29205
[146]	validation_0-rmse:416.07233	validation_1-rmse:412.15500
[147]	validation_0-rmse:416.01209	validation_1-rmse:412.07953
[148]	validation_0-rmse:415.84335	validation_1-rmse:411.88986
[149]	validation_0-rmse:414.78479	validation_1-rmse:410.43219
[150]	va

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=200, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=42,
             reg_alpha=1, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

## Test Preditions

In [34]:
predictions = model.predict(X_test)

In [35]:
predictions=pd.DataFrame(predictions)

In [36]:
predictions.reset_index(inplace=True)

In [37]:
predictions=predictions.rename({0: 'price','index': 'id'}, axis=1)

## Save Preditions

In [38]:
predictions.to_csv('../data/diamonds_predictions_cat_target_cross_target_codes_XGB.csv',index=False)