## 0. Libraries import

In [2]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# imports 
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", 50)
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
#train test split
from sklearn.model_selection import train_test_split
# Hyperparameters selection
from sklearn.model_selection import RandomizedSearchCV
# Importing cross validation function from sklearn
from sklearn.model_selection import cross_val_score
# models
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import RANSACRegressor
# error
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error

## 1. Data import

### 1.1 Impor train data

In [4]:
df_diamonds_train=pd.read_csv('../data/diamonds_train.csv')
df_diamonds_train.pop("Unnamed: 0") #dropped an unnecessary column
df_diamonds_train

Unnamed: 0,index_id,depth,table,x,y,z,price,carat,cut,color,clarity,city
0,5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91...,62.4,58.0,6.83,6.79,4.25,4268,1.21,Premium,J,VS2,Dubai
1,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d...,63.0,57.0,4.35,4.38,2.75,505,0.32,Very Good,H,VS2,Kimberly
2,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,65.5,55.0,5.62,5.53,3.65,2686,0.71,Fair,G,VS1,Las Vegas
3,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,63.8,56.0,4.68,4.72,3.00,738,0.41,Good,D,SI1,Kimberly
4,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328c...,60.5,59.0,6.55,6.51,3.95,4882,1.02,Ideal,G,SI1,Dubai
...,...,...,...,...,...,...,...,...,...,...,...,...
40450,f0bc79169405ebeb24e308055156b946ffd819db9b4f75...,62.7,57.0,7.10,7.04,4.43,10070,1.34,Ideal,G,VS1,Antwerp
40451,339916a23bf22b052b54cb2a9b36ee8418c1c68b46acad...,57.1,60.0,8.31,8.25,4.73,12615,2.02,Good,F,SI2,Madrid
40452,46957922b99954654c1deb8d854c3f069bf118b2ce9415...,62.7,56.0,6.37,6.42,4.01,5457,1.01,Ideal,H,SI1,Kimberly
40453,9d733392d362d5c6f1d9b9659b601c7d4b5a1c1c8df579...,61.9,54.3,4.45,4.47,2.76,456,0.33,Ideal,J,VS1,Kimberly


### 1.2 Impor test data

In [5]:
df_diamonds_test=pd.read_csv('../data/diamonds_test.csv')
df_diamonds_test

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90,Kimberly
4,4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30,Amsterdam
13481,13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56,New York City
13482,13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53,Tel Aviv
13483,13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45,Surat


## 2. Data preparation (training set)

### 2.1 Defining numerical and categorical features

In [6]:
# Excluding city (with trees sometimes is better to keep variables even if they are correlated)
num_features_list=['x','y','z','depth','table','carat']
cat_features_list=['cut','color','clarity']
target='price'
features_list=['x','y','z','depth','table','carat','cut','color','clarity']

### 2.2 Checking if zero/null values

In [7]:
df_diamonds_train[num_features_list].isna().sum()

x        0
y        0
z        0
depth    0
table    0
carat    0
dtype: int64

In [8]:
if 0 in df_diamonds_train[num_features_list]:
    print('0 values')
else:
    print('No 0 values')

No 0 values


### 2.3 Remove outliers

In [9]:
def remove_outliers(df,feature):
    # IQR
    Q1 = np.percentile(df[feature], 25,
                   interpolation = 'midpoint')
    Q3 = np.percentile(df[feature], 75,
                   interpolation = 'midpoint')
    IQR = Q3 - Q1
    # Upper and lower
    upper_limit=Q3+1.5*IQR
    lower_limit=Q1-1.5*IQR
    # Removing the Outliers
    return df[(df[feature]>=lower_limit) & (df[feature]<=upper_limit)]

In [10]:
df_diamonds_train=remove_outliers(df_diamonds_train,'x')
df_diamonds_train=remove_outliers(df_diamonds_train,'y')
df_diamonds_train=remove_outliers(df_diamonds_train,'z')
df_diamonds_train=remove_outliers(df_diamonds_train,'depth')
df_diamonds_train=remove_outliers(df_diamonds_train,'table')
df_diamonds_train=remove_outliers(df_diamonds_train,'carat')

In [11]:
df_diamonds_train.shape

(37264, 12)

### 3. Feature engineering (training set)

### 3.5 One hot encoding

In [12]:
# 3. Defining features y target
#features_list_volume=['x', 'y', 'z', 'depth', 'table', 'carat', 'cut', 'color', 'clarity','volume']
X=df_diamonds_train[features_list]
y=df_diamonds_train['price']

In [13]:
# 4.One-hot encoding for categorical variables
X=pd.get_dummies(X,columns=cat_features_list,drop_first=True)

In [14]:
X.head()

Unnamed: 0,x,y,z,depth,table,carat,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_E,color_F,color_G,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,6.83,6.79,4.25,62.4,58.0,1.21,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0
1,4.35,4.38,2.75,63.0,57.0,0.32,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0
3,4.68,4.72,3.0,63.8,56.0,0.41,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,6.55,6.51,3.95,60.5,59.0,1.02,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
5,7.45,7.39,4.54,61.2,57.0,1.52,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0


### 3.6 Define train and validation

In [15]:
# 5.Splitting train and test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
X_train.shape

(29811, 23)

In [17]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29811 entries, 11656 to 17132
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   x              29811 non-null  float64
 1   y              29811 non-null  float64
 2   z              29811 non-null  float64
 3   depth          29811 non-null  float64
 4   table          29811 non-null  float64
 5   carat          29811 non-null  float64
 6   cut_Good       29811 non-null  uint8  
 7   cut_Ideal      29811 non-null  uint8  
 8   cut_Premium    29811 non-null  uint8  
 9   cut_Very Good  29811 non-null  uint8  
 10  color_E        29811 non-null  uint8  
 11  color_F        29811 non-null  uint8  
 12  color_G        29811 non-null  uint8  
 13  color_H        29811 non-null  uint8  
 14  color_I        29811 non-null  uint8  
 15  color_J        29811 non-null  uint8  
 16  clarity_IF     29811 non-null  uint8  
 17  clarity_SI1    29811 non-null  uint8  
 18  cl

## 4. Data preparation (test set)

## 5. Feature engineering (test set) 

In [18]:
# 0. Adapting categorical features for validation model
X_test=df_diamonds_test[features_list]

### 5.5 One hot encoding

In [19]:
# 3. One-hot encoding for categorical variables
X_test=pd.get_dummies(X_test,columns=cat_features_list,drop_first=True)

In [20]:
X_test.shape

(13485, 23)

In [21]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13485 entries, 0 to 13484
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   x              13485 non-null  float64
 1   y              13485 non-null  float64
 2   z              13485 non-null  float64
 3   depth          13485 non-null  float64
 4   table          13485 non-null  float64
 5   carat          13485 non-null  float64
 6   cut_Good       13485 non-null  uint8  
 7   cut_Ideal      13485 non-null  uint8  
 8   cut_Premium    13485 non-null  uint8  
 9   cut_Very Good  13485 non-null  uint8  
 10  color_E        13485 non-null  uint8  
 11  color_F        13485 non-null  uint8  
 12  color_G        13485 non-null  uint8  
 13  color_H        13485 non-null  uint8  
 14  color_I        13485 non-null  uint8  
 15  color_J        13485 non-null  uint8  
 16  clarity_IF     13485 non-null  uint8  
 17  clarity_SI1    13485 non-null  uint8  
 18  clarit

## Model definition - RandomForestRegressor - with Random Hyperparameter Grid

RandomForestRegressor: multiple trees in paralel changing samples and convining diferrent features (overfitting when the tree is big and good to reduce error variance)

Main Parameters:
   - bootstrap -> method for sampling data points (TRUE bagging and FALSE pasting, with/without replacement)
   - n_estimators -> number of trees in the foreset
   - max_depth -> max number of levels in each decision tree
   - max_features -> max number of features considered for splitting a node
   - ccp_alpha ->
   - criterion ->
   - max_leaf_nodes -> max number of solution nodes 
   - max_samples ->
   - min_impurity_decrease ->
   - min_samples_leaf -> min number of data points allowed in a leaf node
   - min_samples_split -> min number of data points placed in a node before the node is split
   - min_weight_fraction_leaf
   - n_estimators -> number of trees in the foreset
   - n_jobs 
   - oob_score 
   - random_state
   - verbose
   - warm_start   

In [21]:
# 0. Random Hyperparameter Grid - Grid definition

#n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)] # Number of trees in random forest
#max_features = ['auto', 'sqrt'] # Number of features to consider at every split
#max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] # Maximum number of levels in tree
#max_depth.append(None)
#min_samples_split = [2, 5, 10] # Minimum number of samples required to split a node
#min_samples_leaf = [1, 2, 4] # Minimum number of samples required at each leaf node
#bootstrap = [True, False] # Method of selecting samples for training each tree

# Create the random grid
#random_grid = {'n_estimators': n_estimators,
#               'max_features': max_features,
#               'max_depth': max_depth,
#               'min_samples_split': min_samples_split,
#               'min_samples_leaf': min_samples_leaf,
#               'bootstrap': bootstrap}
#print(random_grid)

In [22]:
#%%time
#1. Random Hyperparameter Grid - Use the random grid to search for best hyperparameters

# First create the base model to tune
#rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
#search across 100 different combinations, and use all available cores
#rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100,
#                               cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
#rf_random.fit(X_train, y_train)

In [23]:
#rf_random.best_params_

In [24]:
#best_params={'n_estimators': 1600,
# 'min_samples_split': 5,
# 'min_samples_leaf': 1,
# 'max_features': 'auto',
# 'max_depth': 90,
# 'bootstrap': True}

#(n_estimators=100,min_samples_split=5,min_samples_leaf=1,
#                              max_features='auto',max_depth=10,bootstrap=True)

#n_estimators=100,min_samples_split=5,min_samples_leaf=1,
#                              max_features='auto',max_depth=40,bootstrap=True

In [30]:
# 2. RandomForestRegressor definition
model = RandomForestRegressor()
hyperparameters = model.get_params()
print(type(model), '\n')
print('Model hyperparameters:', hyperparameters, '\n')

<class 'sklearn.ensemble._forest.RandomForestRegressor'> 

Model hyperparameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False} 



In [223]:
#Importance
#feats = {} # a dict to hold feature_name: feature_importance
#for feature, importance in zip(X_train.columns, model.feature_importances_):
#    feats[feature] = importance #add the name/value pair 

In [224]:
#Importance=pd.DataFrame(list(feats.items()),columns = ['feature','importance']).sort_values('importance',ascending=False)

In [18]:
# 1. XGBRegressor 
model = XGBRegressor(n_estimators=200,colsample_bylevel=1,colsample_bynode=1,
                     colsample_bytree=0.8,reg_alpha=1, reg_lambda=1,gamma=0,learning_rate=0.1)
hyperparameters = model.get_params()
print(type(model), '\n')
print('Model hyperparameters:', hyperparameters, '\n')

<class 'xgboost.sklearn.XGBRegressor'> 

Model hyperparameters: {'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 0.8, 'enable_categorical': False, 'gamma': 0, 'gpu_id': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.1, 'max_delta_step': None, 'max_depth': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'n_estimators': 200, 'n_jobs': None, 'num_parallel_tree': None, 'predictor': None, 'random_state': None, 'reg_alpha': 1, 'reg_lambda': 1, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None} 



In [551]:
#model = KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
#                    metric_params=None, n_jobs=None, n_neighbors=4, p=2,
#                    weights='uniform')

## Model training with validation

In [22]:
model = RANSACRegressor()

In [26]:
model = SVR()

In [31]:
model.fit(X_train, y_train)

RandomForestRegressor()

In [29]:
model.fit(X_train, y_train)

RandomForestRegressor()

In [19]:
%%time
# 1. XGBRegressor 
# Model training
model.fit(X_train, y_train,eval_set=[(X_train,y_train),(X_val,y_val)],early_stopping_rounds=40)

[0]	validation_0-rmse:5058.52978	validation_1-rmse:5095.90723
[1]	validation_0-rmse:4580.20312	validation_1-rmse:4614.45703
[2]	validation_0-rmse:4152.52734	validation_1-rmse:4184.85938
[3]	validation_0-rmse:3769.29272	validation_1-rmse:3801.58594
[4]	validation_0-rmse:3425.26416	validation_1-rmse:3456.74414
[5]	validation_0-rmse:3116.66162	validation_1-rmse:3147.00928
[6]	validation_0-rmse:2842.10474	validation_1-rmse:2871.09399
[7]	validation_0-rmse:2595.35742	validation_1-rmse:2622.51392
[8]	validation_0-rmse:2376.74512	validation_1-rmse:2402.80054
[9]	validation_0-rmse:2179.80981	validation_1-rmse:2206.65356
[10]	validation_0-rmse:2005.90430	validation_1-rmse:2031.19312
[11]	validation_0-rmse:1851.63721	validation_1-rmse:1878.36792
[12]	validation_0-rmse:1713.63660	validation_1-rmse:1740.35815
[13]	validation_0-rmse:1591.86853	validation_1-rmse:1618.89917
[14]	validation_0-rmse:1482.77454	validation_1-rmse:1510.45825
[15]	validation_0-rmse:1387.25098	validation_1-rmse:1415.16663
[1

[134]	validation_0-rmse:513.72613	validation_1-rmse:624.11987
[135]	validation_0-rmse:512.85113	validation_1-rmse:623.77478
[136]	validation_0-rmse:511.78796	validation_1-rmse:623.22815
[137]	validation_0-rmse:511.42145	validation_1-rmse:623.16656
[138]	validation_0-rmse:508.99515	validation_1-rmse:621.25214
[139]	validation_0-rmse:507.34613	validation_1-rmse:619.82977
[140]	validation_0-rmse:506.71045	validation_1-rmse:619.62329
[141]	validation_0-rmse:505.91361	validation_1-rmse:619.12457
[142]	validation_0-rmse:505.44211	validation_1-rmse:618.75452
[143]	validation_0-rmse:503.46289	validation_1-rmse:618.14435
[144]	validation_0-rmse:502.05789	validation_1-rmse:617.00220
[145]	validation_0-rmse:501.13965	validation_1-rmse:617.21002
[146]	validation_0-rmse:500.66193	validation_1-rmse:617.05237
[147]	validation_0-rmse:500.25485	validation_1-rmse:616.72296
[148]	validation_0-rmse:498.96271	validation_1-rmse:616.20892
[149]	validation_0-rmse:498.27301	validation_1-rmse:615.92578
[150]	va

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=200, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=1,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [40]:
%%time
# Model predictions
y_pred_val = model.predict(X_val)

CPU times: user 201 ms, sys: 4.53 ms, total: 206 ms
Wall time: 205 ms


In [41]:
%%time
# Model predictions
y_pred_train = model.predict(X_train)

CPU times: user 619 ms, sys: 4.27 ms, total: 623 ms
Wall time: 624 ms


## Training set error

In [42]:
# Model predictions
rmse_train = mean_squared_error(y_train, y_pred_train)**0.5
rmse_train

220.43783397146822

In [43]:
mae_train=mean_absolute_error(y_train, y_pred_train)
mae_train

99.6204692195754

In [44]:
r2r = r2_score(y_val, y_pred_val)
r2r

0.9781443037547919

## Model validation

In [45]:
#432
rmse_val = mean_squared_error(y_val, y_pred_val)**0.5
rmse_val

511.4902744716955

In [46]:
mae_val=mean_absolute_error(y_val, y_pred_val)
mae_val

247.48197874457713

In [47]:
r2r = r2_score(y_val, y_pred_val)
r2r

0.9781443037547919

## Model training without validation

In [48]:
%%time
# Model training
model.fit(X, y,eval_set=[(X_train,y_train),(X_val,y_val)],early_stopping_rounds=40)

TypeError: fit() got an unexpected keyword argument 'eval_set'

## Test Preditions

In [49]:
predictions = model.predict(X_test)

In [50]:
predictions=pd.DataFrame(predictions)

In [51]:
predictions.reset_index(inplace=True)

In [463]:
predictions=predictions.rename({0: 'price','index': 'id'}, axis=1)

## Save Preditions

In [464]:
predictions.to_csv('../data/diamonds_predictions_XGBRegressor_Cross_Target.csv',index=False)