In [45]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [46]:
# imports 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# outliers
from sklearn.datasets import load_boston
#train test split
from sklearn.model_selection import train_test_split
# Hyperparameters selection
from sklearn.model_selection import RandomizedSearchCV
# Importing cross validation function from sklearn
from sklearn.model_selection import cross_val_score
# models
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
# K-Nearest Neighbor(KNN)
from sklearn.neighbors import KNeighborsRegressor
RegModel = KNeighborsRegressor(n_neighbors=4)
# error
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error

In [47]:
# Impor train data
df_diamonds_train=pd.read_csv('../data/diamonds_train.csv')
df_diamonds_train.pop("Unnamed: 0")
df_diamonds_train

Unnamed: 0,index_id,depth,table,x,y,z,price,carat,cut,color,clarity,city
0,5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91...,62.4,58.0,6.83,6.79,4.25,4268,1.21,Premium,J,VS2,Dubai
1,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d...,63.0,57.0,4.35,4.38,2.75,505,0.32,Very Good,H,VS2,Kimberly
2,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,65.5,55.0,5.62,5.53,3.65,2686,0.71,Fair,G,VS1,Las Vegas
3,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,63.8,56.0,4.68,4.72,3.00,738,0.41,Good,D,SI1,Kimberly
4,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328c...,60.5,59.0,6.55,6.51,3.95,4882,1.02,Ideal,G,SI1,Dubai
...,...,...,...,...,...,...,...,...,...,...,...,...
40450,f0bc79169405ebeb24e308055156b946ffd819db9b4f75...,62.7,57.0,7.10,7.04,4.43,10070,1.34,Ideal,G,VS1,Antwerp
40451,339916a23bf22b052b54cb2a9b36ee8418c1c68b46acad...,57.1,60.0,8.31,8.25,4.73,12615,2.02,Good,F,SI2,Madrid
40452,46957922b99954654c1deb8d854c3f069bf118b2ce9415...,62.7,56.0,6.37,6.42,4.01,5457,1.01,Ideal,H,SI1,Kimberly
40453,9d733392d362d5c6f1d9b9659b601c7d4b5a1c1c8df579...,61.9,54.3,4.45,4.47,2.76,456,0.33,Ideal,J,VS1,Kimberly


In [48]:
# Import test data
df_diamonds_test=pd.read_csv('../data/diamonds_test.csv')
df_diamonds_test

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90,Kimberly
4,4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30,Amsterdam
13481,13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56,New York City
13482,13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53,Tel Aviv
13483,13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45,Surat


In [49]:
# 0.Defining numerical and categorical features
# Excluding city (with trees sometimes is better to keep variables even if they are correlated)
num_features_list=['x','y','z','depth','table','carat']
cat_features_list=['cut','color','clarity']
features_list=['x','y','z','depth','table','carat','cut','color','clarity']

In [50]:
# 3.remove outliers
def remove_outliers(df,feature):
    # IQR
    Q1 = np.percentile(df[feature], 25,
                   interpolation = 'midpoint')
    Q3 = np.percentile(df[feature], 75,
                   interpolation = 'midpoint')
    IQR = Q3 - Q1
    # Upper and lower
    upper_limit=Q3+1.5*IQR
    lower_limit=Q1-1.5*IQR
    # Removing the Outliers
    return df[(df[feature]>=lower_limit) & (df[feature]<=upper_limit)]

In [51]:
df_diamonds_train=remove_outliers(df_diamonds_train,'x')
df_diamonds_train=remove_outliers(df_diamonds_train,'y')
df_diamonds_train=remove_outliers(df_diamonds_train,'z')
df_diamonds_train=remove_outliers(df_diamonds_train,'depth')
df_diamonds_train=remove_outliers(df_diamonds_train,'table')
df_diamonds_train=remove_outliers(df_diamonds_train,'carat')

In [52]:
# 3. Defining features y target
X=df_diamonds_train[features_list]
y=df_diamonds_train['price']

In [53]:
# 4.One-hot encoding for categorical variables
X=pd.get_dummies(X,columns=cat_features_list)

In [54]:
# 5.Splitting train and test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [55]:
# 0. Adapting categorical features for validation model
X_test=df_diamonds_test[features_list]

In [56]:
# 3. One-hot encoding for categorical variables
X_test=pd.get_dummies(X_test,columns=cat_features_list)

In [75]:
# 1. XGBRegressor 
model = XGBRegressor(n_estimators=200,colsample_bylevel=1,colsample_bynode=1,
                     colsample_bytree=0.8,reg_alpha=1, reg_lambda=1,gamma=0,learning_rate=0.1)
hyperparameters = model.get_params()
print(type(model), '\n')
print('Model hyperparameters:', hyperparameters, '\n')

<class 'xgboost.sklearn.XGBRegressor'> 

Model hyperparameters: {'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 0.8, 'enable_categorical': False, 'gamma': 0.5, 'gpu_id': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.1, 'max_delta_step': None, 'max_depth': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'n_estimators': 200, 'n_jobs': None, 'num_parallel_tree': None, 'predictor': None, 'random_state': None, 'reg_alpha': 1, 'reg_lambda': 1, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None} 



In [76]:
%%time
# 1. XGBRegressor 
# Model training
model.fit(X_train, y_train,eval_set=[(X_train,y_train),(X_val,y_val)],early_stopping_rounds=40)

[0]	validation_0-rmse:4611.95410	validation_1-rmse:4460.72900
[1]	validation_0-rmse:4174.63281	validation_1-rmse:4031.69824
[2]	validation_0-rmse:3782.40405	validation_1-rmse:3650.27417
[3]	validation_0-rmse:3429.27979	validation_1-rmse:3305.98047
[4]	validation_0-rmse:3114.79370	validation_1-rmse:2998.91553
[5]	validation_0-rmse:2830.80664	validation_1-rmse:2722.45215
[6]	validation_0-rmse:2579.02686	validation_1-rmse:2476.26733
[7]	validation_0-rmse:2353.92334	validation_1-rmse:2257.95752
[8]	validation_0-rmse:2150.58520	validation_1-rmse:2059.06030
[9]	validation_0-rmse:1969.42432	validation_1-rmse:1883.13159
[10]	validation_0-rmse:1809.23828	validation_1-rmse:1727.04236
[11]	validation_0-rmse:1662.29700	validation_1-rmse:1584.23242
[12]	validation_0-rmse:1533.32398	validation_1-rmse:1458.09119
[13]	validation_0-rmse:1418.23108	validation_1-rmse:1346.66125
[14]	validation_0-rmse:1317.13538	validation_1-rmse:1249.98096
[15]	validation_0-rmse:1225.01209	validation_1-rmse:1160.62012
[1

[134]	validation_0-rmse:379.47073	validation_1-rmse:445.88409
[135]	validation_0-rmse:379.19287	validation_1-rmse:445.90082
[136]	validation_0-rmse:379.11883	validation_1-rmse:445.91547
[137]	validation_0-rmse:378.63089	validation_1-rmse:445.67242
[138]	validation_0-rmse:378.02786	validation_1-rmse:445.68811
[139]	validation_0-rmse:377.34665	validation_1-rmse:445.27744
[140]	validation_0-rmse:376.29071	validation_1-rmse:445.24316
[141]	validation_0-rmse:376.21649	validation_1-rmse:445.27402
[142]	validation_0-rmse:374.98770	validation_1-rmse:444.76672
[143]	validation_0-rmse:374.59879	validation_1-rmse:444.72266
[144]	validation_0-rmse:374.22934	validation_1-rmse:444.55133
[145]	validation_0-rmse:373.18085	validation_1-rmse:444.70892
[146]	validation_0-rmse:373.05039	validation_1-rmse:444.61795
[147]	validation_0-rmse:371.79294	validation_1-rmse:444.57535
[148]	validation_0-rmse:371.09369	validation_1-rmse:444.70148
[149]	validation_0-rmse:370.17850	validation_1-rmse:444.75052
[150]	va

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, enable_categorical=False,
             gamma=0.5, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=200, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=1,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [68]:
%%time
# Model predictions
y_pred_val = model.predict(X_val)
print(type(y_pred_val))

<class 'numpy.ndarray'>
CPU times: user 103 ms, sys: 3.24 ms, total: 106 ms
Wall time: 21.8 ms


In [69]:
%%time
# Model predictions
y_pred_train = model.predict(X_train)
print(type(y_pred_train))

<class 'numpy.ndarray'>
CPU times: user 306 ms, sys: 6.28 ms, total: 312 ms
Wall time: 51.6 ms


In [70]:
%%time
# Model predictions 
#con outliers 406, sin ouliers 356
#con volume 344.8066667454747
#sin volume y con mean target 346.63
#sin volume y con mean target y mean cross target 329.2980362680159
#sin volume y con mean cross target 347.51
#sin colume y con mean y std target 381
#con volume y con std target 353
#con volume y sin std  332.33917739765695
rmse_train = mean_squared_error(y_train, y_pred_train)**0.5
rmse_train

CPU times: user 1.9 ms, sys: 1.23 ms, total: 3.12 ms
Wall time: 2.09 ms


315.9886847591365

In [71]:
mae_train=mean_absolute_error(y_train, y_pred_train)
mae_train

184.43244143118739

In [72]:
%%time
# Model predictions
y_pred_val = model.predict(X_val)
print(type(y_pred_val))

<class 'numpy.ndarray'>
CPU times: user 111 ms, sys: 3.93 ms, total: 115 ms
Wall time: 20.1 ms


In [73]:
#432 
#con outliers 541, sin ouliers 442
#con volume 447
#sin volume y con mean target 429
#sin volume y con mean target y mean cross target 436
#sin volume y con mean cross target 445.953
#sin volume y con mean y std target 431
#con volume y con std target 430
#con volume y y con mean target 432
rmse_val = mean_squared_error(y_val, y_pred_val)**0.5
rmse_val

438.165452712901

In [74]:
mae_val=mean_absolute_error(y_val, y_pred_val)
mae_val

232.91249201784962

In [153]:
%%time
# Model training
model.fit(X, y)
print('Model:', model, '\n')
print('Model hyperparameters:', hyperparameters, '\n')

Model: XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=200, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=1,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None) 

Model hyperparameters: {'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 0.8, 'enable_categorical': False, 'gamma': 0, 'gpu_id': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.1, 'max_delta_step': None, 'max_depth': None, 'min_child_weight': No