In [1]:
import pandas as pd



from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error


## Import training and test datasets

In [2]:
df = pd.read_csv("../data/df_baseline.csv")
test = pd.read_csv("../data/tests/diamonds_test.csv")

## Analisis of both datasets

In [3]:
df.describe()

Unnamed: 0,price,carat,depth,table,x,y,z
count,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0
mean,3928.444469,0.797706,61.752841,57.446133,5.729392,5.732819,3.537154
std,3992.416147,0.475544,1.431725,2.233535,1.124453,1.14665,0.697062
min,326.0,0.2,43.0,43.0,0.0,0.0,0.0
25%,945.0,0.4,61.0,56.0,4.71,4.72,2.91
50%,2397.0,0.7,61.8,57.0,5.69,5.71,3.52
75%,5331.0,1.04,62.5,59.0,6.54,6.54,4.035
max,18823.0,4.5,79.0,95.0,10.23,58.9,8.06


In [4]:
test.describe()

Unnamed: 0,id,carat,depth,table,x,y,z
count,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0
mean,6742.0,0.798642,61.739095,57.490337,5.736454,5.739648,3.543474
std,3892.928525,0.469399,1.43531,2.237109,1.113671,1.128507,0.731005
min,0.0,0.2,50.8,51.0,0.0,0.0,0.0
25%,3371.0,0.4,61.0,56.0,4.73,4.73,2.92
50%,6742.0,0.7,61.9,57.0,5.7,5.72,3.53
75%,10113.0,1.04,62.5,59.0,6.53,6.53,4.04
max,13484.0,5.01,79.0,73.0,10.74,31.8,31.8


In [5]:
df

Unnamed: 0,price,carat,city,depth,table,x,y,z,clarity,color,cut
0,505,0.32,Kimberly,63.0,57.0,4.35,4.38,2.75,VS2,H,Very Good
1,2686,0.71,Las Vegas,65.5,55.0,5.62,5.53,3.65,VS1,G,Fair
2,738,0.41,Kimberly,63.8,56.0,4.68,4.72,3.00,SI1,D,Good
3,9057,1.52,Tel Aviv,61.2,57.0,7.45,7.39,4.54,SI2,F,Ideal
4,557,0.31,Zurich,61.8,54.2,4.33,4.37,2.69,VS1,H,Ideal
...,...,...,...,...,...,...,...,...,...,...,...
40450,1116,0.35,Amsterdam,62.2,58.0,4.52,4.45,2.79,IF,G,Premium
40451,975,0.40,Luxembourg,62.4,55.0,4.74,4.71,2.95,VS2,E,Ideal
40452,8572,1.57,Tel Aviv,61.1,61.0,7.46,7.43,4.55,SI2,G,Premium
40453,5154,0.73,Paris,61.4,56.0,5.80,5.81,3.56,VVS1,F,Ideal


In [6]:
df.head(10)

Unnamed: 0,price,carat,city,depth,table,x,y,z,clarity,color,cut
0,505,0.32,Kimberly,63.0,57.0,4.35,4.38,2.75,VS2,H,Very Good
1,2686,0.71,Las Vegas,65.5,55.0,5.62,5.53,3.65,VS1,G,Fair
2,738,0.41,Kimberly,63.8,56.0,4.68,4.72,3.0,SI1,D,Good
3,9057,1.52,Tel Aviv,61.2,57.0,7.45,7.39,4.54,SI2,F,Ideal
4,557,0.31,Zurich,61.8,54.2,4.33,4.37,2.69,VS1,H,Ideal
5,6741,1.14,Antwerp,61.7,56.0,6.71,6.75,4.15,SI1,G,Ideal
6,1040,0.44,Las Vegas,62.3,59.0,4.88,4.84,3.03,SI1,E,Premium
7,4284,1.02,Tel Aviv,58.0,58.0,6.84,6.6,3.9,SI1,I,Premium
8,3101,0.93,Madrid,62.3,57.0,6.2,6.22,3.87,SI2,I,Ideal
9,14507,1.51,Paris,60.1,58.0,7.48,7.45,4.49,VS1,G,Premium


In [7]:
test

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90,Kimberly
4,4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30,Amsterdam
13481,13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56,New York City
13482,13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53,Tel Aviv
13483,13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45,Surat


## Preprocessing

In [8]:
# Eliminate 'x' outliers
df = df[df['x'] > 0.05]
df = df[df['x'] < 90]

In [9]:
# Eliminate 'y' outliers
df = df[df['y'] > 0.05]
df = df[df['y'] < 50]

In [10]:
# Eliminate 'y' outliers
df = df[df['z'] > 2]


In [11]:
# Eliminate 'table' outliers
df = df[df['table'] < 80]

In [12]:
# Transform x,y,z in the test dataset, for the outliers ("input errors")
test.loc[test['x'] < 0.2, 'x'] = 4.73

In [13]:
test.loc[test['y'] < 0.2, 'y'] = 4.73

In [14]:
test.loc[test['z'] < 0.2, 'z'] = 2.91

## Feaure engineering

In [15]:
# Create variable ratio Length / Width
df['L/W ratio'] = df['x'] / df['y']

In [16]:
test['L/W ratio'] = test['x'] / test['y']

In [17]:
# Create variable with the relation between carat, table and depth
df['V'] = df['carat'] / df['table'] * df['depth']

In [18]:
test['V'] = test['carat'] / test['table'] * test['depth']

In [19]:
# Function to create a new column, bins conbining depth and table
def custom_sort(row):
    # Define the sorting logic based on the columns
    if row['depth'] > 60 and row['depth'] < 62 and row['table'] > 53 and row['table'] < 57:
        return 5
    elif row['depth'] > 58.5 and row['depth'] < 64 and row['table'] > 52 and row['table'] < 60:
        return 4
    elif row['depth'] > 57.5 and row['depth'] < 64.5 and row['table'] > 51 and row['table'] < 65:
        return 3
    elif row['depth'] > 56.5 and row['depth'] < 67 and row['table'] > 50 and row['table'] < 67:
        return 2
    else:
        return 1


In [20]:
df['sorting_category'] = df.apply(custom_sort, axis=1)

df.sort_values('sorting_category')

Unnamed: 0,price,carat,city,depth,table,x,y,z,clarity,color,cut,L/W ratio,V,sorting_category
25373,4398,1.02,Kimberly,70.6,57.0,6.08,6.01,4.27,SI1,D,Fair,1.011647,1.263368,1
10962,4320,1.45,Surat,64.8,67.0,7.16,7.00,4.64,I1,F,Fair,1.022857,1.402388,1
21180,474,0.29,Paris,62.8,44.0,4.20,4.24,2.65,VS1,E,Very Good,0.990566,0.413909,1
36365,829,0.47,Madrid,67.9,56.0,4.84,4.65,3.24,SI1,E,Fair,1.040860,0.569875,1
2392,3142,1.00,Dubai,68.3,54.0,6.21,6.12,4.20,SI2,G,Fair,1.014706,1.264815,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5572,2917,0.71,Dubai,61.2,56.0,5.77,5.73,3.52,VS2,F,Ideal,1.006981,0.775929,5
24137,3360,0.70,Las Vegas,61.4,55.0,5.71,5.75,3.52,VS2,D,Ideal,0.993043,0.781455,5
17070,1034,0.31,Tel Aviv,61.1,56.0,4.38,4.39,2.68,VVS2,D,Ideal,0.997722,0.338232,5
5560,2455,0.71,Zurich,61.9,56.0,5.68,5.72,3.53,SI1,E,Very Good,0.993007,0.784804,5


In [21]:
test['sorting_category'] = test.apply(custom_sort, axis=1)

#test['sorting_category'] = test['sorting_category'].astype(int)
test.sort_values('sorting_category')

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city,L/W ratio,V,sorting_category
5813,5813,1.01,Fair,D,VS2,59.1,68.0,6.56,6.44,3.84,Paris,1.018634,0.877809,1
11225,11225,1.91,Fair,I,VS1,59.5,68.0,8.10,7.91,4.78,Tel Aviv,1.024020,1.671250,1
8180,8180,0.97,Fair,J,SI2,60.8,67.0,6.41,6.32,3.87,Las Vegas,1.014241,0.880239,1
6062,6062,0.46,Fair,G,SI1,63.4,68.0,4.88,4.80,3.07,Antwerp,1.016667,0.428882,1
7193,7193,0.50,Good,H,VS2,56.3,65.0,5.21,5.24,2.94,Surat,0.994275,0.433077,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10446,10446,0.40,Ideal,I,VVS2,61.8,56.0,4.76,4.79,2.95,London,0.993737,0.441429,5
1895,1895,1.21,Ideal,E,SI1,61.8,56.0,6.82,6.86,4.23,Antwerp,0.994169,1.335321,5
10452,10452,0.42,Ideal,G,VVS2,61.3,56.0,4.82,4.81,2.95,Zurich,1.002079,0.459750,5
4919,4919,0.54,Ideal,G,VVS1,61.3,56.0,5.28,5.23,3.22,Kimberly,1.009560,0.591107,5


In [22]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13485 entries, 0 to 13484
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                13485 non-null  int64  
 1   carat             13485 non-null  float64
 2   cut               13485 non-null  object 
 3   color             13485 non-null  object 
 4   clarity           13485 non-null  object 
 5   depth             13485 non-null  float64
 6   table             13485 non-null  float64
 7   x                 13485 non-null  float64
 8   y                 13485 non-null  float64
 9   z                 13485 non-null  float64
 10  city              13485 non-null  object 
 11  L/W ratio         13485 non-null  float64
 12  V                 13485 non-null  float64
 13  sorting_category  13485 non-null  int64  
dtypes: float64(8), int64(2), object(4)
memory usage: 1.4+ MB


In [23]:
test.describe()

Unnamed: 0,id,carat,depth,table,x,y,z,L/W ratio,V,sorting_category
count,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0
mean,6742.0,0.798642,61.739095,57.490337,5.737506,5.7407,3.544337,0.999519,0.85613,3.904783
std,3892.928525,0.469399,1.43531,2.237109,1.11048,1.125355,0.728534,0.012935,0.501066,0.712734
min,0.0,0.2,50.8,51.0,3.73,3.68,1.41,0.16195,0.196667,1.0
25%,3371.0,0.4,61.0,56.0,4.73,4.73,2.92,0.992669,0.430345,4.0
50%,6742.0,0.7,61.9,57.0,5.7,5.72,3.53,0.995781,0.756491,4.0
75%,10113.0,1.04,62.5,59.0,6.53,6.53,4.04,1.006993,1.138259,4.0
max,13484.0,5.01,79.0,73.0,10.74,31.8,31.8,1.601415,5.561949,5.0


## Enlabel categorical variables

In [24]:

df.loc[df['color'] == 'J', 'color'] = 1
df.loc[df['color'] == 'I', 'color'] = 2
df.loc[df['color'] == 'H', 'color'] = 3
df.loc[df['color'] == 'G', 'color'] = 4
df.loc[df['color'] == 'F', 'color'] = 5
df.loc[df['color'] == 'E', 'color'] = 6
df.loc[df['color'] == 'D', 'color'] = 7

In [25]:
test.loc[test['color'] == 'J', 'color'] = 1
test.loc[test['color'] == 'I', 'color'] = 2
test.loc[test['color'] == 'H', 'color'] = 3
test.loc[test['color'] == 'G', 'color'] = 4
test.loc[test['color'] == 'F', 'color'] = 5
test.loc[test['color'] == 'E', 'color'] = 6
test.loc[test['color'] == 'D', 'color'] = 7

In [26]:
df.loc[df['clarity'] == 'I1', 'clarity'] = 1
df.loc[df['clarity'] == 'SI2', 'clarity'] = 2
df.loc[df['clarity'] == 'SI1', 'clarity'] = 3
df.loc[df['clarity'] == 'VS2', 'clarity'] = 4
df.loc[df['clarity'] == 'VS1', 'clarity'] = 5
df.loc[df['clarity'] == 'VVS2', 'clarity'] = 6
df.loc[df['clarity'] == 'VVS1', 'clarity'] = 7
df.loc[df['clarity'] == 'IF', 'clarity'] = 8

In [27]:
test.loc[test['clarity'] == 'I1', 'clarity'] = 1
test.loc[test['clarity'] == 'SI2', 'clarity'] = 2
test.loc[test['clarity'] == 'SI1', 'clarity'] = 3
test.loc[test['clarity'] == 'VS2', 'clarity'] = 4
test.loc[test['clarity'] == 'VS1', 'clarity'] = 5
test.loc[test['clarity'] == 'VVS2', 'clarity'] = 6
test.loc[test['clarity'] == 'VVS1', 'clarity'] = 7
test.loc[test['clarity'] == 'IF', 'clarity'] = 8

In [28]:
df.loc[df['cut'] == 'Fair', 'cut'] = 1
df.loc[df['cut'] == 'Good', 'cut'] = 2
df.loc[df['cut'] == 'Very Good', 'cut'] = 3
df.loc[df['cut'] == 'Premium', 'cut'] = 4
df.loc[df['cut'] == 'Ideal', 'cut'] = 5

In [29]:
test.loc[test['cut'] == 'Fair', 'cut'] = 1
test.loc[test['cut'] == 'Good', 'cut'] = 2
test.loc[test['cut'] == 'Very Good', 'cut'] = 3
test.loc[test['cut'] == 'Premium', 'cut'] = 4
test.loc[test['cut'] == 'Ideal', 'cut'] = 5

## Training model with GraadientBoost and GridSearch

In [30]:
# Defining features for training
X = df[['L/W ratio','V','carat','cut','color','clarity','depth','table','sorting_category']]
X

Unnamed: 0,L/W ratio,V,carat,cut,color,clarity,depth,table,sorting_category
0,0.993151,0.353684,0.32,3,3,4,63.0,57.0,4
1,1.016275,0.845545,0.71,1,4,5,65.5,55.0,2
2,0.991525,0.467107,0.41,2,7,3,63.8,56.0,4
3,1.008119,1.632000,1.52,5,5,2,61.2,57.0,4
4,0.990847,0.353469,0.31,5,3,5,61.8,54.2,5
...,...,...,...,...,...,...,...,...,...
40450,1.015730,0.375345,0.35,4,4,8,62.2,58.0,4
40451,1.006369,0.453818,0.40,5,6,4,62.4,55.0,4
40452,1.004038,1.572574,1.57,4,4,2,61.1,61.0,3
40453,0.998279,0.800393,0.73,5,5,7,61.4,56.0,5


In [31]:
y = df['price']

In [32]:
# GradientBoost model
model = GradientBoostingRegressor()

In [33]:
# Defining parameters of grid, to check wich one fits better with this case
param_grid = {'n_estimators': [16, 32, 64, 128, 256, 512],
              'max_depth': [2, 4, 8, 16]}

In [34]:
grid_search = GridSearchCV(model,
                           param_grid,
                           cv=5,
                           verbose=3,
                           scoring='neg_root_mean_squared_error',
                           n_jobs=-1)

In [35]:
# Training my GradientBoost model with grid_search
grid_search.fit(X,y)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [36]:
print('\n')
print('Best hyperparameters: ', grid_search.best_params_, '\n')
print('Best score: ', -grid_search.best_score_, '\n')



Best hyperparameters:  {'max_depth': 8, 'n_estimators': 64} 

Best score:  517.3956640958106 



In [37]:
best_model = grid_search.best_estimator_
best_model

In [38]:
# Defining features for test
test = test[['L/W ratio','V','carat','cut','color','clarity','depth','table','sorting_category']]

In [39]:
predictions = best_model.predict(test)


In [40]:
result = pd.DataFrame({'id': range(0, len(predictions)), 'price': predictions})
result

Unnamed: 0,id,price
0,0,2882.817738
1,1,5505.842918
2,2,10010.836519
3,3,3965.504462
4,4,1606.019377
...,...,...
13480,13480,1677.143853
13481,13481,2457.311350
13482,13482,2981.521924
13483,13483,2083.791885


In [41]:
result.to_csv("../submissions/submision_final_model.csv", index=False)

[CV 3/5] END ..max_depth=2, n_estimators=32;, score=-1215.345 total time=   1.3s
[CV 1/5] END ..max_depth=2, n_estimators=128;, score=-735.269 total time=   4.4s
[CV 4/5] END ..max_depth=2, n_estimators=256;, score=-689.463 total time=   8.7s
[CV 3/5] END ..max_depth=4, n_estimators=16;, score=-1142.235 total time=   1.1s
[CV 1/5] END ...max_depth=4, n_estimators=32;, score=-696.436 total time=   2.0s
[CV 4/5] END ...max_depth=4, n_estimators=32;, score=-707.159 total time=   2.1s
[CV 2/5] END ...max_depth=4, n_estimators=64;, score=-549.006 total time=   3.9s
[CV 5/5] END ...max_depth=4, n_estimators=64;, score=-551.923 total time=   3.8s
[CV 1/5] END ..max_depth=4, n_estimators=256;, score=-526.628 total time=  15.6s
[CV 4/5] END ..max_depth=4, n_estimators=512;, score=-539.947 total time=  30.2s
[CV 2/5] END ..max_depth=8, n_estimators=256;, score=-517.562 total time=  29.2s
[CV 5/5] END ..max_depth=8, n_estimators=512;, score=-528.208 total time=  59.8s
[CV 2/5] END .max_depth=16, 

[CV 4/5] END ..max_depth=2, n_estimators=16;, score=-1590.346 total time=   0.6s
[CV 1/5] END ...max_depth=2, n_estimators=64;, score=-890.957 total time=   2.2s
[CV 5/5] END ..max_depth=2, n_estimators=128;, score=-711.948 total time=   4.5s
[CV 3/5] END ..max_depth=2, n_estimators=512;, score=-646.416 total time=  17.6s
[CV 4/5] END ..max_depth=4, n_estimators=128;, score=-557.480 total time=   7.9s
[CV 1/5] END ..max_depth=4, n_estimators=512;, score=-528.682 total time=  31.3s
[CV 5/5] END ...max_depth=8, n_estimators=64;, score=-516.356 total time=   7.5s
[CV 1/5] END ..max_depth=8, n_estimators=256;, score=-518.458 total time=  29.6s
[CV 4/5] END ..max_depth=8, n_estimators=512;, score=-542.344 total time= 1.0min
[CV 3/5] END .max_depth=16, n_estimators=256;, score=-614.376 total time=  57.1s
[CV 5/5] END .max_depth=16, n_estimators=512;, score=-621.635 total time= 1.5min
