In [264]:
import pandas as pd
import numpy as np
import math
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn. preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMRegressor

In [265]:
diamonds = pd.read_csv('diamonds_train.csv')
predict = pd.read_csv('diamonds_predict.csv')

In [266]:
diamonds = diamonds[(diamonds['x'] > 0) & (diamonds['y'] > 0) & (diamonds['z'] > 0)]

In [267]:
# Transformation which relates x, y and z with diamond volume: CONE

diameter = (diamonds['x'] + diamonds['y']) / 2
diamonds['volume cone'] = (1/3) * np.pi * ((diameter / 2) ** 2) * diamonds['z']
diamonds['volume semisphere'] = (4/6) * np.pi * ((diameter / 2)) **2
diamonds['volume'] = diamonds['volume cone'] + diamonds['volume semisphere']

In [268]:
diameter_predict = (predict['x'] + predict['y']) / 2
predict['volume cone'] = (1/3) * np.pi * ((diameter_predict / 2) ** 2) * predict['z']
predict['volume semisphere'] = (4/6) * np.pi * ((diameter_predict / 2)) **2
predict['volume'] = predict['volume cone'] + predict['volume semisphere']

In [269]:
# Transformation which calculate the price due to price per carat * carat weight

diamonds_price = []

for i in diamonds['carat']:
    
    if i < 0.5:
        diamonds_price.append(600)        
    elif i >= 0.5 and  i < 1:
        diamonds_price.append(2400)
    elif i >= 1 and i < 1.5:
        diamonds_price.append(4280)
    elif i>= 1.5 and i < 2:
        diamonds_price.append(9360)
    else: 
        diamonds_price.append(16000)

In [270]:
diamonds['carat price'] = diamonds_price

In [271]:
predict_price = []

for i in predict['carat']:
    
    if i < 0.5:
        predict_price.append(600)        
    elif i >= 0.5 and  i < 1:
        predict_price.append(2400)
    elif i >= 1 and i < 1.5:
        predict_price.append(4280)
    elif i>= 1.5 and i < 2:
        predict_price.append(9360)
    else: 
        predict_price.append(16000)

In [272]:
predict['carat price'] = predict_price

In [273]:
diamonds['predicted price'] = diamonds['carat'] * diamonds['carat price']

In [274]:
predict['predicted price'] = predict['carat'] * predict['carat price']

In [275]:
diamonds['cut'] = diamonds['cut'].map({'Ideal': 1, 'Good': 2, 'Very Good': 3, 'Fair': 4, 'Premium': 5})

diamonds['color'] = diamonds['color'].map({'E': 1, 'D': 2, 'F': 3, 'G': 4, 'H': 5, 'I': 6, 'J': 7})

diamonds['clarity'] = diamonds['clarity'].map({'VVS1': 1, 'IF': 2, 'VVS2': 3, 'VS1': 4, 'I1': 5, 'VS2': 6, 'SI1': 7, 'SI2': 8})

In [276]:
predict['cut'] = predict['cut'].map({'Ideal': 1, 'Good': 2, 'Very Good': 3, 'Fair': 4, 'Premium': 5})

predict['color'] = predict['color'].map({'E': 1, 'D': 2, 'F': 3, 'G': 4, 'H': 5, 'I': 6, 'J': 7})

predict['clarity'] = predict['clarity'].map({'VVS1': 1, 'IF': 2, 'VVS2': 3, 'VS1': 4, 'I1': 5, 'VS2': 6, 'SI1': 7, 'SI2': 8})

In [277]:
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume cone,volume semisphere,volume,carat price,predicted price
0,1.21,5,7,6,62.4,58.0,4268,6.83,6.79,4.25,51.600247,24.282469,75.882716,4280,5178.8
1,0.32,3,5,6,63.0,57.0,505,4.35,4.38,2.75,13.717337,9.976245,23.693583,600,192.0
2,0.71,4,4,4,65.5,55.0,2686,5.62,5.53,3.65,29.699643,16.273777,45.973421,2400,1704.0
3,0.41,2,2,7,63.8,56.0,738,4.68,4.72,3.0,17.349445,11.566297,28.915742,600,246.0
4,1.02,1,4,7,60.5,59.0,4882,6.55,6.51,3.95,44.095278,22.326723,66.422001,4280,4365.6


In [278]:
diamonds.describe()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume cone,volume semisphere,volume,carat price,predicted price
count,40439.0,40439.0,40439.0,40439.0,40439.0,40439.0,40439.0,40439.0,40439.0,40439.0,40439.0,40439.0,40439.0,40439.0,40439.0
mean,0.797425,2.644477,3.542719,5.513366,61.753013,57.445543,3926.535448,5.729616,5.732897,3.538553,34.030119,17.863019,51.893139,3329.492816,4208.743342
std,0.475257,1.616288,1.768642,2.094771,1.431306,2.233055,3990.024501,1.122384,1.145002,0.693639,23.189237,7.556108,30.661288,3492.792584,7269.65612
min,0.2,1.0,1.0,1.0,43.0,43.0,326.0,3.77,3.72,1.07,8.446508,7.343486,15.825212,600.0,120.0
25%,0.4,1.0,2.0,4.0,61.0,56.0,945.0,4.71,4.72,2.91,17.002457,11.640242,28.670175,600.0,240.0
50%,0.7,3.0,4.0,6.0,61.8,57.0,2396.0,5.69,5.71,3.52,30.033021,17.011724,47.049659,2400.0,1680.0
75%,1.04,5.0,5.0,7.0,62.5,59.0,5329.5,6.54,6.54,4.04,44.728256,22.360927,67.053253,4280.0,4451.2
max,4.5,5.0,7.0,8.0,79.0,95.0,18823.0,10.23,58.9,8.06,2367.356334,587.433333,2954.789667,16000.0,72000.0


In [279]:
# The first step is defining the Data Cleaning and Feature Engineering 

target = 'price'
num_features = ['cut', 'color', 'clarity', 'carat', 'x', 'y', 'z', 'table', 'volume']
features = num_features

In [280]:
diamonds.corr()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume cone,volume semisphere,volume,carat price,predicted price
carat,1.0,0.170432,0.291149,0.323837,0.026585,0.183347,0.921881,0.977901,0.953386,0.976326,0.871154,0.924408,0.886666,0.936014,0.880541
cut,0.170432,1.0,0.04191,0.165621,-0.086092,0.496035,0.1175,0.175228,0.158387,0.157636,0.14632,0.159653,0.150007,0.148554,0.136301
color,0.291149,0.04191,1.0,-0.018635,0.051424,0.024461,0.175498,0.270812,0.263929,0.275598,0.25354,0.261998,0.256319,0.268168,0.268092
clarity,0.323837,0.165621,-0.018635,1.0,0.050223,0.14171,0.151534,0.347645,0.335612,0.349403,0.280993,0.311758,0.289345,0.277544,0.236229
depth,0.026585,-0.086092,0.051424,0.050223,1.0,-0.293093,-0.014923,-0.02619,-0.030803,0.094575,0.002931,-0.03277,-0.005859,0.020361,0.018076
table,0.183347,0.496035,0.024461,0.14171,-0.293093,1.0,0.13017,0.197229,0.185826,0.157398,0.150247,0.177305,0.157327,0.160986,0.138712
price,0.921881,0.1175,0.175498,0.151534,-0.014923,0.13017,1.0,0.88785,0.867756,0.882131,0.80508,0.848914,0.81809,0.896509,0.847348
x,0.977901,0.175228,0.270812,0.347645,-0.02619,0.197229,0.88785,1.0,0.974044,0.990932,0.852997,0.926163,0.873366,0.872369,0.780231
y,0.953386,0.158387,0.263929,0.335612,-0.030803,0.185826,0.867756,0.974044,1.0,0.971019,0.940028,0.98376,0.953382,0.851802,0.761968
z,0.976326,0.157636,0.275598,0.349403,0.094575,0.157398,0.882131,0.990932,0.971019,1.0,0.860871,0.925572,0.879175,0.870722,0.778808


## **LightBMRegressor**

In [281]:
# Generamos una primera etapa del pipeline que transforma los datos categoricos y los datos numéricos del modelo

# En los datos numéricos fill los nulos con la mediana y estandarizamos los resultados
numeric_transformer = \
Pipeline(steps = [('imputer', SimpleImputer(strategy = 'mean')), ('scalar', StandardScaler())])

In [282]:
# Generamos una segunda etapa donde definimos las variables numericas a las que aplicar numeric_transformer y las variables categoricas a las que aplicar categorical_transformer

preprocessor = \
ColumnTransformer(transformers = [('num', numeric_transformer, num_features)])

In [283]:
# Generamos una última etapa donde definimos el modelo

model = \
Pipeline(steps = [('preprocessor', preprocessor), ('regressor', LGBMRegressor(n_stimators = 500))])

In [284]:
# Dividimos el dataset en train y en test

diamonds_train, diamonds_test = train_test_split(diamonds)

In [285]:
model.fit(diamonds_train[features], diamonds_train[target])



Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scalar',
                                                                   StandardScaler())]),
                                                  ['cut', 'color', 'clarity',
                                                   'carat', 'x', 'y', 'z',
                                                   'table', 'volume'])])),
                ('regressor', LGBMRegressor(n_stimators=500))])

In [286]:
# Validamos el modelo con diamonds_test

y_test_pred = model.predict(diamonds_test[features])
y_test_real = diamonds_test[target]

mean_squared_error(y_test_real, y_test_pred, squared = False)

565.23849008753

In [287]:
# Validamos el modelo con diamonds_train

y_train_pred = model.predict(diamonds_train[features])
y_train_real = diamonds_train[target]

mean_squared_error(y_train_real, y_train_pred, squared = False)

468.96001734364313

In [288]:
model.fit(diamonds[features], diamonds[target])



Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scalar',
                                                                   StandardScaler())]),
                                                  ['cut', 'color', 'clarity',
                                                   'carat', 'x', 'y', 'z',
                                                   'table', 'volume'])])),
                ('regressor', LGBMRegressor(n_stimators=500))])

In [289]:
predict.describe()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,volume cone,volume semisphere,volume,carat price,predicted price
count,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0
mean,6742.0,0.798642,2.670004,3.523767,5.510567,61.739095,57.490337,5.736454,5.739648,3.543474,34.050023,17.890323,51.940346,3312.605117,4144.211405
std,3892.928525,0.469399,1.625213,1.766091,2.08432,1.43531,2.237109,1.113671,1.128507,0.731005,20.226776,7.064738,27.238206,3423.178746,7096.530428
min,0.0,0.2,1.0,1.0,1.0,50.8,51.0,0.0,0.0,0.0,0.0,0.0,0.0,600.0,120.0
25%,3371.0,0.4,1.0,2.0,4.0,61.0,56.0,4.73,4.73,2.92,17.18893,11.714423,28.88449,600.0,240.0
50%,6742.0,0.7,3.0,4.0,6.0,61.9,57.0,5.7,5.72,3.53,30.142859,17.101377,47.279318,2400.0,1680.0
75%,10113.0,1.04,5.0,5.0,7.0,62.5,59.0,6.53,6.53,4.04,44.713365,22.326723,67.006168,4280.0,4451.2
max,13484.0,5.01,5.0,7.0,8.0,79.0,73.0,10.74,31.8,31.8,457.517259,178.717679,636.234938,16000.0,80160.0


In [295]:
y_pred = model.predict(predict[features])

## **Submission**

In [296]:
submission_df = pd.DataFrame({'id': predict['id'], 'price': y_pred})

In [297]:
submission_df.price.clip(300, 20000, inplace = True)

In [298]:
submission_df

Unnamed: 0,id,price
0,0,2859.128840
1,1,5707.984255
2,2,9613.821617
3,3,3946.427091
4,4,1556.108304
...,...,...
13480,13480,1619.068629
13481,13481,2318.562306
13482,13482,3249.150399
13483,13483,2264.420452


In [294]:
submission_df.to_csv('./data/LightGBM2.csv', index = False)

## **Hyperparameter Optimization**

In [17]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'regressor__n_estimators': [16, 32, 64, 128, 256, 512],
    'regressor__max_depth': [2, 4, 8, 16, 20],
}

grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=5, 
                                 verbose=10, 
                                 scoring='neg_root_mean_squared_error', 
                                 n_jobs=-1,
                                 n_iter=32)

grid_search.fit(diamonds[features], diamonds[target])

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   18.3s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   44.1s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed: 10.1min finished


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('imputer',
                                                                                                SimpleImputer(strategy='median')),
                                                                                               ('scalar',
                                                                                                StandardScaler())]),
                                                                               ['carat',
                                                                                'depth',
                                                                                'table',
                                                                                'x',
              

In [18]:
grid_search.best_params_

{'regressor__n_estimators': 512,
 'regressor__max_depth': 16,
 'preprocessor__num__imputer__strategy': 'mean'}

In [19]:
grid_search.best_score_

-557.7769864705173

In [22]:
y_pred = grid_search.predict(predict[features])

In [23]:
# Validamos el modelo con diamonds_test

y_test_pred = model.predict(diamonds_test[features])
y_test_real = diamonds_test[target]

mean_squared_error(y_test_real, y_test_pred, squared = False)

623.4305379967942

## **Submission**

In [24]:
submission_df = pd.DataFrame({'id': predict['id'], 'price': y_pred})

In [25]:
submission_df.to_csv('./data/RandomForest.csv', index = False)