In [72]:
import pandas as pd
import numpy as np
import math
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn. preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

In [73]:
diamonds = pd.read_csv('diamonds_train.csv')
predict = pd.read_csv('diamonds_predict.csv')

In [74]:
diamonds = diamonds[(diamonds['x'] > 0) & (diamonds['y'] > 0) & (diamonds['z'] > 0)]

In [75]:
# Transformation which relates x, y and z with diamond volume: CONE

diameter = (diamonds['x'] + diamonds['y']) / 2
diamonds['volume cone'] = (1/3) * np.pi * ((diameter / 2) ** 2) * diamonds['z']
diamonds['volume semisphere'] = (4/6) * np.pi * ((diameter / 2)) **2
diamonds['volume'] = diamonds['volume cone'] + diamonds['volume semisphere']

In [76]:
diameter_predict = (predict['x'] + predict['y']) / 2
predict['volume cone'] = (1/3) * np.pi * ((diameter_predict / 2) ** 2) * predict['z']
predict['volume semisphere'] = (4/6) * np.pi * ((diameter_predict / 2)) **2
predict['volume'] = predict['volume cone'] + predict['volume semisphere']

In [77]:
# Transformation which calculate the price due to price per carat * carat weight

diamonds_price = []

for i in diamonds['carat']:
    
    if i < 0.5:
        diamonds_price.append(600)        
    elif i >= 0.5 and  i < 1:
        diamonds_price.append(2400)
    elif i >= 1 and i < 1.5:
        diamonds_price.append(4280)
    elif i>= 1.5 and i < 2:
        diamonds_price.append(9360)
    else: 
        diamonds_price.append(16000)

In [78]:
diamonds['carat price'] = diamonds_price

In [79]:
predict_price = []

for i in predict['carat']:
    
    if i < 0.5:
        predict_price.append(600)        
    elif i >= 0.5 and  i < 1:
        predict_price.append(2400)
    elif i >= 1 and i < 1.5:
        predict_price.append(4280)
    elif i>= 1.5 and i < 2:
        predict_price.append(9360)
    else: 
        predict_price.append(16000)

In [80]:
predict['carat price'] = predict_price

In [81]:
diamonds['predicted price'] = diamonds['carat'] * diamonds['carat price']

In [82]:
predict['predicted price'] = predict['carat'] * predict['carat price']

In [83]:
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume cone,volume semisphere,volume,carat price,predicted price
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25,51.600247,24.282469,75.882716,4280,5178.8
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75,13.717337,9.976245,23.693583,600,192.0
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65,29.699643,16.273777,45.973421,2400,1704.0
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.0,17.349445,11.566297,28.915742,600,246.0
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95,44.095278,22.326723,66.422001,4280,4365.6


In [84]:
# The first step is defining the Data Cleaning and Feature Engineering 

target = 'price'
cat_features = ['cut', 'color', 'clarity']
num_features = ['carat', 'depth', 'table', 'x', 'y', 'z', 'volume', 'predicted price']
features = cat_features + num_features

## **Gradient Boosting**

In [85]:
# Generamos una primera etapa del pipeline que transforma los datos categoricos y los datos numéricos del modelo

# En los datos numéricos fill los nulos con la mediana y estandarizamos los resultados
numeric_transformer = \
Pipeline(steps = [('imputer', SimpleImputer(strategy = 'median')), ('scalar', StandardScaler())])

# En los datos categóricos fill los nulos con una constante y los transformamos con OneHotEncoding
categorical_transformer = \
Pipeline(steps = [('imputer', SimpleImputer(strategy = 'constant', fill_value = 'missin')), 
                  ('onehot', OneHotEncoder(handle_unknown = 'ignore'))])

In [86]:
# Generamos una segunda etapa donde definimos las variables numericas a las que aplicar numeric_transformer y las variables categoricas a las que aplicar categorical_transformer

preprocessor = \
ColumnTransformer(transformers = [('num', numeric_transformer, num_features), ('cat', categorical_transformer, cat_features)])

In [87]:
# Generamos una última etapa donde definimos el modelo

model = \
Pipeline(steps = [('preprocessor', preprocessor), ('regressor', GradientBoostingRegressor(n_estimators = 150, min_samples_split = 10, min_samples_leaf = 10))])

In [88]:
# Dividimos el dataset en train y en test

diamonds_train, diamonds_test = train_test_split(diamonds)

In [89]:
# Entrenamos el modelo

model.fit(diamonds_train[features], diamonds_train[target])

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scalar',
                                                                   StandardScaler())]),
                                                  ['carat', 'depth', 'table',
                                                   'x', 'y', 'z', 'volume',
                                                   'predicted price']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missin',
                                                                                 strategy='constant'))

In [90]:
# Validamos el modelo con diamonds_test

y_test_pred = model.predict(diamonds_test[features])
y_test_real = diamonds_test[target]

mean_squared_error(y_test_real, y_test_pred, squared = False)

665.0764910120075

In [91]:
# Validamos el modelo con diamonds_train

y_train_pred = model.predict(diamonds_train[features])
y_train_real = diamonds_train[target]

mean_squared_error(y_train_real, y_train_pred, squared = False)

654.0837706767136

In [92]:
y_pred = model.predict(predict).clip(300, 18000)



In [17]:
y_pred

array([ 2786.69986412,  6271.96775874, 10599.98675178, ...,
        3092.25957829,  2242.12895579,   720.43281526])

## **Submission**

In [53]:
submission_df = pd.DataFrame({'id': predict['id'], 'price': y_pred})

In [54]:
submission_df.to_csv('./data/GradientBoosting.csv', index = False)

## **Hyperparameter Optimization**

In [23]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'regressor__n_estimators': [16, 32, 64, 128, 256, 512],
    'regressor__min_samples_split': [2, 4, 8, 16, 20],
    'regressor__min_samples_leaf': [1, 2, 4, 16, 32]
}

grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=5, 
                                 verbose=10, 
                                 scoring='neg_root_mean_squared_error', 
                                 n_jobs=-1,
                                 n_iter=32)

grid_search.fit(diamonds[features], diamonds[target])

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   36.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 12.9min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed: 13.9min
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed: 15.0min finished


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('imputer',
                                                                                                SimpleImputer(strategy='median')),
                                                                                               ('scalar',
                                                                                                StandardScaler())]),
                                                                               ['carat',
                                                                                'depth',
                                                                                'table',
                                                                                'x',
              

In [24]:
grid_search.best_params_

{'regressor__n_estimators': 512,
 'regressor__min_samples_split': 16,
 'regressor__min_samples_leaf': 1,
 'preprocessor__num__imputer__strategy': 'mean'}

In [25]:
grid_search.best_score_

-603.0176734884086

In [26]:
y_pred = grid_search.predict(predict[features])

In [27]:
# Validamos el modelo con diamonds_test

y_test_pred = model.predict(diamonds_test[features])
y_test_real = diamonds_test[target]

mean_squared_error(y_test_real, y_test_pred, squared = False)

751.4533498881235

## **Submission**

In [28]:
submission_df = pd.DataFrame({'id': predict['id'], 'price': y_pred})

In [29]:
submission_df.to_csv('./data/GradientBoosting.csv', index = False)