In [1]:
import pandas as pd
import numpy as np

In [2]:
diamonds = pd.read_csv('../data/data_clean.csv')
diamonds_predict = pd.read_csv('../data/diamonds_predict.csv')

In [3]:
diamonds.columns

Index(['Unnamed: 0', 'carat', 'cut', 'color', 'clarity', 'depth', 'table',
       'price', 'x', 'y', 'z', 'depth_calc', 'cut_calc', 'color_calc',
       'clarity_calc', 'volume_calc', 'density_calc', 'score'],
      dtype='object')

In [4]:
NUM_FEATS = ['carat', 'depth', 'table', 'x', 'y', 'z']
CAT_FEATS = ['cut', 'color', 'clarity']
FEATS = NUM_FEATS + CAT_FEATS  #FEATS es features
TARGET = 'price'

In [5]:
FEATS

['carat', 'depth', 'table', 'x', 'y', 'z', 'cut', 'color', 'clarity']

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [7]:
"""
imputer -> para valores nulos cuando los hay en el df cambiandolo por un valor 
que puede ser la media, mediana...
"""

'\nimputer -> para valores nulos cuando los hay en el df cambiandolo por un valor \nque puede ser la media, mediana...\n'

In [8]:
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), 
                                      ('sacler', StandardScaler())])  
#Los pasos se dan mediante una lista de tuplas, en los que se pone primero el nombre y luego la acción a realizar

In [9]:
categorical_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy='constant', 
                                                                      fill_value='missing')),
                                             ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [10]:
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, NUM_FEATS), 
                                               ('cat', categorical_transformer, CAT_FEATS)])

In [11]:
preprocessor

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('sacler', StandardScaler())]),
                                 ['carat', 'depth', 'table', 'x', 'y', 'z']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='missing',
                                                                strategy='constant')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['cut', 'color', 'clarity'])])

In [12]:
pd.DataFrame(preprocessor.fit_transform(diamonds)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,0.868311,0.453801,0.24819,0.980592,0.950149,1.026498,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-1.004471,0.875598,-0.19965,-1.229082,-1.213093,-1.137626,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.183813,2.633086,-1.095332,-0.097515,-0.180841,0.160848,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.815088,1.437994,-0.647491,-0.935053,-0.907906,-0.776939,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.468504,-0.88189,0.696031,0.731113,0.698817,0.593673,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [13]:
from sklearn.model_selection import train_test_split

In [14]:
diamonds_train, diamonds_test = train_test_split(diamonds)

In [15]:
diamonds_train.shape

(30324, 18)

In [16]:
diamonds_test.shape

(10109, 18)

In [17]:
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor

In [18]:
# model = Pipeline(steps = [('preprocessor', preprocessor), 
#                           ('regressor', ExtraTreesRegressor(n_jobs=-1, max_depth=16, 
#                                                            min_samples_split=500))])

In [19]:
model= Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', RandomForestRegressor())])

In [20]:
model.fit(diamonds_train[FEATS], diamonds_train[TARGET])

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('sacler',
                                                                   StandardScaler())]),
                                                  ['carat', 'depth', 'table',
                                                   'x', 'y', 'z']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',


In [21]:
pd.set_option('display.max_rows', 10)

In [22]:
diamonds_train

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,depth_calc,cut_calc,color_calc,clarity_calc,volume_calc,density_calc,score
26644,26660,0.30,Very Good,E,VS2,61.7,61.0,658,4.28,4.31,2.65,0.616997,0.5,0.833333,0.428571,48.884020,0.006137,6.0
34937,34959,0.30,Ideal,G,VVS1,62.0,54.0,764,4.35,4.36,2.70,0.619977,1.0,0.500000,0.857143,51.208200,0.005858,8.0
11583,11589,0.35,Very Good,F,VVS1,62.3,56.0,909,4.53,4.57,2.83,0.621978,0.5,0.666667,0.857143,58.586943,0.005974,7.0
2181,2183,0.54,Ideal,E,VS2,62.8,55.0,1713,5.21,5.24,3.28,0.627751,1.0,0.833333,0.428571,89.545312,0.006030,7.0
12791,12797,0.30,Ideal,J,IF,61.5,56.0,489,4.32,4.33,2.66,0.615029,1.0,0.000000,1.000000,49.756896,0.006029,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23178,23192,1.38,Ideal,I,SI2,61.8,57.0,5796,7.14,7.10,4.40,0.617978,1.0,0.166667,0.142857,223.053600,0.006187,4.0
23770,23784,1.00,Very Good,F,SI1,61.8,58.0,5058,6.37,6.44,3.96,0.618267,0.5,0.666667,0.285714,162.450288,0.006156,5.0
40285,40307,1.03,Ideal,F,VS1,61.7,56.0,7137,6.49,6.54,4.02,0.617038,1.0,0.666667,0.571429,170.627292,0.006037,7.0
8839,8845,1.04,Ideal,G,SI2,61.7,56.0,4455,6.57,6.52,4.04,0.617265,1.0,0.500000,0.142857,173.059056,0.006010,5.0


In [23]:
from sklearn.metrics import mean_squared_error, r2_score

In [24]:
y_train_pred = model.predict(diamonds_train[FEATS])

In [25]:
y_train_real = diamonds_train[TARGET]

In [26]:
mean_squared_error(y_pred=y_train_pred, y_true=y_train_real, squared=False)

211.93066798727543

In [27]:
y_test_real = diamonds_test[TARGET]

In [28]:
y_test_pred = model.predict(diamonds_test[FEATS])

In [29]:
mean_squared_error(y_pred=y_test_pred, y_true=y_test_real, squared=False)

533.9275796653094

In [30]:
r2_score(y_test_real, y_test_pred)

0.9820275236564218

In [31]:
"""
Claro ejemplo de SOBREAJUSTE (si en ExtraTreeProgressor no ponemos nada en max_depth)
si vamos ajustando el max_depth podemos ir evitando y corrigiendo el sobreajuste
"""

'\nClaro ejemplo de SOBREAJUSTE (si en ExtraTreeProgressor no ponemos nada en max_depth)\nsi vamos ajustando el max_depth podemos ir evitando y corrigiendo el sobreajuste\n'

In [32]:
from sklearn.model_selection import cross_val_score

In [33]:
scores = cross_val_score(model,
                        diamonds[FEATS],
                        diamonds[TARGET],
                        scoring='neg_root_mean_squared_error',
                        cv=5, n_jobs=-1)

In [34]:
np.mean(-scores)

554.7216935007827

In [35]:
model

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('sacler',
                                                                   StandardScaler())]),
                                                  ['carat', 'depth', 'table',
                                                   'x', 'y', 'z']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',


In [36]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [37]:
param_grid = {
    'preprocessor__num__imputer__strategy':['mean', 'median'],
    'regressor__n_estimators': [16, 32, 64, 128, 256, 512],
    'regressor__max_depth': [2, 4, 8, 16],
    }

In [38]:
grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=8, 
                                 verbose=10, 
                                scoring='neg_root_mean_squared_error',
                                n_jobs=-1,
                                n_iter=30
                                )

In [39]:
grid_search.fit(diamonds[FEATS], diamonds[TARGET])

Fitting 8 folds for each of 30 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  8

RandomizedSearchCV(cv=8,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('imputer',
                                                                                                SimpleImputer(strategy='median')),
                                                                                               ('sacler',
                                                                                                StandardScaler())]),
                                                                               ['carat',
                                                                                'depth',
                                                                                'table',
                                                                                'x',
              

In [40]:
from joblib import dump, load
dump(grid_search, 'model.joblib')

['model.joblib']

In [41]:
grid_search.best_params_

{'regressor__n_estimators': 256,
 'regressor__max_depth': 16,
 'preprocessor__num__imputer__strategy': 'mean'}

In [42]:
grid_search.best_score_

-547.7918607362465

In [43]:
submission_df = pd.DataFrame({'id': diamonds_predict.id,
                             'price': grid_search.predict(diamonds_predict[FEATS]).clip(300, 18000)})

In [44]:
submission_df.head()

Unnamed: 0,id,price
0,0,2968.844271
1,1,5411.805836
2,2,9265.574816
3,3,4142.337713
4,4,1700.896463


In [45]:
submission_df.to_csv('../export/prueba_modelo_clase_3.csv', index=False)

In [46]:
"""
Echarle un ojo a la base de datos "limpia".... no está tan bien limpiada
"""

'\nEcharle un ojo a la base de datos "limpia".... no está tan bien limpiada\n'

In [47]:
"""
Hacer notebook de validación
"""

'\nHacer notebook de validación\n'