In [1]:
import pandas as pd
import numpy as np

In [2]:
diamonds = pd.read_csv('../data/data_clean.csv')
diamonds_predict = pd.read_csv('../data/diamonds_predict.csv')

In [3]:
diamonds.columns

Index(['Unnamed: 0', 'carat', 'cut', 'color', 'clarity', 'depth', 'table',
       'price', 'x', 'y', 'z', 'depth_calc', 'cut_calc', 'color_calc',
       'clarity_calc', 'volume_calc', 'density_calc', 'score'],
      dtype='object')

In [4]:
NUM_FEATS = ['carat', 'depth', 'table', 'x', 'y', 'z']
CAT_FEATS = ['cut', 'color', 'clarity']
FEATS = NUM_FEATS + CAT_FEATS  #FEATS es features
TARGET = 'price'

In [5]:
FEATS

['carat', 'depth', 'table', 'x', 'y', 'z', 'cut', 'color', 'clarity']

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [7]:
"""
imputer -> para valores nulos cuando los hay en el df cambiandolo por un valor 
que puede ser la media, mediana...
"""

'\nimputer -> para valores nulos cuando los hay en el df cambiandolo por un valor \nque puede ser la media, mediana...\n'

In [8]:
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), 
                                      ('sacler', StandardScaler())])  
#Los pasos se dan mediante una lista de tuplas, en los que se pone primero el nombre y luego la acción a realizar

In [9]:
categorical_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy='constant', 
                                                                      fill_value='missing')),
                                             ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [10]:
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, NUM_FEATS), 
                                               ('cat', categorical_transformer, CAT_FEATS)])

In [11]:
preprocessor

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('sacler', StandardScaler())]),
                                 ['carat', 'depth', 'table', 'x', 'y', 'z']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='missing',
                                                                strategy='constant')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['cut', 'color', 'clarity'])])

In [12]:
pd.DataFrame(preprocessor.fit_transform(diamonds)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,0.868311,0.453801,0.24819,0.980592,0.950149,1.026498,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-1.004471,0.875598,-0.19965,-1.229082,-1.213093,-1.137626,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.183813,2.633086,-1.095332,-0.097515,-0.180841,0.160848,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.815088,1.437994,-0.647491,-0.935053,-0.907906,-0.776939,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.468504,-0.88189,0.696031,0.731113,0.698817,0.593673,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [13]:
from sklearn.model_selection import train_test_split

In [14]:
diamonds_train, diamonds_test = train_test_split(diamonds)

In [15]:
diamonds_train.shape

(30324, 18)

In [16]:
diamonds_test.shape

(10109, 18)

In [17]:
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor

In [18]:
# model = Pipeline(steps = [('preprocessor', preprocessor), 
#                           ('regressor', ExtraTreesRegressor(n_jobs=-1, max_depth=16, 
#                                                            min_samples_split=500))])

In [19]:
model= Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', RandomForestRegressor())])

In [20]:
model.fit(diamonds_train[FEATS], diamonds_train[TARGET])

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('sacler',
                                                                   StandardScaler())]),
                                                  ['carat', 'depth', 'table',
                                                   'x', 'y', 'z']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',


In [21]:
from sklearn.metrics import mean_squared_error, r2_score

In [22]:
y_train_pred = model.predict(diamonds_train[FEATS])

In [23]:
y_train_real = diamonds_train[TARGET]

In [24]:
mean_squared_error(y_pred=y_train_pred, y_true=y_train_real, squared=False)

209.96739853455372

In [25]:
y_test_real = diamonds_test[TARGET]

In [26]:
y_test_pred = model.predict(diamonds_test[FEATS])

In [27]:
mean_squared_error(y_pred=y_test_pred, y_true=y_test_real, squared=False)

562.5163123024463

In [28]:
r2_score(y_test_real, y_test_pred)

0.9800748024276287

In [28]:
"""
Claro ejemplo de SOBREAJUSTE (si en ExtraTreeProgressor no ponemos nada en max_depth)
si vamos ajustando el max_depth podemos ir evitando y corrigiendo el sobreajuste
"""

'\nClaro ejemplo de SOBREAJUSTE (si en ExtraTreeProgressor no ponemos nada en max_depth)\nsi vamos ajustando el max_depth podemos ir evitando y corrigiendo el sobreajuste\n'

In [29]:
from sklearn.model_selection import cross_val_score

In [30]:
scores = cross_val_score(model,
                        diamonds[FEATS],
                        diamonds[TARGET],
                        scoring='neg_root_mean_squared_error',
                        cv=5, n_jobs=-1)

In [31]:
np.mean(-scores)

555.348762446698

In [32]:
model

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('sacler',
                                                                   StandardScaler())]),
                                                  ['carat', 'depth', 'table',
                                                   'x', 'y', 'z']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',


In [33]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [34]:
param_grid = {
    'preprocessor__num__imputer__strategy':['mean', 'median'],
    'regressor__n_estimators': [16, 32, 64, 128, 256, 512],
    'regressor__max_depth': [2, 4, 8, 16],
    }

In [35]:
grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=8, 
                                 verbose=10, 
                                scoring='neg_root_mean_squared_error',
                                n_jobs=-1,
                                n_iter=30
                                )

In [36]:
grid_search.fit(diamonds[FEATS], diamonds[TARGET])

Fitting 8 folds for each of 30 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   45.4s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 10

RandomizedSearchCV(cv=8,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('imputer',
                                                                                                SimpleImputer(strategy='median')),
                                                                                               ('sacler',
                                                                                                StandardScaler())]),
                                                                               ['carat',
                                                                                'depth',
                                                                                'table',
                                                                                'x',
              

In [37]:
from joblib import dump, load
dump(grid_search, 'model.joblib')

['model.joblib']

In [38]:
grid_search.best_params_

{'regressor__n_estimators': 512,
 'regressor__max_depth': 16,
 'preprocessor__num__imputer__strategy': 'median'}

In [39]:
grid_search.best_score_

-548.0836281957418

In [40]:
submission_df = pd.DataFrame({'id': diamonds_predict.id,
                             'price': grid_search.predict(diamonds_predict[FEATS]).clip(300, 18000)})

In [41]:
submission_df.head()

Unnamed: 0,id,price
0,0,2959.996984
1,1,5401.806979
2,2,9299.386448
3,3,4148.631928
4,4,1702.086938


In [43]:
submission_df.to_csv('../export/prueba_modelo_clase_3.csv', index=False)

In [None]:
"""
Echarle un ojo a la base de datos "limpia".... no está tan bien limpiada
"""

In [None]:
"""
Hacer notebook de validación
"""