In [4]:
import pandas as pd
import numpy as np
from hyperopt import fmin, tpe, hp
import ipynb.fs.full.utils as utils
import ipynb.fs.full.features as features
import ipynb.fs.full.features_distancias as f_distancias


df_train = pd.read_csv('./data/train_filtrado.csv')
# Para usarse con el submit a Kaggle
df_eval = pd.read_csv('./data/test.csv')

df_train, df_eval = features.features_de_csvs(df_train, df_eval)

df_train_idf = pd.read_csv('./data/train_idf.csv')
df_eval_idf = pd.read_csv('./data/test_idf.csv')

df_train = pd.merge(df_train, df_train_idf, on= 'id', how= 'left')
df_eval = pd.merge(df_eval, df_eval_idf, on= 'id', how= 'left')

df_train, df_test = utils.dividir_df_testeo(df_train, test_size=0.25)

df_test = features.llenar_nulls(df_test, hgb_mean=True, df_fill=df_train)
df_train = features.llenar_nulls(df_train, hgb_mean=True)

In [5]:
# df_train = df_train.sample(frac=1).reset_index(drop=True)

df_test_f = features.features_independientes_precio(df_test)
df_test_f = features.features_dependientes_precio(df_test_f, df_train)

df_train_f = features.features_independientes_precio(df_train)
df_train_f = features.features_dependientes_precio(df_train_f, df_train)

df_test_f, cols_tipodepropiedad_ohe = features.columna_a_ohe(df_test_f, 'tipodepropiedad', N=100, df_aux=df_train, devolver_cols=True)
df_test_f, cols_provincia_ohe = features.columna_a_ohe(df_test_f, 'provincia', N=100, df_aux=df_train, devolver_cols=True)
df_test_f, cols_zona_ohe = features.columna_a_ohe(df_test_f, 'zona', df_aux=df_train_f, devolver_cols=True)

df_train_f = features.columna_a_ohe(df_train_f, 'tipodepropiedad', N=100, df_aux=df_test)
df_train_f = features.columna_a_ohe(df_train_f, 'provincia', N=100, df_aux=df_test)
df_train_f = features.columna_a_ohe(df_train_f, 'zona', df_aux=df_test_f)


df_train_f['fecha'] = pd.to_datetime(df_train_f['fecha']).astype(int)
df_test_f['fecha'] = pd.to_datetime(df_test_f['fecha']).astype(int)

# df_train_f = df_train_f.sample(frac=1).reset_index(drop=True)

df_train_f = f_distancias.feature_distancias(df_train_f)
df_test_f = f_distancias.feature_distancias(df_test_f, df_train_f)

## GridSearchCV

In [None]:
from sklearn.ensemble import ExtraTreesRegressor

from sklearn.model_selection import GridSearchCV

features = ['habitaciones', 'garages','banos','antiguedad', 'metroscubiertos',  'metrostotales','lat_norm', 
           'lng_norm', 'gimnasio', 'usosmultiples', 'piscina']

features_test = ['top_provincia', 'promedio_precio_ciudad', 'anio', 'promedio_id_zona', 
                 'promedio_precio_tipo_propiedad', 'count_id_zona', 'count_ciudad', 'puntaje', 
               'count_tipo_propiedad_ciudad', 'promedio_precio_tipo_propiedad_ciudad_gen','count_id_zona'
           'dias_desde_datos','meses_desde_datos','porcentaje_metros','distancia_ciudad_centrica', 'puntaje', 'distancia_centro_mexico']

features += features_test

extratrees_params = { 
    "n_estimators": [10],
    "max_features": ["auto", "sqrt", "log2"],
    "min_samples_split": [2,4,8],
    "min_impurity_decrease": [0, 0.1, 0.2],
    "bootstrap": [False]
}

x_train, x_test, y_train, y_test = utils.dividir_dataset(df_train_f, 'precio', features, test_size=1)

rs_cv = GridSearchCV(estimator=ExtraTreesRegressor(), 
                           param_grid=extratrees_params, 
                           cv=4, scoring='neg_mean_absolute_error', verbose=1)

rs_cv.fit(x_train, y_train)

## Evaluación modelo final

In [6]:
from sklearn.ensemble import ExtraTreesRegressor


params = {'criterion': 'mse',
 'max_features': 'sqrt',
 'min_samples_split': 4}

extratrees = ExtraTreesRegressor(n_estimators=20, verbose=3, **params)

features = ['habitaciones', 'garages','banos','antiguedad', 'metroscubiertos',  'metrostotales','lat_norm', 
           'lng_norm', 'gimnasio', 'usosmultiples', 'piscina']

features_test = ['top_provincia', 'promedio_precio_ciudad', 'anio', 'promedio_id_zona', 
                 'promedio_precio_tipo_propiedad', 'count_id_zona', 'count_ciudad', 'puntaje', 
               'count_tipo_propiedad_ciudad', 'promedio_precio_tipo_propiedad_ciudad_gen','count_id_zona'
           'dias_desde_datos','meses_desde_datos','porcentaje_metros','distancia_ciudad_centrica', 
                'distancia_centro_mexico', 'distancia_ciudad_cara', 'label']

features += features_test

x_train = utils.filtrar_features(df_train_f, features)
y_train = df_train_f['precio']
x_eval = utils.filtrar_features(df_test_f, features)
y_eval = df_test_f['precio']

extratrees.fit(x_train, y_train)

y_pred_train = extratrees.predict(x_train)
y_pred_eval = extratrees.predict(x_eval)

mae_train = utils.MAE(y_train, y_pred_train)
mae_eval = utils.MAE(y_eval, y_pred_eval)

print(f"MAE AdaBoost (train): {mae_train:.5f}")
print(f"MAE AdaBoost (eval): {mae_eval:.5f}")

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 20


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


building tree 2 of 20


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.7s remaining:    0.0s


building tree 3 of 20
building tree 4 of 20
building tree 5 of 20
building tree 6 of 20
building tree 7 of 20
building tree 8 of 20
building tree 9 of 20
building tree 10 of 20
building tree 11 of 20
building tree 12 of 20
building tree 13 of 20
building tree 14 of 20
building tree 15 of 20
building tree 16 of 20
building tree 17 of 20
building tree 18 of 20
building tree 19 of 20
building tree 20 of 20


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    8.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    1.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


MAE AdaBoost (train): 142320.43568
MAE AdaBoost (eval): 531897.23870


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.6s finished
