In [34]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from hyperopt import fmin, tpe, hp
import ipynb.fs.full.utils as utils
import ipynb.fs.full.features as features
import ipynb.fs.full.features_distancias as f_distancias

df_train = pd.read_csv('./data/train_filtrado.csv')
# df_train = utils.dolarizar_df(df_train)
# Para usarse con el submit a Kaggle
df_test = pd.read_csv('./data/test.csv')

df_train = features.llenar_nulls(df_train)
df_test = features.llenar_nulls(df_test, hgb_mean=True, df_fill=df_train)

df_train_cluster = pd.read_csv('./data/clustering_train.csv').rename(columns={'label': 'clustering_label'})
df_test_cluster = pd.read_csv('./data/clustering_test.csv').rename(columns={'label': 'clustering_label'})

df_train = pd.merge(df_train, df_train_cluster, on='id')
df_test = pd.merge(df_test, df_test_cluster, on='id')

df_train_idf = pd.read_csv('./data/train_idf.csv')
df_test_idf = pd.read_csv('./data/test_idf.csv')

df_train = pd.merge(df_train, df_train_idf, on= 'id', how= 'left')
df_test = pd.merge(df_test, df_test_idf, on= 'id', how= 'left')

# df_train, df_test = utils.dividir_df_testeo(df_train, test_size=0.3)

# df_train, df_test = features_de_csvs(df_train, df_test)

In [35]:
# df_train, df_test = utils.dividir_df_testeo(df_train, test_size=0.20)

In [36]:
df_test_f = features.features_independientes_precio(df_test)
df_test_f = features.features_dependientes_precio(df_test_f, df_train)

df_train_f = features.features_independientes_precio(df_train)
df_train_f = features.features_dependientes_precio(df_train_f, df_train)

df_test_f, cols_tipodepropiedad_ohe = features.columna_a_ohe(df_test_f, 'tipodepropiedad', N=100, df_aux=df_train, devolver_cols=True)
df_test_f, cols_provincia_ohe = features.columna_a_ohe(df_test_f, 'provincia', N=100, df_aux=df_train, devolver_cols=True)
df_test_f, cols_zona_ohe = features.columna_a_ohe(df_test_f, 'zona', df_aux=df_train_f, devolver_cols=True)

df_train_f = features.columna_a_ohe(df_train_f, 'tipodepropiedad', N=100, df_aux=df_test)
df_train_f = features.columna_a_ohe(df_train_f, 'provincia', N=100, df_aux=df_test)
df_train_f = features.columna_a_ohe(df_train_f, 'zona', df_aux=df_test_f)


df_train_f['fecha'] = pd.to_datetime(df_train_f['fecha']).astype(int)
df_test_f['fecha'] = pd.to_datetime(df_test_f['fecha']).astype(int)


df_train_f = f_distancias.feature_distancias(df_train_f)
df_test_f = f_distancias.feature_distancias(df_test_f, df_train_f)

df_train_f = features.rankings(df_train_f, df_train_f)
df_test_f = features.rankings(df_test_f, df_train_f)


# df_train_f = features.KD_feature(df_train_f)
# df_test_f =  features.KD_feature(df_test_f)

## Selector de Features

In [37]:
from sklearn.base import BaseEstimator

class FeatureSelector(BaseEstimator):
    
    base_features = list(df_train_f.drop('precio', axis=1, errors='ignore').columns)
    
    def __init__(self, features):
        base_features = FeatureSelector.base_features

        self.features = features
        self.features_index = [i for i in range(len(base_features)) if base_features[i] in features]
        
    def fit(self, x, y=None):
        return self

    def transform(self, x):
        return x[:, self.features_index]

## LightGBM model

In [38]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

class LightGBMWrapper(lgb.LGBMRegressor):
    
    def fit(self, x, y):
        x_train, x_eval, y_train, y_eval = train_test_split(x, y, test_size=0.01, random_state=0)
        eval_set = [(x_eval, y_eval)]
        if self.boosting_type == 'dart':
            x_train = x
            y_train = y
            eval_set = None

        return super(LightGBMWrapper, self).fit(x_train, y_train, 
                                                early_stopping_rounds=15, eval_set=eval_set)
    
    def predict(self, X):
        return super(LightGBMWrapper, self).predict(X)

params = {
#  'bagging_freq': 0,
#  'max_bin': 1392,
#  'max_depth': 24,
#  'min_data_in_leaf': 19,
#  'min_split_gain': 0.3525463651166809,
#  'num_leaves': 734,
#  'reg_lambda': 14.58736307435159,
 'bagging_fraction': 0.7863825296731695,
 'bagging_freq': 0,
 'max_bin': 100,
 'max_depth': 15,
 'min_data_in_leaf': 20,
 'min_split_gain': 0.48874667957166357,
 'num_leaves': 486,
 'reg_lambda': 12.823774492528074,
 'boosting_type': 'gbdt',
 'metric': 'mae',
 'n_jobs': 4,
 'objective': 'regression',
 'n_estimators': 1500,
}
# params = {'boosting_type': 'gbdt',
#  'feature_fraction': 0.8,
#  'learning_rate': 0.25,
#  'max_bin': 255,
#  'max_depth': 15,
#  'metric': 'mae',
#  'min_data_in_leaf': 40,
#  'min_split_gain': 0.7,
#  'n_jobs': 4,
#  'num_leaves': 300,
#  'objective': 'regression',
#  'reg_lambda': 10,
#  'verbose': -1,
#  'n_estimators': 1500,
#  'silent': True}

lgb_m = Pipeline(steps=[
    ('feature_selector', FeatureSelector(['habitaciones', 
            'garages', 
            'banos',
            'antiguedad',
           'metroscubiertos', 
            'metrostotales',
            'lat_norm', 'lng_norm'
            'gimnasio', 'usosmultiples', 'piscina','prop_frecuente', 'top_provincia', 'promedio_precio_ciudad', 
                 'anio', 'promedio_id_zona', 'promedio_precio_tipo_propiedad', 
                 'count_id_zona', 'count_ciudad', 'puntaje', 
                     'count_tipo_propiedad_ciudad', 
                 'promedio_precio_tipo_propiedad_ciudad_gen',
                 'count_id_zona'
                 'dias_desde_datos',
                 'meses_desde_datos',
                 'porcentaje_metros',
                 'distancia_ciudad_centrica', 'peso_descripcion'])),
    ('lightgbm', LightGBMWrapper(**params))
])

## XGBoost

In [39]:
import xgboost as xgb

class XGBoostWrapper(xgb.XGBRegressor):
    
    def fit(self, x, y):
        x_train, x_eval, y_train, y_eval = train_test_split(x, y, test_size=0.05, random_state=0)
        eval_set = [(x_eval, y_eval)]

        return super(xgb.XGBRegressor, self).fit(x_train, y_train, 
                                                early_stopping_rounds=10, eval_set=eval_set)
    
    def predict(self, X):
        return super(xgb.XGBRegressor, self).predict(X)

hps = {
 'verbosity': 0,
 'objective': 'reg:squarederror',
 'eval_metric': 'mae',
 'n_estimators': 1000,
 'n_jobs': 4,
 'reg_alpha': 20.91434940058063,
 'colsample_bytree': 0.65,
 'learning_rate': 0.14,
 'max_depth': int(16.0),
#  'learning_rate': 0.2,
#  'colsample_bytree': 0.7999417329959986,
#  'max_depth': 15,
#  'reg_alpha': 34.228487127404584,
#  'scale_pos_weight': 5,
#  'subsample': 0.9569019167459879
}

# hps = {'verbosity': 0,
# 'subsample': 0.9,
# 'scale_pos_weight': 2,
# 'reg_alpha': 4, 
# 'objective': 'reg:squarederror',
# 'n_jobs': 4,
# 'n_estimators': 1000,
# 'max_depth': 15,
# 'learning_rate': 0.2,
# 'eval_metric': 'mae',
# 'colsample_bytree': 0.7,
#  'silent': True 
#  }

xgb_m = Pipeline([
    ('feature_selector', FeatureSelector(['habitaciones', 
            'garages', 
            'banos',
            'antiguedad',
           'metroscubiertos', 
            'metrostotales',
            'lat_norm', 'lng_norm'
            'gimnasio', 'usosmultiples', 'piscina','prop_frecuente', 'top_provincia', 'promedio_precio_ciudad', 
                 'anio', 'promedio_id_zona', 'promedio_precio_tipo_propiedad', 
                 'count_id_zona', 'count_ciudad', 'puntaje', 
                     'count_tipo_propiedad_ciudad', 
                 'promedio_precio_tipo_propiedad_ciudad_gen',
                 'count_id_zona'
                 'dias_desde_datos',
                 'meses_desde_datos',
                 'porcentaje_metros',
                 'distancia_ciudad_centrica', 'distancia_centro_mexico'])),
    ('xgboost', XGBoostWrapper(**hps))
])

## RandomForest

In [40]:
from sklearn.ensemble import RandomForestRegressor

params = {'bootstrap': False,
          'max_features': 'sqrt',
          'min_samples_split': 4,
          'n_jobs': 2,
          'n_estimators': 100}

forest_m = Pipeline([
    ('feature_selector', FeatureSelector(['garages', 'banos', 'metroscubiertos', 'metrostotales',
       'lat', 'lng', 'fecha', 'porcentaje_metros', 'diferencia_metros',
       'metroscubiertos_bins_unif', 'metroscubiertos_bins_perc',
       'metros_totales_normalizados', 'metros_cubiertos_normalizados',
       'promedio_metros_tipo_propiedad',
       'promedio_metros_cubiertos_provincia', 'mes', 'dias_desde_datos', 'tam_ambientes',
       'promedio_precio_provincia', 'promedio_precio_ciudad', 'count_ciudad',
       'promedio_id_zona', 'count_id_zona', 'promedio_precio_tipo_propiedad',
       'promedio_precio_tipo_propiedad_ciudad',
       'count_tipo_propiedad_ciudad', 'promedio_por_mes', 'promedio_precio_hbg_tipo_propiedad',
       'puntaje', 'distancia_ciudad_centrica',
       'distancia_centro_mexico',
       'ranking_en_provincia_tipodepropiedad_precio',
       'ranking_en_provincia_tipodepropiedad_cantidad'])),
    ('random_forest', RandomForestRegressor(**params))
])

## ExtraTrees

In [41]:
from sklearn.ensemble import ExtraTreesRegressor

params = {'criterion': 'mse',
 'max_features': 'sqrt',
 'min_samples_split': 4}

extratrees_m =  Pipeline([
    ('feature_selector', FeatureSelector(['antiguedad', 'garages', 'banos', 'metroscubiertos', 'metrostotales',
       'lat', 'lng', 'fecha', 'idf_titulo', 'idf_descripcion',
       'peso_descripcion', 'porcentaje_metros', 'diferencia_metros',
       'metroscubiertos_bins_unif', 'metroscubiertos_bins_perc',
       'metros_totales_normalizados', 'metros_cubiertos_normalizados',
       'promedio_metros_tipo_propiedad', 'promedio_metros_cub_tipo_propiedad',
       'promedio_metros_cubiertos_provincia', 'mes', 'dia', 'dias_desde_datos',
       'meses_desde_datos', 'antiguedad_bins_perc', 'tam_ambientes',
       'promedio_precio_provincia', 'promedio_precio_ciudad',
       'promedio_precio_ciudad_gen', 'varianza_precio_ciudad', 'count_ciudad',
       'promedio_id_zona', 'promedio_id_zona_gen', 'varianza_id_zona',
       'count_id_zona', 'promedio_precio_tipo_propiedad',
       'promedio_precio_tipo_propiedad_ciudad',
       'promedio_precio_tipo_propiedad_ciudad_gen',
       'count_tipo_propiedad_ciudad', 'promedio_por_mes', 'varianza_por_mes',
       'promedio_precio_habitaciones_banos_garages',
       'promedio_precio_banos_garages', 'promedio_precio_hbg_tipo_propiedad',
       'lat_norm', 'lng_norm', 'puntaje', 'distancia_ciudad_centrica',
       'distancia_centro_mexico', 'distancia_ciudad_cara',
       'ranking_en_provincia_tipodepropiedad_precio',
       'ranking_en_provincia_tipodepropiedad_cantidad'])),
    ('extra_trees', ExtraTreesRegressor(n_estimators=50, **params))
])

## CatBoost

In [42]:
from catboost import CatBoostRegressor


class CatBoostWrapper(CatBoostRegressor):
    
    def fit(self, x, y):
        x_train, x_eval, y_train, y_eval = train_test_split(x, y, test_size=0.05)
        eval_set = [(x_eval, y_eval)]

        # posiciones de features categoricas encontradas a mano
        cat_features = [0, 1, 2]
        return super(CatBoostWrapper, self).fit(x, y, cat_features=cat_features, eval_set=eval_set, early_stopping_rounds=25)


f_s = FeatureSelector(['garages', 'banos', 'metroscubiertos', 'metrostotales',
       'lat', 'lng', 'fecha', 'porcentaje_metros', 'diferencia_metros',
       'promedio_metros_tipo_propiedad', 'promedio_metros_cubiertos_provincia', 'mes', 
       'dias_desde_datos', 'tam_ambientes',
       'promedio_precio_provincia', 'promedio_precio_ciudad',
       'count_ciudad', 'promedio_id_zona',
       'count_id_zona', 'promedio_precio_tipo_propiedad',
       'promedio_precio_tipo_propiedad_ciudad',
       'count_tipo_propiedad_ciudad', 'promedio_por_mes', 'promedio_precio_hbg_tipo_propiedad',
       'puntaje', 'distancia_ciudad_centrica',
       'distancia_centro_mexico', 'provincia', 'ciudad', 'tipodepropiedad',
       'ranking_en_provincia_tipodepropiedad_precio',
       'ranking_en_provincia_tipodepropiedad_cantidad'] + ['provincia', 'ciudad', 'tipodepropiedad'])

params = {
    'od_wait': 50,
    'od_type': 'Iter',
    'learning_rate': 0.15,
    'l2_leaf_reg': 3,
    'depth': 12,
    'colsample_bylevel': 0.5,
    'border_count': 128,
    'iterations': 800,
    'silent': True,
    'eval_metric': 'MAE'
}


catboost_m = Pipeline([
    ('feature_selector', f_s),
    ('catboost', CatBoostWrapper(**params))
])

## Keras model

In [43]:
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Activation
from sklearn.preprocessing import MinMaxScaler

def keras_modelo():    
    model = Sequential()
    model.add(Dense(units=512, activation='relu'))
    model.add(Dropout(0.13))
    model.add(Dense(units=250, activation='relu'))
    model.add(Dropout(0.13))
    model.add(Dense(units=128, activation='relu'))
    model.add(Dropout(0.13))
    model.add(Dense(units=1, activation='linear'))

    model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_squared_error'])  
    return model

keras_m = Pipeline([
    ('feature_selector', FeatureSelector(['antiguedad', 'habitaciones', 'garages', 'banos', 'metroscubiertos',
       'metrostotales', 'lat', 'lng', 'gimnasio', 'usosmultiples',
       'piscina', 'clustering_label', 'idf_descripcion', 'porcentaje_metros', 'diferencia_metros',
       'promedio_metros_tipo_propiedad', 'prop_frecuente', 'top_provincia',
       'promedio_metros_totales_provincia', 'anio', 'mes',
       'trimestre', 'dias_desde_datos', 'cantidad_inquilinos', 'tam_ambientes', 'promedio_precio_provincia',
       'promedio_precio_ciudad', 'count_ciudad', 'promedio_id_zona',
       'count_id_zona','promedio_precio_tipo_propiedad',
       'promedio_precio_tipo_propiedad_ciudad', 'count_tipo_propiedad',
       'count_tipo_propiedad_ciudad', 'promedio_por_mes', 
       'promedio_precio_banos_garages', 'promedio_precio_hbg_tipo_propiedad',
       'promedio_precio_booleanos', 'puntaje',
       'distancia_ciudad_centrica', 'distancia_centro_mexico',
       'ranking_en_provincia_tipodepropiedad_precio',
       'ranking_en_provincia_tipodepropiedad_cantidad'])),
    ('scaler', MinMaxScaler()),
    ('keras', KerasRegressor(build_fn=keras_modelo, epochs=5))
])

## Stacking

In [44]:
from sklearn.preprocessing import MinMaxScaler
from vecstack import stacking
from vecstack_sk import StackingTransformer

modelos = [
#            ('keras', keras_m),
           ('lightgbm', lgb_m), 
           ('xgboost', xgb_m),
#            ('randomforest', forest_m),
#            ('extratrees', extratrees_m),
#            ('catboost', catboost_m)
          ]

stack = StackingTransformer(modelos, 
                          regression=True, verbose=2, n_folds=5)

stack.fit(df_train_f.drop('precio', axis=1).values, 
                          df_train_f['precio'].values)

task:         [regression]
metric:       [mean_absolute_error]
variant:      [A]
n_estimators: [2]

estimator  0: [lightgbm: Pipeline]
[1]	valid_0's l1: 1.43709e+06
Training until validation scores don't improve for 15 rounds
[2]	valid_0's l1: 1.32538e+06
[3]	valid_0's l1: 1.22747e+06
[4]	valid_0's l1: 1.1386e+06
[5]	valid_0's l1: 1.06021e+06
[6]	valid_0's l1: 991960
[7]	valid_0's l1: 932127
[8]	valid_0's l1: 879604
[9]	valid_0's l1: 832312
[10]	valid_0's l1: 791115
[11]	valid_0's l1: 755415
[12]	valid_0's l1: 723621
[13]	valid_0's l1: 695988
[14]	valid_0's l1: 673283
[15]	valid_0's l1: 652295
[16]	valid_0's l1: 635957
[17]	valid_0's l1: 621298
[18]	valid_0's l1: 608042
[19]	valid_0's l1: 595851
[20]	valid_0's l1: 584383
[21]	valid_0's l1: 575044
[22]	valid_0's l1: 567037
[23]	valid_0's l1: 559290
[24]	valid_0's l1: 552610
[25]	valid_0's l1: 546984
[26]	valid_0's l1: 541422
[27]	valid_0's l1: 536636
[28]	valid_0's l1: 532120
[29]	valid_0's l1: 528608
[30]	valid_0's l1: 524592
[31]	vali

[301]	valid_0's l1: 469402
[302]	valid_0's l1: 469483
[303]	valid_0's l1: 469399
[304]	valid_0's l1: 469367
[305]	valid_0's l1: 469370
[306]	valid_0's l1: 469301
[307]	valid_0's l1: 469184
[308]	valid_0's l1: 468915
[309]	valid_0's l1: 469043
[310]	valid_0's l1: 468953
[311]	valid_0's l1: 468915
[312]	valid_0's l1: 468909
[313]	valid_0's l1: 468956
[314]	valid_0's l1: 468904
[315]	valid_0's l1: 468792
[316]	valid_0's l1: 468788
[317]	valid_0's l1: 468707
[318]	valid_0's l1: 468640
[319]	valid_0's l1: 468538
[320]	valid_0's l1: 468436
[321]	valid_0's l1: 468514
[322]	valid_0's l1: 468489
[323]	valid_0's l1: 468447
[324]	valid_0's l1: 468502
[325]	valid_0's l1: 468289
[326]	valid_0's l1: 468393
[327]	valid_0's l1: 468462
[328]	valid_0's l1: 468451
[329]	valid_0's l1: 468436
[330]	valid_0's l1: 468305
[331]	valid_0's l1: 468227
[332]	valid_0's l1: 468161
[333]	valid_0's l1: 468077
[334]	valid_0's l1: 468064
[335]	valid_0's l1: 468092
[336]	valid_0's l1: 468134
[337]	valid_0's l1: 468091
[

[245]	valid_0's l1: 465987
[246]	valid_0's l1: 466367
[247]	valid_0's l1: 466619
[248]	valid_0's l1: 466505
[249]	valid_0's l1: 466400
[250]	valid_0's l1: 466208
[251]	valid_0's l1: 466198
[252]	valid_0's l1: 466235
[253]	valid_0's l1: 466130
[254]	valid_0's l1: 465921
[255]	valid_0's l1: 465762
[256]	valid_0's l1: 465789
[257]	valid_0's l1: 465631
[258]	valid_0's l1: 465617
[259]	valid_0's l1: 465616
[260]	valid_0's l1: 465679
[261]	valid_0's l1: 465555
[262]	valid_0's l1: 465434
[263]	valid_0's l1: 465406
[264]	valid_0's l1: 465255
[265]	valid_0's l1: 465186
[266]	valid_0's l1: 465349
[267]	valid_0's l1: 465275
[268]	valid_0's l1: 465214
[269]	valid_0's l1: 465143
[270]	valid_0's l1: 465184
[271]	valid_0's l1: 465198
[272]	valid_0's l1: 465214
[273]	valid_0's l1: 465124
[274]	valid_0's l1: 465278
[275]	valid_0's l1: 465174
[276]	valid_0's l1: 465170
[277]	valid_0's l1: 465109
[278]	valid_0's l1: 465070
[279]	valid_0's l1: 465005
[280]	valid_0's l1: 465054
[281]	valid_0's l1: 465133
[

[174]	valid_0's l1: 461746
[175]	valid_0's l1: 461729
[176]	valid_0's l1: 461494
[177]	valid_0's l1: 461530
[178]	valid_0's l1: 461606
[179]	valid_0's l1: 461581
[180]	valid_0's l1: 461603
[181]	valid_0's l1: 461680
[182]	valid_0's l1: 461706
[183]	valid_0's l1: 461597
[184]	valid_0's l1: 461373
[185]	valid_0's l1: 461183
[186]	valid_0's l1: 461179
[187]	valid_0's l1: 461191
[188]	valid_0's l1: 461252
[189]	valid_0's l1: 461213
[190]	valid_0's l1: 460823
[191]	valid_0's l1: 460745
[192]	valid_0's l1: 460731
[193]	valid_0's l1: 460577
[194]	valid_0's l1: 460689
[195]	valid_0's l1: 460665
[196]	valid_0's l1: 460458
[197]	valid_0's l1: 460292
[198]	valid_0's l1: 460505
[199]	valid_0's l1: 460450
[200]	valid_0's l1: 460265
[201]	valid_0's l1: 460031
[202]	valid_0's l1: 459920
[203]	valid_0's l1: 459868
[204]	valid_0's l1: 459934
[205]	valid_0's l1: 459828
[206]	valid_0's l1: 459803
[207]	valid_0's l1: 459841
[208]	valid_0's l1: 459984
[209]	valid_0's l1: 459990
[210]	valid_0's l1: 460005
[

[8]	valid_0's l1: 935962
[9]	valid_0's l1: 886053
[10]	valid_0's l1: 844309
[11]	valid_0's l1: 805714
[12]	valid_0's l1: 772503
[13]	valid_0's l1: 741506
[14]	valid_0's l1: 714516
[15]	valid_0's l1: 691482
[16]	valid_0's l1: 671055
[17]	valid_0's l1: 652837
[18]	valid_0's l1: 636996
[19]	valid_0's l1: 623926
[20]	valid_0's l1: 612615
[21]	valid_0's l1: 602717
[22]	valid_0's l1: 594060
[23]	valid_0's l1: 585241
[24]	valid_0's l1: 578355
[25]	valid_0's l1: 571713
[26]	valid_0's l1: 565996
[27]	valid_0's l1: 561583
[28]	valid_0's l1: 557498
[29]	valid_0's l1: 553492
[30]	valid_0's l1: 549398
[31]	valid_0's l1: 546650
[32]	valid_0's l1: 544015
[33]	valid_0's l1: 541495
[34]	valid_0's l1: 539217
[35]	valid_0's l1: 537029
[36]	valid_0's l1: 535014
[37]	valid_0's l1: 533322
[38]	valid_0's l1: 532212
[39]	valid_0's l1: 530698
[40]	valid_0's l1: 529713
[41]	valid_0's l1: 528081
[42]	valid_0's l1: 526674
[43]	valid_0's l1: 526060
[44]	valid_0's l1: 524314
[45]	valid_0's l1: 523524
[46]	valid_0's

[120]	validation_0-mae:485316
[121]	validation_0-mae:485328
[122]	validation_0-mae:485286
[123]	validation_0-mae:485242
[124]	validation_0-mae:485242
[125]	validation_0-mae:485236
[126]	validation_0-mae:485227
[127]	validation_0-mae:485221
[128]	validation_0-mae:485219
[129]	validation_0-mae:485168
[130]	validation_0-mae:485147
[131]	validation_0-mae:485089
[132]	validation_0-mae:485080
[133]	validation_0-mae:485069
[134]	validation_0-mae:485051
[135]	validation_0-mae:484980
[136]	validation_0-mae:484972
[137]	validation_0-mae:484994
[138]	validation_0-mae:484993
[139]	validation_0-mae:484971
[140]	validation_0-mae:484959
[141]	validation_0-mae:484895
[142]	validation_0-mae:484900
[143]	validation_0-mae:484905
[144]	validation_0-mae:484896
[145]	validation_0-mae:484914
[146]	validation_0-mae:484890
[147]	validation_0-mae:484923
[148]	validation_0-mae:484927
[149]	validation_0-mae:484928
[150]	validation_0-mae:484921
[151]	validation_0-mae:484929
[152]	validation_0-mae:484916
[153]	vali

[32]	validation_0-mae:488602
[33]	validation_0-mae:488149
[34]	validation_0-mae:488045
[35]	validation_0-mae:487706
[36]	validation_0-mae:487235
[37]	validation_0-mae:486792
[38]	validation_0-mae:486593
[39]	validation_0-mae:486269
[40]	validation_0-mae:486091
[41]	validation_0-mae:485767
[42]	validation_0-mae:485794
[43]	validation_0-mae:485673
[44]	validation_0-mae:485352
[45]	validation_0-mae:485203
[46]	validation_0-mae:485068
[47]	validation_0-mae:484762
[48]	validation_0-mae:484520
[49]	validation_0-mae:484238
[50]	validation_0-mae:484258
[51]	validation_0-mae:484126
[52]	validation_0-mae:483909
[53]	validation_0-mae:483847
[54]	validation_0-mae:483692
[55]	validation_0-mae:483668
[56]	validation_0-mae:483577
[57]	validation_0-mae:483500
[58]	validation_0-mae:483460
[59]	validation_0-mae:483331
[60]	validation_0-mae:483316
[61]	validation_0-mae:483231
[62]	validation_0-mae:483162
[63]	validation_0-mae:483013
[64]	validation_0-mae:483044
[65]	validation_0-mae:483061
[66]	validatio

[308]	validation_0-mae:479379
[309]	validation_0-mae:479387
[310]	validation_0-mae:479380
[311]	validation_0-mae:479380
[312]	validation_0-mae:479374
[313]	validation_0-mae:479369
[314]	validation_0-mae:479378
[315]	validation_0-mae:479382
[316]	validation_0-mae:479403
[317]	validation_0-mae:479403
[318]	validation_0-mae:479387
[319]	validation_0-mae:479394
[320]	validation_0-mae:479335
[321]	validation_0-mae:479351
[322]	validation_0-mae:479342
[323]	validation_0-mae:479340
[324]	validation_0-mae:479347
[325]	validation_0-mae:479347
[326]	validation_0-mae:479353
[327]	validation_0-mae:479334
[328]	validation_0-mae:479325
[329]	validation_0-mae:479316
[330]	validation_0-mae:479310
[331]	validation_0-mae:479306
[332]	validation_0-mae:479306
[333]	validation_0-mae:479314
[334]	validation_0-mae:479304
[335]	validation_0-mae:479308
[336]	validation_0-mae:479304
[337]	validation_0-mae:479300
[338]	validation_0-mae:479299
[339]	validation_0-mae:479312
[340]	validation_0-mae:479300
[341]	vali

[203]	validation_0-mae:486594
[204]	validation_0-mae:486591
[205]	validation_0-mae:486540
[206]	validation_0-mae:486504
[207]	validation_0-mae:486479
[208]	validation_0-mae:486480
[209]	validation_0-mae:486474
[210]	validation_0-mae:486422
[211]	validation_0-mae:486398
[212]	validation_0-mae:486394
[213]	validation_0-mae:486392
[214]	validation_0-mae:486366
[215]	validation_0-mae:486325
[216]	validation_0-mae:486313
[217]	validation_0-mae:486313
[218]	validation_0-mae:486302
[219]	validation_0-mae:486286
[220]	validation_0-mae:486283
[221]	validation_0-mae:486252
[222]	validation_0-mae:486290
[223]	validation_0-mae:486285
[224]	validation_0-mae:486283
[225]	validation_0-mae:486249
[226]	validation_0-mae:486268
[227]	validation_0-mae:486261
[228]	validation_0-mae:486277
[229]	validation_0-mae:486275
[230]	validation_0-mae:486262
[231]	validation_0-mae:486265
[232]	validation_0-mae:486237
[233]	validation_0-mae:486224
[234]	validation_0-mae:486199
[235]	validation_0-mae:486211
[236]	vali

[93]	validation_0-mae:489137
[94]	validation_0-mae:489141
[95]	validation_0-mae:489148
[96]	validation_0-mae:489120
[97]	validation_0-mae:489122
[98]	validation_0-mae:489014
[99]	validation_0-mae:489015
[100]	validation_0-mae:488991
[101]	validation_0-mae:488904
[102]	validation_0-mae:488852
[103]	validation_0-mae:488867
[104]	validation_0-mae:488829
[105]	validation_0-mae:488842
[106]	validation_0-mae:488834
[107]	validation_0-mae:488814
[108]	validation_0-mae:488805
[109]	validation_0-mae:488747
[110]	validation_0-mae:488704
[111]	validation_0-mae:488690
[112]	validation_0-mae:488643
[113]	validation_0-mae:488629
[114]	validation_0-mae:488614
[115]	validation_0-mae:488565
[116]	validation_0-mae:488571
[117]	validation_0-mae:488571
[118]	validation_0-mae:488569
[119]	validation_0-mae:488474
[120]	validation_0-mae:488442
[121]	validation_0-mae:488454
[122]	validation_0-mae:488402
[123]	validation_0-mae:488401
[124]	validation_0-mae:488411
[125]	validation_0-mae:488333
[126]	validation_

StackingTransformer(estimators=[('lightgbm',
                                 Pipeline(memory=None,
                                          steps=[('feature_selector',
                                                  FeatureSelector(features=['habitaciones',
                                                                            'garages',
                                                                            'banos',
                                                                            'antiguedad',
                                                                            'metroscubiertos',
                                                                            'metrostotales',
                                                                            'lat_norm',
                                                                            'lng_normgimnasio',
                                                                            'usosmultiples',
           

In [45]:
s_train = stack.transform(df_train_f.drop('precio', axis=1).values)
s_test = stack.transform(df_test_f.drop('precio', axis=1, errors='ignore').values)

Train set was detected.
Transforming...

estimator  0: [lightgbm: Pipeline]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    model from fold  4: done
    ----
    DONE

estimator  1: [xgboost: Pipeline]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    model from fold  4: done
    ----
    DONE

Transforming...

estimator  0: [lightgbm: Pipeline]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    model from fold  4: done
    ----
    DONE

estimator  1: [xgboost: Pipeline]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    model from fold  4: done
    ----
    DONE



## Prediccion con todos los features + stacking

In [50]:
df_train_s = df_train_f.copy()
df_test_s = df_test_f.copy()

df_train_s['stack01'], df_train_s['stack02'] = zip(*s_train)
df_test_s['stack01'], df_test_s['stack02'] = zip(*s_test)

features_stacking = ['stack01', 'stack02']

In [51]:
df_train_s['id'] = df_train['id']
df_test_s['id'] = df_test['id']

In [52]:
import xgboost as xgb
import pandas as pd
import numpy as np
from hyperopt import fmin, tpe, hp
from hyperopt.pyll.base import scope
import ipynb.fs.full.utils as utils
import ipynb.fs.full.features as features
import ipynb.fs.full.features_distancias as f_distancias


from sklearn.model_selection import cross_val_score
from hyperopt import STATUS_OK


features = ['habitaciones', 
            'garages', 
            'banos',
            'antiguedad',
           'metroscubiertos', 
            'metrostotales',
            'lat_norm', 'lng_norm'
            'gimnasio', 'usosmultiples', 'piscina','prop_frecuente', 'top_provincia', 'promedio_precio_ciudad', 
                 'anio', 'promedio_id_zona', 'promedio_precio_tipo_propiedad', 
                 'count_id_zona', 'count_ciudad', 'puntaje', 
                     'count_tipo_propiedad_ciudad', 
                 'promedio_precio_tipo_propiedad_ciudad_gen',
                 'count_id_zona'
                 'dias_desde_datos',
                 'meses_desde_datos',
                 'porcentaje_metros',
                 'distancia_ciudad_centrica']


# params_2nd = {'bagging_fraction': 0.8999882607358867,
#  'bagging_freq': int(95.0),
#  'feature_fraction': 0.2570109385381975,
#  'learning_rate': 0.13601832720254403,
#  'max_depth': int(26.0),
#  'num_leaves': int(175.0),
#  'test_size': 0.08363501292068126,
#  'boosting_type': 'dart',
#  'num_boost_round': 1200,
#  'objective': 'regression',
#  'metric': 'mae'}

params_2nd = {
 'bagging_freq': 0,
 'max_bin': 1012,
 'max_depth': 5,
 'min_data_in_leaf': 40,
 'min_split_gain': 0.2934964048344291,
 'num_leaves': 469,
 'reg_lambda': 36.47905035024398,
 'boosting_type': 'gbdt',
 'num_boost_round': 1200,
 'objective': 'regression',
 'metric': 'mae'
}


# def eval_lightgbm(params):
#     return {'loss': -np.average(cross_val_score(LightGBMWrapper(**params), utils.filtrar_features(df_train_s, features + features_stacking), 
#                                                 df_train['precio'].values, cv=3, scoring='neg_mean_absolute_error')),
#             'status': STATUS_OK}

# space = {
#     'boosting_type': 'gbdt',
#     'objective': 'regression',
#     'metric': 'mae', # Si se deja vacio se toma el ideal para llegar al 'objective'
# #     'bagging_fraction': hp.uniform('bagging_fraction', 0.1, 1),
# #     'bagging_freq': hp.choice('bagging_freq', [0, 1, 5, 10, 20, 40]),
# #     'bagging_fraction': hp.uniform('bagging_fraction', 0.1, 1),
#     'bagging_freq': hp.choice('bagging_freq', [0]),
#     'min_data_in_leaf': scope.int(hp.quniform('min_data_in_leaf', 5, 95, 1)),
#     'max_bin': scope.int(hp.quniform('max_bin', 64, 2014, 1)),
#     'reg_lambda': hp.uniform('reg_lambda', 0, 75),
#     'min_split_gain': hp.uniform('min_split_gain', 0, 1),
#     'feature_fraction': 0.85,
#     'max_depth': scope.int(hp.quniform('max_depth', 3, 25, 1)),
#     'num_leaves': scope.int(hp.quniform('num_leaves', 100, 800, 1)),
#     'learning_rate': 0.15,
#     'n_jobs': 4,
#     'silent': True
# }

# hps = fmin(eval_lightgbm, space=space, algo=tpe.suggest, max_evals=30, verbose=1)

# display(hps)

lgb_m_2nd = LightGBMWrapper(**params_2nd)
lgb_m_2nd.fit(utils.filtrar_features(df_train_s, features + features_stacking), df_train['precio'].values)

[1]	valid_0's l1: 1.45566e+06
Training until validation scores don't improve for 15 rounds
[2]	valid_0's l1: 1.33076e+06
[3]	valid_0's l1: 1.21977e+06
[4]	valid_0's l1: 1.12148e+06
[5]	valid_0's l1: 1.03478e+06
[6]	valid_0's l1: 958730
[7]	valid_0's l1: 891900
[8]	valid_0's l1: 832938
[9]	valid_0's l1: 782213
[10]	valid_0's l1: 738341
[11]	valid_0's l1: 701354
[12]	valid_0's l1: 669392
[13]	valid_0's l1: 641753
[14]	valid_0's l1: 617704
[15]	valid_0's l1: 597078
[16]	valid_0's l1: 579329
[17]	valid_0's l1: 564325
[18]	valid_0's l1: 551188
[19]	valid_0's l1: 540121
[20]	valid_0's l1: 530634
[21]	valid_0's l1: 522571
[22]	valid_0's l1: 515517
[23]	valid_0's l1: 509547
[24]	valid_0's l1: 504351
[25]	valid_0's l1: 500124
[26]	valid_0's l1: 496545
[27]	valid_0's l1: 493460
[28]	valid_0's l1: 490860
[29]	valid_0's l1: 488701
[30]	valid_0's l1: 486955
[31]	valid_0's l1: 485384
[32]	valid_0's l1: 483912
[33]	valid_0's l1: 482856
[34]	valid_0's l1: 481881
[35]	valid_0's l1: 481236
[36]	valid_0'

LightGBMWrapper(bagging_freq=0, boosting_type='gbdt', class_weight=None,
                colsample_bytree=1.0, importance_type='split',
                learning_rate=0.1, max_bin=1012, max_depth=5, metric='mae',
                min_child_samples=20, min_child_weight=0.001,
                min_data_in_leaf=40, min_split_gain=0.2934964048344291,
                n_estimators=100, n_jobs=-1, num_boost_round=1200,
                num_leaves=469, objective='regression', random_state=None,
                reg_alpha=0.0, reg_lambda=36.47905035024398, silent=True,
                subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [53]:
df_test_s['target'] = lgb_m_2nd.predict(utils.filtrar_features(df_test_s, features + features_stacking))
# df_test_s = utils.pesificar_df(df_test_s, col_precio_in='target', col_precio_out='target')
df_test_s[['id', 'target']].to_csv('respuesta55.csv', index = False)

# print(f'MAE Stacking-full: {utils.MAE(df_test_s["precio"].values, df_test_s["target"].values)}')

## Prediccion solo con features de stacking

In [22]:
# params_2nd = {'bagging_fraction': 0.8924398062087346,
#  'bagging_freq': int(36.0),
#  'feature_fraction': 0.16167385124183287,
#  'learning_rate': 0.054693418899570134,
#  'max_depth': int(4.0),
#  'num_leaves': int(93.0),
#  'objective': 'regression',
#  'boosting_type': 'gbdt',
#  'metric': 'mae'}

params_2nd = {'bagging_fraction': 0.8243831977099841,
 'bagging_freq': int(10.0),
 'feature_fraction': 0.9228324501365147,
 'learning_rate': 0.050664243951241736,
 'max_depth': int(3.0),
 'num_leaves': int(78.0),
 'objective': 'regression',
 'boosting_type': 'dart',
 'num_boost_round': 1200,
 'metric': 'mae'}


lgb_m_2nd = LightGBMWrapper(**params_2nd)
lgb_m_2nd.fit(s_train, df_train_f['precio'].values)

df_test_f['target'] = lgb_m_2nd.predict(s_test)
df_test_f[['id', 'target']].to_csv('respuesta47.csv', index = False)



In [49]:
features = ['stack01', 'stack02', 'stack03', 'stack04']

def eval_lightgbm(args):
    num_leaves, learning_rate, feature_fraction, bagging_fraction, bagging_freq, max_depth = args

    lgb_train = lgb.Dataset(s_train, df_train['precio'].values)
    
    num_leaves = int(num_leaves)
    bagging_freq = int(bagging_freq)
    max_depth = int(max_depth)
    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'mae'}, # Si se deja vacio se toma el ideal para llegar al 'objective'
        'num_leaves': num_leaves,
        'learning_rate': learning_rate,
        'feature_fraction': feature_fraction,
        'bagging_fraction': bagging_fraction,
        'bagging_freq': bagging_freq,
        'max_depth': max_depth,
        'verbose': -1,
    }

    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=250,
                    verbose_eval=-1)
    
    y_pred_test = gbm.predict(s_test, num_iteration=gbm.best_iteration)
    return utils.MAE(df_test['precio'].values, y_pred_test)

space = [hp.quniform('num_leaves', 15, 130, 1), hp.uniform('learning_rate', 0.05, 0.9),
        hp.uniform('feature_fraction', 0.90, 1), hp.uniform('bagging_fraction', 0.70, 1),
        hp.quniform('bagging_freq', 0, 40, 1), hp.quniform('max_depth', 3, 15, 1)]

hps = fmin(eval_lightgbm, space=space, algo=tpe.suggest, max_evals=400, verbose=1)

display(hps)

100%|██████████| 400/400 [17:21<00:00,  2.60s/it, best loss: 511432.09539443516]


{'bagging_fraction': 0.8243831977099841,
 'bagging_freq': 10.0,
 'feature_fraction': 0.9228324501365147,
 'learning_rate': 0.050664243951241736,
 'max_depth': 3.0,
 'num_leaves': 78.0}

## Prediccion con promedios

In [19]:
df_test_f['target'] = np.average(s_test, axis=1)
df_test_f[['id', 'target']].to_csv('respuesta44.csv', index = False)

In [77]:
from scipy.stats import mode

y_pred_test = mode(s_test, axis=1)[0]
print(f"MAE Stacking only: {utils.MAE(y_pred_test, df_test_f['precio'].values)}")

MAE Stacking only: 519840.412728738


In [95]:
import numpy as np

y_pred_test = np.average(s_test, axis=1)
print(f"MAE Stacking only: {utils.MAE(y_pred_test, df_test_f['precio'].values)}")

MAE Stacking only: 502391.48180612386


In [91]:
from scipy.optimize import minimize

def mae_res(weights):
    y_pred_test = np.average(s_test, weights=weights, axis=1)
    return utils.MAE(y_pred_test, df_test_f['precio'].values)

x0 = [1] * len(s_test.T)
minimize(mae_res, x0)

      fun: 497375.416620218
 hess_inv: array([[ 7.64527106e-05, -6.84363088e-05,  5.18102889e-04,
         1.30966978e-03],
       [-6.84363088e-05,  6.84021400e-05, -4.91721763e-04,
        -1.17091868e-03],
       [ 5.18102889e-04, -4.91721763e-04,  4.00738244e-03,
         9.31303274e-03],
       [ 1.30966978e-03, -1.17091868e-03,  9.31303274e-03,
         2.30019160e-02]])
      jac: array([5.8359375 , 2.96484375, 0.3671875 , 1.5859375 ])
  message: 'Desired error not necessarily achieved due to precision loss.'
     nfev: 702
      nit: 15
     njev: 115
   status: 2
  success: False
        x: array([ 0.40410989, -0.16217251,  2.30245548,  0.39816892])

In [92]:
from scipy.optimize import minimize, differential_evolution

def mae_res(weights):
    y_pred_test = np.average(s_test, weights=weights, axis=1)
    return utils.MAE(y_pred_test, df_test_f['precio'].values)

x0 = [(-3, 4)] * len(s_test.T)
differential_evolution(mae_res, bounds=x0)

     fun: 497375.4169548869
     jac: array([-0.58207661, -0.83819032, -0.23283064,  1.65309757])
 message: 'Optimization terminated successfully.'
    nfev: 425
     nit: 5
 success: True
       x: array([ 0.49248228, -0.19790651,  2.80637364,  0.48548681])

In [23]:
df_test_s[['id'] + features_stacking].to_csv('data/stacking5_test.csv', index=False)
df_train_s[['id'] + features_stacking].to_csv('data/stacking5_train.csv', index=False)

## Blending

In [54]:
from sklearn.base import clone
from sklearn.model_selection import train_test_split


class BlendingTransformer():

    def __init__(self, models, model_blend, val_split=0.15, verbose=True):
        self.models = models
        self.val_split = val_split
        self.model_blend = model_blend
        self.verbose = verbose

        
    def simple_fit(self, x, y):
        x_cat = FeatureSelector(['provincia']).transform(x)
        
        x_train, x_eval, y_train, y_eval = train_test_split(x, y, 
                                                            test_size=self.val_split,
                                                            random_state=1,
                                                            stratify=x_cat)

        self._fit_first(x_train, y_train)
        self._fit_second(x_eval, y_eval)
        


    def fit(self, x, y, x_test):

        x_cat = FeatureSelector(['provincia']).transform(x)
        
        x_train, x_eval, y_train, y_eval = train_test_split(x, y, 
                                                            test_size=self.val_split,
                                                            random_state=0,
                                                            stratify=x_cat)

        self._fit_first(x_train, y_train)
        self._fit_second(x_eval, y_eval)

        self._fit_first(x, y)
        y_test = self._predict_second(x_test)

        x = np.concatenate((x, x_test))
        y = np.concatenate((y, y_test))
        x_cat = FeatureSelector(['provincia', 'intervalo_metros_cubiertos']).transform(x)
        
        x_train, x_eval, y_train, y_eval = train_test_split(x, y, 
                                                            test_size=self.val_split, 
                                                            random_state=0,
                                                            stratify=x_cat)
        self._fit_first(x_train, y_train)
        self._fit_second(x_eval, y_eval)

        self._fit_first(x, y)


    def _fit_first(self, x, y):

        for i in range(len(self.models)):   
            if self.verbose:
                print(f'Fitting {i}...')
            self.models[i] = clone(self.models[i])
            self.models[i].fit(x, y)


    def _fit_second(self, x, y):
        if self.verbose:
            print(f'Fitting blender...')
        self.model_blend = clone(self.model_blend)
        self.model_blend.fit(self._predict_first(x), y)


            
    def _predict_first(self, x):
        preds = []
        for i in range(len(self.models)):
            y_pred = self.models[i].predict(x)
            preds.append(y_pred)

        return np.array(preds).T


    def _predict_second(self, x):
        return self.model_blend.predict(self._predict_first(x))


    def predict(self, x):
        return self._predict_second(x)
    
    
modelos = [
           lgb_m,
#            catboost_m,
           xgb_m,
#            forest_m
          ]

params_2nd = {'bagging_fraction': 0.8243831977099841,
 'bagging_freq': int(10.0),
 'feature_fraction': 0.9228324501365147,
 'learning_rate': 0.050664243951241736,
 'max_depth': int(3.0),
 'num_leaves': int(78.0),
 'objective': 'regression',
 'boosting_type': 'dart',
 'num_boost_round': 1200,
 'metric': 'mae'}

lgb_m_2nd = LightGBMWrapper(**params_2nd)

blend = BlendingTransformer(modelos, lgb_m_2nd)

### Simple blend

In [34]:
blend.simple_fit(df_train_f.drop('precio', axis=1).values, df_train_f['precio'].values)

Fitting 0...
[1]	valid_0's l1: 1.26295e+06
Training until validation scores don't improve for 25 rounds
[2]	valid_0's l1: 1.03402e+06
[3]	valid_0's l1: 873763
[4]	valid_0's l1: 763554
[5]	valid_0's l1: 688818
[6]	valid_0's l1: 636144
[7]	valid_0's l1: 600321
[8]	valid_0's l1: 575559
[9]	valid_0's l1: 557892
[10]	valid_0's l1: 546389
[11]	valid_0's l1: 538399
[12]	valid_0's l1: 532480
[13]	valid_0's l1: 526075
[14]	valid_0's l1: 522024
[15]	valid_0's l1: 518883
[16]	valid_0's l1: 516157
[17]	valid_0's l1: 514013
[18]	valid_0's l1: 512474
[19]	valid_0's l1: 510622
[20]	valid_0's l1: 509510
[21]	valid_0's l1: 508198
[22]	valid_0's l1: 507296
[23]	valid_0's l1: 506631
[24]	valid_0's l1: 506177
[25]	valid_0's l1: 505580
[26]	valid_0's l1: 504593
[27]	valid_0's l1: 503853
[28]	valid_0's l1: 503037
[29]	valid_0's l1: 502674
[30]	valid_0's l1: 502127
[31]	valid_0's l1: 501628
[32]	valid_0's l1: 501740
[33]	valid_0's l1: 500688
[34]	valid_0's l1: 500524
[35]	valid_0's l1: 500018
[36]	valid_0's 

[154]	validation_0-mae:466271
[155]	validation_0-mae:466272
[156]	validation_0-mae:466276
[157]	validation_0-mae:466247
[158]	validation_0-mae:466219
[159]	validation_0-mae:466178
[160]	validation_0-mae:466152
[161]	validation_0-mae:466171
[162]	validation_0-mae:466175
[163]	validation_0-mae:466178
[164]	validation_0-mae:466194
[165]	validation_0-mae:466187
[166]	validation_0-mae:466134
[167]	validation_0-mae:466131
[168]	validation_0-mae:466147
[169]	validation_0-mae:466028
[170]	validation_0-mae:465988
[171]	validation_0-mae:465965
[172]	validation_0-mae:465957
[173]	validation_0-mae:465914
[174]	validation_0-mae:465898
[175]	validation_0-mae:465944
[176]	validation_0-mae:465920
[177]	validation_0-mae:465924
[178]	validation_0-mae:465917
[179]	validation_0-mae:465914
[180]	validation_0-mae:465902
[181]	validation_0-mae:465921
[182]	validation_0-mae:465866
[183]	validation_0-mae:465858
[184]	validation_0-mae:465848
[185]	validation_0-mae:465831
[186]	validation_0-mae:465814
[187]	vali



In [24]:
utils.MAE(blend.predict(df_test_f.drop('precio', axis=1).values), df_test_f['precio'].values)

508087.59099414723

### Long blend

In [55]:
blend.fit(df_train_f.drop('precio', axis=1).values, 
                          df_train_f['precio'].values, df_test_f.drop('precio', axis=1, errors='ignore').values)

Fitting 0...
[1]	valid_0's l1: 1.48803e+06
Training until validation scores don't improve for 15 rounds
[2]	valid_0's l1: 1.37088e+06
[3]	valid_0's l1: 1.26584e+06
[4]	valid_0's l1: 1.17282e+06
[5]	valid_0's l1: 1.09168e+06
[6]	valid_0's l1: 1.01912e+06
[7]	valid_0's l1: 953872
[8]	valid_0's l1: 897402
[9]	valid_0's l1: 848430
[10]	valid_0's l1: 804754
[11]	valid_0's l1: 764916
[12]	valid_0's l1: 731757
[13]	valid_0's l1: 702461
[14]	valid_0's l1: 676961
[15]	valid_0's l1: 652696
[16]	valid_0's l1: 632558
[17]	valid_0's l1: 614795
[18]	valid_0's l1: 600288
[19]	valid_0's l1: 586600
[20]	valid_0's l1: 574868
[21]	valid_0's l1: 564166
[22]	valid_0's l1: 555127
[23]	valid_0's l1: 546370
[24]	valid_0's l1: 539778
[25]	valid_0's l1: 532657
[26]	valid_0's l1: 526886
[27]	valid_0's l1: 522491
[28]	valid_0's l1: 518000
[29]	valid_0's l1: 513561
[30]	valid_0's l1: 509275
[31]	valid_0's l1: 505579
[32]	valid_0's l1: 503344
[33]	valid_0's l1: 500487
[34]	valid_0's l1: 497850
[35]	valid_0's l1: 49

[116]	validation_0-mae:458599
[117]	validation_0-mae:458563
[118]	validation_0-mae:458554
[119]	validation_0-mae:458578
[120]	validation_0-mae:458585
[121]	validation_0-mae:458600
[122]	validation_0-mae:458523
[123]	validation_0-mae:458521
[124]	validation_0-mae:458516
[125]	validation_0-mae:458535
[126]	validation_0-mae:458522
[127]	validation_0-mae:458523
[128]	validation_0-mae:458443
[129]	validation_0-mae:458428
[130]	validation_0-mae:458412
[131]	validation_0-mae:458398
[132]	validation_0-mae:458356
[133]	validation_0-mae:458380
[134]	validation_0-mae:458378
[135]	validation_0-mae:458352
[136]	validation_0-mae:458352
[137]	validation_0-mae:458296
[138]	validation_0-mae:458263
[139]	validation_0-mae:458224
[140]	validation_0-mae:458202
[141]	validation_0-mae:458211
[142]	validation_0-mae:458199
[143]	validation_0-mae:458193
[144]	validation_0-mae:458169
[145]	validation_0-mae:458182
[146]	validation_0-mae:458177
[147]	validation_0-mae:458100
[148]	validation_0-mae:458114
[149]	vali



Fitting 0...
[1]	valid_0's l1: 1.46973e+06
Training until validation scores don't improve for 15 rounds
[2]	valid_0's l1: 1.35579e+06
[3]	valid_0's l1: 1.25643e+06
[4]	valid_0's l1: 1.16642e+06
[5]	valid_0's l1: 1.08666e+06
[6]	valid_0's l1: 1.01657e+06
[7]	valid_0's l1: 955247
[8]	valid_0's l1: 899829
[9]	valid_0's l1: 852697
[10]	valid_0's l1: 810806
[11]	valid_0's l1: 773406
[12]	valid_0's l1: 741113
[13]	valid_0's l1: 712658
[14]	valid_0's l1: 688434
[15]	valid_0's l1: 665116
[16]	valid_0's l1: 646528
[17]	valid_0's l1: 628780
[18]	valid_0's l1: 614887
[19]	valid_0's l1: 603017
[20]	valid_0's l1: 592334
[21]	valid_0's l1: 582363
[22]	valid_0's l1: 574094
[23]	valid_0's l1: 565942
[24]	valid_0's l1: 558678
[25]	valid_0's l1: 552833
[26]	valid_0's l1: 548066
[27]	valid_0's l1: 543487
[28]	valid_0's l1: 539538
[29]	valid_0's l1: 535582
[30]	valid_0's l1: 532360
[31]	valid_0's l1: 530170
[32]	valid_0's l1: 527804
[33]	valid_0's l1: 526079
[34]	valid_0's l1: 523702
[35]	valid_0's l1: 52

[79]	validation_0-mae:478211
[80]	validation_0-mae:478170
[81]	validation_0-mae:478134
[82]	validation_0-mae:478090
[83]	validation_0-mae:478072
[84]	validation_0-mae:478034
[85]	validation_0-mae:477937
[86]	validation_0-mae:477935
[87]	validation_0-mae:477841
[88]	validation_0-mae:477837
[89]	validation_0-mae:477794
[90]	validation_0-mae:477808
[91]	validation_0-mae:477799
[92]	validation_0-mae:477790
[93]	validation_0-mae:477723
[94]	validation_0-mae:477734
[95]	validation_0-mae:477756
[96]	validation_0-mae:477763
[97]	validation_0-mae:477708
[98]	validation_0-mae:477663
[99]	validation_0-mae:477658
[100]	validation_0-mae:477653
[101]	validation_0-mae:477617
[102]	validation_0-mae:477626
[103]	validation_0-mae:477596
[104]	validation_0-mae:477538
[105]	validation_0-mae:477482
[106]	validation_0-mae:477494
[107]	validation_0-mae:477444
[108]	validation_0-mae:477422
[109]	validation_0-mae:477391
[110]	validation_0-mae:477339
[111]	validation_0-mae:477322
[112]	validation_0-mae:477308
[

[143]	valid_0's l1: 408445
[144]	valid_0's l1: 408236
[145]	valid_0's l1: 408260
[146]	valid_0's l1: 408283
[147]	valid_0's l1: 408324
[148]	valid_0's l1: 408171
[149]	valid_0's l1: 408208
[150]	valid_0's l1: 408274
[151]	valid_0's l1: 408216
[152]	valid_0's l1: 408167
[153]	valid_0's l1: 408165
[154]	valid_0's l1: 408186
[155]	valid_0's l1: 408085
[156]	valid_0's l1: 408003
[157]	valid_0's l1: 407921
[158]	valid_0's l1: 407926
[159]	valid_0's l1: 407724
[160]	valid_0's l1: 407515
[161]	valid_0's l1: 407315
[162]	valid_0's l1: 407039
[163]	valid_0's l1: 406823
[164]	valid_0's l1: 406606
[165]	valid_0's l1: 406562
[166]	valid_0's l1: 406604
[167]	valid_0's l1: 406284
[168]	valid_0's l1: 406237
[169]	valid_0's l1: 406269
[170]	valid_0's l1: 406175
[171]	valid_0's l1: 405956
[172]	valid_0's l1: 406008
[173]	valid_0's l1: 405826
[174]	valid_0's l1: 405713
[175]	valid_0's l1: 405667
[176]	valid_0's l1: 405611
[177]	valid_0's l1: 405558
[178]	valid_0's l1: 405458
[179]	valid_0's l1: 405397
[



Fitting 0...
[1]	valid_0's l1: 1.44731e+06
Training until validation scores don't improve for 15 rounds
[2]	valid_0's l1: 1.33058e+06
[3]	valid_0's l1: 1.22539e+06
[4]	valid_0's l1: 1.13286e+06
[5]	valid_0's l1: 1.0513e+06
[6]	valid_0's l1: 980410
[7]	valid_0's l1: 918010
[8]	valid_0's l1: 862860
[9]	valid_0's l1: 814898
[10]	valid_0's l1: 771434
[11]	valid_0's l1: 733538
[12]	valid_0's l1: 700663
[13]	valid_0's l1: 670954
[14]	valid_0's l1: 645196
[15]	valid_0's l1: 622879
[16]	valid_0's l1: 602385
[17]	valid_0's l1: 586202
[18]	valid_0's l1: 570890
[19]	valid_0's l1: 557635
[20]	valid_0's l1: 546313
[21]	valid_0's l1: 536371
[22]	valid_0's l1: 527314
[23]	valid_0's l1: 519289
[24]	valid_0's l1: 512412
[25]	valid_0's l1: 506046
[26]	valid_0's l1: 501095
[27]	valid_0's l1: 496705
[28]	valid_0's l1: 492375
[29]	valid_0's l1: 488788
[30]	valid_0's l1: 485789
[31]	valid_0's l1: 482926
[32]	valid_0's l1: 480141
[33]	valid_0's l1: 478288
[34]	valid_0's l1: 476202
[35]	valid_0's l1: 474337
[

[305]	valid_0's l1: 420230
[306]	valid_0's l1: 420206
[307]	valid_0's l1: 420172
[308]	valid_0's l1: 420177
[309]	valid_0's l1: 420138
[310]	valid_0's l1: 420128
[311]	valid_0's l1: 420060
[312]	valid_0's l1: 420041
[313]	valid_0's l1: 420051
[314]	valid_0's l1: 419981
[315]	valid_0's l1: 419946
[316]	valid_0's l1: 419966
[317]	valid_0's l1: 419906
[318]	valid_0's l1: 419826
[319]	valid_0's l1: 419803
[320]	valid_0's l1: 419652
[321]	valid_0's l1: 419730
[322]	valid_0's l1: 419675
[323]	valid_0's l1: 419570
[324]	valid_0's l1: 419441
[325]	valid_0's l1: 419494
[326]	valid_0's l1: 419466
[327]	valid_0's l1: 419496
[328]	valid_0's l1: 419378
[329]	valid_0's l1: 419416
[330]	valid_0's l1: 419368
[331]	valid_0's l1: 419281
[332]	valid_0's l1: 419318
[333]	valid_0's l1: 419384
[334]	valid_0's l1: 419306
[335]	valid_0's l1: 419156
[336]	valid_0's l1: 419014
[337]	valid_0's l1: 418950
[338]	valid_0's l1: 418979
[339]	valid_0's l1: 418902
[340]	valid_0's l1: 418614
[341]	valid_0's l1: 418527
[

[612]	valid_0's l1: 412577
[613]	valid_0's l1: 412487
[614]	valid_0's l1: 412463
[615]	valid_0's l1: 412427
[616]	valid_0's l1: 412386
[617]	valid_0's l1: 412510
[618]	valid_0's l1: 412507
[619]	valid_0's l1: 412518
[620]	valid_0's l1: 412502
[621]	valid_0's l1: 412533
[622]	valid_0's l1: 412535
[623]	valid_0's l1: 412519
Early stopping, best iteration is:
[608]	valid_0's l1: 412380
Fitting 1...
[0]	validation_0-mae:2.15129e+06
Will train until validation_0-mae hasn't improved in 10 rounds.
[1]	validation_0-mae:1.85767e+06
[2]	validation_0-mae:1.6063e+06
[3]	validation_0-mae:1.39283e+06
[4]	validation_0-mae:1.21179e+06
[5]	validation_0-mae:1.06091e+06
[6]	validation_0-mae:936484
[7]	validation_0-mae:831757
[8]	validation_0-mae:747062
[9]	validation_0-mae:675933
[10]	validation_0-mae:618525
[11]	validation_0-mae:574060
[12]	validation_0-mae:537244
[13]	validation_0-mae:508150
[14]	validation_0-mae:485526
[15]	validation_0-mae:468176
[16]	validation_0-mae:455240
[17]	validation_0-mae:444

In [56]:
df_test_s = df_test_f.copy()
df_test_s['target'] = blend.predict(df_test_s.values)
# df_test_s = utils.pesificar_df(df_test_s, col_precio_in='target', col_precio_out='target')
df_test_s[['id', 'target']].to_csv('respuesta56.csv', index = False)

In [14]:
df_test_f['precio']

0        6350000.0
1         650000.0
2        1790000.0
3        2323500.0
4        5600000.0
           ...    
70886    4000000.0
70887     450000.0
70888    3400000.0
70889     368790.0
70890    2220000.0
Name: precio, Length: 70891, dtype: float64