In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from hyperopt import fmin, tpe, hp
import ipynb.fs.full.utils as utils
from ipynb.fs.full.features import features_independientes_precio, features_dependientes_precio

df_train = pd.read_csv('./data/train.csv')

df_train, df_test = utils.dividir_df_testeo(df_train, test_size=0.15)

# Para usarse con el submit a Kaggle
df_eval = pd.read_csv('./data/test.csv')

In [2]:
features_base = ['antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 
#                  'lat', 'lng',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas', 'centroscomercialescercanos']

df_train = df_train.sample(frac=1).reset_index(drop=True)

df_test_f = features_independientes_precio(df_test)
df_test_f = features_dependientes_precio(df_test_f, df_train)

df_train_f = features_independientes_precio(df_train)
df_train_f = features_dependientes_precio(df_train_f, df_train)

df_train_f = df_train_f.sample(frac=1).reset_index(drop=True)

## Búsqueda hiperparámetros

In [33]:
features = ['antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 
#             'lat', 'lng',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas', 'centroscomercialescercanos']

features_test = ['prop_frecuente', 'top_provincia', 'porcentaje_metros', 'diferencia_metros', 
                 'promedio_precio_ciudad', 'promedio_por_mes', 'anio', 'promedio_id_zona', 
                 'promedio_precio_tipo_propiedad', 'promedio_precio_habitaciones']

features += features_test


def eval_lightgbm(args):
    num_leaves, learning_rate, feature_fraction, bagging_fraction, bagging_freq, max_depth, test_size = args
    
    df_train_h = df_train_f.sample(frac=1).reset_index(drop=True)
    
    x_train, x_test, y_train, y_test = utils.dividir_dataset(df_train_h, 'precio', features, test_size=test_size)

    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)
    
    num_leaves = int(num_leaves)
    bagging_freq = int(bagging_freq)
    max_depth = int(max_depth)
    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'mae'}, # Si se deja vacio se toma el ideal para llegar al 'objective'
        'num_leaves': num_leaves,
        'learning_rate': learning_rate,
        'feature_fraction': feature_fraction,
        'bagging_fraction': bagging_fraction,
        'bagging_freq': bagging_freq,
        'max_depth': max_depth,
        'verbose': -1,
    }

    gbm = lgb.train(params,
                    lgb_train,
                    valid_sets=lgb_eval,
                    num_boost_round=150,
                    early_stopping_rounds=15,
                    verbose_eval=-1)
    
    y_pred_test = gbm.predict(utils.filtrar_features(df_test_f, features), num_iteration=gbm.best_iteration)
    return utils.MAE(df_test_f['precio'].values, y_pred_test)

space = [hp.quniform('num_leaves', 30, 130, 1), hp.uniform('learning_rate', 0.05, 0.9),
        hp.uniform('feature_fraction', 0.10, 0.90), hp.uniform('bagging_fraction', 0.10, 0.90),
        hp.quniform('bagging_freq', 1, 130, 1), hp.quniform('max_depth', 1, 20, 1),
        hp.uniform('test_size', 0.01, 0.8)]

hps = fmin(eval_lightgbm, space=space, algo=tpe.suggest, max_evals=100, verbose=1)

display(hps)

Training until validation scores don't improve for 15 rounds
Early stopping, best iteration is:                   
[51]	valid_0's l1: 593738
Training until validation scores don't improve for 15 rounds                 
Did not meet early stopping. Best iteration is:                              
[150]	valid_0's l1: 545767
Training until validation scores don't improve for 15 rounds                 
Did not meet early stopping. Best iteration is:                              
[142]	valid_0's l1: 545367
Training until validation scores don't improve for 15 rounds                 
Early stopping, best iteration is:                                           
[72]	valid_0's l1: 557767
Training until validation scores don't improve for 15 rounds                 
Did not meet early stopping. Best iteration is:                              
[142]	valid_0's l1: 580397
Training until validation scores don't improve for 15 rounds                 
Did not meet early stopping. Best iteration is:   

Early stopping, best iteration is:                                            
[66]	valid_0's l1: 561626
Training until validation scores don't improve for 15 rounds                  
Did not meet early stopping. Best iteration is:                               
[150]	valid_0's l1: 544072
Training until validation scores don't improve for 15 rounds                  
Did not meet early stopping. Best iteration is:                               
[150]	valid_0's l1: 584510
Training until validation scores don't improve for 15 rounds                  
Did not meet early stopping. Best iteration is:                               
[146]	valid_0's l1: 538862
Training until validation scores don't improve for 15 rounds                  
Did not meet early stopping. Best iteration is:                               
[150]	valid_0's l1: 547875
Training until validation scores don't improve for 15 rounds                  
Did not meet early stopping. Best iteration is:                             

{'bagging_fraction': 0.867964514027621,
 'bagging_freq': 69.0,
 'feature_fraction': 0.8985719006599164,
 'learning_rate': 0.15007007911553005,
 'max_depth': 9.0,
 'num_leaves': 105.0,
 'test_size': 0.30442962737370866}

## Evaluación features

In [None]:
features = ['antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 
#             'lat', 'lng',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas', 'centroscomercialescercanos']

# features_test = ['tipo_propiedad_compartida',
#                  'prop_frecuente', 'top_provincia', 'es_ciudad_centrica',
#                  'promedio_metros_totales_provincia',
#                  'promedio_metros_cubiertos_provincia', 'porcentaje_metros',
#                  'diferencia_metros', 'delincuencia', 'turismo',
#                  'promedio_precio_ciudad', 'promedio_id_zona',
#                  'promedio_precio_tipo_propiedad',
#                  'promedio_por_mes', 'promedio_precio_habitaciones',
#                  'promedio_precio_habitaciones_banos_garages',
#                  'promedio_precio_banos_garages', 'promedio_precio_booleanos',
#                  'metros_totales_normalizados', 'metros_cubiertos_normalizados', 'anio']

# hps = {'bagging_fraction': 0.7514713723096431,
#  'bagging_freq': 97.0,
#  'feature_fraction': 0.7403311365563734,
#  'learning_rate': 0.11573281219786402,
#  'max_depth': 13.0,
#  'num_leaves': 87.0,
#  'test_size': 0.12116948662473831}

hps = {'bagging_fraction': 0.867964514027621,
 'bagging_freq': 69.0,
 'feature_fraction': 0.8985719006599164,
 'learning_rate': 0.15007007911553005,
 'max_depth': 9.0,
 'num_leaves': 105.0,
 'test_size': 0.30442962737370866}


bagging_fraction = hps['bagging_fraction']
bagging_freq = int(hps['bagging_freq'])
feature_fraction = hps['feature_fraction']
learning_rate = hps['learning_rate']
num_leaves = int(hps['num_leaves'])
max_depth = int(hps['max_depth'])
test_size = hps['test_size']

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'mae'}, # Si se deja vacio se toma el ideal para llegar al 'objective'
    'num_leaves': num_leaves,
    'learning_rate': learning_rate,
    'feature_fraction': feature_fraction,
    'bagging_fraction': bagging_fraction,
    'bagging_freq': bagging_freq,
    'max_depth': max_depth,
    'verbose': 0
}


base_train = 0
base_test = 0
base_eval = 0
for i in ['None'] + features_test:
    x_train, x_test, y_train, y_test = utils.dividir_dataset(df_train_f, 'precio', features + [i], test_size=test_size)

    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)

    gbm = lgb.train(params,
                    lgb_train,
                    valid_sets=lgb_eval,
                    num_boost_round=1000,
                    early_stopping_rounds=15,
                    verbose_eval=-1)

    y_pred_test = gbm.predict(x_test, raw_score=True, num_iteration=gbm.best_iteration)
    y_pred_train = gbm.predict(x_train, raw_score=True, num_iteration=gbm.best_iteration)
    y_pred_eval = gbm.predict(utils.filtrar_features(df_test_f, features + [i]), num_iteration=gbm.best_iteration)

    gbm_mae_train = utils.MAE(y_train, y_pred_train)
    gbm_mae = utils.MAE(y_test, y_pred_test)
    gbm_mae_eval = utils.MAE(df_test_f['precio'].values, y_pred_eval)

    print(f"MAE LightGBM (train): {gbm_mae_train:.5f}")
    print(f"MAE LightGBM (test): {gbm_mae:.5f}")
    print(f"MAE LightGBM (eval): {gbm_mae_eval:.5f}")
    if i is not 'None':
        print(f"Overfitting (base_eval - base_test) - (eval - test) - {i}: {(base_eval - base_test) - (gbm_mae_eval - gbm_mae)}")
        print(f"Diff evaluation (base_eval - eval)                  - {i}: {base_eval - gbm_mae_eval}")
        print(f"Diff train (base_train - train)                     - {i}: {base_train - gbm_mae_train}")
    else:
        base_train = gbm_mae_train
        base_test = gbm_mae
        base_eval = gbm_mae_eval

In [28]:
df_train_f.columns

Index(['id', 'titulo', 'descripcion', 'tipodepropiedad', 'direccion', 'ciudad',
       'provincia', 'antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 'idzona', 'lat', 'lng', 'fecha',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas',
       'centroscomercialescercanos', 'precio', 'porcentaje_metros',
       'diferencia_metros', 'intervalo_metros_totales',
       'intervalo_metros_cubiertos', 'escomercial',
       'promedio_metros_tipo_propiedad', 'promedio_metros_cub_tipo_propiedad',
       'tipo_propiedad_compartida', 'prop_frecuente', 'zona', 'top_provincia',
       'es_ciudad_centrica', 'promedio_metros_totales_provincia',
       'promedio_metros_cubiertos_provincia', 'anio', 'mes', 'dia',
       'trimestre', 'escualas_centros_cercanos', 'delincuencia', 'turismo',
       'es_antigua', 'cantidad_inquilinos', 'metros_totales_normalizados',
       'metros_cubiertos_normalizados', 'promedio_precio_ciudad',
       'varianza_preci

## Evaluación modelo final

In [5]:
features = ['antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 'lat', 'lng',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas', 'centroscomercialescercanos']


features_test = ['prop_frecuente', 'top_provincia', 'porcentaje_metros', 'diferencia_metros', 'promedio_precio_ciudad', 
                 'promedio_por_mes', 'anio', 'promedio_id_zona', 'promedio_precio_tipo_propiedad', 
                 'promedio_precio_habitaciones', 'count_idzona', 'count_ciudad']

# features_test = ['promedio_precio_ciudad',
#                 'promedio_id_zona']

features += features_test


hps = {'bagging_fraction': 0.7514713723096431,
 'bagging_freq': 97.0,
 'feature_fraction': 0.7403311365563734,
 'learning_rate': 0.11573281219786402,
 'max_depth': 13.0,
 'num_leaves': 87.0,
 'test_size': 0.12116948662473831}


bagging_fraction = hps['bagging_fraction']
bagging_freq = int(hps['bagging_freq'])
feature_fraction = hps['feature_fraction']
learning_rate = hps['learning_rate']
num_leaves = int(hps['num_leaves'])
max_depth = int(hps['max_depth'])
test_size = hps['test_size']

params = {
    'boosting_type': 'dart',
    'objective': 'regression',
    'metric': {'mae'}, # Si se deja vacio se toma el ideal para llegar al 'objective'
    'num_leaves': num_leaves,
    'learning_rate': learning_rate,
    'feature_fraction': feature_fraction,
    'bagging_fraction': bagging_fraction,
    'bagging_freq': bagging_freq,
    'max_depth': max_depth,
    'verbose': 0
}

x_train, x_test, y_train, y_test = utils.dividir_dataset(df_train_f, 'precio', features, test_size=test_size)

lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)

gbm = lgb.train(params,
                lgb_train,
                valid_sets=lgb_eval,
                num_boost_round=800,
                early_stopping_rounds=15,
                verbose_eval=1)

y_pred_test = gbm.predict(x_test, num_iteration=gbm.best_iteration)
y_pred_train = gbm.predict(x_train, num_iteration=gbm.best_iteration)
y_pred_eval = gbm.predict(utils.filtrar_features(df_test_f, features), num_iteration=gbm.best_iteration)

df_test_f['target'] = y_pred_eval
# df_test_f = utils.pesificar_df(df_test_f, 'target', 'target')

gbm_mae_train = utils.MAE(y_train, y_pred_train)
gbm_mae = utils.MAE(y_test, y_pred_test)
gbm_mae_eval = utils.MAE(df_test_f['precio'].values, df_test_f['target'].values)

print(f"MAE LightGBM (train): {gbm_mae_train:.5f}")
print(f"MAE LightGBM (test): {gbm_mae:.5f}")
print(f"MAE LightGBM (eval): {gbm_mae_eval:.5f}")

[1]	valid_0's l1: 1.46581e+06
[2]	valid_0's l1: 1.34665e+06




[3]	valid_0's l1: 1.23877e+06
[4]	valid_0's l1: 1.15094e+06
[5]	valid_0's l1: 1.09075e+06
[6]	valid_0's l1: 1.01339e+06
[7]	valid_0's l1: 953840
[8]	valid_0's l1: 986125
[9]	valid_0's l1: 924442
[10]	valid_0's l1: 873220
[11]	valid_0's l1: 832246
[12]	valid_0's l1: 845712
[13]	valid_0's l1: 804607
[14]	valid_0's l1: 769391
[15]	valid_0's l1: 740420
[16]	valid_0's l1: 715359
[17]	valid_0's l1: 694053
[18]	valid_0's l1: 676126
[19]	valid_0's l1: 660209
[20]	valid_0's l1: 646051
[21]	valid_0's l1: 653267
[22]	valid_0's l1: 640843
[23]	valid_0's l1: 629604
[24]	valid_0's l1: 619565
[25]	valid_0's l1: 611052
[26]	valid_0's l1: 603886
[27]	valid_0's l1: 597716
[28]	valid_0's l1: 602207
[29]	valid_0's l1: 596732
[30]	valid_0's l1: 591387
[31]	valid_0's l1: 595342
[32]	valid_0's l1: 590316
[33]	valid_0's l1: 584243
[34]	valid_0's l1: 579520
[35]	valid_0's l1: 583492
[36]	valid_0's l1: 593528
[37]	valid_0's l1: 587922
[38]	valid_0's l1: 582813
[39]	valid_0's l1: 577758
[40]	valid_0's l1: 579868

[314]	valid_0's l1: 514518
[315]	valid_0's l1: 515030
[316]	valid_0's l1: 514604
[317]	valid_0's l1: 514190
[318]	valid_0's l1: 513855
[319]	valid_0's l1: 514089
[320]	valid_0's l1: 513801
[321]	valid_0's l1: 514040
[322]	valid_0's l1: 513542
[323]	valid_0's l1: 513291
[324]	valid_0's l1: 513465
[325]	valid_0's l1: 513233
[326]	valid_0's l1: 513374
[327]	valid_0's l1: 513145
[328]	valid_0's l1: 513356
[329]	valid_0's l1: 513080
[330]	valid_0's l1: 513221
[331]	valid_0's l1: 513429
[332]	valid_0's l1: 513874
[333]	valid_0's l1: 514365
[334]	valid_0's l1: 514915
[335]	valid_0's l1: 513937
[336]	valid_0's l1: 514396
[337]	valid_0's l1: 513703
[338]	valid_0's l1: 513403
[339]	valid_0's l1: 513709
[340]	valid_0's l1: 514142
[341]	valid_0's l1: 513682
[342]	valid_0's l1: 513226
[343]	valid_0's l1: 512923
[344]	valid_0's l1: 513145
[345]	valid_0's l1: 512901
[346]	valid_0's l1: 512620
[347]	valid_0's l1: 512825
[348]	valid_0's l1: 512760
[349]	valid_0's l1: 512714
[350]	valid_0's l1: 512865
[

[619]	valid_0's l1: 502114
[620]	valid_0's l1: 502044
[621]	valid_0's l1: 501934
[622]	valid_0's l1: 501270
[623]	valid_0's l1: 501106
[624]	valid_0's l1: 501107
[625]	valid_0's l1: 500973
[626]	valid_0's l1: 500921
[627]	valid_0's l1: 500872
[628]	valid_0's l1: 500844
[629]	valid_0's l1: 500793
[630]	valid_0's l1: 500764
[631]	valid_0's l1: 500759
[632]	valid_0's l1: 500758
[633]	valid_0's l1: 500722
[634]	valid_0's l1: 500716
[635]	valid_0's l1: 500788
[636]	valid_0's l1: 500774
[637]	valid_0's l1: 500873
[638]	valid_0's l1: 500833
[639]	valid_0's l1: 500796
[640]	valid_0's l1: 500748
[641]	valid_0's l1: 500711
[642]	valid_0's l1: 500672
[643]	valid_0's l1: 500682
[644]	valid_0's l1: 500602
[645]	valid_0's l1: 500611
[646]	valid_0's l1: 500464
[647]	valid_0's l1: 500469
[648]	valid_0's l1: 500410
[649]	valid_0's l1: 499734
[650]	valid_0's l1: 499726
[651]	valid_0's l1: 499694
[652]	valid_0's l1: 499650
[653]	valid_0's l1: 499642
[654]	valid_0's l1: 499667
[655]	valid_0's l1: 499832
[

In [13]:
x_train.columns

Index(['antiguedad', 'habitaciones', 'garages', 'banos', 'metroscubiertos',
       'metrostotales', 'lat', 'lng', 'gimnasio', 'usosmultiples', 'piscina',
       'escuelascercanas', 'centroscomercialescercanos', 'porcentaje_metros',
       'diferencia_metros', 'prop_frecuente', 'top_provincia', 'anio',
       'promedio_precio_ciudad', 'promedio_id_zona',
       'promedio_precio_tipo_propiedad', 'promedio_por_mes',
       'promedio_precio_habitaciones'],
      dtype='object')

## Evaluación df_test completo

In [7]:
df_train = pd.read_csv('./data/train.csv')

# Para usarse con el submit a Kaggle
df_test = pd.read_csv('./data/test.csv')

df_test_f = features_independientes_precio(df_test)
df_test_f = features_dependientes_precio(df_test_f, df_train)

df_train_f = features_independientes_precio(df_train)
df_train_f = features_dependientes_precio(df_train_f, df_train)

features = ['antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 'lat', 'lng',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas', 'centroscomercialescercanos']

features_test = ['prop_frecuente', 'top_provincia', 'porcentaje_metros', 'diferencia_metros', 'promedio_precio_ciudad', 
                 'promedio_por_mes', 'anio', 'promedio_id_zona', 'promedio_precio_tipo_propiedad', 
                 'promedio_precio_habitaciones', 'count_idzona', 'count_ciudad']

features += features_test


hps = {'bagging_fraction': 0.7514713723096431,
 'bagging_freq': 97.0,
 'feature_fraction': 0.7403311365563734,
 'learning_rate': 0.11573281219786402,
 'max_depth': 13.0,
 'num_leaves': 87.0,
 'test_size': 0.12116948662473831}

bagging_fraction = hps['bagging_fraction']
bagging_freq = int(hps['bagging_freq'])
feature_fraction = hps['feature_fraction']
learning_rate = hps['learning_rate']
num_leaves = int(hps['num_leaves'])
max_depth = int(hps['max_depth'])
test_size = hps['test_size']

params = {
    'boosting_type': 'dart',
    'objective': 'regression',
    'metric': {'mae'}, # Si se deja vacio se toma el ideal para llegar al 'objective'
    'num_leaves': num_leaves,
    'learning_rate': learning_rate,
    'feature_fraction': feature_fraction,
    'bagging_fraction': bagging_fraction,
    'bagging_freq': bagging_freq,
    'max_depth': max_depth,
    'verbose': 0
}

x_train, x_test, y_train, y_test = utils.dividir_dataset(df_train_f, 'precio', features, test_size=test_size)

lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)


gbm = lgb.train(params,
                lgb_train,
                valid_sets=lgb_eval,
                num_boost_round=1000,
                early_stopping_rounds=15,
                verbose_eval=1)

y_pred_test = gbm.predict(x_test, raw_score=True, num_iteration=gbm.best_iteration)
y_pred_train = gbm.predict(x_train, raw_score=True, num_iteration=gbm.best_iteration)
y_pred_eval = gbm.predict(utils.filtrar_features(df_test_f, features), num_iteration=gbm.best_iteration)

gbm_mae_train = utils.MAE(y_train, y_pred_train)
gbm_mae = utils.MAE(y_test, y_pred_test)

print(f"MAE LightGBM (train): {gbm_mae_train:.5f}")
print(f"MAE LightGBM (test): {gbm_mae:.5f}")

[1]	valid_0's l1: 1.44703e+06
[2]	valid_0's l1: 1.32944e+06
[3]	valid_0's l1: 1.22213e+06
[4]	valid_0's l1: 1.13533e+06
[5]	valid_0's l1: 1.07334e+06
[6]	valid_0's l1: 997849
[7]	valid_0's l1: 939953




[8]	valid_0's l1: 971411
[9]	valid_0's l1: 911518
[10]	valid_0's l1: 860994
[11]	valid_0's l1: 820270
[12]	valid_0's l1: 834268
[13]	valid_0's l1: 794846
[14]	valid_0's l1: 761189
[15]	valid_0's l1: 732508
[16]	valid_0's l1: 708370
[17]	valid_0's l1: 687280
[18]	valid_0's l1: 669390
[19]	valid_0's l1: 653973
[20]	valid_0's l1: 639949
[21]	valid_0's l1: 647417
[22]	valid_0's l1: 634618
[23]	valid_0's l1: 623200
[24]	valid_0's l1: 613223
[25]	valid_0's l1: 604640
[26]	valid_0's l1: 597160
[27]	valid_0's l1: 590596
[28]	valid_0's l1: 595211
[29]	valid_0's l1: 589742
[30]	valid_0's l1: 584232
[31]	valid_0's l1: 587937
[32]	valid_0's l1: 582633
[33]	valid_0's l1: 577463
[34]	valid_0's l1: 572653
[35]	valid_0's l1: 576690
[36]	valid_0's l1: 586336
[37]	valid_0's l1: 580867
[38]	valid_0's l1: 575314
[39]	valid_0's l1: 570526
[40]	valid_0's l1: 572429
[41]	valid_0's l1: 578628
[42]	valid_0's l1: 573102
[43]	valid_0's l1: 577375
[44]	valid_0's l1: 571779
[45]	valid_0's l1: 567056
[46]	valid_0's

[315]	valid_0's l1: 509787
[316]	valid_0's l1: 509102
[317]	valid_0's l1: 508654
[318]	valid_0's l1: 508359
[319]	valid_0's l1: 508598
[320]	valid_0's l1: 508197
[321]	valid_0's l1: 508404
[322]	valid_0's l1: 508084
[323]	valid_0's l1: 507950
[324]	valid_0's l1: 508142
[325]	valid_0's l1: 507849
[326]	valid_0's l1: 507971
[327]	valid_0's l1: 507693
[328]	valid_0's l1: 507849
[329]	valid_0's l1: 507563
[330]	valid_0's l1: 507712
[331]	valid_0's l1: 507917
[332]	valid_0's l1: 508465
[333]	valid_0's l1: 508945
[334]	valid_0's l1: 509499
[335]	valid_0's l1: 508784
[336]	valid_0's l1: 509268
[337]	valid_0's l1: 508500
[338]	valid_0's l1: 508206
[339]	valid_0's l1: 508547
[340]	valid_0's l1: 508967
[341]	valid_0's l1: 508419
[342]	valid_0's l1: 507977
[343]	valid_0's l1: 507444
[344]	valid_0's l1: 507674
[345]	valid_0's l1: 507317
[346]	valid_0's l1: 507212
[347]	valid_0's l1: 507419
[348]	valid_0's l1: 507322
[349]	valid_0's l1: 507169
[350]	valid_0's l1: 507330
[351]	valid_0's l1: 507070
[

[621]	valid_0's l1: 495531
[622]	valid_0's l1: 494903
[623]	valid_0's l1: 494861
[624]	valid_0's l1: 494872
[625]	valid_0's l1: 494905
[626]	valid_0's l1: 494905
[627]	valid_0's l1: 494902
[628]	valid_0's l1: 494856
[629]	valid_0's l1: 494830
[630]	valid_0's l1: 494825
[631]	valid_0's l1: 494729
[632]	valid_0's l1: 494679
[633]	valid_0's l1: 494644
[634]	valid_0's l1: 494634
[635]	valid_0's l1: 494657
[636]	valid_0's l1: 494662
[637]	valid_0's l1: 494740
[638]	valid_0's l1: 494718
[639]	valid_0's l1: 494701
[640]	valid_0's l1: 494726
[641]	valid_0's l1: 494712
[642]	valid_0's l1: 494685
[643]	valid_0's l1: 494706
[644]	valid_0's l1: 494658
[645]	valid_0's l1: 494703
[646]	valid_0's l1: 494736
[647]	valid_0's l1: 494658
[648]	valid_0's l1: 494504
[649]	valid_0's l1: 493901
[650]	valid_0's l1: 493901
[651]	valid_0's l1: 493886
[652]	valid_0's l1: 493840
[653]	valid_0's l1: 493849
[654]	valid_0's l1: 493879
[655]	valid_0's l1: 493961
[656]	valid_0's l1: 493990
[657]	valid_0's l1: 494052
[

[927]	valid_0's l1: 488932
[928]	valid_0's l1: 488947
[929]	valid_0's l1: 488889
[930]	valid_0's l1: 488985
[931]	valid_0's l1: 488958
[932]	valid_0's l1: 488953
[933]	valid_0's l1: 489002
[934]	valid_0's l1: 489008
[935]	valid_0's l1: 489131
[936]	valid_0's l1: 489070
[937]	valid_0's l1: 489021
[938]	valid_0's l1: 489115
[939]	valid_0's l1: 489122
[940]	valid_0's l1: 489170
[941]	valid_0's l1: 488679
[942]	valid_0's l1: 488771
[943]	valid_0's l1: 488752
[944]	valid_0's l1: 488820
[945]	valid_0's l1: 488789
[946]	valid_0's l1: 488747
[947]	valid_0's l1: 488796
[948]	valid_0's l1: 488903
[949]	valid_0's l1: 488939
[950]	valid_0's l1: 488939
[951]	valid_0's l1: 488921
[952]	valid_0's l1: 488855
[953]	valid_0's l1: 488378
[954]	valid_0's l1: 488369
[955]	valid_0's l1: 488403
[956]	valid_0's l1: 488422
[957]	valid_0's l1: 488504
[958]	valid_0's l1: 488466
[959]	valid_0's l1: 488450
[960]	valid_0's l1: 488438
[961]	valid_0's l1: 488442
[962]	valid_0's l1: 488326
[963]	valid_0's l1: 488264
[

In [8]:
y_pred_eval = gbm.predict(utils.filtrar_features(df_test_f, features), num_iteration=gbm.best_iteration)
df_test_f['target'] = y_pred_eval
df_test_f[['id', 'target']].to_csv('respuesta9.csv', index=False)

In [9]:
x_train.columns

Index(['antiguedad', 'habitaciones', 'garages', 'banos', 'metroscubiertos',
       'metrostotales', 'lat', 'lng', 'gimnasio', 'usosmultiples', 'piscina',
       'escuelascercanas', 'centroscomercialescercanos', 'porcentaje_metros',
       'diferencia_metros', 'prop_frecuente', 'top_provincia', 'anio',
       'promedio_precio_ciudad', 'count_ciudad', 'promedio_id_zona',
       'count_idzona', 'promedio_precio_tipo_propiedad', 'promedio_por_mes',
       'promedio_precio_habitaciones'],
      dtype='object')

In [10]:
x_test.shape

(29081, 25)

In [11]:
df_test_f.shape

(60000, 62)

In [None]:
utils.filtrar_features(df_test_f, features).columns