In [15]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from hyperopt import fmin, tpe, hp
import ipynb.fs.full.utils as utils
from ipynb.fs.full.features import features_independientes_precio, features_dependientes_precio

df_train = pd.read_csv('./data/train.csv')

# df_train = df_train.sample(frac=1).reset_index(drop=True)

df_train, df_test = utils.dividir_df_testeo(df_train, test_size=0.15)

# Para usarse con el submit a Kaggle
df_eval = pd.read_csv('./data/test.csv')

In [16]:
features_base = ['antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 
#                  'lat', 'lng',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas', 'centroscomercialescercanos']

# df_train = df_train.sample(frac=1).reset_index(drop=True)

df_test_f = features_independientes_precio(df_test)
df_test_f = features_dependientes_precio(df_test_f, df_train)

df_train_f = features_independientes_precio(df_train)
df_train_f = features_dependientes_precio(df_train_f, df_train)

# df_train_f = df_train_f.sample(frac=1).reset_index(drop=True)

## Búsqueda hiperparámetros

In [17]:
features = ['antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 
#             'lat', 'lng',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas', 'centroscomercialescercanos']

features_test = ['prop_frecuente', 'top_provincia', 'porcentaje_metros', 'diferencia_metros', 'promedio_precio_ciudad', 
                 'promedio_por_mes', 'anio', 'promedio_id_zona', 'promedio_precio_tipo_propiedad', 
                 'promedio_precio_habitaciones', 'count_idzona', 'count_ciudad', 'puntaje', 
                 'count_tipodepropiedad', 'count_tipodepropiedad_ciudad']

features += features_test


def eval_lightgbm(args):
    num_leaves, learning_rate, feature_fraction, bagging_fraction, bagging_freq, max_depth, test_size = args
    
    df_train_h = df_train_f.sample(frac=1).reset_index(drop=True)
    
    x_train, x_test, y_train, y_test = utils.dividir_dataset(df_train_h, 'precio', features, test_size=test_size)

    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)
    
    num_leaves = int(num_leaves)
    bagging_freq = int(bagging_freq)
    max_depth = int(max_depth)
    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'mae'}, # Si se deja vacio se toma el ideal para llegar al 'objective'
        'num_leaves': num_leaves,
        'learning_rate': learning_rate,
        'feature_fraction': feature_fraction,
        'bagging_fraction': bagging_fraction,
        'bagging_freq': bagging_freq,
        'max_depth': max_depth,
        'verbose': -1,
    }

    gbm = lgb.train(params,
                    lgb_train,
                    valid_sets=lgb_eval,
                    num_boost_round=150,
                    early_stopping_rounds=15,
                    verbose_eval=-1)
    
    y_pred_test = gbm.predict(utils.filtrar_features(df_test_f, features), num_iteration=gbm.best_iteration)
    return utils.MAE(df_test_f['precio'].values, y_pred_test)

space = [hp.quniform('num_leaves', 30, 130, 1), hp.uniform('learning_rate', 0.05, 0.9),
        hp.uniform('feature_fraction', 0.10, 0.90), hp.uniform('bagging_fraction', 0.10, 0.90),
        hp.quniform('bagging_freq', 1, 130, 1), hp.quniform('max_depth', 1, 20, 1),
        hp.uniform('test_size', 0.01, 0.4)]

hps = fmin(eval_lightgbm, space=space, algo=tpe.suggest, max_evals=100, verbose=1)

display(hps)

Training until validation scores don't improve for 15 rounds
Did not meet early stopping. Best iteration is:      
[148]	valid_0's l1: 529719
Training until validation scores don't improve for 15 rounds                 
Early stopping, best iteration is:                                           
[31]	valid_0's l1: 682366
Training until validation scores don't improve for 15 rounds                 
Early stopping, best iteration is:                                           
[25]	valid_0's l1: 612269
Training until validation scores don't improve for 15 rounds                 
Early stopping, best iteration is:                                           
[58]	valid_0's l1: 605493
Training until validation scores don't improve for 15 rounds                 
Did not meet early stopping. Best iteration is:                              
[150]	valid_0's l1: 543865
Training until validation scores don't improve for 15 rounds                 
Early stopping, best iteration is:                 

Did not meet early stopping. Best iteration is:                               
[150]	valid_0's l1: 577852
Training until validation scores don't improve for 15 rounds                  
Early stopping, best iteration is:                                            
[49]	valid_0's l1: 576377
Training until validation scores don't improve for 15 rounds                  
Early stopping, best iteration is:                                            
[105]	valid_0's l1: 547338
Training until validation scores don't improve for 15 rounds                  
Early stopping, best iteration is:                                            
[68]	valid_0's l1: 552932
Training until validation scores don't improve for 15 rounds                  
Did not meet early stopping. Best iteration is:                               
[145]	valid_0's l1: 706921
Training until validation scores don't improve for 15 rounds                  
Did not meet early stopping. Best iteration is:                              

Did not meet early stopping. Best iteration is:                               
[150]	valid_0's l1: 530648
Training until validation scores don't improve for 15 rounds                  
Did not meet early stopping. Best iteration is:                               
[150]	valid_0's l1: 526754
Training until validation scores don't improve for 15 rounds                  
Did not meet early stopping. Best iteration is:                               
[150]	valid_0's l1: 553924
Training until validation scores don't improve for 15 rounds                  
Did not meet early stopping. Best iteration is:                               
[146]	valid_0's l1: 526343
Training until validation scores don't improve for 15 rounds                  
Did not meet early stopping. Best iteration is:                               
[150]	valid_0's l1: 545020
Training until validation scores don't improve for 15 rounds                  
Did not meet early stopping. Best iteration is:                            

{'bagging_fraction': 0.806451877022587,
 'bagging_freq': 62.0,
 'feature_fraction': 0.5379925983440028,
 'learning_rate': 0.1363027714646826,
 'max_depth': 11.0,
 'num_leaves': 113.0,
 'test_size': 0.09575190901892519}

## Evaluación features

In [11]:
features = ['antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 
#             'lat', 'lng',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas', 'centroscomercialescercanos']

# features_test = ['tipo_propiedad_compartida',
#                  'prop_frecuente', 'top_provincia', 'es_ciudad_centrica',
#                  'promedio_metros_totales_provincia',
#                  'promedio_metros_cubiertos_provincia', 'porcentaje_metros',
#                  'diferencia_metros', 'delincuencia', 'turismo',
#                  'promedio_precio_ciudad', 'promedio_id_zona',
#                  'promedio_precio_tipo_propiedad',
#                  'promedio_por_mes', 'promedio_precio_habitaciones',
#                  'promedio_precio_habitaciones_banos_garages',
#                  'promedio_precio_banos_garages', 'promedio_precio_booleanos',
#                  'metros_totales_normalizados', 'metros_cubiertos_normalizados', 'anio']

features_test = ['puntaje', 'count_tipodepropiedad', 'count_tipodepropiedad_ciudad']

hps = {'bagging_fraction': 0.806451877022587,
 'bagging_freq': 62.0,
 'feature_fraction': 0.5379925983440028,
 'learning_rate': 0.1363027714646826,
 'max_depth': 11.0,
 'num_leaves': 113.0,
 'test_size': 0.09575190901892519}

# hps = {'bagging_fraction': 0.7514713723096431,
#  'bagging_freq': 97.0,
#  'feature_fraction': 0.7403311365563734,
#  'learning_rate': 0.11573281219786402,
#  'max_depth': 13.0,
#  'num_leaves': 87.0,
#  'test_size': 0.12116948662473831}


bagging_fraction = hps['bagging_fraction']
bagging_freq = int(hps['bagging_freq'])
feature_fraction = hps['feature_fraction']
learning_rate = hps['learning_rate']
num_leaves = int(hps['num_leaves'])
max_depth = int(hps['max_depth'])
test_size = hps['test_size']

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'mae'}, # Si se deja vacio se toma el ideal para llegar al 'objective'
    'num_leaves': num_leaves,
    'learning_rate': learning_rate,
    'feature_fraction': feature_fraction,
    'bagging_fraction': bagging_fraction,
    'bagging_freq': bagging_freq,
    'max_depth': max_depth,
    'verbose': 0
}


base_train = 0
base_test = 0
base_eval = 0
for i in ['None'] + features_test:
    x_train, x_test, y_train, y_test = utils.dividir_dataset(df_train_f, 'precio', features + [i], test_size=test_size)

    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)

    gbm = lgb.train(params,
                    lgb_train,
                    valid_sets=lgb_eval,
                    num_boost_round=1000,
                    early_stopping_rounds=15,
                    verbose_eval=-1)

    y_pred_test = gbm.predict(x_test, raw_score=True, num_iteration=gbm.best_iteration)
    y_pred_train = gbm.predict(x_train, raw_score=True, num_iteration=gbm.best_iteration)
    y_pred_eval = gbm.predict(utils.filtrar_features(df_test_f, features + [i]), num_iteration=gbm.best_iteration)

    gbm_mae_train = utils.MAE(y_train, y_pred_train)
    gbm_mae = utils.MAE(y_test, y_pred_test)
    gbm_mae_eval = utils.MAE(df_test_f['precio'].values, y_pred_eval)

    print(f"MAE LightGBM (train): {gbm_mae_train:.5f}")
    print(f"MAE LightGBM (test): {gbm_mae:.5f}")
    print(f"MAE LightGBM (eval): {gbm_mae_eval:.5f}")
    if i is not 'None':
        print(f"Overfitting (base_eval - base_test) - (eval - test) - {i}: {(base_eval - base_test) - (gbm_mae_eval - gbm_mae)}")
        print(f"Diff evaluation (base_eval - eval)                  - {i}: {base_eval - gbm_mae_eval}")
        print(f"Diff train (base_train - train)                     - {i}: {base_train - gbm_mae_train}")
    else:
        base_train = gbm_mae_train
        base_test = gbm_mae
        base_eval = gbm_mae_eval

Training until validation scores don't improve for 15 rounds
Early stopping, best iteration is:
[111]	valid_0's l1: 917555


KeyError: 'precio'

In [28]:
df_train_f.columns

Index(['id', 'titulo', 'descripcion', 'tipodepropiedad', 'direccion', 'ciudad',
       'provincia', 'antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 'idzona', 'lat', 'lng', 'fecha',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas',
       'centroscomercialescercanos', 'precio', 'porcentaje_metros',
       'diferencia_metros', 'intervalo_metros_totales',
       'intervalo_metros_cubiertos', 'escomercial',
       'promedio_metros_tipo_propiedad', 'promedio_metros_cub_tipo_propiedad',
       'tipo_propiedad_compartida', 'prop_frecuente', 'zona', 'top_provincia',
       'es_ciudad_centrica', 'promedio_metros_totales_provincia',
       'promedio_metros_cubiertos_provincia', 'anio', 'mes', 'dia',
       'trimestre', 'escualas_centros_cercanos', 'delincuencia', 'turismo',
       'es_antigua', 'cantidad_inquilinos', 'metros_totales_normalizados',
       'metros_cubiertos_normalizados', 'promedio_precio_ciudad',
       'varianza_preci

## Evaluación modelo final

In [18]:
features = ['antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 'lat', 'lng',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas', 'centroscomercialescercanos']


features_test = ['prop_frecuente', 'top_provincia', 'porcentaje_metros', 'diferencia_metros', 'promedio_precio_ciudad', 
                 'promedio_por_mes', 'anio', 'promedio_id_zona', 'promedio_precio_tipo_propiedad', 
                 'promedio_precio_habitaciones', 'count_idzona', 'count_ciudad', 'puntaje', 
                 'count_tipodepropiedad', 'count_tipodepropiedad_ciudad']

# features_test = ['promedio_precio_ciudad',
#                 'promedio_id_zona']

features += features_test


hps = {'bagging_fraction': 0.806451877022587,
 'bagging_freq': 62.0,
 'feature_fraction': 0.5379925983440028,
 'learning_rate': 0.1363027714646826,
 'max_depth': 11.0,
 'num_leaves': 113.0,
 'test_size': 0.09575190901892519}


bagging_fraction = hps['bagging_fraction']
bagging_freq = int(hps['bagging_freq'])
feature_fraction = hps['feature_fraction']
learning_rate = hps['learning_rate']
num_leaves = int(hps['num_leaves'])
max_depth = int(hps['max_depth'])
test_size = hps['test_size']

params = {
    'boosting_type': 'dart',
    'objective': 'regression',
    'metric': {'mae'}, # Si se deja vacio se toma el ideal para llegar al 'objective'
    'num_leaves': num_leaves,
    'learning_rate': learning_rate,
    'feature_fraction': feature_fraction,
    'bagging_fraction': bagging_fraction,
    'bagging_freq': bagging_freq,
    'max_depth': max_depth,
    'verbose': 0
}

x_train, x_test, y_train, y_test = utils.dividir_dataset(df_train_f, 'precio', features, test_size=test_size)

lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)

gbm = lgb.train(params,
                lgb_train,
                valid_sets=lgb_eval,
                num_boost_round=800,
                early_stopping_rounds=15,
                verbose_eval=1)

y_pred_test = gbm.predict(x_test, num_iteration=gbm.best_iteration)
y_pred_train = gbm.predict(x_train, num_iteration=gbm.best_iteration)
y_pred_eval = gbm.predict(utils.filtrar_features(df_test_f, features), num_iteration=gbm.best_iteration)

df_test_f['target'] = y_pred_eval
# df_test_f = utils.pesificar_df(df_test_f, 'target', 'target')

gbm_mae_train = utils.MAE(y_train, y_pred_train)
gbm_mae = utils.MAE(y_test, y_pred_test)
gbm_mae_eval = utils.MAE(df_test_f['precio'].values, df_test_f['target'].values)

print(f"MAE LightGBM (train): {gbm_mae_train:.5f}")
print(f"MAE LightGBM (test): {gbm_mae:.5f}")
print(f"MAE LightGBM (eval): {gbm_mae_eval:.5f}")

[1]	valid_0's l1: 1.44605e+06
[2]	valid_0's l1: 1.30082e+06
[3]	valid_0's l1: 1.18665e+06
[4]	valid_0's l1: 1.08927e+06
[5]	valid_0's l1: 1.00277e+06
[6]	valid_0's l1: 937702
[7]	valid_0's l1: 875096
[8]	valid_0's l1: 910620




[9]	valid_0's l1: 852666
[10]	valid_0's l1: 805955
[11]	valid_0's l1: 766563
[12]	valid_0's l1: 787479
[13]	valid_0's l1: 748215
[14]	valid_0's l1: 718285
[15]	valid_0's l1: 694108
[16]	valid_0's l1: 670719
[17]	valid_0's l1: 650046
[18]	valid_0's l1: 633552
[19]	valid_0's l1: 620230
[20]	valid_0's l1: 608625
[21]	valid_0's l1: 617107
[22]	valid_0's l1: 607125
[23]	valid_0's l1: 598535
[24]	valid_0's l1: 590101
[25]	valid_0's l1: 583679
[26]	valid_0's l1: 578609
[27]	valid_0's l1: 573782
[28]	valid_0's l1: 577403
[29]	valid_0's l1: 573012
[30]	valid_0's l1: 568356
[31]	valid_0's l1: 571189
[32]	valid_0's l1: 567100
[33]	valid_0's l1: 563515
[34]	valid_0's l1: 560560
[35]	valid_0's l1: 563842
[36]	valid_0's l1: 572432
[37]	valid_0's l1: 568314
[38]	valid_0's l1: 563333
[39]	valid_0's l1: 559464
[40]	valid_0's l1: 561332
[41]	valid_0's l1: 566441
[42]	valid_0's l1: 561247
[43]	valid_0's l1: 564428
[44]	valid_0's l1: 559925
[45]	valid_0's l1: 556186
[46]	valid_0's l1: 575678
[47]	valid_0'

[319]	valid_0's l1: 505059
[320]	valid_0's l1: 504654
[321]	valid_0's l1: 504831
[322]	valid_0's l1: 504500
[323]	valid_0's l1: 504225
[324]	valid_0's l1: 504416
[325]	valid_0's l1: 504095
[326]	valid_0's l1: 504196
[327]	valid_0's l1: 503992
[328]	valid_0's l1: 504174
[329]	valid_0's l1: 503875
[330]	valid_0's l1: 504025
[331]	valid_0's l1: 504189
[332]	valid_0's l1: 504501
[333]	valid_0's l1: 504900
[334]	valid_0's l1: 505404
[335]	valid_0's l1: 504719
[336]	valid_0's l1: 505159
[337]	valid_0's l1: 504408
[338]	valid_0's l1: 504021
[339]	valid_0's l1: 504288
[340]	valid_0's l1: 504705
[341]	valid_0's l1: 504174
[342]	valid_0's l1: 503781
[343]	valid_0's l1: 503316
[344]	valid_0's l1: 503567
[345]	valid_0's l1: 503329
[346]	valid_0's l1: 503162
[347]	valid_0's l1: 503435
[348]	valid_0's l1: 503230
[349]	valid_0's l1: 502983
[350]	valid_0's l1: 503178
[351]	valid_0's l1: 502957
[352]	valid_0's l1: 502852
[353]	valid_0's l1: 502402
[354]	valid_0's l1: 502490
[355]	valid_0's l1: 502563
[

[624]	valid_0's l1: 493113
[625]	valid_0's l1: 492973
[626]	valid_0's l1: 492829
[627]	valid_0's l1: 492730
[628]	valid_0's l1: 492676
[629]	valid_0's l1: 492658
[630]	valid_0's l1: 492674
[631]	valid_0's l1: 492627
[632]	valid_0's l1: 492741
[633]	valid_0's l1: 492733
[634]	valid_0's l1: 492750
[635]	valid_0's l1: 492785
[636]	valid_0's l1: 492629
[637]	valid_0's l1: 492564
[638]	valid_0's l1: 492554
[639]	valid_0's l1: 492570
[640]	valid_0's l1: 492525
[641]	valid_0's l1: 492534
[642]	valid_0's l1: 492559
[643]	valid_0's l1: 492432
[644]	valid_0's l1: 492396
[645]	valid_0's l1: 492467
[646]	valid_0's l1: 492450
[647]	valid_0's l1: 492222
[648]	valid_0's l1: 492105
[649]	valid_0's l1: 491517
[650]	valid_0's l1: 491541
[651]	valid_0's l1: 491604
[652]	valid_0's l1: 491594
[653]	valid_0's l1: 491645
[654]	valid_0's l1: 491709
[655]	valid_0's l1: 491655
[656]	valid_0's l1: 491693
[657]	valid_0's l1: 491774
[658]	valid_0's l1: 491663
[659]	valid_0's l1: 491689
[660]	valid_0's l1: 491753
[

In [13]:
x_train.columns

Index(['antiguedad', 'habitaciones', 'garages', 'banos', 'metroscubiertos',
       'metrostotales', 'lat', 'lng', 'gimnasio', 'usosmultiples', 'piscina',
       'escuelascercanas', 'centroscomercialescercanos', 'porcentaje_metros',
       'diferencia_metros', 'prop_frecuente', 'top_provincia', 'anio',
       'promedio_precio_ciudad', 'promedio_id_zona',
       'promedio_precio_tipo_propiedad', 'promedio_por_mes',
       'promedio_precio_habitaciones'],
      dtype='object')

## Evaluación df_test completo

In [20]:
df_train = pd.read_csv('./data/train.csv')

# Para usarse con el submit a Kaggle
df_test = pd.read_csv('./data/test.csv')

df_test_f = features_independientes_precio(df_test)
df_test_f = features_dependientes_precio(df_test_f, df_train)

df_train_f = features_independientes_precio(df_train)
df_train_f = features_dependientes_precio(df_train_f, df_train)

features = ['antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 'lat', 'lng',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas', 'centroscomercialescercanos']

features_test = ['prop_frecuente', 'top_provincia', 'porcentaje_metros', 'diferencia_metros', 'promedio_precio_ciudad', 
                 'promedio_por_mes', 'anio', 'promedio_id_zona', 'promedio_precio_tipo_propiedad', 
                 'promedio_precio_habitaciones', 'count_idzona', 'count_ciudad', 'puntaje', 
                 'count_tipodepropiedad', 'count_tipodepropiedad_ciudad']

features += features_test


hps = {'bagging_fraction': 0.806451877022587,
 'bagging_freq': 62.0,
 'feature_fraction': 0.5379925983440028,
 'learning_rate': 0.1363027714646826,
 'max_depth': 11.0,
 'num_leaves': 113.0,
 'test_size': 0.09575190901892519}

bagging_fraction = hps['bagging_fraction']
bagging_freq = int(hps['bagging_freq'])
feature_fraction = hps['feature_fraction']
learning_rate = hps['learning_rate']
num_leaves = int(hps['num_leaves'])
max_depth = int(hps['max_depth'])
test_size = hps['test_size']

params = {
    'boosting_type': 'dart',
    'objective': 'regression',
    'metric': {'mae'}, # Si se deja vacio se toma el ideal para llegar al 'objective'
    'num_leaves': num_leaves,
    'learning_rate': learning_rate,
    'feature_fraction': feature_fraction,
    'bagging_fraction': bagging_fraction,
    'bagging_freq': bagging_freq,
    'max_depth': max_depth,
    'verbose': 0
}

x_train, x_test, y_train, y_test = utils.dividir_dataset(df_train_f, 'precio', features, test_size=test_size)

lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)


gbm = lgb.train(params,
                lgb_train,
                valid_sets=lgb_eval,
                num_boost_round=1750,
                early_stopping_rounds=15,
                verbose_eval=1)

y_pred_test = gbm.predict(x_test, raw_score=True, num_iteration=gbm.best_iteration)
y_pred_train = gbm.predict(x_train, raw_score=True, num_iteration=gbm.best_iteration)
y_pred_eval = gbm.predict(utils.filtrar_features(df_test_f, features), num_iteration=gbm.best_iteration)

gbm_mae_train = utils.MAE(y_train, y_pred_train)
gbm_mae = utils.MAE(y_test, y_pred_test)

print(f"MAE LightGBM (train): {gbm_mae_train:.5f}")
print(f"MAE LightGBM (test): {gbm_mae:.5f}")

[1]	valid_0's l1: 1.41859e+06
[2]	valid_0's l1: 1.27751e+06
[3]	valid_0's l1: 1.16333e+06
[4]	valid_0's l1: 1.06811e+06
[5]	valid_0's l1: 982701
[6]	valid_0's l1: 917630
[7]	valid_0's l1: 856565
[8]	valid_0's l1: 890812
[9]	valid_0's l1: 834393
[10]	valid_0's l1: 789090
[11]	valid_0's l1: 751103
[12]	valid_0's l1: 771253
[13]	valid_0's l1: 733600
[14]	valid_0's l1: 704313
[15]	valid_0's l1: 681030
[16]	valid_0's l1: 657960
[17]	valid_0's l1: 638312
[18]	valid_0's l1: 622212
[19]	valid_0's l1: 608974
[20]	valid_0's l1: 598014
[21]	valid_0's l1: 606025
[22]	valid_0's l1: 596264
[23]	valid_0's l1: 588157
[24]	valid_0's l1: 580151
[25]	valid_0's l1: 574416
[26]	valid_0's l1: 569887
[27]	valid_0's l1: 565181
[28]	valid_0's l1: 568398
[29]	valid_0's l1: 564023
[30]	valid_0's l1: 559325
[31]	valid_0's l1: 562273
[32]	valid_0's l1: 558609
[33]	valid_0's l1: 555091
[34]	valid_0's l1: 552484
[35]	valid_0's l1: 555439
[36]	valid_0's l1: 563271
[37]	valid_0's l1: 559726
[38]	valid_0's l1: 555515
[

[309]	valid_0's l1: 500717
[310]	valid_0's l1: 500205
[311]	valid_0's l1: 499645
[312]	valid_0's l1: 499384
[313]	valid_0's l1: 499274
[314]	valid_0's l1: 499152
[315]	valid_0's l1: 499473
[316]	valid_0's l1: 498885
[317]	valid_0's l1: 498765
[318]	valid_0's l1: 498665
[319]	valid_0's l1: 498740
[320]	valid_0's l1: 498398
[321]	valid_0's l1: 498492
[322]	valid_0's l1: 498408
[323]	valid_0's l1: 498143
[324]	valid_0's l1: 498214
[325]	valid_0's l1: 498128
[326]	valid_0's l1: 498145
[327]	valid_0's l1: 498070
[328]	valid_0's l1: 498101
[329]	valid_0's l1: 497979
[330]	valid_0's l1: 498013
[331]	valid_0's l1: 498114
[332]	valid_0's l1: 498285
[333]	valid_0's l1: 498608
[334]	valid_0's l1: 499010
[335]	valid_0's l1: 498370
[336]	valid_0's l1: 498686
[337]	valid_0's l1: 498138
[338]	valid_0's l1: 497956
[339]	valid_0's l1: 498122
[340]	valid_0's l1: 498437
[341]	valid_0's l1: 497956
[342]	valid_0's l1: 497746
[343]	valid_0's l1: 497517
[344]	valid_0's l1: 497653
[345]	valid_0's l1: 497377
[

[613]	valid_0's l1: 488035
[614]	valid_0's l1: 488144
[615]	valid_0's l1: 488210
[616]	valid_0's l1: 488404
[617]	valid_0's l1: 488332
[618]	valid_0's l1: 488270
[619]	valid_0's l1: 488270
[620]	valid_0's l1: 488467
[621]	valid_0's l1: 488319
[622]	valid_0's l1: 487623
[623]	valid_0's l1: 487678
[624]	valid_0's l1: 487653
[625]	valid_0's l1: 487663
[626]	valid_0's l1: 487595
[627]	valid_0's l1: 487428
[628]	valid_0's l1: 487448
[629]	valid_0's l1: 487368
[630]	valid_0's l1: 487317
[631]	valid_0's l1: 487414
[632]	valid_0's l1: 487388
[633]	valid_0's l1: 487324
[634]	valid_0's l1: 487284
[635]	valid_0's l1: 487462
[636]	valid_0's l1: 487642
[637]	valid_0's l1: 487642
[638]	valid_0's l1: 487571
[639]	valid_0's l1: 487520
[640]	valid_0's l1: 487444
[641]	valid_0's l1: 487386
[642]	valid_0's l1: 487321
[643]	valid_0's l1: 487354
[644]	valid_0's l1: 487258
[645]	valid_0's l1: 487187
[646]	valid_0's l1: 487122
[647]	valid_0's l1: 487190
[648]	valid_0's l1: 487226
[649]	valid_0's l1: 486536
[

[917]	valid_0's l1: 482783
[918]	valid_0's l1: 482721
[919]	valid_0's l1: 482686
[920]	valid_0's l1: 482671
[921]	valid_0's l1: 482644
[922]	valid_0's l1: 482652
[923]	valid_0's l1: 482633
[924]	valid_0's l1: 482729
[925]	valid_0's l1: 482801
[926]	valid_0's l1: 482881
[927]	valid_0's l1: 482840
[928]	valid_0's l1: 482840
[929]	valid_0's l1: 482868
[930]	valid_0's l1: 482993
[931]	valid_0's l1: 482922
[932]	valid_0's l1: 482895
[933]	valid_0's l1: 482919
[934]	valid_0's l1: 483050
[935]	valid_0's l1: 483144
[936]	valid_0's l1: 483056
[937]	valid_0's l1: 482997
[938]	valid_0's l1: 483057
[939]	valid_0's l1: 483039
[940]	valid_0's l1: 483112
[941]	valid_0's l1: 482562
[942]	valid_0's l1: 482416
[943]	valid_0's l1: 482357
[944]	valid_0's l1: 482556
[945]	valid_0's l1: 482496
[946]	valid_0's l1: 482438
[947]	valid_0's l1: 482614
[948]	valid_0's l1: 482652
[949]	valid_0's l1: 482657
[950]	valid_0's l1: 482766
[951]	valid_0's l1: 482822
[952]	valid_0's l1: 482716
[953]	valid_0's l1: 482179
[

[1215]	valid_0's l1: 479202
[1216]	valid_0's l1: 479113
[1217]	valid_0's l1: 479237
[1218]	valid_0's l1: 479204
[1219]	valid_0's l1: 479119
[1220]	valid_0's l1: 479154
[1221]	valid_0's l1: 479113
[1222]	valid_0's l1: 479173
[1223]	valid_0's l1: 479093
[1224]	valid_0's l1: 479110
[1225]	valid_0's l1: 479031
[1226]	valid_0's l1: 479101
[1227]	valid_0's l1: 479025
[1228]	valid_0's l1: 479096
[1229]	valid_0's l1: 479087
[1230]	valid_0's l1: 478983
[1231]	valid_0's l1: 478886
[1232]	valid_0's l1: 478845
[1233]	valid_0's l1: 478777
[1234]	valid_0's l1: 478725
[1235]	valid_0's l1: 478655
[1236]	valid_0's l1: 478667
[1237]	valid_0's l1: 478728
[1238]	valid_0's l1: 478703
[1239]	valid_0's l1: 478649
[1240]	valid_0's l1: 478544
[1241]	valid_0's l1: 478506
[1242]	valid_0's l1: 478581
[1243]	valid_0's l1: 478532
[1244]	valid_0's l1: 478698
[1245]	valid_0's l1: 478803
[1246]	valid_0's l1: 478922
[1247]	valid_0's l1: 478851
[1248]	valid_0's l1: 478788
[1249]	valid_0's l1: 478756
[1250]	valid_0's l1:

[1508]	valid_0's l1: 476911
[1509]	valid_0's l1: 476857
[1510]	valid_0's l1: 476831
[1511]	valid_0's l1: 476852
[1512]	valid_0's l1: 476946
[1513]	valid_0's l1: 476915
[1514]	valid_0's l1: 476869
[1515]	valid_0's l1: 476847
[1516]	valid_0's l1: 476803
[1517]	valid_0's l1: 476779
[1518]	valid_0's l1: 476819
[1519]	valid_0's l1: 476794
[1520]	valid_0's l1: 476847
[1521]	valid_0's l1: 476944
[1522]	valid_0's l1: 476894
[1523]	valid_0's l1: 476919
[1524]	valid_0's l1: 476876
[1525]	valid_0's l1: 476850
[1526]	valid_0's l1: 476904
[1527]	valid_0's l1: 476940
[1528]	valid_0's l1: 476881
[1529]	valid_0's l1: 476849
[1530]	valid_0's l1: 476828
[1531]	valid_0's l1: 476790
[1532]	valid_0's l1: 476844
[1533]	valid_0's l1: 476797
[1534]	valid_0's l1: 476889
[1535]	valid_0's l1: 476924
[1536]	valid_0's l1: 476884
[1537]	valid_0's l1: 476926
[1538]	valid_0's l1: 476868
[1539]	valid_0's l1: 476864
[1540]	valid_0's l1: 476788
[1541]	valid_0's l1: 476860
[1542]	valid_0's l1: 476838
[1543]	valid_0's l1:

In [21]:
y_pred_eval = gbm.predict(utils.filtrar_features(df_test_f, features), num_iteration=gbm.best_iteration)
df_test_f['target'] = y_pred_eval
df_test_f[['id', 'target']].to_csv('respuesta10.csv', index=False)

In [9]:
x_train.columns

Index(['antiguedad', 'habitaciones', 'garages', 'banos', 'metroscubiertos',
       'metrostotales', 'lat', 'lng', 'gimnasio', 'usosmultiples', 'piscina',
       'escuelascercanas', 'centroscomercialescercanos', 'porcentaje_metros',
       'diferencia_metros', 'prop_frecuente', 'top_provincia', 'anio',
       'promedio_precio_ciudad', 'count_ciudad', 'promedio_id_zona',
       'count_idzona', 'promedio_precio_tipo_propiedad', 'promedio_por_mes',
       'promedio_precio_habitaciones'],
      dtype='object')

In [10]:
x_test.shape

(29081, 25)

In [11]:
df_test_f.shape

(60000, 62)

In [None]:
utils.filtrar_features(df_test_f, features).columns