In [24]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from hyperopt import fmin, tpe, hp
import ipynb.fs.full.utils as utils
from ipynb.fs.full.features import features_independientes_precio, features_dependientes_precio

df_train = pd.read_csv('./data/train.csv')

df_train, df_test = utils.dividir_df_testeo(df_train, test_size=0.15)
df_train = df_train.copy()
df_test = df_test.copy()

# Para usarse con el submit a Kaggle
df_eval = pd.read_csv('./data/test.csv')

In [25]:
features_base = ['antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 
#                  'lat', 'lng',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas', 'centroscomercialescercanos']

df_train = df_train.sample(frac=1).reset_index(drop=True)

df_test_f = features_independientes_precio(df_test)
df_test_f = features_dependientes_precio(df_test_f, df_train)

df_train_f = features_independientes_precio(df_train)
df_train_f = features_dependientes_precio(df_train_f, df_train)

df_train_f = df_train_f.sample(frac=1).reset_index(drop=True)

## Búsqueda hiperparámetros

In [33]:
features = ['antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 
#             'lat', 'lng',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas', 'centroscomercialescercanos']

features_test = ['prop_frecuente', 'top_provincia', 'porcentaje_metros', 'diferencia_metros', 
                 'promedio_precio_ciudad', 'promedio_por_mes', 'anio', 'promedio_id_zona', 
                 'promedio_precio_tipo_propiedad', 'promedio_precio_habitaciones']

features += features_test


def eval_lightgbm(args):
    num_leaves, learning_rate, feature_fraction, bagging_fraction, bagging_freq, max_depth, test_size = args
    
    df_train_h = df_train_f.sample(frac=1).reset_index(drop=True)
    
    x_train, x_test, y_train, y_test = utils.dividir_dataset(df_train_h, 'precio', features, test_size=test_size)

    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)
    
    num_leaves = int(num_leaves)
    bagging_freq = int(bagging_freq)
    max_depth = int(max_depth)
    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'mae'}, # Si se deja vacio se toma el ideal para llegar al 'objective'
        'num_leaves': num_leaves,
        'learning_rate': learning_rate,
        'feature_fraction': feature_fraction,
        'bagging_fraction': bagging_fraction,
        'bagging_freq': bagging_freq,
        'max_depth': max_depth,
        'verbose': -1,
    }

    gbm = lgb.train(params,
                    lgb_train,
                    valid_sets=lgb_eval,
                    num_boost_round=150,
                    early_stopping_rounds=15,
                    verbose_eval=-1)
    
    y_pred_test = gbm.predict(utils.filtrar_features(df_test_f, features), num_iteration=gbm.best_iteration)
    return utils.MAE(df_test_f['precio'].values, y_pred_test)

space = [hp.quniform('num_leaves', 30, 130, 1), hp.uniform('learning_rate', 0.05, 0.9),
        hp.uniform('feature_fraction', 0.10, 0.90), hp.uniform('bagging_fraction', 0.10, 0.90),
        hp.quniform('bagging_freq', 1, 130, 1), hp.quniform('max_depth', 1, 20, 1),
        hp.uniform('test_size', 0.01, 0.8)]

hps = fmin(eval_lightgbm, space=space, algo=tpe.suggest, max_evals=100, verbose=1)

display(hps)

Training until validation scores don't improve for 15 rounds
Early stopping, best iteration is:                   
[51]	valid_0's l1: 593738
Training until validation scores don't improve for 15 rounds                 
Did not meet early stopping. Best iteration is:                              
[150]	valid_0's l1: 545767
Training until validation scores don't improve for 15 rounds                 
Did not meet early stopping. Best iteration is:                              
[142]	valid_0's l1: 545367
Training until validation scores don't improve for 15 rounds                 
Early stopping, best iteration is:                                           
[72]	valid_0's l1: 557767
Training until validation scores don't improve for 15 rounds                 
Did not meet early stopping. Best iteration is:                              
[142]	valid_0's l1: 580397
Training until validation scores don't improve for 15 rounds                 
Did not meet early stopping. Best iteration is:   

Early stopping, best iteration is:                                            
[66]	valid_0's l1: 561626
Training until validation scores don't improve for 15 rounds                  
Did not meet early stopping. Best iteration is:                               
[150]	valid_0's l1: 544072
Training until validation scores don't improve for 15 rounds                  
Did not meet early stopping. Best iteration is:                               
[150]	valid_0's l1: 584510
Training until validation scores don't improve for 15 rounds                  
Did not meet early stopping. Best iteration is:                               
[146]	valid_0's l1: 538862
Training until validation scores don't improve for 15 rounds                  
Did not meet early stopping. Best iteration is:                               
[150]	valid_0's l1: 547875
Training until validation scores don't improve for 15 rounds                  
Did not meet early stopping. Best iteration is:                             

{'bagging_fraction': 0.867964514027621,
 'bagging_freq': 69.0,
 'feature_fraction': 0.8985719006599164,
 'learning_rate': 0.15007007911553005,
 'max_depth': 9.0,
 'num_leaves': 105.0,
 'test_size': 0.30442962737370866}

## Evaluación features

In [32]:
features = ['antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 
#             'lat', 'lng',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas', 'centroscomercialescercanos']

features_test = ['tipo_propiedad_compartida',
                 'prop_frecuente', 'top_provincia', 'es_ciudad_centrica',
                 'promedio_metros_totales_provincia',
                 'promedio_metros_cubiertos_provincia', 'porcentaje_metros',
                 'diferencia_metros', 'delincuencia', 'turismo',
                 'promedio_precio_ciudad', 'promedio_id_zona',
                 'promedio_precio_tipo_propiedad',
                 'promedio_por_mes', 'promedio_precio_habitaciones',
                 'promedio_precio_habitaciones_banos_garages',
                 'promedio_precio_banos_garages', 'promedio_precio_booleanos',
                 'metros_totales_normalizados', 'metros_cubiertos_normalizados', 'anio']

# hps = {'bagging_fraction': 0.7514713723096431,
#  'bagging_freq': 97.0,
#  'feature_fraction': 0.7403311365563734,
#  'learning_rate': 0.11573281219786402,
#  'max_depth': 13.0,
#  'num_leaves': 87.0,
#  'test_size': 0.12116948662473831}

hps = {'bagging_fraction': 0.867964514027621,
 'bagging_freq': 69.0,
 'feature_fraction': 0.8985719006599164,
 'learning_rate': 0.15007007911553005,
 'max_depth': 9.0,
 'num_leaves': 105.0,
 'test_size': 0.30442962737370866}


bagging_fraction = hps['bagging_fraction']
bagging_freq = int(hps['bagging_freq'])
feature_fraction = hps['feature_fraction']
learning_rate = hps['learning_rate']
num_leaves = int(hps['num_leaves'])
max_depth = int(hps['max_depth'])
test_size = hps['test_size']

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'mae'}, # Si se deja vacio se toma el ideal para llegar al 'objective'
    'num_leaves': num_leaves,
    'learning_rate': learning_rate,
    'feature_fraction': feature_fraction,
    'bagging_fraction': bagging_fraction,
    'bagging_freq': bagging_freq,
    'max_depth': max_depth,
    'verbose': 0
}


base_train = 0
base_test = 0
base_eval = 0
for i in ['None'] + features_test:
    x_train, x_test, y_train, y_test = utils.dividir_dataset(df_train_f, 'precio', features + [i], test_size=test_size)

    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)

    gbm = lgb.train(params,
                    lgb_train,
                    valid_sets=lgb_eval,
                    num_boost_round=1000,
                    early_stopping_rounds=15,
                    verbose_eval=-1)

    y_pred_test = gbm.predict(x_test, raw_score=True, num_iteration=gbm.best_iteration)
    y_pred_train = gbm.predict(x_train, raw_score=True, num_iteration=gbm.best_iteration)
    y_pred_eval = gbm.predict(utils.filtrar_features(df_test_f, features + [i]), num_iteration=gbm.best_iteration)

    gbm_mae_train = utils.MAE(y_train, y_pred_train)
    gbm_mae = utils.MAE(y_test, y_pred_test)
    gbm_mae_eval = utils.MAE(df_test_f['precio'].values, y_pred_eval)

    print(f"MAE LightGBM (train): {gbm_mae_train:.5f}")
    print(f"MAE LightGBM (test): {gbm_mae:.5f}")
    print(f"MAE LightGBM (eval): {gbm_mae_eval:.5f}")
    if i is not 'None':
        print(f"Overfitting (base_eval - base_test) - (eval - test) - {i}: {(base_eval - base_test) - (gbm_mae_eval - gbm_mae)}")
        print(f"Diff evaluation (base_eval - eval)                  - {i}: {base_eval - gbm_mae_eval}")
        print(f"Diff train (base_train - train)                     - {i}: {base_train - gbm_mae_train}")
    else:
        base_train = gbm_mae_train
        base_test = gbm_mae
        base_eval = gbm_mae_eval

Training until validation scores don't improve for 15 rounds
Early stopping, best iteration is:
[299]	valid_0's l1: 935092
MAE LightGBM (train): 877498.78919
MAE LightGBM (test): 935092.36429
MAE LightGBM (eval): 930254.65186
Training until validation scores don't improve for 15 rounds
Early stopping, best iteration is:
[213]	valid_0's l1: 928271
MAE LightGBM (train): 885868.83314
MAE LightGBM (test): 928271.32694
MAE LightGBM (eval): 925980.07447
Overfitting (base_eval - base_test) - (eval - test) - tipo_propiedad_compartida: -2546.45995675202
Diff evaluation (base_eval - eval)                  - tipo_propiedad_compartida: 4274.577387943282
Diff train (base_train - train)                     - tipo_propiedad_compartida: -8370.043948679813
Training until validation scores don't improve for 15 rounds
Early stopping, best iteration is:
[276]	valid_0's l1: 924180
MAE LightGBM (train): 866622.03927
MAE LightGBM (test): 924180.44104
MAE LightGBM (eval): 920015.36471
Overfitting (base_eval -

Training until validation scores don't improve for 15 rounds
Early stopping, best iteration is:
[273]	valid_0's l1: 935757
MAE LightGBM (train): 881651.88182
MAE LightGBM (test): 935757.04351
MAE LightGBM (eval): 949817.72880
Overfitting (base_eval - base_test) - (eval - test) - promedio_precio_banos_garages: -18898.39772575954
Diff evaluation (base_eval - eval)                  - promedio_precio_banos_garages: -19563.07694704458
Diff train (base_train - train)                     - promedio_precio_banos_garages: -4153.092634318396
Training until validation scores don't improve for 15 rounds
Early stopping, best iteration is:
[186]	valid_0's l1: 939151
MAE LightGBM (train): 899033.69424
MAE LightGBM (test): 939151.02552
MAE LightGBM (eval): 933573.91387
Overfitting (base_eval - base_test) - (eval - test) - promedio_precio_booleanos: 739.3992270349991
Diff evaluation (base_eval - eval)                  - promedio_precio_booleanos: -3319.262011286104
Diff train (base_train - train)      

In [28]:
df_train_f.columns

Index(['id', 'titulo', 'descripcion', 'tipodepropiedad', 'direccion', 'ciudad',
       'provincia', 'antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 'idzona', 'lat', 'lng', 'fecha',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas',
       'centroscomercialescercanos', 'precio', 'porcentaje_metros',
       'diferencia_metros', 'intervalo_metros_totales',
       'intervalo_metros_cubiertos', 'escomercial',
       'promedio_metros_tipo_propiedad', 'promedio_metros_cub_tipo_propiedad',
       'tipo_propiedad_compartida', 'prop_frecuente', 'zona', 'top_provincia',
       'es_ciudad_centrica', 'promedio_metros_totales_provincia',
       'promedio_metros_cubiertos_provincia', 'anio', 'mes', 'dia',
       'trimestre', 'escualas_centros_cercanos', 'delincuencia', 'turismo',
       'es_antigua', 'cantidad_inquilinos', 'metros_totales_normalizados',
       'metros_cubiertos_normalizados', 'promedio_precio_ciudad',
       'varianza_preci

## Evaluación modelo final

In [37]:
features = ['antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 'lat', 'lng',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas', 'centroscomercialescercanos']


features_test = ['prop_frecuente', 'top_provincia', 'porcentaje_metros', 'diferencia_metros', 'promedio_precio_ciudad', 
                 'promedio_por_mes', 'anio', 'promedio_id_zona', 'promedio_precio_tipo_propiedad', 
                 'promedio_precio_habitaciones']

# features_test = ['promedio_precio_ciudad',
#                 'promedio_id_zona']

features += features_test


hps = {'bagging_fraction': 0.7514713723096431,
 'bagging_freq': 97.0,
 'feature_fraction': 0.7403311365563734,
 'learning_rate': 0.11573281219786402,
 'max_depth': 13.0,
 'num_leaves': 87.0,
 'test_size': 0.12116948662473831}


bagging_fraction = hps['bagging_fraction']
bagging_freq = int(hps['bagging_freq'])
feature_fraction = hps['feature_fraction']
learning_rate = hps['learning_rate']
num_leaves = int(hps['num_leaves'])
max_depth = int(hps['max_depth'])
test_size = hps['test_size']

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'mae'}, # Si se deja vacio se toma el ideal para llegar al 'objective'
    'num_leaves': num_leaves,
    'learning_rate': learning_rate,
    'feature_fraction': feature_fraction,
    'bagging_fraction': bagging_fraction,
    'bagging_freq': bagging_freq,
    'max_depth': max_depth,
    'verbose': 0
}

x_train, x_test, y_train, y_test = utils.dividir_dataset(df_train_f, 'precio', features, test_size=test_size)

lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)

gbm = lgb.train(params,
                lgb_train,
                valid_sets=lgb_eval,
                num_boost_round=10000,
                early_stopping_rounds=15,
                verbose_eval=1)

y_pred_test = gbm.predict(x_test, num_iteration=gbm.best_iteration)
y_pred_train = gbm.predict(x_train, num_iteration=gbm.best_iteration)
y_pred_eval = gbm.predict(utils.filtrar_features(df_test_f, features), num_iteration=gbm.best_iteration)

df_test_f['target'] = y_pred_eval
# df_test_f = utils.pesificar_df(df_test_f, 'target', 'target')

gbm_mae_train = utils.MAE(y_train, y_pred_train)
gbm_mae = utils.MAE(y_test, y_pred_test)
gbm_mae_eval = utils.MAE(df_test_f['precio'].values, df_test_f['target'].values)

print(f"MAE LightGBM (train): {gbm_mae_train:.5f}")
print(f"MAE LightGBM (test): {gbm_mae:.5f}")
print(f"MAE LightGBM (eval): {gbm_mae_eval:.5f}")

[1]	valid_0's l1: 1.46873e+06
Training until validation scores don't improve for 15 rounds
[2]	valid_0's l1: 1.34546e+06
[3]	valid_0's l1: 1.23907e+06
[4]	valid_0's l1: 1.15131e+06
[5]	valid_0's l1: 1.07277e+06
[6]	valid_0's l1: 1.00438e+06
[7]	valid_0's l1: 945164
[8]	valid_0's l1: 893837
[9]	valid_0's l1: 850512
[10]	valid_0's l1: 813715
[11]	valid_0's l1: 785014
[12]	valid_0's l1: 757025
[13]	valid_0's l1: 733278
[14]	valid_0's l1: 712299
[15]	valid_0's l1: 694056
[16]	valid_0's l1: 679470
[17]	valid_0's l1: 666208
[18]	valid_0's l1: 654792
[19]	valid_0's l1: 644129
[20]	valid_0's l1: 635402
[21]	valid_0's l1: 628279
[22]	valid_0's l1: 621222
[23]	valid_0's l1: 615223
[24]	valid_0's l1: 609921
[25]	valid_0's l1: 604206
[26]	valid_0's l1: 599646
[27]	valid_0's l1: 595631
[28]	valid_0's l1: 592608
[29]	valid_0's l1: 589507
[30]	valid_0's l1: 586459
[31]	valid_0's l1: 584286
[32]	valid_0's l1: 582148
[33]	valid_0's l1: 579376
[34]	valid_0's l1: 577145
[35]	valid_0's l1: 575260
[36]	val

[316]	valid_0's l1: 521871
[317]	valid_0's l1: 521858
[318]	valid_0's l1: 521768
[319]	valid_0's l1: 521774
[320]	valid_0's l1: 521858
[321]	valid_0's l1: 521934
[322]	valid_0's l1: 521896
[323]	valid_0's l1: 521906
[324]	valid_0's l1: 521890
[325]	valid_0's l1: 521874
[326]	valid_0's l1: 521848
[327]	valid_0's l1: 521811
[328]	valid_0's l1: 521759
[329]	valid_0's l1: 521852
[330]	valid_0's l1: 521808
[331]	valid_0's l1: 521794
[332]	valid_0's l1: 521693
[333]	valid_0's l1: 521626
[334]	valid_0's l1: 521524
[335]	valid_0's l1: 521512
[336]	valid_0's l1: 521593
[337]	valid_0's l1: 521558
[338]	valid_0's l1: 521500
[339]	valid_0's l1: 521459
[340]	valid_0's l1: 521413
[341]	valid_0's l1: 521386
[342]	valid_0's l1: 521274
[343]	valid_0's l1: 521276
[344]	valid_0's l1: 521314
[345]	valid_0's l1: 521229
[346]	valid_0's l1: 521143
[347]	valid_0's l1: 521119
[348]	valid_0's l1: 521143
[349]	valid_0's l1: 521162
[350]	valid_0's l1: 521117
[351]	valid_0's l1: 521123
[352]	valid_0's l1: 521066
[

## Evaluación df_test completo

In [38]:
df_train = pd.read_csv('./data/train.csv')

# Para usarse con el submit a Kaggle
df_test = pd.read_csv('./data/test.csv')

df_test_f = features_independientes_precio(df_test)
df_test_f = features_dependientes_precio(df_test_f, df_train)

df_train_f = features_independientes_precio(df_train)
df_train_f = features_dependientes_precio(df_train_f, df_train)

features = ['antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 'lat', 'lng',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas', 'centroscomercialescercanos']

features_test = ['prop_frecuente',
                'promedio_precio_ciudad',
                'promedio_precio_tipo_propiedad',
                'promedio_por_mes',
                'promedio_precio_habitaciones',
                'top_provincia',
                'promedio_id_zona',
                'puntaje']

features += features_test


hps = {'bagging_fraction': 0.7514713723096431,
 'bagging_freq': 97.0,
 'feature_fraction': 0.7403311365563734,
 'learning_rate': 0.11573281219786402,
 'max_depth': 13.0,
 'num_leaves': 87.0,
 'test_size': 0.12116948662473831}

bagging_fraction = hps['bagging_fraction']
bagging_freq = int(hps['bagging_freq'])
feature_fraction = hps['feature_fraction']
learning_rate = hps['learning_rate']
num_leaves = int(hps['num_leaves'])
max_depth = int(hps['max_depth'])
test_size = hps['test_size']

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'mae'}, # Si se deja vacio se toma el ideal para llegar al 'objective'
    'num_leaves': num_leaves,
    'learning_rate': learning_rate,
    'feature_fraction': feature_fraction,
    'bagging_fraction': bagging_fraction,
    'bagging_freq': bagging_freq,
    'max_depth': max_depth,
    'verbose': 0
}

x_train, x_test, y_train, y_test = utils.dividir_dataset(df_train_f, 'precio', features, test_size=test_size)

lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)


gbm = lgb.train(params,
                lgb_train,
                valid_sets=lgb_eval,
                num_boost_round=10000,
                early_stopping_rounds=15,
                verbose_eval=1)

y_pred_test = gbm.predict(x_test, raw_score=True, num_iteration=gbm.best_iteration)
y_pred_train = gbm.predict(x_train, raw_score=True, num_iteration=gbm.best_iteration)
y_pred_eval = gbm.predict(utils.filtrar_features(df_test_f, features), num_iteration=gbm.best_iteration)

gbm_mae_train = utils.MAE(y_train, y_pred_train)
gbm_mae = utils.MAE(y_test, y_pred_test)

print(f"MAE LightGBM (train): {gbm_mae_train:.5f}")
print(f"MAE LightGBM (test): {gbm_mae:.5f}")

[1]	valid_0's l1: 1.44726e+06
Training until validation scores don't improve for 15 rounds
[2]	valid_0's l1: 1.32312e+06
[3]	valid_0's l1: 1.21994e+06
[4]	valid_0's l1: 1.12565e+06
[5]	valid_0's l1: 1.04717e+06
[6]	valid_0's l1: 978504
[7]	valid_0's l1: 920843
[8]	valid_0's l1: 869550
[9]	valid_0's l1: 829619
[10]	valid_0's l1: 792316
[11]	valid_0's l1: 760419
[12]	valid_0's l1: 735955
[13]	valid_0's l1: 711957
[14]	valid_0's l1: 690955
[15]	valid_0's l1: 674872
[16]	valid_0's l1: 659938
[17]	valid_0's l1: 647171
[18]	valid_0's l1: 635514
[19]	valid_0's l1: 625536
[20]	valid_0's l1: 617241
[21]	valid_0's l1: 610250
[22]	valid_0's l1: 603833
[23]	valid_0's l1: 597598
[24]	valid_0's l1: 592616
[25]	valid_0's l1: 587830
[26]	valid_0's l1: 584070
[27]	valid_0's l1: 580595
[28]	valid_0's l1: 577888
[29]	valid_0's l1: 574663
[30]	valid_0's l1: 572557
[31]	valid_0's l1: 570131
[32]	valid_0's l1: 568511
[33]	valid_0's l1: 566736
[34]	valid_0's l1: 564402
[35]	valid_0's l1: 562916
[36]	valid_0'

[312]	valid_0's l1: 515160
[313]	valid_0's l1: 515124
[314]	valid_0's l1: 515102
[315]	valid_0's l1: 514981
[316]	valid_0's l1: 515000
[317]	valid_0's l1: 514928
[318]	valid_0's l1: 514755
[319]	valid_0's l1: 514732
[320]	valid_0's l1: 514819
[321]	valid_0's l1: 514757
[322]	valid_0's l1: 514690
[323]	valid_0's l1: 514659
[324]	valid_0's l1: 514677
[325]	valid_0's l1: 514687
[326]	valid_0's l1: 514590
[327]	valid_0's l1: 514575
[328]	valid_0's l1: 514554
[329]	valid_0's l1: 514570
[330]	valid_0's l1: 514575
[331]	valid_0's l1: 514485
[332]	valid_0's l1: 514419
[333]	valid_0's l1: 514450
[334]	valid_0's l1: 514402
[335]	valid_0's l1: 514336
[336]	valid_0's l1: 514385
[337]	valid_0's l1: 514314
[338]	valid_0's l1: 514253
[339]	valid_0's l1: 514124
[340]	valid_0's l1: 514061
[341]	valid_0's l1: 514087
[342]	valid_0's l1: 514100
[343]	valid_0's l1: 514058
[344]	valid_0's l1: 514069
[345]	valid_0's l1: 514102
[346]	valid_0's l1: 514078
[347]	valid_0's l1: 514050
[348]	valid_0's l1: 514020
[

In [40]:
y_pred_eval = gbm.predict(utils.filtrar_features(df_test_f, features), num_iteration=gbm.best_iteration)
df_test_f['target'] = y_pred_eval
df_test_f[['id', 'target']].to_csv('respuesta8.csv', index=False)

In [None]:
x_train.columns

In [None]:
x_test.shape

In [None]:
df_test_f.shape

In [None]:
utils.filtrar_features(df_test_f, features).columns