In [5]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from hyperopt import fmin, tpe, hp
import ipynb.fs.full.utils as utils
import ipynb.fs.full.features as features
import ipynb.fs.full.features_distancias as f_distancias


df_train = pd.read_csv('./data/train_filtrado.csv')
# Para usarse con el submit a Kaggle
df_eval = pd.read_csv('./data/test.csv')

df_train, df_eval = features.features_de_csvs(df_train, df_eval)

# Randoms solo para asegurarse que los features esten bien hechos
# df_train = df_train.sample(frac=1).reset_index(drop=True)
# df_train = utils.dolarizar_df(df_train)
# df_train = df_train.sample(frac=1).reset_index(drop=True)

df_train, df_test = utils.dividir_df_testeo(df_train, test_size=0.01)

df_test = features.llenar_nulls(df_test, hgb_mean=True, df_fill=df_train)
df_train = features.llenar_nulls(df_train, hgb_mean=True)


# df_train = df_train.sample(frac=1).reset_index(drop=True)
# df_test = utils.pesificar_df(df_test)
# df_train = df_train.sample(frac=1).reset_index(drop=True)


In [None]:
# df_train = df_train.sample(frac=1).reset_index(drop=True)

df_test_f = features.features_independientes_precio(df_test)
df_test_f = features.features_dependientes_precio(df_test_f, df_train)

df_train_f = features.features_independientes_precio(df_train)
df_train_f = features.features_dependientes_precio(df_train_f, df_train)

df_test_f, cols_tipodepropiedad_ohe = features.columna_a_ohe(df_test_f, 'tipodepropiedad', N=100, df_aux=df_train, devolver_cols=True)
df_test_f, cols_provincia_ohe = features.columna_a_ohe(df_test_f, 'provincia', N=100, df_aux=df_train, devolver_cols=True)
df_test_f, cols_zona_ohe = features.columna_a_ohe(df_test_f, 'zona', df_aux=df_train_f, devolver_cols=True)

df_train_f = features.columna_a_ohe(df_train_f, 'tipodepropiedad', N=100, df_aux=df_test)
df_train_f = features.columna_a_ohe(df_train_f, 'provincia', N=100, df_aux=df_test)
df_train_f = features.columna_a_ohe(df_train_f, 'zona', df_aux=df_test_f)


df_train_f['fecha'] = pd.to_datetime(df_train_f['fecha']).astype(int)
df_test_f['fecha'] = pd.to_datetime(df_test_f['fecha']).astype(int)

# df_train_f = df_train_f.sample(frac=1).reset_index(drop=True)

df_train_idf = pd.read_csv('./data/train_idf.csv')
df_test_idf = pd.read_csv('./data/test_idf.csv')

df_train_f = pd.merge(df_train_f, df_train_idf, on= 'id', how= 'left')
df_test_f = pd.merge(df_test_f, df_test_idf, on= 'id', how= 'left')

df_train_f = f_distancias.feature_distancias(df_train_f)
df_test_f = f_distancias.feature_distancias(df_test_f, df_train_f)

df_train_f = features.KD_feature(df_train_f)
df_test_f =  features.KD_feature(df_test_f)

## Búsqueda hiperparámetros

In [None]:
# features = ['habitaciones', 
#             'garages', 
#             'banos',
#             'antiguedad',
#             'metroscubiertos', 
#             'metrostotales', 
#             'lat', 'lng',
#             'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas', 'centroscomercialescercanos']

# features_test = ['prop_frecuente', 'top_provincia', 'promedio_precio_ciudad', 
#                  'anio', 'promedio_id_zona', 'promedio_precio_tipo_propiedad', 
#                  'count_id_zona', 'count_ciudad', 'puntaje', 
#                      'count_tipo_propiedad_ciudad', 
#                  'promedio_precio_tipo_propiedad_ciudad_gen',
#                  'count_id_zona'
#                  'dias_desde_datos',
#                  'meses_desde_datos',
#                  'porcentaje_metros',
#                  'promedio_precio_hbg_tipo_propiedad']

# features += features_test

# features += cols_tipodepropiedad_ohe + cols_provincia_ohe + cols_zona_ohe

features =['habitaciones', 
            'garages', 
            'banos',
            'antiguedad',
           'metroscubiertos', 
            'metrostotales',
            'lat_norm', 'lng_norm'
            'gimnasio', 'usosmultiples', 'piscina','prop_frecuente', 'top_provincia', 'promedio_precio_ciudad', 
                 'anio', 'promedio_id_zona', 'promedio_precio_tipo_propiedad', 
                 'count_id_zona', 'count_ciudad', 'puntaje', 
                     'count_tipo_propiedad_ciudad', 
                 'promedio_precio_tipo_propiedad_ciudad_gen',
                 'count_id_zona'
                 'dias_desde_datos',
                 'meses_desde_datos',
                 'porcentaje_metros',
                 'distancia_ciudad_centrica']

def eval_lightgbm(args):
    num_leaves, learning_rate, feature_fraction, bagging_fraction, bagging_freq, max_depth, test_size = args
    
    df_train_h = df_train_f.sample(frac=1).reset_index(drop=True)
    
    x_train, x_test, y_train, y_test = utils.dividir_dataset(df_train_h, 'precio', features, test_size=test_size)

    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)
    
    num_leaves = int(num_leaves)
    bagging_freq = int(bagging_freq)
    max_depth = int(max_depth)
    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'mae'}, # Si se deja vacio se toma el ideal para llegar al 'objective'
        'num_leaves': num_leaves,
        'learning_rate': learning_rate,
        'feature_fraction': feature_fraction,
        'bagging_fraction': bagging_fraction,
        'bagging_freq': bagging_freq,
        'max_depth': max_depth,
        'verbose': -1,
    }

    gbm = lgb.train(params,
                    lgb_train,
                    valid_sets=lgb_eval,
                    num_boost_round=150,
                    early_stopping_rounds=15,
                    verbose_eval=-1)
    
    y_pred_test = gbm.predict(utils.filtrar_features(df_test_f, features), num_iteration=gbm.best_iteration)
    return utils.MAE(df_test_f['precio'].values, y_pred_test)

space = [hp.quniform('num_leaves', 30, 180, 1), hp.uniform('learning_rate', 0.05, 0.9),
        hp.uniform('feature_fraction', 0.10, 0.90), hp.uniform('bagging_fraction', 0.10, 0.90),
        hp.quniform('bagging_freq', 1, 130, 1), hp.quniform('max_depth', 1, 30, 1),
        hp.uniform('test_size', 0.05, 0.25)]

hps = fmin(eval_lightgbm, space=space, algo=tpe.suggest, max_evals=200, verbose=1)

display(hps)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

features = ['habitaciones', 'garages','banos','antiguedad', 'metroscubiertos',  'metrostotales','lat_norm', 
           'lng_norm', 'gimnasio', 'usosmultiples', 'piscina']

features_test = ['top_provincia', 'promedio_precio_ciudad', 'anio', 'promedio_id_zona', 
                 'promedio_precio_tipo_propiedad', 'count_id_zona', 'count_ciudad', 'puntaje', 
               'count_tipo_propiedad_ciudad', 'promedio_precio_tipo_propiedad_ciudad_gen','count_id_zona'
           'dias_desde_datos','meses_desde_datos','porcentaje_metros','distancia_ciudad_centrica', 'puntaje', 'distancia_centro_mexico']

features += features_test

lgb_params = {
    'bagging_fraction': np.arange(0.1, 0.5, 0.2),
    'bagging_frequency': np.arange(1, 20, 5),
    'feature_fraction': np.arange(0.5, 1, 0.1),
    'max_depth': np.arange(5, 40, 10),
    'min_data_in_leaf': np.arange(30, 90, 30),
    'num_leaves': np.arange(30, 430, 100),
    'learning_rate': np.arange(0.1, 1, 0.2)
}


x_train, x_test, y_train, y_test = utils.dividir_dataset(df_train_f, 'precio', features, test_size=1)

rs_cv = RandomizedSearchCV(estimator=lgb.LGBMRegressor(), 
                           param_distributions=lgb_params, 
                           cv=4, 
                           n_iter=400,
                           verbose=1)

rs_cv.fit(x_train, y_train, verbose=1)


## Evaluación features

In [None]:
#features = ['antiguedad', 'habitaciones', 'garages', 'banos',
#       'metroscubiertos', 'metrostotales', 
#            'lat', 'lng',
#       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas', 'centroscomercialescercanos']

#features_test = ['prop_frecuente', 'top_provincia', 'porcentaje_metros', 'diferencia_metros', 'promedio_precio_ciudad', 
#                 'promedio_por_mes', 'anio', 'promedio_id_zona', 'promedio_precio_tipo_propiedad', 
#                 'promedio_precio_hbg_tipo_propiedad', 'count_id_zona', 'count_ciudad', 'puntaje', 
#                 'count_tipo_propiedad', 'count_tipo_propiedad_ciudad', 
                 'promedio_precio_tipo_propiedad_ciudad_gen', 'promedio_precio_hbg_tipo_propiedad_provincia',
                 'varianza_id_zona', 'promedio_id_zona_log', 'tam_ambientes', 'metros_cubiertos_normalizados', 
                 'dias_desde_datos', 'meses_desde_datos']

features += features_test

features += cols_tipodepropiedad_ohe + cols_provincia_ohe + cols_zona_ohe

features_remove = [['lat', 'lng'], ['antiguedad'], ['habitaciones'], ['garages'], ['banos'], ['metroscubiertos'], ['metrostotales'],
                   ['prop_frecuente'], ['top_provincia'], ['porcentaje_metros'], ['diferencia_metros'], 
                   ['promedio_precio_ciudad'],  ['promedio_por_mes'], ['anio'], ['promedio_id_zona'], ['promedio_id_zona_log'],
                   ['promedio_id_zona', 'promedio_id_zona_log'], ['promedio_precio_tipo_propiedad'],  
                   ['promedio_precio_hbg_tipo_propiedad'], ['count_id_zona'], ['count_ciudad'], 
                   ['puntaje'],  ['count_tipo_propiedad'], ['count_tipo_propiedad_ciudad'],  
                   ['promedio_precio_tipo_propiedad_ciudad_gen'], ['promedio_precio_hbg_tipo_propiedad_provincia'], 
                   ['varianza_id_zona'], ['tam_ambientes'], ['metros_cubiertos_normalizados'], ['dias_desde_datos'], 
                   ['meses_desde_datos']]

hps = {'bagging_fraction': 0.806451877022587,
 'bagging_freq': 62.0,
 'feature_fraction': 0.5379925983440028,
 'learning_rate': 0.1363027714646826,
 'max_depth': 11.0,
 'num_leaves': 113.0,
 'test_size': 0.09575190901892519}

bagging_fraction = hps['bagging_fraction']
bagging_freq = int(hps['bagging_freq'])
feature_fraction = hps['feature_fraction']
learning_rate = hps['learning_rate']
num_leaves = int(hps['num_leaves'])
max_depth = int(hps['max_depth'])
test_size = hps['test_size']

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'mae'}, # Si se deja vacio se toma el ideal para llegar al 'objective'
    'num_leaves': num_leaves,
    'learning_rate': learning_rate,
    'feature_fraction': feature_fraction,
    'bagging_fraction': bagging_fraction,
    'bagging_freq': bagging_freq,
    'max_depth': max_depth,
    'verbose': 0
}


base_train = 0
base_test = 0
base_eval = 0
for i in [['None']] + features_remove:
    
    features_new = [f for f in features if f not in i]
    if len(features_new) == len(features) and i != ['None']:
        print(f'{i} no encontrado')
        conitnue
    
    x_train, x_test, y_train, y_test = utils.dividir_dataset(df_train_f, 'precio', features_new, test_size=test_size)

    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)

    gbm = lgb.train(params,
                    lgb_train,
                    valid_sets=lgb_eval,
                    num_boost_round=1000,
                    early_stopping_rounds=15,
                    verbose_eval=-1)

    y_pred_test = gbm.predict(x_test, raw_score=True, num_iteration=gbm.best_iteration)
    y_pred_train = gbm.predict(x_train, raw_score=True, num_iteration=gbm.best_iteration)
    y_pred_eval = gbm.predict(utils.filtrar_features(df_test_f, features_new), num_iteration=gbm.best_iteration)

    gbm_mae_train = utils.MAE(y_train, y_pred_train)
    gbm_mae = utils.MAE(y_test, y_pred_test)
    gbm_mae_eval = utils.MAE(df_test_f['precio'].values, y_pred_eval)

    print(f"MAE LightGBM (train): {gbm_mae_train:.5f}")
    print(f"MAE LightGBM (test): {gbm_mae:.5f}")
    print(f"MAE LightGBM (eval): {gbm_mae_eval:.5f}")
    if i != ['None']:
        print(f"Overfitting (base_eval - base_test) - (eval - test) - {i}: {(base_eval - base_test) - (gbm_mae_eval - gbm_mae)}")
        print(f"Diff evaluation (base_eval - eval)                  - {i}: {base_eval - gbm_mae_eval}")
        print(f"Diff train (base_train - train)                     - {i}: {base_train - gbm_mae_train}")
    else:
        base_train = gbm_mae_train
        base_test = gbm_mae
        base_eval = gbm_mae_eval

## Evaluación modelo final

In [None]:
features = ['habitaciones', 'garages','banos','antiguedad', 'metroscubiertos',  'metrostotales','lat_norm', 
           'lng_norm', 'gimnasio', 'usosmultiples', 'piscina']

features_test = ['top_provincia', 'promedio_precio_ciudad', 'anio', 'promedio_id_zona', 
                 'promedio_precio_tipo_propiedad', 'count_id_zona', 'count_ciudad', 'puntaje', 
               'count_tipo_propiedad_ciudad', 'promedio_precio_tipo_propiedad_ciudad_gen','count_id_zona'
           'dias_desde_datos','meses_desde_datos','porcentaje_metros','distancia_ciudad_centrica', 
                'distancia_centro_mexico', 'distancia_ciudad_cara', 'promedio_precio_hbg_tipo_propiedad_provincia']

features += features_test

# # features += cols_tipodepropiedad_ohe + cols_provincia_ohe + cols_zona_ohe

# features = ['antiguedad', 'metroscubiertos', 'metrostotales', 'lat', 'lng',
#        'promedio_metros_tipo_propiedad', 'dias_desde_datos', 'tam_ambientes',
#        'promedio_id_zona', 'promedio_id_zona_gen', 'varianza_id_zona',
#        'count_id_zona', 'promedio_precio_tipo_propiedad_ciudad',
#        'count_tipo_propiedad_ciudad', 'varianza_por_mes',
#        'promedio_precio_habitaciones_banos_garages',
#        'promedio_precio_hbg_tipo_propiedad',
#        'promedio_precio_hbg_tipo_propiedad_provincia', 'puntaje', 'idf_titulo',
#        'idf_descripcion', 'distancia_ciudad_centrica',
#        'distancia_centro_mexico', 'distancia_ciudad_cara', 'similares_count']
# # hps = {'bagging_fraction': 0.8988911725316586,
#  'bagging_freq': 22.0,
#  'feature_fraction': 0.6622442122619671,
#  'learning_rate': 0.16422725363286422,
#  'max_depth': 22.0,
#  'num_leaves': 180.0,
#  'test_size': 0.20892455926004772}

hps = {'bagging_fraction': 0.8667885775824707,
 'bagging_freq': 72.0,
 'feature_fraction': 0.5369072488159948,
 'learning_rate': 0.13480325449634387,
 'max_depth': 15.0,
 'num_leaves': 174.0,
 'test_size': 0.1050550407163082}


bagging_fraction = hps['bagging_fraction']
bagging_freq = int(hps['bagging_freq'])
feature_fraction = hps['feature_fraction']
learning_rate = hps['learning_rate']
num_leaves = int(hps['num_leaves'])
max_depth = int(hps['max_depth'])
test_size = hps['test_size']

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'mae'}, # Si se deja vacio se toma el ideal para llegar al 'objective'
    'num_leaves': num_leaves,
    'learning_rate': learning_rate,
    'feature_fraction': feature_fraction,
    'bagging_fraction': bagging_fraction,
    'bagging_freq': bagging_freq,
    'max_depth': max_depth,
    'verbose': 0
}

x_train, x_test, y_train, y_test = utils.dividir_dataset(df_train_f, 'precio', features, test_size=test_size)

lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)

gbm = lgb.train(params,
                lgb_train,
                valid_sets=lgb_eval,
                num_boost_round=1000,
                early_stopping_rounds=25,
                verbose_eval=1)

y_pred_test = gbm.predict(x_test, num_iteration=gbm.best_iteration, plot=True)
y_pred_train = gbm.predict(x_train, num_iteration=gbm.best_iteration, plot=True)

y_pred_eval = gbm.predict(utils.filtrar_features(df_test_f, features), num_iteration=gbm.best_iteration)

df_test_f['target'] = y_pred_eval
# df_test_f = utils.pesificar_df(df_test_f, 'target', 'target')

gbm_mae_train = utils.MAE(y_train, y_pred_train)
gbm_mae = utils.MAE(y_test, y_pred_test)
gbm_mae_eval = utils.MAE(df_test_f['precio'].values, df_test_f['target'].values)

print(f"MAE LightGBM (train): {gbm_mae_train:.5f}")
print(f"MAE LightGBM (test): {gbm_mae:.5f}")
print(f"MAE LightGBM (eval): {gbm_mae_eval:.5f}")

In [None]:
import shap 
# shap.initjs()

df_test_shap = utils.filtrar_features(df_test_f, features)

explainer = shap.TreeExplainer(gbm)
shap_values = explainer.shap_values(df_test_shap)

In [None]:
# shap.force_plot(explainer.expected_value, shap_values, df_test_shap)

In [None]:
shap.summary_plot(shap_values, df_test_shap)

## Evaluación df_test completo

In [17]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from hyperopt import fmin, tpe, hp
import ipynb.fs.full.utils as utils
import ipynb.fs.full.features as features
import ipynb.fs.full.features_distancias as f_distancias

df_train = pd.read_csv('./data/train_filtrado.csv',)
# Para usarse con el submit a Kaggle
df_test = pd.read_csv('./data/test.csv')

df_test = features.llenar_nulls(df_test, hgb_mean=True, df_fill=df_train)
df_train = features.llenar_nulls(df_train, hgb_mean=True)

In [18]:
df_test_f = features.features_independientes_precio(df_test)
df_test_f = features.features_dependientes_precio(df_test_f, df_train)

df_train_f = features.features_independientes_precio(df_train)
df_train_f = features.features_dependientes_precio(df_train_f, df_train)

df_test_f, cols_tipodepropiedad_ohe = features.columna_a_ohe(df_test_f, 'tipodepropiedad', N=250, df_aux=df_train, devolver_cols=True)
df_test_f, cols_provincia_ohe = features.columna_a_ohe(df_test_f, 'provincia', N=250, df_aux=df_train, devolver_cols=True)
df_test_f, cols_zona_ohe = features.columna_a_ohe(df_test_f, 'zona', N=250, df_aux=df_train_f, devolver_cols=True)

df_train_f = features.columna_a_ohe(df_train_f, 'tipodepropiedad', N=250, df_aux=df_test)
df_train_f = features.columna_a_ohe(df_train_f, 'provincia', N=250, df_aux=df_test)
df_train_f = features.columna_a_ohe(df_train_f, 'zona', N=250, df_aux=df_test_f)


df_train_idf = pd.read_csv('./data/train_idf.csv')
df_test_idf = pd.read_csv('./data/test_idf.csv')

df_train_f = pd.merge(df_train_f, df_train_idf, on= 'id', how= 'left')
df_test_f = pd.merge(df_test_f, df_test_idf, on= 'id', how= 'left')

df_train_f = f_distancias.feature_distancias(df_train_f)
df_test_f = f_distancias.feature_distancias(df_test_f, df_train_f)

# df_train_f['fecha'] = pd.to_datetime(df_train_f['fecha']).astype(int)
# df_test_f['fecha'] = pd.to_datetime(df_test_f['fecha']).astype(int)

features = ['habitaciones', 'garages','banos','antiguedad', 'metroscubiertos',  'metrostotales','lat_norm', 
           'lng_norm', 'gimnasio', 'usosmultiples', 'piscina']

features_test = ['top_provincia', 'promedio_precio_ciudad', 'anio', 'promedio_id_zona', 
                 'promedio_precio_tipo_propiedad', 'count_id_zona', 'count_ciudad', 'puntaje', 
               'count_tipo_propiedad_ciudad', 'promedio_precio_tipo_propiedad_ciudad_gen','count_id_zona'
           'dias_desde_datos','meses_desde_datos','porcentaje_metros','distancia_ciudad_centrica', 
                'distancia_centro_mexico', 'distancia_ciudad_cara', 'promedio_precio_hbg_tipo_propiedad_provincia']

features += features_test

# hps = {'bagging_fraction': 0.806451877022587,
#  'bagging_freq': 62.0,
#  'feature_fraction': 0.5379925983440028,
#  'learning_rate': 0.1363027714646826,
#  'max_depth': 11.0,
#  'num_leaves': 113.0,
#  'test_size': 0.09575190901892519}


hps = {'bagging_fraction': 0.5,
 'feature_fraction': 0.9,
 'learning_rate': 0.25,
 'max_depth': 10,
 'n_jobs': 2,
 'num_leaves': 200
      }

bagging_fraction = hps['bagging_fraction']
feature_fraction = hps['feature_fraction']
learning_rate = hps['learning_rate']
num_leaves = int(hps['num_leaves'])
max_depth = int(hps['max_depth'])

params = {
    'boosting_type': 'dart',
    'objective': 'regression',
    'metric': {'mae'}, # Si se deja vacio se toma el ideal para llegar al 'objective'
    'num_leaves': num_leaves,
    'learning_rate': learning_rate,
    'feature_fraction': feature_fraction,
    'bagging_fraction': bagging_fraction,
#    'bagging_freq': bagging_freq,
    'max_depth': max_depth,
    'verbose': 0
}

x_train, x_test, y_train, y_test = utils.dividir_dataset(df_train_f, 'precio', features,0.15)

lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)


gbm = lgb.train(params,
                lgb_train,
                valid_sets=lgb_eval,
                num_boost_round=1000,
                early_stopping_rounds=15,
                verbose_eval=1)

y_pred_test = gbm.predict(x_test, raw_score=True, num_iteration=gbm.best_iteration)
y_pred_train = gbm.predict(x_train, raw_score=True, num_iteration=gbm.best_iteration)
y_pred_eval = gbm.predict(utils.filtrar_features(df_test_f, features), num_iteration=gbm.best_iteration)

gbm_mae_train = utils.MAE(y_train, y_pred_train)
gbm_mae = utils.MAE(y_test, y_pred_test)

print(f"MAE LightGBM (train): {gbm_mae_train:.5f}")
print(f"MAE LightGBM (test): {gbm_mae:.5f}")

[1]	valid_0's l1: 1.28602e+06
[2]	valid_0's l1: 1.05421e+06
[3]	valid_0's l1: 893167
[4]	valid_0's l1: 784396




[5]	valid_0's l1: 706998
[6]	valid_0's l1: 652841
[7]	valid_0's l1: 615765
[8]	valid_0's l1: 650449
[9]	valid_0's l1: 613703
[10]	valid_0's l1: 587720
[11]	valid_0's l1: 568986
[12]	valid_0's l1: 580126
[13]	valid_0's l1: 563601
[14]	valid_0's l1: 551713
[15]	valid_0's l1: 542856
[16]	valid_0's l1: 535826
[17]	valid_0's l1: 530570
[18]	valid_0's l1: 526483
[19]	valid_0's l1: 522248
[20]	valid_0's l1: 520376
[21]	valid_0's l1: 522608
[22]	valid_0's l1: 519561
[23]	valid_0's l1: 516963
[24]	valid_0's l1: 515178
[25]	valid_0's l1: 513535
[26]	valid_0's l1: 512190
[27]	valid_0's l1: 510917
[28]	valid_0's l1: 511653
[29]	valid_0's l1: 509910
[30]	valid_0's l1: 508001
[31]	valid_0's l1: 508440
[32]	valid_0's l1: 507162
[33]	valid_0's l1: 506217
[34]	valid_0's l1: 504922
[35]	valid_0's l1: 505550
[36]	valid_0's l1: 508177
[37]	valid_0's l1: 506251
[38]	valid_0's l1: 504164
[39]	valid_0's l1: 503143
[40]	valid_0's l1: 504187
[41]	valid_0's l1: 506309
[42]	valid_0's l1: 504166
[43]	valid_0's l1

[314]	valid_0's l1: 471367
[315]	valid_0's l1: 471397
[316]	valid_0's l1: 471438
[317]	valid_0's l1: 471468
[318]	valid_0's l1: 471517
[319]	valid_0's l1: 471378
[320]	valid_0's l1: 471471
[321]	valid_0's l1: 471437
[322]	valid_0's l1: 471411
[323]	valid_0's l1: 471368
[324]	valid_0's l1: 471256
[325]	valid_0's l1: 471278
[326]	valid_0's l1: 471164
[327]	valid_0's l1: 471264
[328]	valid_0's l1: 471161
[329]	valid_0's l1: 471242
[330]	valid_0's l1: 471137
[331]	valid_0's l1: 471075
[332]	valid_0's l1: 470777
[333]	valid_0's l1: 470819
[334]	valid_0's l1: 470939
[335]	valid_0's l1: 470629
[336]	valid_0's l1: 470700
[337]	valid_0's l1: 470751
[338]	valid_0's l1: 470757
[339]	valid_0's l1: 470665
[340]	valid_0's l1: 470703
[341]	valid_0's l1: 470707
[342]	valid_0's l1: 470774
[343]	valid_0's l1: 470792
[344]	valid_0's l1: 470714
[345]	valid_0's l1: 470633
[346]	valid_0's l1: 470648
[347]	valid_0's l1: 470563
[348]	valid_0's l1: 470674
[349]	valid_0's l1: 470725
[350]	valid_0's l1: 470653
[

[621]	valid_0's l1: 463657
[622]	valid_0's l1: 463187
[623]	valid_0's l1: 463374
[624]	valid_0's l1: 463323
[625]	valid_0's l1: 463373
[626]	valid_0's l1: 463421
[627]	valid_0's l1: 463366
[628]	valid_0's l1: 463400
[629]	valid_0's l1: 463331
[630]	valid_0's l1: 463256
[631]	valid_0's l1: 463291
[632]	valid_0's l1: 463302
[633]	valid_0's l1: 463231
[634]	valid_0's l1: 463183
[635]	valid_0's l1: 463292
[636]	valid_0's l1: 463369
[637]	valid_0's l1: 463402
[638]	valid_0's l1: 463333
[639]	valid_0's l1: 463273
[640]	valid_0's l1: 463309
[641]	valid_0's l1: 463246
[642]	valid_0's l1: 463208
[643]	valid_0's l1: 463226
[644]	valid_0's l1: 463114
[645]	valid_0's l1: 463207
[646]	valid_0's l1: 463232
[647]	valid_0's l1: 463215
[648]	valid_0's l1: 463316
[649]	valid_0's l1: 462808
[650]	valid_0's l1: 462773
[651]	valid_0's l1: 462770
[652]	valid_0's l1: 462677
[653]	valid_0's l1: 462640
[654]	valid_0's l1: 462632
[655]	valid_0's l1: 462800
[656]	valid_0's l1: 462896
[657]	valid_0's l1: 462873
[

[926]	valid_0's l1: 460231
[927]	valid_0's l1: 460192
[928]	valid_0's l1: 460177
[929]	valid_0's l1: 460233
[930]	valid_0's l1: 460209
[931]	valid_0's l1: 460156
[932]	valid_0's l1: 460116
[933]	valid_0's l1: 460194
[934]	valid_0's l1: 460252
[935]	valid_0's l1: 460299
[936]	valid_0's l1: 460214
[937]	valid_0's l1: 460162
[938]	valid_0's l1: 460253
[939]	valid_0's l1: 460238
[940]	valid_0's l1: 460228
[941]	valid_0's l1: 459854
[942]	valid_0's l1: 459961
[943]	valid_0's l1: 459920
[944]	valid_0's l1: 460073
[945]	valid_0's l1: 459999
[946]	valid_0's l1: 459931
[947]	valid_0's l1: 459994
[948]	valid_0's l1: 460056
[949]	valid_0's l1: 460037
[950]	valid_0's l1: 460109
[951]	valid_0's l1: 460186
[952]	valid_0's l1: 460101
[953]	valid_0's l1: 459754
[954]	valid_0's l1: 459723
[955]	valid_0's l1: 459897
[956]	valid_0's l1: 459957
[957]	valid_0's l1: 460014
[958]	valid_0's l1: 459948
[959]	valid_0's l1: 460048
[960]	valid_0's l1: 460076
[961]	valid_0's l1: 460034
[962]	valid_0's l1: 460096
[

In [24]:
#y_pred_eval = gbm.predict(utils.filtrar_features(df_train_f, features), num_iteration=gbm.best_iteration)
df_train_f['target'] = y_pred_train

# df_test_f = utils.pesificar_df(df_test_f, 'target', 'target')
df_train_f[['id', 'target']].to_csv('respuesta38c.csv', index=False)

ValueError: Length of values does not match length of index

In [30]:
len(y_pred_eval)

236303

In [None]:
df_test_f.shape