In [2]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from hyperopt import fmin, tpe, hp
import ipynb.fs.full.utils as utils
import ipynb.fs.full.features as features
import ipynb.fs.full.features_distancias as f_distancias

df_train = pd.read_csv('./data/train_filtrado.csv')
# Para usarse con el submit a Kaggle
df_test = pd.read_csv('./data/test.csv')

df_train = features.llenar_nulls(df_train)
df_test = features.llenar_nulls(df_test, hgb_mean=True, df_fill=df_train)

# df_train, df_test = features_de_csvs(df_train, df_test)

# df_train, df_test = utils.dividir_df_testeo(df_train, test_size=0.15)

In [3]:
df_test_f = features.features_independientes_precio(df_test)
df_test_f = features.features_dependientes_precio(df_test_f, df_train)

df_train_f = features.features_independientes_precio(df_train)
df_train_f = features.features_dependientes_precio(df_train_f, df_train)

df_test_f, cols_tipodepropiedad_ohe = features.columna_a_ohe(df_test_f, 'tipodepropiedad', N=100, df_aux=df_train, devolver_cols=True)
df_test_f, cols_provincia_ohe = features.columna_a_ohe(df_test_f, 'provincia', N=100, df_aux=df_train, devolver_cols=True)
df_test_f, cols_zona_ohe = features.columna_a_ohe(df_test_f, 'zona', df_aux=df_train_f, devolver_cols=True)

df_train_f = features.columna_a_ohe(df_train_f, 'tipodepropiedad', N=100, df_aux=df_test)
df_train_f = features.columna_a_ohe(df_train_f, 'provincia', N=100, df_aux=df_test)
df_train_f = features.columna_a_ohe(df_train_f, 'zona', df_aux=df_test_f)


df_train_f['fecha'] = pd.to_datetime(df_train_f['fecha']).astype(int)
df_test_f['fecha'] = pd.to_datetime(df_test_f['fecha']).astype(int)

df_train_idf = pd.read_csv('./data/train_idf.csv')
df_test_idf = pd.read_csv('./data/test_idf.csv')

df_train_f = pd.merge(df_train_f, df_train_idf, on= 'id', how= 'left')
df_test_f = pd.merge(df_test_f, df_test_idf, on= 'id', how= 'left')

df_train_f = f_distancias.feature_distancias(df_train_f)
df_test_f = f_distancias.feature_distancias(df_test_f, df_train_f)


# df_train_f = features.KD_feature(df_train_f)
# df_test_f =  features.KD_feature(df_test_f)

## LightGBM model

In [4]:
from sklearn.model_selection import train_test_split

class LightGBMWrapper(lgb.LGBMRegressor):
    
    def fit(self, x, y):        
        return super(LightGBMWrapper, self).fit(x, y)
    
    def predict(self, X):
        return super(LightGBMWrapper, self).predict(X,num_iteration=self.best_iteration_)

hps = {'bagging_fraction': 0.5,
 'boosting_type': 'gbdt',
 'feature_fraction': 0.9,
 'learning_rate': 0.25,
 'max_depth': 10,
 'metric': 'mae',
 'n_jobs': 2,
 'num_leaves': 200,
 'objective': 'regression'}

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mae', # Si se deja vacio se toma el ideal para llegar al 'objective'
    'num_leaves': int(hps['num_leaves']),
    'learning_rate': hps['learning_rate'],
    'feature_fraction': hps['feature_fraction'],
    'bagging_fraction': hps['bagging_fraction'],
#     'bagging_freq': int(hps['bagging_freq']),
    'max_depth': int(hps['max_depth']),
    'verbose': 0
}

lgb_m = LightGBMWrapper(**params)

## Keras model

In [5]:
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Activation

def keras_modelo():    
    model = Sequential()
    model.add(Dense(units=200, activation='selu'))
    model.add(Dropout(0.1))
    model.add(Dense(units=200, activation='selu'))
    model.add(Dropout(0.1))
    model.add(Dense(units=200, activation='selu'))
    model.add(Dropout(0.1))
    model.add(Dense(units=1, activation='linear'))

    model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_squared_error'])  
    return model

keras_m = KerasRegressor(build_fn=keras_modelo, epochs=15)

Using TensorFlow backend.


## XGBoost

In [6]:
import xgboost as xgb

class XGBoostWrapper(xgb.XGBRegressor):
    
    def fit(self, x, y):
        return super(xgb.XGBRegressor, self).fit(x, y, early_stopping_rounds=2, eval_metric='mae', eval_set=[(x, y)])
    
    def predict(self, X):
        return super(xgb.XGBRegressor, self).predict(X)


# hps = {'alpha': 9.616105489494071,
#      'colsample_bytree': 0.8500000000000001,
#      'learning_rate': 0.14,
#      'max_depth': int(16.0),
#      'n_estimators': int(450.0),
#      'test_size': 0.1,
#      'early_stopping_rounds': 5,
#      'n_jobs': 4}

hps = {'colsample_bytree': 0.9,
 'eval_metric': 'mae',
 'learning_rate': 0.1,
 'max_depth': 10,
 'n_estimators': 120,
 'n_jobs': 4,
 'objective': 'reg:squarederror',
 'scale_pos_weight': 1,
 'verbosity': 0}


# n_estimators = int(hps['n_estimators'])
# max_depth = int(hps['max_depth'])

xgb_m = XGBoostWrapper(**hps)

## RandomForest

In [7]:
from sklearn.ensemble import RandomForestRegressor

params = {'bootstrap': False,
          'max_features': 'sqrt',
          'min_samples_split': 4,
          'n_jobs': 2}

forest_m = RandomForestRegressor(verbose=1, n_estimators=100, **params)

## Stacking

In [8]:
from sklearn.preprocessing import MinMaxScaler
from vecstack import StackingTransformer

# from sklearn.preprocessing import StandardScaler
# from keras.models import Sequential
# from keras.layers import Dense, Dropout, AlphaDropout

# def normalizar_df(df, features):
#     min_max = StandardScaler()
#     df[features] = pd.DataFrame(min_max.fit_transform(df[features]), columns=features)
#     return df
    

features = ['habitaciones', 'garages','banos','antiguedad', 'metroscubiertos',  'metrostotales','lat_norm', 
           'lng_norm', 'gimnasio', 'usosmultiples', 'piscina']

features_test = ['prop_frecuente', 'top_provincia', 'promedio_precio_ciudad', 'anio', 'promedio_id_zona', 
                 'promedio_precio_tipo_propiedad', 'count_id_zona', 'count_ciudad', 'puntaje', 
               'count_tipo_propiedad_ciudad', 'promedio_precio_tipo_propiedad_ciudad_gen',
           'dias_desde_datos','meses_desde_datos','porcentaje_metros','distancia_ciudad_centrica']

# features_cat = ['provincia', 'tipodepropiedad', 'intervalo_metros_totales', 'intervalo_metros_cubiertos',
#                'zona']

features += features_test #+ features_cat

# df_train_g = utils.filtrar_features(df_train_f, features, 'precio')

# features_a_normalizar = [f for f in features if f not in features_cat + ['precio']]

# df_train_n = pd.get_dummies(df_train_g, columns=features_cat)
# df_train_n = normalizar_df(df_train_n, features_a_normalizar)

x_train, x_test, y_train, y_test = utils.dividir_dataset(df_train_f, 'precio', features, test_size=0.001)

modelos = [
           ('lightgbm', lgb_m), 
           ('randomforest', forest_m),
         #   ('keras', keras_m), 
           ('xgboost', xgb_m)
          ]

stack = StackingTransformer(modelos, regression=True, verbose=2, n_folds=7)

stack = stack.fit(x_train, y_train)

# s_train = stack.transform(x_train)
# s_test = stack.transform(x_test)

task:         [regression]
metric:       [mean_absolute_error]
variant:      [A]
n_estimators: [3]

estimator  0: [lightgbm: LightGBMWrapper]
    fold  0:  [500048.82303763]
    fold  1:  [498234.18788983]
    fold  2:  [498223.18022674]
    fold  3:  [497179.28424232]
    fold  4:  [492265.26021487]
    fold  5:  [495102.86257527]
    fold  6:  [503024.22269548]
    ----
    MEAN:     [497725.40298316] + [3184.14685319]

estimator  1: [randomforest: RandomForestRegressor]


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   22.9s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   52.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.5s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    1.2s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


    fold  0:  [472004.87398085]


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   22.5s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   48.8s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.4s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    1.0s finished


    fold  1:  [469794.05911717]


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   22.4s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   50.0s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.7s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    1.2s finished


    fold  2:  [470264.31916355]


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   22.2s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   48.9s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    1.2s finished


    fold  3:  [467056.51030406]


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   22.7s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   49.4s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.7s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    1.2s finished


    fold  4:  [463233.69109204]


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   24.3s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   50.8s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    1.2s finished


    fold  5:  [465555.70108495]


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   22.8s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   53.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    1.3s finished


    fold  6:  [474039.07696581]
    ----
    MEAN:     [468849.74738692] + [3492.50691459]

estimator  2: [xgboost: XGBoostWrapper]
[0]	validation_0-mae:2.27271e+06
Will train until validation_0-mae hasn't improved in 2 rounds.
[1]	validation_0-mae:2.04691e+06
[2]	validation_0-mae:1.84426e+06
[3]	validation_0-mae:1.6632e+06
[4]	validation_0-mae:1.50182e+06
[5]	validation_0-mae:1.35878e+06
[6]	validation_0-mae:1.23265e+06
[7]	validation_0-mae:1.12156e+06
[8]	validation_0-mae:1.02431e+06
[9]	validation_0-mae:939562
[10]	validation_0-mae:866289
[11]	validation_0-mae:802850
[12]	validation_0-mae:748454
[13]	validation_0-mae:701966
[14]	validation_0-mae:662463
[15]	validation_0-mae:628900
[16]	validation_0-mae:600523
[17]	validation_0-mae:576299
[18]	validation_0-mae:556182
[19]	validation_0-mae:538576
[20]	validation_0-mae:524085
[21]	validation_0-mae:511865
[22]	validation_0-mae:501569
[23]	validation_0-mae:492768
[24]	validation_0-mae:485297
[25]	validation_0-mae:478752
[26]	validation_0

[25]	validation_0-mae:479279
[26]	validation_0-mae:473464
[27]	validation_0-mae:468668
[28]	validation_0-mae:463909
[29]	validation_0-mae:460074
[30]	validation_0-mae:456376
[31]	validation_0-mae:452966
[32]	validation_0-mae:450146
[33]	validation_0-mae:447488
[34]	validation_0-mae:444983
[35]	validation_0-mae:442601
[36]	validation_0-mae:440547
[37]	validation_0-mae:438326
[38]	validation_0-mae:436424
[39]	validation_0-mae:434089
[40]	validation_0-mae:432262
[41]	validation_0-mae:430251
[42]	validation_0-mae:428391
[43]	validation_0-mae:426776
[44]	validation_0-mae:425518
[45]	validation_0-mae:424186
[46]	validation_0-mae:422943
[47]	validation_0-mae:421400
[48]	validation_0-mae:420114
[49]	validation_0-mae:418999
[50]	validation_0-mae:417907
[51]	validation_0-mae:416430
[52]	validation_0-mae:415674
[53]	validation_0-mae:414969
[54]	validation_0-mae:413791
[55]	validation_0-mae:413088
[56]	validation_0-mae:411802
[57]	validation_0-mae:410931
[58]	validation_0-mae:410098
[59]	validatio

[58]	validation_0-mae:410849
[59]	validation_0-mae:409567
[60]	validation_0-mae:408743
[61]	validation_0-mae:407992
[62]	validation_0-mae:407458
[63]	validation_0-mae:406228
[64]	validation_0-mae:405624
[65]	validation_0-mae:404358
[66]	validation_0-mae:403161
[67]	validation_0-mae:402412
[68]	validation_0-mae:401791
[69]	validation_0-mae:400981
[70]	validation_0-mae:400573
[71]	validation_0-mae:400121
[72]	validation_0-mae:399359
[73]	validation_0-mae:397860
[74]	validation_0-mae:397575
[75]	validation_0-mae:396533
[76]	validation_0-mae:395307
[77]	validation_0-mae:394442
[78]	validation_0-mae:393655
[79]	validation_0-mae:392784
[80]	validation_0-mae:391803
[81]	validation_0-mae:391038
[82]	validation_0-mae:390515
[83]	validation_0-mae:390127
[84]	validation_0-mae:389258
[85]	validation_0-mae:388558
[86]	validation_0-mae:387805
[87]	validation_0-mae:387084
[88]	validation_0-mae:386132
[89]	validation_0-mae:385428
[90]	validation_0-mae:384989
[91]	validation_0-mae:384459
[92]	validatio

[91]	validation_0-mae:382402
[92]	validation_0-mae:381722
[93]	validation_0-mae:381367
[94]	validation_0-mae:380993
[95]	validation_0-mae:380048
[96]	validation_0-mae:379374
[97]	validation_0-mae:378852
[98]	validation_0-mae:378305
[99]	validation_0-mae:377744
[100]	validation_0-mae:377406
[101]	validation_0-mae:376842
[102]	validation_0-mae:376253
[103]	validation_0-mae:375766
[104]	validation_0-mae:375316
[105]	validation_0-mae:374721
[106]	validation_0-mae:373836
[107]	validation_0-mae:372951
[108]	validation_0-mae:372364
[109]	validation_0-mae:371932
[110]	validation_0-mae:371325
[111]	validation_0-mae:370749
[112]	validation_0-mae:370204
[113]	validation_0-mae:369459
[114]	validation_0-mae:369171
[115]	validation_0-mae:368632
[116]	validation_0-mae:368314
[117]	validation_0-mae:367784
[118]	validation_0-mae:367345
[119]	validation_0-mae:366753
    fold  6:  [486817.99595741]
    ----
    MEAN:     [482688.62400320] + [3089.83582980]



In [9]:
s_train = stack.transform(utils.filtrar_features(df_train_f.drop('precio', axis=1), features))
s_test = stack.transform(utils.filtrar_features(df_test_f, features))

Transforming...

estimator  0: [lightgbm: LightGBMWrapper]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    model from fold  4: done
    model from fold  5: done
    model from fold  6: done
    ----
    DONE

estimator  1: [randomforest: RandomForestRegressor]


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   14.9s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   33.1s finished


    model from fold  0: done


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   17.7s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   35.5s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


    model from fold  1: done


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   16.5s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   34.9s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


    model from fold  2: done


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   19.1s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   35.3s finished


    model from fold  3: done


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   14.6s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   30.3s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


    model from fold  4: done


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   13.3s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   28.8s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


    model from fold  5: done


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   18.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   37.9s finished


    model from fold  6: done
    ----
    DONE

estimator  2: [xgboost: XGBoostWrapper]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    model from fold  4: done
    model from fold  5: done
    model from fold  6: done
    ----
    DONE

Transforming...

estimator  0: [lightgbm: LightGBMWrapper]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    model from fold  4: done
    model from fold  5: done
    model from fold  6: done
    ----
    DONE

estimator  1: [randomforest: RandomForestRegressor]


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   11.8s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   25.9s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


    model from fold  0: done


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   14.7s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   29.3s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


    model from fold  1: done


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   12.9s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   27.4s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


    model from fold  2: done


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   15.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   26.9s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


    model from fold  3: done


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   12.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   26.2s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


    model from fold  4: done


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   12.1s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   25.7s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


    model from fold  5: done


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   13.5s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   30.3s finished


    model from fold  6: done
    ----
    DONE

estimator  2: [xgboost: XGBoostWrapper]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    model from fold  4: done
    model from fold  5: done
    model from fold  6: done
    ----
    DONE



## Prediccion con todos los features + stacking

In [10]:
df_train_s = df_train_f.copy()
df_test_s = df_test_f.copy()

df_train_s['stack01'], df_train_s['stack02'], df_train_s['stack03'] = zip(*s_train)
df_test_s['stack01'], df_test_s['stack02'], df_test_s['stack03'] = zip(*s_test)

In [11]:
df_train_s['id'] = df_train['id']
df_test_s['id'] = df_test['id']

In [12]:
params_2nd = {'bagging_fraction': 0.8999882607358867,
 'bagging_freq': int(95.0),
 'feature_fraction': 0.2570109385381975,
 'learning_rate': 0.13601832720254403,
 'max_depth': int(26.0),
 'num_leaves': int(175.0),
 'test_size': 0.08363501292068126,
 'boosting_type': 'dart',
 'num_boost_round': 1500,
 'objective': 'regression',
 'metric': 'mae'}

lgb_m_2nd = LightGBMWrapper(**params_2nd)
lgb_m_2nd.fit(utils.filtrar_features(df_train_s, features + ['stack01', 'stack02', 'stack03']), df_train['precio'].values)
# hps = {'bagging_fraction': 0.8999882607358867,
#  'bagging_freq': int(95.0),
#  'feature_fraction': 0.2570109385381975,
#  'learning_rate': 0.13601832720254403,
#  'max_depth': int(26.0),
#  'num_leaves': int(175.0),
#  'test_size': 0.08363501292068126}

# params_2nd = {
#     'boosting_type': 'dart',
#     'num_boost_round': 1500,
#     'objective': 'regression',
#     'metric': 'mae', # Si se deja vacio se toma el ideal para llegar al 'objective'
#     'num_leaves': int(hps['num_leaves']),
#     'learning_rate': hps['learning_rate'],
#     'feature_fraction': hps['feature_fraction'],
#     'bagging_fraction': hps['bagging_fraction'],
#     'bagging_freq': int(hps['bagging_freq']),
#     'max_depth': int(hps['max_depth']),
#     'verbose': 0
# }

# lgb_m_2nd = LightGBMWrapper(**params_2nd)
# lgb_m_2nd.fit(utils.filtrar_features(df_train_s, features + ['stack01', 'stack02']), df_train['precio'].values)



LightGBMWrapper(bagging_fraction=0.8999882607358867, bagging_freq=95,
                boosting_type='dart', class_weight=None, colsample_bytree=1.0,
                feature_fraction=0.2570109385381975, importance_type='split',
                learning_rate=0.13601832720254403, max_depth=26, metric='mae',
                min_child_samples=20, min_child_weight=0.001,
                min_split_gain=0.0, n_estimators=100, n_jobs=-1,
                num_boost_round=1500, num_leaves=175, objective='regression',
                random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
                subsample=1.0, subsample_for_bin=200000, subsample_freq=0,
                test_size=0.08363501292068126)

In [13]:
df_test_s['target'] = lgb_m_2nd.predict(utils.filtrar_features(df_test_s, features + ['stack01', 'stack02', 'stack03']))
df_test_s[['id', 'target']].to_csv('respuesta41.csv', index = False)

In [18]:
print(utils.filtrar_features(df_test_s, features + ['stack01', 'stack02', 'stack03']).shape)
print(utils.filtrar_features(df_train_s, features + ['stack01', 'stack02', 'stack03']).shape)
print(df_train_s.shape)

(60000, 29)
(236303, 29)
(236303, 151)


## Prediccion solo con features de stacking

In [35]:
params_2nd = {'bagging_fraction': 0.8924398062087346,
 'bagging_freq': int(36.0),
 'feature_fraction': 0.16167385124183287,
 'learning_rate': 0.054693418899570134,
 'max_depth': int(4.0),
 'num_leaves': int(93.0)}
keras_mae_train = utils.MAE(y_train, lgb_m_2nd.predict(stack.transform(x_train)))
keras_mae_test = utils.MAE(y_test, lgb_m_2nd.predict(stack.transform(x_test)))
print(f"MAE Stacking (train): {keras_mae_train:.5f}")
print(f"MAE Stacking (test): {keras_mae_test:.5f}")
lgb_m_2nd = LightGBMWrapper(**params_2nd)
lgb_m_2nd.fit(stack.transform(utils.filtrar_features(df_train_f.drop('precio', axis=1), features)), df_train_f['precio'].values)

Train set was detected.
Transforming...

estimator  0: [lightgbm: LightGBMWrapper]


TypeError: super(type, obj): obj must be an instance or subtype of type

In [36]:
keras_mae_train = utils.MAE(y_train, lgb_m_2nd.predict(stack.transform(x_train)))
keras_mae_test = utils.MAE(y_test, lgb_m_2nd.predict(stack.transform(x_test)))
print(f"MAE Stacking (train): {keras_mae_train:.5f}")
print(f"MAE Stacking (test): {keras_mae_test:.5f}")

Train set was detected.
Transforming...

estimator  0: [lightgbm: LightGBMWrapper]


TypeError: super(type, obj): obj must be an instance or subtype of type

In [None]:
s_test_f = stack.transform(utils.filtrar_features(df_test_f, features))
y_pred_test_f = lgb_m_2nd.predict(s_test_f)
df_test_f['target'] = y_pred_test_f
df_test_f[['id', 'target']].to_csv('respuesta35.csv', index = False)

In [None]:
features = ['stack01', 'stack02', 'stack03']

def eval_lightgbm(args):
    num_leaves, learning_rate, feature_fraction, bagging_fraction, bagging_freq, max_depth = args

    lgb_train = lgb.Dataset(s_train, y_train)
#     lgb_eval = lgb.Dataset(s_test, y_test, reference=lgb_train)
    
    num_leaves = int(num_leaves)
    bagging_freq = int(bagging_freq)
    max_depth = int(max_depth)
    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'mae'}, # Si se deja vacio se toma el ideal para llegar al 'objective'
        'num_leaves': num_leaves,
        'learning_rate': learning_rate,
        'feature_fraction': feature_fraction,
        'bagging_fraction': bagging_fraction,
        'bagging_freq': bagging_freq,
        'max_depth': max_depth,
        'verbose': -1,
    }

    gbm = lgb.train(params,
                    lgb_train,
#                     valid_sets=lgb_eval,
                    num_boost_round=250,
#                     early_stopping_rounds=15,
                    verbose_eval=-1)
    
    y_pred_test = gbm.predict(s_test, num_iteration=gbm.best_iteration)
    return utils.MAE(y_test, y_pred_test)

space = [hp.quniform('num_leaves', 30, 130, 1), hp.uniform('learning_rate', 0.05, 0.9),
        hp.uniform('feature_fraction', 0.10, 0.90), hp.uniform('bagging_fraction', 0.10, 0.90),
        hp.quniform('bagging_freq', 1, 130, 1), hp.quniform('max_depth', 1, 20, 1)]

hps = fmin(eval_lightgbm, space=space, algo=tpe.suggest, max_evals=100, verbose=1)

display(hps)

In [None]:

# keras_mae_train = utils.MAE(y_test, lgb_m.predict(x_test_s))
# print(f"MAE Keras (train): {keras_mae_train:.5f}")