In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from hyperopt import fmin, tpe, hp
import ipynb.fs.full.utils as utils
import ipynb.fs.full.features as features
import ipynb.fs.full.features_distancias as f_distancias

df_train = pd.read_csv('./data/train_filtrado.csv')
# df_train = utils.dolarizar_df(df_train)
# Para usarse con el submit a Kaggle
df_test = pd.read_csv('./data/test.csv')

df_train = features.llenar_nulls(df_train)
df_test = features.llenar_nulls(df_test, hgb_mean=True, df_fill=df_train)

df_train_cluster = pd.read_csv('./data/clustering_train.csv').rename(columns={'label': 'clustering_label'})
df_test_cluster = pd.read_csv('./data/clustering_test.csv').rename(columns={'label': 'clustering_label'})

df_train = pd.merge(df_train, df_train_cluster, on='id')
df_test = pd.merge(df_test, df_test_cluster, on='id')

df_train_idf = pd.read_csv('./data/train_idf.csv')
df_test_idf = pd.read_csv('./data/test_idf.csv')

df_train = pd.merge(df_train, df_train_idf, on= 'id', how= 'left')
df_test = pd.merge(df_test, df_test_idf, on= 'id', how= 'left')

df_train, df_test = utils.dividir_df_testeo(df_train, test_size=0.3)

# df_train, df_test = features_de_csvs(df_train, df_test)

In [2]:
# df_train, df_test = utils.dividir_df_testeo(df_train, test_size=0.20)

In [3]:
df_test_f = features.features_independientes_precio(df_test)
df_test_f = features.features_dependientes_precio(df_test_f, df_train)

df_train_f = features.features_independientes_precio(df_train)
df_train_f = features.features_dependientes_precio(df_train_f, df_train)

df_test_f, cols_tipodepropiedad_ohe = features.columna_a_ohe(df_test_f, 'tipodepropiedad', N=100, df_aux=df_train, devolver_cols=True)
df_test_f, cols_provincia_ohe = features.columna_a_ohe(df_test_f, 'provincia', N=100, df_aux=df_train, devolver_cols=True)
df_test_f, cols_zona_ohe = features.columna_a_ohe(df_test_f, 'zona', df_aux=df_train_f, devolver_cols=True)

df_train_f = features.columna_a_ohe(df_train_f, 'tipodepropiedad', N=100, df_aux=df_test)
df_train_f = features.columna_a_ohe(df_train_f, 'provincia', N=100, df_aux=df_test)
df_train_f = features.columna_a_ohe(df_train_f, 'zona', df_aux=df_test_f)


df_train_f['fecha'] = pd.to_datetime(df_train_f['fecha']).astype(int)
df_test_f['fecha'] = pd.to_datetime(df_test_f['fecha']).astype(int)


df_train_f = f_distancias.feature_distancias(df_train_f)
df_test_f = f_distancias.feature_distancias(df_test_f, df_train_f)


# df_train_f = features.KD_feature(df_train_f)
# df_test_f =  features.KD_feature(df_test_f)

## Selector de Features

In [4]:
from sklearn.base import BaseEstimator

class FeatureSelector(BaseEstimator):
    
    base_features = list(df_train_f.drop('precio', axis=1, errors='ignore').columns)
    
    def __init__(self, features):
        base_features = FeatureSelector.base_features

        self.features = features
        self.features_index = [i for i in range(len(base_features)) if base_features[i] in features]
        
    def fit(self, x, y=None):
        return self

    def transform(self, x):
        return x[:, self.features_index]

## LightGBM model

In [5]:
from sklearn.pipeline import Pipeline

class LightGBMWrapper(lgb.LGBMRegressor):
    
    def fit(self, x, y):
        return super(LightGBMWrapper, self).fit(x, y)
    
    def predict(self, X):
        return super(LightGBMWrapper, self).predict(X)

params = {'boosting_type': 'gbdt',
 'feature_fraction': 0.8,
 'learning_rate': 0.25,
 'max_bin': 255,
 'max_depth': 15,
 'metric': 'mae',
 'min_data_in_leaf': 40,
 'min_split_gain': 0.7,
 'n_jobs': 4,
 'num_leaves': 300,
 'objective': 'regression',
 'reg_lambda': 10,
 'verbose': 0}

lgb_m = Pipeline(steps=[
    ('feature_selector', FeatureSelector(['antiguedad', 'habitaciones', 'garages', 'banos', 'metroscubiertos',
       'metrostotales', 'lat', 'lng', 'fecha', 'piscina',
       'clustering_label', 'idf_descripcion',
       'porcentaje_metros', 'diferencia_metros', 'promedio_metros_tipo_propiedad', 
       'prop_frecuente', 'top_provincia', 'es_ciudad_centrica',
       'mes', 'trimestre', 'dias_desde_datos',
       'tam_ambientes', 'promedio_precio_provincia',
       'promedio_precio_ciudad', 'count_ciudad',
       'promedio_id_zona', 'count_id_zona', 'promedio_precio_tipo_propiedad',
       'promedio_precio_tipo_propiedad_ciudad', 'count_tipo_propiedad',
       'count_tipo_propiedad_ciudad', 'promedio_por_mes',
       'promedio_precio_habitaciones_banos_garages',
       'promedio_precio_hbg_tipo_propiedad', 'puntaje',
       'distancia_ciudad_centrica', 'distancia_centro_mexico'])),
    ('lightgbm', LightGBMWrapper(**params))
])

## Keras model

In [6]:
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Activation

def keras_modelo():    
    model = Sequential()
    model.add(Dense(units=200, activation='selu'))
    model.add(Dropout(0.1))
    model.add(Dense(units=200, activation='selu'))
    model.add(Dropout(0.1))
    model.add(Dense(units=200, activation='selu'))
    model.add(Dropout(0.1))
    model.add(Dense(units=1, activation='linear'))

    model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_squared_error'])  
    return model

keras_m = KerasRegressor(build_fn=keras_modelo, epochs=15)

Using TensorFlow backend.


## XGBoost

In [7]:
import xgboost as xgb

class XGBoostWrapper(xgb.XGBRegressor):
    
    def fit(self, x, y):
        return super(xgb.XGBRegressor, self).fit(x, y)
    
    def predict(self, X):
        return super(xgb.XGBRegressor, self).predict(X)

hps = {'verbosity': 0,
'subsample': 0.9,
'scale_pos_weight': 2,
'reg_alpha': 4,
'objective': 'reg:squarederror',
'n_jobs': 4,
'n_estimators': 75,
'max_depth': 15,
'learning_rate': 0.1,
'eval_metric': 'mae',
'colsample_bytree': 0.7}

xgb_m = Pipeline([
    ('feature_selector', FeatureSelector(['antiguedad', 'habitaciones', 'garages', 'banos', 'metroscubiertos',
       'metrostotales', 'lat', 'lng', 'gimnasio', 'usosmultiples',
       'piscina', 'clustering_label', 'idf_descripcion', 'porcentaje_metros', 'diferencia_metros',
       'promedio_metros_tipo_propiedad', 'prop_frecuente', 'top_provincia',
       'promedio_metros_totales_provincia', 'anio', 'mes',
       'trimestre', 'dias_desde_datos', 'cantidad_inquilinos', 'tam_ambientes', 'promedio_precio_provincia',
       'promedio_precio_ciudad', 'count_ciudad', 'promedio_id_zona',
       'count_id_zona','promedio_precio_tipo_propiedad',
       'promedio_precio_tipo_propiedad_ciudad', 'count_tipo_propiedad',
       'count_tipo_propiedad_ciudad', 'promedio_por_mes', 
       'promedio_precio_banos_garages', 'promedio_precio_hbg_tipo_propiedad',
       'promedio_precio_booleanos', 'puntaje',
       'distancia_ciudad_centrica', 'distancia_centro_mexico'])),
    ('xgboost', XGBoostWrapper(**hps))
])

## RandomForest

In [8]:
from sklearn.ensemble import RandomForestRegressor

params = {'bootstrap': False,
          'max_features': 'sqrt',
          'min_samples_split': 4,
          'n_jobs': 2,
          'n_estimators': 100}

forest_m = Pipeline([
    ('feature_selector', FeatureSelector(['antiguedad', 'garages', 'banos', 'metroscubiertos', 'metrostotales',
       'lat', 'lng', 'fecha', 'idf_descripcion', 'porcentaje_metros', 'diferencia_metros',
       'metroscubiertos_bins_unif', 'metroscubiertos_bins_perc',
       'metros_totales_normalizados', 'metros_cubiertos_normalizados',
       'promedio_metros_tipo_propiedad',
       'promedio_metros_cubiertos_provincia', 'mes', 'dias_desde_datos', 'tam_ambientes',
       'promedio_precio_provincia', 'promedio_precio_ciudad', 'count_ciudad',
       'promedio_id_zona', 'count_id_zona', 'promedio_precio_tipo_propiedad',
       'promedio_precio_tipo_propiedad_ciudad',
       'count_tipo_propiedad_ciudad', 'promedio_por_mes', 'promedio_precio_hbg_tipo_propiedad',
       'puntaje', 'distancia_ciudad_centrica',
       'distancia_centro_mexico'])),
    ('random_forest', RandomForestRegressor(**params))
])

## ExtraTrees

In [9]:
from sklearn.ensemble import ExtraTreesRegressor

params = {'criterion': 'mse',
 'max_features': 'sqrt',
 'min_samples_split': 4}

extratrees_m =  Pipeline([
    ('feature_selector', FeatureSelector(['antiguedad', 'garages', 'banos', 'metroscubiertos', 'metrostotales',
       'lat', 'lng', 'fecha', 'idf_titulo', 'idf_descripcion',
       'peso_descripcion', 'porcentaje_metros', 'diferencia_metros',
       'metroscubiertos_bins_unif', 'metroscubiertos_bins_perc',
       'metros_totales_normalizados', 'metros_cubiertos_normalizados',
       'promedio_metros_tipo_propiedad', 'promedio_metros_cub_tipo_propiedad',
       'promedio_metros_cubiertos_provincia', 'mes', 'dia', 'dias_desde_datos',
       'meses_desde_datos', 'antiguedad_bins_perc', 'tam_ambientes',
       'promedio_precio_provincia', 'promedio_precio_ciudad',
       'promedio_precio_ciudad_gen', 'varianza_precio_ciudad', 'count_ciudad',
       'promedio_id_zona', 'promedio_id_zona_gen', 'varianza_id_zona',
       'count_id_zona', 'promedio_precio_tipo_propiedad',
       'promedio_precio_tipo_propiedad_ciudad',
       'promedio_precio_tipo_propiedad_ciudad_gen',
       'count_tipo_propiedad_ciudad', 'promedio_por_mes', 'varianza_por_mes',
       'promedio_precio_habitaciones_banos_garages',
       'promedio_precio_banos_garages', 'promedio_precio_hbg_tipo_propiedad',
       'lat_norm', 'lng_norm', 'puntaje', 'distancia_ciudad_centrica',
       'distancia_centro_mexico', 'distancia_ciudad_cara'])),
    ('extra_trees', ExtraTreesRegressor(n_estimators=50, **params))
])

## CatBoost

In [10]:
from catboost import CatBoostRegressor


class CatBoostWrapper(CatBoostRegressor):
    
    def fit(self, x, y):
        # posiciones de features categoricas encontradas a mano
        cat_features = [0, 1, 2]
        return super(CatBoostWrapper, self).fit(x, y, cat_features=cat_features)


f_s = FeatureSelector(['antiguedad', 'garages', 'banos', 'metroscubiertos', 'metrostotales',
       'lat', 'lng', 'fecha', 'idf_descripcion', 'porcentaje_metros', 'diferencia_metros',
       'promedio_metros_tipo_propiedad', 'promedio_metros_cubiertos_provincia', 'mes', 
       'dias_desde_datos', 'tam_ambientes',
       'promedio_precio_provincia', 'promedio_precio_ciudad',
       'count_ciudad', 'promedio_id_zona',
       'count_id_zona', 'promedio_precio_tipo_propiedad',
       'promedio_precio_tipo_propiedad_ciudad',
       'count_tipo_propiedad_ciudad', 'promedio_por_mes', 'promedio_precio_hbg_tipo_propiedad',
       'puntaje', 'distancia_ciudad_centrica',
       'distancia_centro_mexico', 'provincia', 'ciudad', 'tipodepropiedad'])

# params = {'od_wait': 50,
#  'od_type': 'IncToDec',
#  'learning_rate': 0.2,
#  'l2_leaf_reg': 0,
#  'depth': 10,
#  'iterations': 300,
#  'silent': True,
#  'eval_metric': 'MAE',
# }

params = {
    'od_wait': 50,
    'od_type': 'Iter',
    'learning_rate': 0.15,
    'l2_leaf_reg': 3,
    'depth': 12,
    'colsample_bylevel': 0.5,
    'border_count': 128,
    'iterations': 300,
    'silent': True,
    'eval_metric': 'MAE'
}


catboost_m = Pipeline([
    ('feature_selector', f_s),
    ('catboost', CatBoostWrapper(**params))
])

## Stacking

In [12]:
from sklearn.preprocessing import MinMaxScaler
from vecstack import stacking
from vecstack_sk import StackingTransformer

modelos = [
           ('lightgbm', lgb_m), 
           ('randomforest', forest_m),
           ('extratrees', extratrees_m),
           ('catboost', catboost_m), 
           ('xgboost', xgb_m)
          ]

stack = StackingTransformer(modelos, 
                          regression=True, verbose=2, n_folds=4)

stack.fit(df_train_f.drop('precio', axis=1).values, 
                          df_train_f['precio'].values)

task:         [regression]
metric:       [mean_absolute_error]
variant:      [A]
n_estimators: [5]

estimator  0: [lightgbm: Pipeline]
    fold  0:  [484282.99385412]
    fold  1:  [483709.40052008]
    fold  2:  [486961.11769344]
    fold  3:  [480640.37270216]
    ----
    MEAN:     [483898.47119245] + [2246.03065200]

estimator  1: [randomforest: Pipeline]


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/javier/Documents/FIUBA/Datos/.venv/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3326, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-2af6ee0b717e>", line 17, in <module>
    df_train_f['precio'].values)
  File "/home/javier/Documents/FIUBA/Datos/datos-tp2/vecstack_sk.py", line 506, in fit
    transform=self.transform_target)
  File "/home/javier/Documents/FIUBA/Datos/datos-tp2/vecstack_sk.py", line 825, in _estimator_action
    return estimator.fit(X_train, self._transformer(y_train, func=transform))
  File "/home/javier/Documents/FIUBA/Datos/.venv/lib/python3.7/site-packages/sklearn/pipeline.py", line 356, in fit
    self._final_estimator.fit(Xt, y, **fit_params)
  File "/home/javier/Documents/FIUBA/Datos/.venv/lib/python3.7/site-packages/sklearn/ensemble/forest.py", line 330, in fit
    for i, t in enumerate(trees))
  File "/home/javier/Documents/FIUBA/Datos/.venv/lib/

KeyboardInterrupt: 

In [18]:
s_train = stack.transform(df_train_f.drop('precio', axis=1).values)
s_test = stack.transform(df_test_f.values)

Train set was detected.
Transforming...

estimator  0: [lightgbm: Pipeline]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE

estimator  1: [catboost: Pipeline]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE

estimator  2: [randomforest: Pipeline]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE

estimator  3: [extratrees: Pipeline]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE

estimator  4: [xgboost: Pipeline]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE

Transforming...

estimator  0: [lightgbm: Pipeline]
    model from fold  0: done
    model from fold  1:

## Prediccion con todos los features + stacking

In [19]:
df_train_s = df_train_f.copy()
df_test_s = df_test_f.copy()

df_train_s['stack01'], df_train_s['stack02'], df_train_s['stack03'], df_train_s['stack04'], df_train_s['stack05'] = zip(*s_train)
df_test_s['stack01'], df_test_s['stack02'], df_test_s['stack03'], df_test_s['stack04'], df_test_s['stack05'] = zip(*s_test)

features_stacking = ['stack01', 'stack02', 'stack03', 'stack04', 'stack05']

In [15]:
df_train_s['id'] = df_train['id']
df_test_s['id'] = df_test['id']

In [20]:
features = ['habitaciones', 'garages','banos','antiguedad', 'metroscubiertos',  'metrostotales','lat_norm', 
           'lng_norm', 'gimnasio', 'usosmultiples', 'piscina']

features_test = ['prop_frecuente', 'top_provincia', 'promedio_precio_ciudad', 'anio', 'promedio_id_zona', 
                 'promedio_precio_tipo_propiedad', 'count_id_zona', 'count_ciudad', 'puntaje', 
                 'count_tipo_propiedad_ciudad', 'promedio_precio_tipo_propiedad_ciudad_gen',
                 'dias_desde_datos','meses_desde_datos','porcentaje_metros','distancia_ciudad_centrica', 
                 'clustering_label', 'distancia_centro_mexico'
                ]

features += features_test


params_2nd = {'bagging_fraction': 0.8999882607358867,
 'bagging_freq': int(95.0),
 'feature_fraction': 0.2570109385381975,
 'learning_rate': 0.13601832720254403,
 'max_depth': int(26.0),
 'num_leaves': int(175.0),
 'test_size': 0.08363501292068126,
 'boosting_type': 'dart',
 'num_boost_round': 1200,
 'objective': 'regression',
 'metric': 'mae'}

lgb_m_2nd = LightGBMWrapper(**params_2nd)
lgb_m_2nd.fit(utils.filtrar_features(df_train_s, features + features_stacking), df_train['precio'].values)



LightGBMWrapper(bagging_fraction=0.8999882607358867, bagging_freq=95,
                boosting_type='dart', class_weight=None, colsample_bytree=1.0,
                feature_fraction=0.2570109385381975, importance_type='split',
                learning_rate=0.13601832720254403, max_depth=26, metric='mae',
                min_child_samples=20, min_child_weight=0.001,
                min_split_gain=0.0, n_estimators=100, n_jobs=-1,
                num_boost_round=1200, num_leaves=175, objective='regression',
                random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
                subsample=1.0, subsample_for_bin=200000, subsample_freq=0,
                test_size=0.08363501292068126)

In [21]:
df_test_s['target'] = lgb_m_2nd.predict(utils.filtrar_features(df_test_s, features + features_stacking))
# df_test_s = utils.pesificar_df(df_test_s, col_precio_in='target', col_precio_out='target')
df_test_s[['id', 'target']].to_csv('respuesta46.csv', index = False)

# print(f'MAE Stacking-full: {utils.MAE(df_test_s["precio"].values, df_test_s["target"].values)}')

## Prediccion solo con features de stacking

In [22]:
# params_2nd = {'bagging_fraction': 0.8924398062087346,
#  'bagging_freq': int(36.0),
#  'feature_fraction': 0.16167385124183287,
#  'learning_rate': 0.054693418899570134,
#  'max_depth': int(4.0),
#  'num_leaves': int(93.0),
#  'objective': 'regression',
#  'boosting_type': 'gbdt',
#  'metric': 'mae'}

params_2nd = {'bagging_fraction': 0.8243831977099841,
 'bagging_freq': int(10.0),
 'feature_fraction': 0.9228324501365147,
 'learning_rate': 0.050664243951241736,
 'max_depth': int(3.0),
 'num_leaves': int(78.0),
 'objective': 'regression',
 'boosting_type': 'dart',
 'num_boost_round': 1200,
 'metric': 'mae'}


lgb_m_2nd = LightGBMWrapper(**params_2nd)
lgb_m_2nd.fit(s_train, df_train_f['precio'].values)

df_test_f['target'] = lgb_m_2nd.predict(s_test)
df_test_f[['id', 'target']].to_csv('respuesta47.csv', index = False)



In [49]:
features = ['stack01', 'stack02', 'stack03', 'stack04']

def eval_lightgbm(args):
    num_leaves, learning_rate, feature_fraction, bagging_fraction, bagging_freq, max_depth = args

    lgb_train = lgb.Dataset(s_train, df_train['precio'].values)
    
    num_leaves = int(num_leaves)
    bagging_freq = int(bagging_freq)
    max_depth = int(max_depth)
    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'mae'}, # Si se deja vacio se toma el ideal para llegar al 'objective'
        'num_leaves': num_leaves,
        'learning_rate': learning_rate,
        'feature_fraction': feature_fraction,
        'bagging_fraction': bagging_fraction,
        'bagging_freq': bagging_freq,
        'max_depth': max_depth,
        'verbose': -1,
    }

    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=250,
                    verbose_eval=-1)
    
    y_pred_test = gbm.predict(s_test, num_iteration=gbm.best_iteration)
    return utils.MAE(df_test['precio'].values, y_pred_test)

space = [hp.quniform('num_leaves', 15, 130, 1), hp.uniform('learning_rate', 0.05, 0.9),
        hp.uniform('feature_fraction', 0.90, 1), hp.uniform('bagging_fraction', 0.70, 1),
        hp.quniform('bagging_freq', 0, 40, 1), hp.quniform('max_depth', 3, 15, 1)]

hps = fmin(eval_lightgbm, space=space, algo=tpe.suggest, max_evals=400, verbose=1)

display(hps)

100%|██████████| 400/400 [17:21<00:00,  2.60s/it, best loss: 511432.09539443516]


{'bagging_fraction': 0.8243831977099841,
 'bagging_freq': 10.0,
 'feature_fraction': 0.9228324501365147,
 'learning_rate': 0.050664243951241736,
 'max_depth': 3.0,
 'num_leaves': 78.0}

## Prediccion con promedios

In [19]:
df_test_f['target'] = np.average(s_test, axis=1)
df_test_f[['id', 'target']].to_csv('respuesta44.csv', index = False)

In [77]:
from scipy.stats import mode

y_pred_test = mode(s_test, axis=1)[0]
print(f"MAE Stacking only: {utils.MAE(y_pred_test, df_test_f['precio'].values)}")

MAE Stacking only: 519840.412728738


In [95]:
import numpy as np

y_pred_test = np.average(s_test, axis=1)
print(f"MAE Stacking only: {utils.MAE(y_pred_test, df_test_f['precio'].values)}")

MAE Stacking only: 502391.48180612386


In [91]:
from scipy.optimize import minimize

def mae_res(weights):
    y_pred_test = np.average(s_test, weights=weights, axis=1)
    return utils.MAE(y_pred_test, df_test_f['precio'].values)

x0 = [1] * len(s_test.T)
minimize(mae_res, x0)

      fun: 497375.416620218
 hess_inv: array([[ 7.64527106e-05, -6.84363088e-05,  5.18102889e-04,
         1.30966978e-03],
       [-6.84363088e-05,  6.84021400e-05, -4.91721763e-04,
        -1.17091868e-03],
       [ 5.18102889e-04, -4.91721763e-04,  4.00738244e-03,
         9.31303274e-03],
       [ 1.30966978e-03, -1.17091868e-03,  9.31303274e-03,
         2.30019160e-02]])
      jac: array([5.8359375 , 2.96484375, 0.3671875 , 1.5859375 ])
  message: 'Desired error not necessarily achieved due to precision loss.'
     nfev: 702
      nit: 15
     njev: 115
   status: 2
  success: False
        x: array([ 0.40410989, -0.16217251,  2.30245548,  0.39816892])

In [92]:
from scipy.optimize import minimize, differential_evolution

def mae_res(weights):
    y_pred_test = np.average(s_test, weights=weights, axis=1)
    return utils.MAE(y_pred_test, df_test_f['precio'].values)

x0 = [(-3, 4)] * len(s_test.T)
differential_evolution(mae_res, bounds=x0)

     fun: 497375.4169548869
     jac: array([-0.58207661, -0.83819032, -0.23283064,  1.65309757])
 message: 'Optimization terminated successfully.'
    nfev: 425
     nit: 5
 success: True
       x: array([ 0.49248228, -0.19790651,  2.80637364,  0.48548681])

In [23]:
df_test_s[['id'] + features_stacking].to_csv('data/stacking5_test.csv', index=False)
df_train_s[['id'] + features_stacking].to_csv('data/stacking5_train.csv', index=False)

## Blending

In [11]:
from sklearn.base import clone
from sklearn.model_selection import train_test_split


class BlendingTransformer():

    def __init__(self, models, model_blend, val_split=0.10, verbose=True):
        self.models = models
        self.val_split = val_split
        self.model_blend = model_blend
        self.verbose = verbose

        
    def simple_fit(self, x, y):
        x_cat = FeatureSelector(['provincia', 'intervalo_metros_cubiertos']).transform(x)
        
        x_train, x_eval, y_train, y_eval = train_test_split(x, y, 
                                                            test_size=self.val_split, 
                                                            stratify=x_cat)

        self._fit_first(x_train, y_train)
        self._fit_second(x_eval, y_eval)
        


    def fit(self, x, y, x_test):

        x_cat = FeatureSelector(['provincia', 'intervalo_metros_cubiertos']).transform(x)
        
        x_train, x_eval, y_train, y_eval = train_test_split(x, y, 
                                                            test_size=self.val_split, 
                                                            stratify=x_cat)

        self._fit_first(x_train, y_train)
        self._fit_second(x_eval, y_eval)

        self._fit_first(x, y)
        y_test = self._predict_second(x_test)

        x = np.concatenate((x, x_test))
        y = np.concatenate((y, y_test))
        x_cat = FeatureSelector(['provincia', 'intervalo_metros_cubiertos']).transform(x)

        x_train, x_eval, y_train, y_eval = train_test_split(x, y, 
                                                            test_size=self.val_split, 
                                                            stratify=x_cat)
        self._fit_first(x_train, y_train)
        self._fit_second(x_eval, y_eval)

        self._fit_first(x, y)


    def _fit_first(self, x, y):

        for i in range(len(self.models)):   
            if self.verbose:
                print(f'Fitting {i}...')
            self.models[i] = clone(self.models[i])
            self.models[i].fit(x, y)


    def _fit_second(self, x, y):
        if self.verbose:
            print(f'Fitting blender...')
        self.model_blend = clone(self.model_blend)
        self.model_blend.fit(self._predict_first(x), y)


            
    def _predict_first(self, x):
        preds = []
        for i in range(len(self.models)):
            y_pred = self.models[i].predict(x)
            preds.append(y_pred)

        return np.array(preds).T


    def _predict_second(self, x):
        return self.model_blend.predict(self._predict_first(x))


    def predict(self, x):
        return self._predict_second(x)
    
    
modelos = [
           lgb_m,
           catboost_m, 
           forest_m,
           xgb_m
          ]

params_2nd = {'bagging_fraction': 0.8243831977099841,
 'bagging_freq': int(10.0),
 'feature_fraction': 0.9228324501365147,
 'learning_rate': 0.050664243951241736,
 'max_depth': int(3.0),
 'num_leaves': int(78.0),
 'objective': 'regression',
 'boosting_type': 'dart',
 'num_boost_round': 1500,
 'metric': 'mae'}

lgb_m_2nd = LightGBMWrapper(**params_2nd)

blend = BlendingTransformer(modelos, lgb_m_2nd)

### Simple blend

In [12]:
blend.simple_fit(df_train_f.drop('precio', axis=1).values, df_train_f['precio'].values)

Fitting 0...
Fitting 1...


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/javier/Documents/FIUBA/Datos/.venv/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3326, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-ff06219c7e23>", line 1, in <module>
    blend.simple_fit(df_train_f.drop('precio', axis=1).values, df_train_f['precio'].values)
  File "<ipython-input-11-5f1cad563535>", line 21, in simple_fit
    self._fit_first(x_train, y_train)
  File "<ipython-input-11-5f1cad563535>", line 59, in _fit_first
    self.models[i].fit(x, y)
  File "/home/javier/Documents/FIUBA/Datos/.venv/lib/python3.7/site-packages/sklearn/pipeline.py", line 356, in fit
    self._final_estimator.fit(Xt, y, **fit_params)
  File "<ipython-input-10-1c9d9daa278e>", line 9, in fit
    return super(CatBoostWrapper, self).fit(x, y, cat_features=cat_features)
  File "/home/javier/Documents/FIUBA/Datos/.venv/lib/python3.7/site-packages/catboost/core.py", line 4239, in fit
    save_s

KeyboardInterrupt: 

### Long blend

In [None]:
blend.fit(df_train_f.drop('precio', axis=1).values, 
                          df_train_f['precio'].values, df_test_f.drop('precio', axis=1, errors='ignore').values)

In [None]:
df_test_s = df_test_f.copy()
df_test_s['target'] = blend.predict(df_test_s.values)
# df_test_s = utils.pesificar_df(df_test_s, col_precio_in='target', col_precio_out='target')
df_test_s[['id', 'target']].to_csv('respuesta49.csv', index = False)