In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from hyperopt import fmin, tpe, hp
import ipynb.fs.full.utils as utils
import ipynb.fs.full.features as features
import ipynb.fs.full.features_distancias as f_distancias

df_train = pd.read_csv('./data/train_filtrado.csv')
# Para usarse con el submit a Kaggle
df_test = pd.read_csv('./data/test.csv')

df_train = features.llenar_nulls(df_train)
df_test = features.llenar_nulls(df_test, hgb_mean=True, df_fill=df_train)

# df_train, df_test = features_de_csvs(df_train, df_test)

# df_train, df_test = utils.dividir_df_testeo(df_train, test_size=0.15)

In [2]:
df_train_cluster = pd.read_csv('./data/clustering_train.csv').rename(columns={'label': 'clustering_label'})
df_test_cluster = pd.read_csv('./data/clustering_test.csv').rename(columns={'label': 'clustering_label'})

df_train = pd.merge(df_train, df_train_cluster, on='id')
df_test = pd.merge(df_test, df_test_cluster, on='id')

df_train_idf = pd.read_csv('./data/train_idf.csv')
df_test_idf = pd.read_csv('./data/test_idf.csv')

df_train = pd.merge(df_train, df_train_idf, on= 'id', how= 'left')
df_test = pd.merge(df_test, df_test_idf, on= 'id', how= 'left')

In [3]:
# df_train, df_test = utils.dividir_df_testeo(df_train, test_size=0.20)

In [4]:
df_test_f = features.features_independientes_precio(df_test)
df_test_f = features.features_dependientes_precio(df_test_f, df_train)

df_train_f = features.features_independientes_precio(df_train)
df_train_f = features.features_dependientes_precio(df_train_f, df_train)

df_test_f, cols_tipodepropiedad_ohe = features.columna_a_ohe(df_test_f, 'tipodepropiedad', N=100, df_aux=df_train, devolver_cols=True)
df_test_f, cols_provincia_ohe = features.columna_a_ohe(df_test_f, 'provincia', N=100, df_aux=df_train, devolver_cols=True)
df_test_f, cols_zona_ohe = features.columna_a_ohe(df_test_f, 'zona', df_aux=df_train_f, devolver_cols=True)

df_train_f = features.columna_a_ohe(df_train_f, 'tipodepropiedad', N=100, df_aux=df_test)
df_train_f = features.columna_a_ohe(df_train_f, 'provincia', N=100, df_aux=df_test)
df_train_f = features.columna_a_ohe(df_train_f, 'zona', df_aux=df_test_f)


df_train_f['fecha'] = pd.to_datetime(df_train_f['fecha']).astype(int)
df_test_f['fecha'] = pd.to_datetime(df_test_f['fecha']).astype(int)


df_train_f = f_distancias.feature_distancias(df_train_f)
df_test_f = f_distancias.feature_distancias(df_test_f, df_train_f)


# df_train_f = features.KD_feature(df_train_f)
# df_test_f =  features.KD_feature(df_test_f)

## LightGBM model

In [5]:
from sklearn.model_selection import train_test_split

class LightGBMWrapper(lgb.LGBMRegressor):
    
    def fit(self, x, y):        
        return super(LightGBMWrapper, self).fit(x, y)
    
    def predict(self, X):
        return super(LightGBMWrapper, self).predict(X,num_iteration=self.best_iteration_)

hps = {'bagging_fraction': 0.5,
 'boosting_type': 'gbdt',
 'feature_fraction': 0.9,
 'learning_rate': 0.25,
 'max_depth': 10,
 'metric': 'mae',
 'n_jobs': 2,
 'num_leaves': 200,
 'objective': 'regression'}

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mae', # Si se deja vacio se toma el ideal para llegar al 'objective'
    'num_leaves': int(hps['num_leaves']),
    'learning_rate': hps['learning_rate'],
    'feature_fraction': hps['feature_fraction'],
    'bagging_fraction': hps['bagging_fraction'],
#     'bagging_freq': int(hps['bagging_freq']),
    'max_depth': int(hps['max_depth']),
    'verbose': 0
}

lgb_m = LightGBMWrapper(**params)

## Keras model

In [6]:
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Activation

def keras_modelo():    
    model = Sequential()
    model.add(Dense(units=200, activation='selu'))
    model.add(Dropout(0.1))
    model.add(Dense(units=200, activation='selu'))
    model.add(Dropout(0.1))
    model.add(Dense(units=200, activation='selu'))
    model.add(Dropout(0.1))
    model.add(Dense(units=1, activation='linear'))

    model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_squared_error'])  
    return model

keras_m = KerasRegressor(build_fn=keras_modelo, epochs=15)

Using TensorFlow backend.


## XGBoost

In [7]:
import xgboost as xgb

class XGBoostWrapper(xgb.XGBRegressor):
    
    def fit(self, x, y):
        return super(xgb.XGBRegressor, self).fit(x, y)
    
    def predict(self, X):
        return super(xgb.XGBRegressor, self).predict(X)

hps = {'colsample_bytree': 0.9,
 'eval_metric': 'mae',
 'learning_rate': 0.1,
 'max_depth': 10,
 'n_estimators': 120,
 'n_jobs': 4,
 'objective': 'reg:squarederror',
 'scale_pos_weight': 1,
 'verbosity': 0}


# n_estimators = int(hps['n_estimators'])
# max_depth = int(hps['max_depth'])

xgb_m = XGBoostWrapper(**hps)

## RandomForest

In [8]:
from sklearn.ensemble import RandomForestRegressor

params = {'bootstrap': False,
          'max_features': 'sqrt',
          'min_samples_split': 4,
          'n_jobs': 2}

forest_m = RandomForestRegressor(n_estimators=100, **params)

## ExtraTrees

In [9]:
from sklearn.ensemble import ExtraTreesRegressor

params = {'criterion': 'mse',
 'max_features': 'sqrt',
 'min_samples_split': 4}

extratrees_m = ExtraTreesRegressor(n_estimators=50, **params)

## Stacking

In [10]:
from sklearn.preprocessing import MinMaxScaler
from vecstack import StackingTransformer

features = ['habitaciones', 'garages','banos','antiguedad', 'metroscubiertos',  'metrostotales','lat_norm', 
           'lng_norm', 'gimnasio', 'usosmultiples', 'piscina']

features_test = ['prop_frecuente', 'top_provincia', 'promedio_precio_ciudad', 'anio', 'promedio_id_zona', 
                 'promedio_precio_tipo_propiedad', 'count_id_zona', 'count_ciudad', 'puntaje', 
                 'count_tipo_propiedad_ciudad', 'promedio_precio_tipo_propiedad_ciudad_gen',
                 'dias_desde_datos','meses_desde_datos','porcentaje_metros','distancia_ciudad_centrica', 
                 'clustering_label', 'distancia_centro_mexico'
                ]

features += features_test 

x_train, x_test, y_train, y_test = utils.dividir_dataset(df_train_f, 'precio', features, test_size=2)

modelos = [
           ('lightgbm', lgb_m), 
           ('extratrees', extratrees_m),
           ('randomforest', forest_m),
         #   ('keras', keras_m), 
           ('xgboost', xgb_m)
          ]

stack = StackingTransformer(modelos, regression=True, verbose=2, n_folds=4, variant='B')

stack = stack.fit(x_train, y_train)

# s_train = stack.transform(x_train)
# s_test = stack.transform(x_test)

task:         [regression]
metric:       [mean_absolute_error]
variant:      [B]
n_estimators: [4]

estimator  0: [lightgbm: LightGBMWrapper]
    fold  0:  [501478.01739473]
    fold  1:  [497414.73277108]
    fold  2:  [495426.11090588]
    fold  3:  [500568.06879772]
    ----
    MEAN:     [498721.73246735] + [2427.72702997]

    Fitting on full train set...

estimator  1: [extratrees: ExtraTreesRegressor]
    fold  0:  [509672.81794688]
    fold  1:  [507053.70398601]
    fold  2:  [503436.13110065]
    fold  3:  [508618.59928154]
    ----
    MEAN:     [507195.31307877] + [2361.95238448]

    Fitting on full train set...

estimator  2: [randomforest: RandomForestRegressor]
    fold  0:  [477209.60261355]
    fold  1:  [473691.24736816]
    fold  2:  [472930.47367288]
    fold  3:  [476387.60918731]
    ----
    MEAN:     [475054.73321047] + [1788.26681029]

    Fitting on full train set...

estimator  3: [xgboost: XGBoostWrapper]
    fold  0:  [486552.78820370]
    fold  1:  [48279

In [11]:
s_train = stack.transform(utils.filtrar_features(df_train_f.drop('precio', axis=1), features))
s_test = stack.transform(utils.filtrar_features(df_test_f, features))

Transforming...

estimator  0: [lightgbm: LightGBMWrapper]
    DONE

estimator  1: [extratrees: ExtraTreesRegressor]
    DONE

estimator  2: [randomforest: RandomForestRegressor]
    DONE

estimator  3: [xgboost: XGBoostWrapper]
    DONE

Transforming...

estimator  0: [lightgbm: LightGBMWrapper]
    DONE

estimator  1: [extratrees: ExtraTreesRegressor]
    DONE

estimator  2: [randomforest: RandomForestRegressor]
    DONE

estimator  3: [xgboost: XGBoostWrapper]
    DONE



## Prediccion con todos los features + stacking

In [56]:
df_train_s = df_train_f.copy()
df_test_s = df_test_f.copy()

df_train_s['stack01'], df_train_s['stack02'], df_train_s['stack03'], df_train_s['stack04'] = zip(*s_train)
df_test_s['stack01'], df_test_s['stack02'], df_test_s['stack03'], df_test_s['stack04'] = zip(*s_test)

features_stacking = ['stack01', 'stack02', 'stack03', 'stack04']

In [57]:
df_train_s['id'] = df_train['id']
df_test_s['id'] = df_test['id']

In [58]:
params_2nd = {'bagging_fraction': 0.8999882607358867,
 'bagging_freq': int(95.0),
 'feature_fraction': 0.2570109385381975,
 'learning_rate': 0.13601832720254403,
 'max_depth': int(26.0),
 'num_leaves': int(175.0),
 'test_size': 0.08363501292068126,
 'boosting_type': 'dart',
 'num_boost_round': 1500,
 'objective': 'regression',
 'metric': 'mae'}

lgb_m_2nd = LightGBMWrapper(**params_2nd)
lgb_m_2nd.fit(utils.filtrar_features(df_train_s, features + features_stacking), df_train['precio'].values)



LightGBMWrapper(bagging_fraction=0.8999882607358867, bagging_freq=95,
                boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                feature_fraction=0.2570109385381975, importance_type='split',
                learning_rate=0.13601832720254403, max_depth=26, metric='mae',
                min_child_samples=20, min_child_weight=0.001,
                min_split_gain=0.0, n_estimators=100, n_jobs=-1,
                num_boost_round=1500, num_leaves=175, objective='regression',
                random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
                subsample=1.0, subsample_for_bin=200000, subsample_freq=0,
                test_size=0.08363501292068126)

In [59]:
df_test_s['target'] = lgb_m_2nd.predict(utils.filtrar_features(df_test_s, features + features_stacking))
df_test_s[['id', 'target']].to_csv('respuesta41.csv', index = False)

print(f'MAE Stacking-full: {utils.MAE(df_test_s["precio"].values, df_test_s["target"].values)}')

MAE Stacking-full: 514117.80862808606


## Prediccion solo con features de stacking

In [14]:
# params_2nd = {'bagging_fraction': 0.8924398062087346,
#  'bagging_freq': int(36.0),
#  'feature_fraction': 0.16167385124183287,
#  'learning_rate': 0.054693418899570134,
#  'max_depth': int(4.0),
#  'num_leaves': int(93.0),
#  'objective': 'regression',
#  'boosting_type': 'gbdt',
#  'metric': 'mae'}

params_2nd = {'bagging_fraction': 0.8243831977099841,
 'bagging_freq': int(10.0),
 'feature_fraction': 0.9228324501365147,
 'learning_rate': 0.050664243951241736,
 'max_depth': int(3.0),
 'num_leaves': int(78.0),
 'objective': 'regression',
 'boosting_type': 'dart',
 'metric': 'mae'}


lgb_m_2nd = LightGBMWrapper(**params_2nd)
lgb_m_2nd.fit(s_train, df_train_f['precio'].values)

df_test_f['target'] = lgb_m_2nd.predict(s_test)
df_test_f[['id', 'target']].to_csv('respuesta43.csv', index = False)

In [49]:
features = ['stack01', 'stack02', 'stack03', 'stack04']

def eval_lightgbm(args):
    num_leaves, learning_rate, feature_fraction, bagging_fraction, bagging_freq, max_depth = args

    lgb_train = lgb.Dataset(s_train, df_train['precio'].values)
    
    num_leaves = int(num_leaves)
    bagging_freq = int(bagging_freq)
    max_depth = int(max_depth)
    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'mae'}, # Si se deja vacio se toma el ideal para llegar al 'objective'
        'num_leaves': num_leaves,
        'learning_rate': learning_rate,
        'feature_fraction': feature_fraction,
        'bagging_fraction': bagging_fraction,
        'bagging_freq': bagging_freq,
        'max_depth': max_depth,
        'verbose': -1,
    }

    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=250,
                    verbose_eval=-1)
    
    y_pred_test = gbm.predict(s_test, num_iteration=gbm.best_iteration)
    return utils.MAE(df_test['precio'].values, y_pred_test)

space = [hp.quniform('num_leaves', 15, 130, 1), hp.uniform('learning_rate', 0.05, 0.9),
        hp.uniform('feature_fraction', 0.90, 1), hp.uniform('bagging_fraction', 0.70, 1),
        hp.quniform('bagging_freq', 0, 40, 1), hp.quniform('max_depth', 3, 15, 1)]

hps = fmin(eval_lightgbm, space=space, algo=tpe.suggest, max_evals=400, verbose=1)

display(hps)

100%|██████████| 400/400 [17:21<00:00,  2.60s/it, best loss: 511432.09539443516]


{'bagging_fraction': 0.8243831977099841,
 'bagging_freq': 10.0,
 'feature_fraction': 0.9228324501365147,
 'learning_rate': 0.050664243951241736,
 'max_depth': 3.0,
 'num_leaves': 78.0}

## Prediccion con promedios

In [12]:
df_test_f['target'] = np.average(s_test, axis=1)
df_test_f[['id', 'target']].to_csv('respuesta42.csv', index = False)

In [77]:
from scipy.stats import mode

y_pred_test = mode(s_test, axis=1)[0]
print(f"MAE Stacking only: {utils.MAE(y_pred_test, df_test_f['precio'].values)}")

MAE Stacking only: 519840.412728738


In [95]:
import numpy as np

y_pred_test = np.average(s_test, axis=1)
print(f"MAE Stacking only: {utils.MAE(y_pred_test, df_test_f['precio'].values)}")

MAE Stacking only: 502391.48180612386


In [91]:
from scipy.optimize import minimize

def mae_res(weights):
    y_pred_test = np.average(s_test, weights=weights, axis=1)
    return utils.MAE(y_pred_test, df_test_f['precio'].values)

x0 = [1] * len(s_test.T)
minimize(mae_res, x0)

      fun: 497375.416620218
 hess_inv: array([[ 7.64527106e-05, -6.84363088e-05,  5.18102889e-04,
         1.30966978e-03],
       [-6.84363088e-05,  6.84021400e-05, -4.91721763e-04,
        -1.17091868e-03],
       [ 5.18102889e-04, -4.91721763e-04,  4.00738244e-03,
         9.31303274e-03],
       [ 1.30966978e-03, -1.17091868e-03,  9.31303274e-03,
         2.30019160e-02]])
      jac: array([5.8359375 , 2.96484375, 0.3671875 , 1.5859375 ])
  message: 'Desired error not necessarily achieved due to precision loss.'
     nfev: 702
      nit: 15
     njev: 115
   status: 2
  success: False
        x: array([ 0.40410989, -0.16217251,  2.30245548,  0.39816892])

In [92]:
from scipy.optimize import minimize, differential_evolution

def mae_res(weights):
    y_pred_test = np.average(s_test, weights=weights, axis=1)
    return utils.MAE(y_pred_test, df_test_f['precio'].values)

x0 = [(-3, 4)] * len(s_test.T)
differential_evolution(mae_res, bounds=x0)

     fun: 497375.4169548869
     jac: array([-0.58207661, -0.83819032, -0.23283064,  1.65309757])
 message: 'Optimization terminated successfully.'
    nfev: 425
     nit: 5
 success: True
       x: array([ 0.49248228, -0.19790651,  2.80637364,  0.48548681])