In [2]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.model_selection import train_test_split
import ipynb.fs.full.features as features
import ipynb.fs.full.utils as utils
from hyperopt import fmin, tpe, hp
from sklearn.metrics import mean_absolute_error

df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

df_train = features.llenar_nulls(df_train)
df_test = features.llenar_nulls(df_test)

### Agrego Features

In [3]:
df_test_f = features.features_independientes_precio(df_test)
df_test_f = features.features_dependientes_precio(df_test_f, df_train)

df_train_f = features.features_independientes_precio(df_train)
df_train_f = features.features_dependientes_precio(df_train_f, df_train)

df_test_f, cols_tipodepropiedad_ohe = features.columna_a_ohe(df_test_f, 'tipodepropiedad', N=100, df_aux=df_train, devolver_cols=True)
df_test_f, cols_provincia_ohe = features.columna_a_ohe(df_test_f, 'provincia', N=100, df_aux=df_train, devolver_cols=True)
df_test_f, cols_zona_ohe = features.columna_a_ohe(df_test_f, 'zona', df_aux=df_train_f, devolver_cols=True)

df_train_f = features.columna_a_ohe(df_train_f, 'tipodepropiedad', N=100, df_aux=df_test)
df_train_f = features.columna_a_ohe(df_train_f, 'provincia', N=100, df_aux=df_test)
df_train_f = features.columna_a_ohe(df_train_f, 'zona', df_aux=df_test_f)

df_train_f['fecha'] = pd.to_datetime(df_train_f['fecha']).astype(int)
df_test_f['fecha'] = pd.to_datetime(df_test_f['fecha']).astype(int)

### Filtro columnas

In [4]:
features = ['antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 
            'lat', 'lng',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas', 'centroscomercialescercanos']

features_test = ['prop_frecuente', 'top_provincia', 'porcentaje_metros', 'diferencia_metros', 'promedio_precio_ciudad', 
                 'promedio_por_mes', 'anio', 'promedio_id_zona', 'promedio_precio_tipo_propiedad', 
                 'promedio_precio_hbg_tipo_propiedad', 'count_id_zona', 'count_ciudad', 'puntaje', 
                 'count_tipo_propiedad', 'count_tipo_propiedad_ciudad', 
                 'promedio_precio_tipo_propiedad_ciudad_gen', 'promedio_precio_hbg_tipo_propiedad_provincia',
                 'varianza_id_zona', 'tam_ambientes', 'metros_cubiertos_normalizados', 'dias_desde_datos',
                 'meses_desde_datos', 'promedio_id_zona_log']

features += features_test

features += cols_tipodepropiedad_ohe + cols_provincia_ohe + cols_zona_ohe

df_XGBoost, df_eval = utils.dividir_df_testeo(df_train_f, test_size=0.15)

df_XGBoost = utils.filtrar_features(df_XGBoost, features, 'precio')
df_eval = utils.filtrar_features(df_eval, features, 'precio')

## Busco los mejores hiperparametros

In [7]:
def xgb_eval(args):
    
    # Modelo - Ver hiperparametros
    colsample_bytree, learning_rate, max_depth, alpha, n_estimators, test_size = args
    
    # Preparacion de los datos
    x_train, x_test, y_train, y_test = utils.dividir_dataset(df_XGBoost, 'precio', features, test_size=test_size)
    
    x_eval = df_eval.drop('precio', axis=1)
    y_eval = df_eval['precio']
    
    xg_train = xgb.DMatrix(x_train, label=y_train)
    xg_test = xgb.DMatrix(x_test, label=y_test)
    dfg_test = xgb.DMatrix(x_eval, label = y_eval)

    max_depth = int(max_depth)
    n_estimators = int(n_estimators)
    params = {
        'objective':'reg:squarederror', 
        'colsample_bytree' : colsample_bytree , 
        'learning_rate' : learning_rate,
        'max_depth' : max_depth, 
        'alpha' : alpha,
        'n_estimators' : n_estimators,
        'eval_metric': 'mae'
    }

    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    num_round = 25
    
    xg_reg = xgb.train(params, 
                    xg_train, 
                    num_round, 
#                     watchlist
                   )

    y_pred_eval = xg_reg.predict(dfg_test)
    y_pred_test = xg_reg.predict(xg_test)
    return utils.MAE(y_eval, y_pred_eval)

space = [hp.quniform('colsample_bytree', 0.3, 1, 0.05), hp.quniform('learning_rate', 0.01, 0.15, 0.01),
         hp.quniform("max_depth", 1, 20, 1),hp.uniform("alpha", 0.01, 30),
        hp.quniform("n_estimators", 100, 600, 30), hp.quniform("test_size", 0.1, 0.4, 0.05)]

hps = fmin(xgb_eval, space=space, algo=tpe.suggest, max_evals=50)

display(hps)

  2%|▏         | 1/50 [00:31<26:00, 31.84s/it, best loss: 536204.5653767361]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



  4%|▍         | 2/50 [00:35<18:38, 23.29s/it, best loss: 536204.5653767361]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



  6%|▌         | 3/50 [00:43<14:45, 18.85s/it, best loss: 536204.5653767361]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



  8%|▊         | 4/50 [01:02<14:21, 18.73s/it, best loss: 534373.3772743056]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 10%|█         | 5/50 [01:08<11:11, 14.92s/it, best loss: 534373.3772743056]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 12%|█▏        | 6/50 [01:29<12:25, 16.95s/it, best loss: 534373.3772743056]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 14%|█▍        | 7/50 [01:41<10:59, 15.33s/it, best loss: 534373.3772743056]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 16%|█▌        | 8/50 [01:48<08:55, 12.74s/it, best loss: 534373.3772743056]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 18%|█▊        | 9/50 [02:01<08:48, 12.90s/it, best loss: 534373.3772743056]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 20%|██        | 10/50 [02:17<09:14, 13.85s/it, best loss: 532342.4761484375]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 22%|██▏       | 11/50 [02:55<13:42, 21.10s/it, best loss: 532342.4761484375]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 24%|██▍       | 12/50 [03:07<11:40, 18.44s/it, best loss: 532342.4761484375]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 26%|██▌       | 13/50 [03:20<10:21, 16.81s/it, best loss: 532342.4761484375]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 28%|██▊       | 14/50 [03:25<07:56, 13.23s/it, best loss: 532342.4761484375]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 30%|███       | 15/50 [03:30<06:18, 10.80s/it, best loss: 532342.4761484375]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 32%|███▏      | 16/50 [04:13<11:30, 20.31s/it, best loss: 528302.63565625]  

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 34%|███▍      | 17/50 [04:24<09:38, 17.54s/it, best loss: 528302.63565625]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 36%|███▌      | 18/50 [04:42<09:24, 17.65s/it, best loss: 495289.55463888886]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 38%|███▊      | 19/50 [04:46<06:58, 13.51s/it, best loss: 495289.55463888886]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 40%|████      | 20/50 [05:44<13:29, 26.97s/it, best loss: 495289.55463888886]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 42%|████▏     | 21/50 [06:01<11:39, 24.12s/it, best loss: 495289.55463888886]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 44%|████▍     | 22/50 [06:17<10:03, 21.54s/it, best loss: 487142.821109375]  

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 46%|████▌     | 23/50 [06:28<08:17, 18.43s/it, best loss: 487142.821109375]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 48%|████▊     | 24/50 [06:46<07:51, 18.15s/it, best loss: 487142.821109375]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 50%|█████     | 25/50 [07:22<09:49, 23.58s/it, best loss: 484432.2250295139]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 52%|█████▏    | 26/50 [07:37<08:23, 21.00s/it, best loss: 484432.2250295139]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 54%|█████▍    | 27/50 [08:18<10:21, 27.03s/it, best loss: 484432.2250295139]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 56%|█████▌    | 28/50 [08:57<11:14, 30.67s/it, best loss: 484432.2250295139]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 58%|█████▊    | 29/50 [09:53<13:23, 38.26s/it, best loss: 484432.2250295139]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 60%|██████    | 30/50 [10:54<15:01, 45.05s/it, best loss: 481652.43170746526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 62%|██████▏   | 31/50 [12:11<17:16, 54.58s/it, best loss: 481652.43170746526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 64%|██████▍   | 32/50 [12:52<15:08, 50.49s/it, best loss: 481652.43170746526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 66%|██████▌   | 33/50 [14:00<15:50, 55.89s/it, best loss: 481652.43170746526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 68%|██████▊   | 34/50 [14:36<13:18, 49.93s/it, best loss: 481652.43170746526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 70%|███████   | 35/50 [15:05<10:51, 43.46s/it, best loss: 481652.43170746526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 72%|███████▏  | 36/50 [15:24<08:25, 36.13s/it, best loss: 481652.43170746526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 74%|███████▍  | 37/50 [16:17<08:57, 41.35s/it, best loss: 481494.25492100697]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 76%|███████▌  | 38/50 [17:10<08:57, 44.79s/it, best loss: 481494.25492100697]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 78%|███████▊  | 39/50 [17:44<07:38, 41.66s/it, best loss: 481494.25492100697]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 80%|████████  | 40/50 [18:36<07:27, 44.71s/it, best loss: 481494.25492100697]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 82%|████████▏ | 41/50 [19:15<06:26, 42.98s/it, best loss: 481494.25492100697]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 84%|████████▍ | 42/50 [19:50<05:24, 40.57s/it, best loss: 481494.25492100697]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 86%|████████▌ | 43/50 [20:27<04:35, 39.39s/it, best loss: 481494.25492100697]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 88%|████████▊ | 44/50 [21:05<03:54, 39.10s/it, best loss: 481494.25492100697]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 90%|█████████ | 45/50 [21:28<02:51, 34.21s/it, best loss: 481494.25492100697]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 92%|█████████▏| 46/50 [22:10<02:25, 36.44s/it, best loss: 481494.25492100697]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 94%|█████████▍| 47/50 [22:18<01:24, 28.15s/it, best loss: 481494.25492100697]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 96%|█████████▌| 48/50 [23:03<01:06, 33.02s/it, best loss: 481494.25492100697]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 98%|█████████▊| 49/50 [24:07<00:42, 42.54s/it, best loss: 481494.25492100697]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



100%|██████████| 50/50 [24:48<00:00, 29.77s/it, best loss: 481494.25492100697]


{'alpha': 19.99916025229205,
 'colsample_bytree': 0.65,
 'learning_rate': 0.15,
 'max_depth': 20.0,
 'n_estimators': 90.0,
 'test_size': 0.1}

### Busco los mejores features

In [None]:
features = ['antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 
            # 'lat', 'lng',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas', 'centroscomercialescercanos']

features_test = ['tipo_propiedad_compartida',
                 'prop_frecuente', 'top_provincia', 'es_ciudad_centrica',
                 'promedio_metros_totales_provincia',
                 'promedio_metros_cubiertos_provincia', 'porcentaje_metros',
                 'diferencia_metros', 'delincuencia', 'turismo',
                 'promedio_precio_ciudad', 'promedio_id_zona',
                 'promedio_precio_tipo_propiedad','es_antigua',
                 'promedio_por_mes', 'promedio_precio_habitaciones',
                 'promedio_precio_habitaciones_banos_garages','cantidad_inquilinos',
                 'promedio_precio_banos_garages', 'promedio_precio_booleanos',
                 'metros_totales_normalizados', 'metros_cubiertos_normalizados','escomercial',
                 'promedio_metros_cub_tipo_propiedad', 'anio', 'mes', 'dia','trimestre']


In [None]:
hps = {'alpha': 9.499855511978337,
 'colsample_bytree': 0.8500000000000001,
 'learning_rate': 0.2226513740962932,
 'max_depth': 20.0,
 'n_estimators': 240.0,
 'test_size': 0.103950169925154826
}

alpha = hps['alpha']
colsample_bytree = hps['colsample_bytree']
max_depth = hps['max_depth']
learning_rate = hps['learning_rate']
n_estimators = hps['n_estimators']

n_estimators = int(hps['n_estimators'])
max_depth = int(hps['max_depth'])

params = {
        'objective':'reg:squarederror', 
        'colsample_bytree' : colsample_bytree , 
        'learning_rate' : learning_rate,
        'max_depth' : max_depth, 
        'alpha' : alpha,
        'n_estimators' : n_estimators,
        'eval_metric': 'mae'
}


base_train = 0
base_test = 0
base_eval = 0
for i in ['None'] + features_test:
    x_train, x_test, y_train, y_test = utils.dividir_dataset(df_train_f, 'precio', features + [i])

    xg_train = xgb.DMatrix(x_train, label=y_train)
    xg_test = xgb.DMatrix(x_test, label=y_test)
    xg_train = xgb.DMatrix(x_train, label=y_train)
    xg_test = xgb.DMatrix(x_test, label=y_test)

    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    num_round = 10
    
    xg_reg = xgb.train(params, 
                    xg_train, 
                    num_round, 
                    watchlist
                   )
    
    x_train_predict = xgb.DMatrix(x_train, label=y_train)
    x_test_predict = xgb.DMatrix(x_test, label=y_test)
    x_eval_predict = xgb.DMatrix(utils.filtrar_features(df_eval, features + [i]))
    
    y_pred_test = xg_reg.predict(x_test_predict)
    y_pred_train = xg_reg.predict(x_train_predict)
    y_pred_eval = xg_reg.predict(x_eval_predict)

    xgb_mae_train = utils.MAE(y_train, y_pred_train)
    xgb_mae = utils.MAE(y_test, y_pred_test)
    xgb_mae_eval = utils.MAE(df_eval['precio'].values, y_pred_eval)

    print(f"MAE LightGBM (train): {xgb_mae_train:.5f}")
    print(f"MAE LightGBM (test): {xgb_mae:.5f}")
    print(f"MAE LightGBM (eval): {xgb_mae_eval:.5f}")
    if i is not 'None':
        print(f"Overfitting (base_eval - base_test) - (eval - test) - {i}: {(base_eval - base_test) - (xgb_mae_eval - xgb_mae)}")
        print(f"Diff evaluation (base_eval - eval)                  - {i}: {base_eval - xgb_mae_eval}")
        print(f"Diff train (base_train - train)                     - {i}: {base_train - xgb_mae_train}")
    else:
        base_train = xgb_mae_train
        base_test = xgb_mae
        base_eval = xgb_mae_eval


## Predicción

In [8]:
features = ['antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 
            'lat', 'lng',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas', 'centroscomercialescercanos']

features_test = ['prop_frecuente', 'top_provincia', 'porcentaje_metros', 'diferencia_metros', 'promedio_precio_ciudad', 
                 'promedio_por_mes', 'anio', 'promedio_id_zona', 'promedio_precio_tipo_propiedad', 
                 'promedio_precio_hbg_tipo_propiedad', 'count_id_zona', 'count_ciudad', 'puntaje', 
                 'count_tipo_propiedad', 'count_tipo_propiedad_ciudad', 
                 'promedio_precio_tipo_propiedad_ciudad_gen', 'promedio_precio_hbg_tipo_propiedad_provincia',
                 'varianza_id_zona', 'tam_ambientes', 'metros_cubiertos_normalizados', 'dias_desde_datos',
                 'meses_desde_datos', 'promedio_id_zona_log']

features += features_test

features += cols_tipodepropiedad_ohe + cols_provincia_ohe + cols_zona_ohe

# Preparación hiperparámetros

# hps = {'alpha': 12.338515720263425,
#  'colsample_bytree': 0.8500000000000001,
#  'learning_rate': 0.15,
#  'max_depth': 17.0,
#  'n_estimators': 300.0,
#  'test_size': 0.15000000000000002}

hps = {'alpha': 19.99916025229205,
 'colsample_bytree': 0.65,
 'learning_rate': 0.15,
 'max_depth': int(20.0),
 'n_estimators': int(90.0),
 'test_size': 0.1}

alpha = hps['alpha']
colsample_bytree = hps['colsample_bytree']
max_depth = hps['max_depth']
learning_rate = hps['learning_rate']
n_estimators = hps['n_estimators']
test_size = hps['test_size']

n_estimators = int(hps['n_estimators'])
max_depth = int(hps['max_depth'])


# Preparacion de los datos
x_train, x_test, y_train, y_test = utils.dividir_dataset(df_XGBoost, 'precio', features, test_size=test_size)

x_eval = df_eval.drop('precio', axis=1)
y_eval = df_eval['precio']

xg_train = xgb.DMatrix(x_train, label=y_train)
xg_test = xgb.DMatrix(x_test, label=y_test)
dfg_test = xgb.DMatrix(x_eval, label = y_eval)


params = {
    'objective':'reg:squarederror', 
    'colsample_bytree' : colsample_bytree , 
    'learning_rate' : learning_rate,
    'max_depth' : max_depth, 
    'alpha' : alpha,
    'n_estimators' : n_estimators,
    'eval_metric': 'mae'
}

watchlist = [(xg_train, 'train'), (xg_test, 'test')]

# Entrenamiento
xg_reg = xgb.train(params, 
                   xg_train, 
                   num_boost_round=50,
                   evals=watchlist)

# Prediccion
y_pred_train = xg_reg.predict(xg_train)
y_pred_test = xg_reg.predict(xg_test)


linear_mae_train = utils.MAE(y_train, y_pred_train)
linear_mae = utils.MAE(y_test, y_pred_test)

y_pred_eval = xg_reg.predict(dfg_test)
linear_mae_eval = utils.MAE(y_eval, y_pred_eval)


print(f"MAE (train): {linear_mae_train:.5f}")
print(f"MAE: {linear_mae:.5f}")
print(f"MAE (eval): {linear_mae_eval:.5f}")


  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


[0]	train-mae:2.16263e+06	test-mae:2.18108e+06
[1]	train-mae:1.84763e+06	test-mae:1.86739e+06
[2]	train-mae:1.58015e+06	test-mae:1.60335e+06
[3]	train-mae:1.35271e+06	test-mae:1.38282e+06
[4]	train-mae:1.1596e+06	test-mae:1.19971e+06
[5]	train-mae:996237	test-mae:1.05124e+06
[6]	train-mae:857518	test-mae:930787
[7]	train-mae:738939	test-mae:833015
[8]	train-mae:638519	test-mae:755706
[9]	train-mae:552710	test-mae:694137
[10]	train-mae:479926	test-mae:647447
[11]	train-mae:417837	test-mae:610690
[12]	train-mae:364434	test-mae:581549
[13]	train-mae:318952	test-mae:559763
[14]	train-mae:279914	test-mae:542763
[15]	train-mae:246921	test-mae:530034
[16]	train-mae:218572	test-mae:519891
[17]	train-mae:194438	test-mae:512738
[18]	train-mae:173315	test-mae:506899
[19]	train-mae:155467	test-mae:502224
[20]	train-mae:140111	test-mae:498859
[21]	train-mae:126857	test-mae:496404
[22]	train-mae:115677	test-mae:494486
[23]	train-mae:105637	test-mae:492864
[24]	train-mae:96929.6	test-mae:491596
[25]	

## Predicción final

In [4]:
df_train_final = pd.read_csv('./data/train.csv')
df_test_final = pd.read_csv('./data/test.csv')

df_train_final = features.llenar_nulls(df_train_final)
df_test_final = features.llenar_nulls(df_test_final)

df_train_final_f = features.features_independientes_precio(df_train_final)
df_train_final_f = features.features_dependientes_precio(df_train_final_f, df_train_final)
df_test_final_f = features.features_independientes_precio(df_test_final)
df_test_final_f = features.features_dependientes_precio(df_test_final_f, df_train_final)

df_test_final_f, cols_tipodepropiedad_ohe = features.columna_a_ohe(df_test_final_f, 'tipodepropiedad', N=100, df_aux=df_train_final, devolver_cols=True)
df_test_final_f, cols_provincia_ohe = features.columna_a_ohe(df_test_final_f, 'provincia', N=100, df_aux=df_train_final, devolver_cols=True)
df_test_final_f, cols_zona_ohe = features.columna_a_ohe(df_test_final_f, 'zona', df_aux=df_train_final_f, devolver_cols=True)

df_train_final_f = features.columna_a_ohe(df_train_final_f, 'tipodepropiedad', N=100, df_aux=df_test_final)
df_train_final_f = features.columna_a_ohe(df_train_final_f, 'provincia', N=100, df_aux=df_test_final)
df_train_final_f = features.columna_a_ohe(df_train_final_f, 'zona', df_aux=df_test_final_f)

df_train_final_f['fecha'] = pd.to_datetime(df_train_final_f['fecha']).astype(int)
df_test_final_f['fecha'] = pd.to_datetime(df_test_final_f['fecha']).astype(int)

In [8]:
features = ['antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 
            'lat', 'lng',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas', 'centroscomercialescercanos']

features_test = ['prop_frecuente', 'top_provincia', 'porcentaje_metros', 'diferencia_metros', 'promedio_precio_ciudad', 
                 'promedio_por_mes', 'anio', 'promedio_id_zona', 'promedio_precio_tipo_propiedad', 
                 'promedio_precio_hbg_tipo_propiedad', 'count_id_zona', 'count_ciudad', 'puntaje', 
                 'count_tipo_propiedad', 'count_tipo_propiedad_ciudad', 
                 'promedio_precio_tipo_propiedad_ciudad_gen', 'promedio_precio_hbg_tipo_propiedad_provincia',
                 'varianza_id_zona', 'tam_ambientes', 'metros_cubiertos_normalizados', 'dias_desde_datos',
                 'meses_desde_datos', 'promedio_id_zona_log']

features += features_test

features += cols_tipodepropiedad_ohe + cols_provincia_ohe + cols_zona_ohe


hps = {'alpha': 19.99916025229205,
 'colsample_bytree': 0.65,
 'learning_rate': 0.15,
 'max_depth': int(20.0),
 'n_estimators': int(90.0),
 'test_size': 0.1}

alpha = hps['alpha']
colsample_bytree = hps['colsample_bytree']
max_depth = hps['max_depth']
learning_rate = hps['learning_rate']
n_estimators = hps['n_estimators']
test_size = hps['test_size']

n_estimators = int(hps['n_estimators'])
max_depth = int(hps['max_depth'])


# Filtro columnas
df_XGBoost_fin = utils.filtrar_features(df_train_final_f, features, 'precio')
df_eval_fin = utils.filtrar_features(df_test_final_f, features)

x_train_f, x_test_f, y_train_f, y_test_f = utils.dividir_dataset(df_XGBoost_fin, 'precio', features)


xg_train = xgb.DMatrix(x_train_f, label=y_train_f)
xg_test = xgb.DMatrix(x_test_f, label=y_test_f)

# Modelo - Ver hiperparametros
params = {
    'objective':'reg:squarederror', 
    'colsample_bytree' : colsample_bytree , 
    'learning_rate' : learning_rate,
    'max_depth' : max_depth, 
    'alpha' : alpha,
    'n_estimators' : n_estimators,
    'eval_metric': 'mae'
}

watchlist = [(xg_train, 'train'), (xg_test, 'test')]

# Entrenamiento
xg_reg = xgb.train(params, 
                   xg_train, 
                   num_boost_round=400,
                   evals=watchlist,
                   early_stopping_rounds=10
                  )


# Prediccion
y_pred_test_f = xg_reg.predict(xgb.DMatrix(x_test_f))
y_pred_train_f = xg_reg.predict(xgb.DMatrix(x_train_f))

linear_mae_train_f = utils.MAE(y_train_f, y_pred_train_f)
linear_mae_f = utils.MAE(y_test_f, y_pred_test_f)



print(f"MAE (train): {linear_mae_train_f:.5f}")
print(f"MAE: {linear_mae_f:.5f}")

prediccion_final = xg_reg.predict(xgb.DMatrix(df_eval_fin))
df_test_final_f['target'] = prediccion_final
df_test_final_f[['id', 'target']].to_csv('respuesta22.csv', index = False)

[0]	train-mae:2.16798e+06	test-mae:2.14817e+06
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 10 rounds.
[1]	train-mae:1.85233e+06	test-mae:1.83954e+06
[2]	train-mae:1.58405e+06	test-mae:1.57918e+06
[3]	train-mae:1.35618e+06	test-mae:1.36214e+06
[4]	train-mae:1.16279e+06	test-mae:1.18192e+06
[5]	train-mae:999053	test-mae:1.03559e+06
[6]	train-mae:860037	test-mae:917059
[7]	train-mae:741232	test-mae:820760
[8]	train-mae:640462	test-mae:744287
[9]	train-mae:554488	test-mae:683466
[10]	train-mae:481530	test-mae:636930
[11]	train-mae:419246	test-mae:600240
[12]	train-mae:365754	test-mae:571849
[13]	train-mae:320298	test-mae:550081
[14]	train-mae:281190	test-mae:533091
[15]	train-mae:248057	test-mae:519944
[16]	train-mae:219714	test-mae:510212
[17]	train-mae:195572	test-mae:502873
[18]	train-mae:174462	test-mae:496886
[19]	train-mae:156750	test-mae:492071
[20]	train-mae:141258	test-mae:488564
[21]	train-mae:1

[205]	train-mae:2563.16	test-mae:474213
[206]	train-mae:2530.77	test-mae:474212
[207]	train-mae:2516.53	test-mae:474213
[208]	train-mae:2506.4	test-mae:474212
[209]	train-mae:2490.79	test-mae:474212
[210]	train-mae:2455.49	test-mae:474211
[211]	train-mae:2447.47	test-mae:474210
[212]	train-mae:2423.23	test-mae:474209
[213]	train-mae:2393.27	test-mae:474210
[214]	train-mae:2389.99	test-mae:474210
[215]	train-mae:2345.39	test-mae:474209
[216]	train-mae:2332.67	test-mae:474209
[217]	train-mae:2302.1	test-mae:474209
[218]	train-mae:2274.37	test-mae:474207
[219]	train-mae:2255.51	test-mae:474206
[220]	train-mae:2244.46	test-mae:474207
[221]	train-mae:2202.9	test-mae:474206
[222]	train-mae:2197.45	test-mae:474206
[223]	train-mae:2188.91	test-mae:474206
[224]	train-mae:2183.04	test-mae:474206
[225]	train-mae:2172.98	test-mae:474205
[226]	train-mae:2160.07	test-mae:474204
[227]	train-mae:2143.07	test-mae:474204
[228]	train-mae:2134.78	test-mae:474204
[229]	train-mae:2122.44	test-mae:474204
[23

### Verificación de Nulos

In [None]:
df_test_final_f['target'].isnull().any()

In [None]:
df_test_final_f['target'].isnull().value_counts()

In [None]:
x_train_f.columns