In [93]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.model_selection import train_test_split
from ipynb.fs.full.features import features_independientes_precio, features_dependientes_precio
import ipynb.fs.full.utils as utils
from hyperopt import fmin, tpe, hp
from sklearn.metrics import mean_absolute_error

df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

### Agrego Features

In [94]:
df_train_f = features_independientes_precio(df_train)
df_train_f = features_dependientes_precio(df_train_f, df_train)

df_test_f = features_independientes_precio(df_test)
df_test_f = features_dependientes_precio(df_test_f, df_train)

### Filtro columnas

In [97]:
features = ['antiguedad', 'habitaciones', 'garages', 'banos', 'metroscubiertos',
       'metrostotales', 'idzona', 'lat', 'lng', 'gimnasio', 'usosmultiples',
       'piscina', 'escuelascercanas', 'centroscomercialescercanos', 'precio']

features_extra = ['prop_frecuente', 'es_ciudad_centrica', 'porcentaje_metros', 'diferencia_metros',
       'delincuencia','turismo','promedio_precio_ciudad', 'promedio_precio_tipo_propiedad',
       'anio','promedio_metros_cub_tipo_propiedad', 'top_provincia', 'promedio_id_zona', 'promedio_por_mes', 
       'count_id_zona', 'count_ciudad', 'puntaje', 'count_tipo_propiedad', 'count_tipo_propiedad_ciudad',
       'promedio_id_zona_gen', 'promedio_precio_tipo_propiedad_ciudad_gen', 'promedio_precio_hbg_tipo_propiedad_provincia']

features += features_extra

df_XGBoost, df_eval = utils.dividir_df_testeo(df_train, test_size=0.15)

df_XGBoost = utils.filtrar_features(df_XGBoost, features)
df_eval = utils.filtrar_features(df_XGBoost, features)

## Busco los mejores hiperparametros

In [99]:
def xgb_eval(args):
    
    # Modelo - Ver hiperparametros
    colsample_bytree, learning_rate, max_depth, alpha, n_estimators, test_size = args
    
    # Preparacion de los datos
    x_train, x_test, y_train, y_test = utils.dividir_dataset(df_XGBoost, 'precio', features, test_size=test_size)
    
    x_eval = df_eval.drop('precio', axis=1)
    y_eval = df_eval['precio']
    
    xg_train = xgb.DMatrix(x_train, label=y_train)
    xg_test = xgb.DMatrix(x_test, label=y_test)
    dfg_test = xgb.DMatrix(x_eval, label = y_eval)

    max_depth = int(max_depth)
    n_estimators = int(n_estimators)
    params = {
        'objective':'reg:squarederror', 
        'colsample_bytree' : colsample_bytree , 
        'learning_rate' : learning_rate,
        'max_depth' : max_depth, 
        'alpha' : alpha,
        'n_estimators' : n_estimators,
        'eval_metric': 'mae'
    }

    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    num_round = 25
    
    xg_reg = xgb.train(params, 
                    xg_train, 
                    num_round, 
#                     watchlist
                   )

    y_pred_eval = xg_reg.predict(dfg_test)
    y_pred_test = xg_reg.predict(xg_test)
    return utils.MAE(y_test, y_pred_test)

space = [hp.quniform('colsample_bytree', 0.3, 1, 0.05), hp.quniform('learning_rate', 0.01, 0.15, 0.01),
         hp.quniform("max_depth", 1, 20, 1),hp.uniform("alpha", 0.01, 30),
        hp.quniform("n_estimators", 100, 600, 30), hp.quniform("test_size", 0.1, 0.4, 0.05)]

hps = fmin(xgb_eval, space=space, algo=tpe.suggest, max_evals=50)

display(hps)

  2%|▏         | 1/50 [00:05<04:07,  5.06s/it, best loss: 1087519.312549018]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



  4%|▍         | 2/50 [00:07<03:27,  4.33s/it, best loss: 949379.2341007965]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



  6%|▌         | 3/50 [00:12<03:24,  4.36s/it, best loss: 776487.3294676339]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



  8%|▊         | 4/50 [00:19<03:57,  5.16s/it, best loss: 776487.3294676339]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 10%|█         | 5/50 [00:22<03:22,  4.50s/it, best loss: 776487.3294676339]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 12%|█▏        | 6/50 [00:34<05:08,  7.00s/it, best loss: 655241.3753032989]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 14%|█▍        | 7/50 [00:49<06:34,  9.17s/it, best loss: 655241.3753032989]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 16%|█▌        | 8/50 [00:54<05:36,  8.01s/it, best loss: 655241.3753032989]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 18%|█▊        | 9/50 [00:57<04:24,  6.45s/it, best loss: 655241.3753032989]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 20%|██        | 10/50 [01:15<06:42, 10.06s/it, best loss: 645450.4960186888]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 22%|██▏       | 11/50 [01:42<09:50, 15.13s/it, best loss: 645450.4960186888]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 24%|██▍       | 12/50 [01:59<09:58, 15.75s/it, best loss: 645450.4960186888]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 26%|██▌       | 13/50 [02:13<09:17, 15.07s/it, best loss: 645450.4960186888]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 28%|██▊       | 14/50 [02:23<08:10, 13.63s/it, best loss: 645450.4960186888]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 30%|███       | 15/50 [02:30<06:47, 11.64s/it, best loss: 645450.4960186888]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 32%|███▏      | 16/50 [02:35<05:29,  9.71s/it, best loss: 645450.4960186888]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 34%|███▍      | 17/50 [02:46<05:33, 10.12s/it, best loss: 645450.4960186888]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 36%|███▌      | 18/50 [02:57<05:28, 10.28s/it, best loss: 645450.4960186888]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 38%|███▊      | 19/50 [03:12<06:04, 11.75s/it, best loss: 645450.4960186888]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 40%|████      | 20/50 [03:31<06:51, 13.73s/it, best loss: 645450.4960186888]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 42%|████▏     | 21/50 [03:59<08:42, 18.02s/it, best loss: 612268.921015625] 

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 44%|████▍     | 22/50 [04:19<08:44, 18.75s/it, best loss: 612268.921015625]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 46%|████▌     | 23/50 [04:49<09:55, 22.07s/it, best loss: 612268.921015625]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 48%|████▊     | 24/50 [05:19<10:36, 24.48s/it, best loss: 612268.921015625]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 50%|█████     | 25/50 [05:29<08:26, 20.25s/it, best loss: 612268.921015625]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 52%|█████▏    | 26/50 [06:05<09:54, 24.78s/it, best loss: 612268.921015625]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 54%|█████▍    | 27/50 [06:25<09:00, 23.49s/it, best loss: 612268.921015625]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 56%|█████▌    | 28/50 [06:40<07:36, 20.73s/it, best loss: 612268.921015625]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 58%|█████▊    | 29/50 [07:05<07:45, 22.16s/it, best loss: 612268.921015625]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 60%|██████    | 30/50 [07:30<07:38, 22.91s/it, best loss: 612268.921015625]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 62%|██████▏   | 31/50 [07:50<07:00, 22.14s/it, best loss: 611636.8553982843]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 64%|██████▍   | 32/50 [08:03<05:50, 19.47s/it, best loss: 611636.8553982843]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 66%|██████▌   | 33/50 [08:28<05:57, 21.00s/it, best loss: 611636.8553982843]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 68%|██████▊   | 34/50 [08:31<04:08, 15.52s/it, best loss: 611636.8553982843]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 70%|███████   | 35/50 [09:01<05:00, 20.01s/it, best loss: 611636.8553982843]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 72%|███████▏  | 36/50 [09:23<04:48, 20.61s/it, best loss: 611636.8553982843]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 74%|███████▍  | 37/50 [09:40<04:14, 19.60s/it, best loss: 611636.8553982843]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 76%|███████▌  | 38/50 [10:06<04:17, 21.50s/it, best loss: 611636.8553982843]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 78%|███████▊  | 39/50 [10:12<03:05, 16.87s/it, best loss: 611636.8553982843]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 80%|████████  | 40/50 [10:33<03:01, 18.13s/it, best loss: 611636.8553982843]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 82%|████████▏ | 41/50 [10:42<02:16, 15.20s/it, best loss: 611636.8553982843]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 84%|████████▍ | 42/50 [11:05<02:20, 17.52s/it, best loss: 611636.8553982843]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 86%|████████▌ | 43/50 [11:23<02:03, 17.69s/it, best loss: 611636.8553982843]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 88%|████████▊ | 44/50 [11:28<01:24, 14.04s/it, best loss: 611636.8553982843]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 90%|█████████ | 45/50 [11:32<00:55, 11.08s/it, best loss: 611636.8553982843]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 92%|█████████▏| 46/50 [11:42<00:41, 10.48s/it, best loss: 611636.8553982843]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 94%|█████████▍| 47/50 [11:50<00:30, 10.00s/it, best loss: 611636.8553982843]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 96%|█████████▌| 48/50 [12:05<00:22, 11.29s/it, best loss: 611636.8553982843]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 98%|█████████▊| 49/50 [12:22<00:12, 12.95s/it, best loss: 611636.8553982843]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



100%|██████████| 50/50 [12:34<00:00, 15.10s/it, best loss: 611636.8553982843]


{'alpha': 19.242014009068196,
 'colsample_bytree': 0.7000000000000001,
 'learning_rate': 0.15,
 'max_depth': 16.0,
 'n_estimators': 480.0,
 'test_size': 0.1}

### Busco los mejores features

In [37]:
features = ['antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 
            # 'lat', 'lng',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas', 'centroscomercialescercanos']

features_test = ['tipo_propiedad_compartida',
                 'prop_frecuente', 'top_provincia', 'es_ciudad_centrica',
                 'promedio_metros_totales_provincia',
                 'promedio_metros_cubiertos_provincia', 'porcentaje_metros',
                 'diferencia_metros', 'delincuencia', 'turismo',
                 'promedio_precio_ciudad', 'promedio_id_zona',
                 'promedio_precio_tipo_propiedad','es_antigua',
                 'promedio_por_mes', 'promedio_precio_habitaciones',
                 'promedio_precio_habitaciones_banos_garages','cantidad_inquilinos',
                 'promedio_precio_banos_garages', 'promedio_precio_booleanos',
                 'metros_totales_normalizados', 'metros_cubiertos_normalizados','escomercial',
                 'promedio_metros_cub_tipo_propiedad', 'anio', 'mes', 'dia','trimestre']


In [43]:
hps = {'alpha': 9.499855511978337,
 'colsample_bytree': 0.8500000000000001,
 'learning_rate': 0.2226513740962932,
 'max_depth': 20.0,
 'n_estimators': 240.0,
 'test_size': 0.103950169925154826
}

alpha = hps['alpha']
colsample_bytree = hps['colsample_bytree']
max_depth = hps['max_depth']
learning_rate = hps['learning_rate']
n_estimators = hps['n_estimators']

n_estimators = int(hps['n_estimators'])
max_depth = int(hps['max_depth'])

params = {
        'objective':'reg:squarederror', 
        'colsample_bytree' : colsample_bytree , 
        'learning_rate' : learning_rate,
        'max_depth' : max_depth, 
        'alpha' : alpha,
        'n_estimators' : n_estimators,
        'eval_metric': 'mae'
}


base_train = 0
base_test = 0
base_eval = 0
for i in ['None'] + features_test:
    x_train, x_test, y_train, y_test = dividir_dataset(df_train_f, 'precio', features + [i])

    xg_train = xgb.DMatrix(x_train, label=y_train)
    xg_test = xgb.DMatrix(x_test, label=y_test)
    xg_train = xgb.DMatrix(x_train, label=y_train)
    xg_test = xgb.DMatrix(x_test, label=y_test)

    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    num_round = 50
    
    xg_reg = xgb.train(params, 
                    xg_train, 
                    num_round, 
                    watchlist
                   )
    
    x_train_predict = xgb.DMatrix(x_train, label=y_train)
    x_test_predict = xgb.DMatrix(x_test, label=y_test)
    x_eval_predict = xgb.DMatrix(filtrar_features(df_test_f, features + [i]))
    
    y_pred_test = xg_reg.predict(x_test_predict)
    y_pred_train = xg_reg.predict(x_train_predict)
    y_pred_eval = xg_reg.predict(x_eval_predict)

    xgb_mae_train = MAE(y_train, y_pred_train)
    xgb_mae = MAE(y_test, y_pred_test)
    xgb_mae_eval = MAE(df_test_f['precio'].values, y_pred_eval)

    print(f"MAE LightGBM (train): {xgb_mae_train:.5f}")
    print(f"MAE LightGBM (test): {xgb_mae:.5f}")
    print(f"MAE LightGBM (eval): {xgb_mae_eval:.5f}")
    if i is not 'None':
        print(f"Overfitting (base_eval - base_test) - (eval - test) - {i}: {(base_eval - base_test) - (xgb_mae_eval - xgb_mae)}")
        print(f"Diff evaluation (base_eval - eval)                  - {i}: {base_eval - xgb_mae_eval}")
        print(f"Diff train (base_train - train)                     - {i}: {base_train - xgb_mae_train}")
    else:
        base_train = xgb_mae_train
        base_test = xgb_mae
        base_eval = xgb_mae_eval


NameError: name 'features_test' is not defined

## Predicción

In [88]:
features = ['antiguedad', 'habitaciones', 'garages', 'banos', 'metroscubiertos',
       'metrostotales', 'idzona', 'lat', 'lng', 'gimnasio', 'usosmultiples',
       'piscina', 'escuelascercanas', 'centroscomercialescercanos', 'precio']

features_extra = ['prop_frecuente', 'es_ciudad_centrica', 'porcentaje_metros', 'diferencia_metros',
       'delincuencia','turismo','promedio_precio_ciudad', 'promedio_precio_tipo_propiedad',
       'anio','promedio_metros_cub_tipo_propiedad', 'top_provincia', 'promedio_id_zona', 'promedio_por_mes', 
       'count_id_zona', 'count_ciudad', 'puntaje', 'count_tipo_propiedad', 'count_tipo_propiedad_ciudad']

features += features_extra


# Preparación hiperparámetros

# hps = {'alpha': 9.499855511978337,
#  'colsample_bytree': 0.8500000000000001,
#  'learning_rate': 0.2226513740962932,
#  'max_depth': 20.0,
#  'n_estimators': 240.0,
#  'test_size': 0.103950169925154826
# }
hps = {'alpha': 12.338515720263425,
 'colsample_bytree': 0.8500000000000001,
 'learning_rate': 0.15,
 'max_depth': 17.0,
 'n_estimators': 600.0,
 'test_size': 0.15000000000000002}

alpha = hps['alpha']
colsample_bytree = hps['colsample_bytree']
max_depth = hps['max_depth']
learning_rate = hps['learning_rate']
n_estimators = hps['n_estimators']
test_size = hps['test_size']

n_estimators = int(hps['n_estimators'])
max_depth = int(hps['max_depth'])


# Preparacion de los datos
x_train, x_test, y_train, y_test = utils.dividir_dataset(df_XGBoost, 'precio', features, test_size=test_size)

x_eval = df_eval.drop('precio', axis=1)
y_eval = df_eval['precio']

xg_train = xgb.DMatrix(x_train, label=y_train)
xg_test = xgb.DMatrix(x_test, label=y_test)
dfg_test = xgb.DMatrix(x_eval, label = y_eval)


params = {
    'objective':'reg:squarederror', 
    'colsample_bytree' : colsample_bytree , 
    'learning_rate' : learning_rate,
    'max_depth' : max_depth, 
    'alpha' : alpha,
    'n_estimators' : n_estimators,
    'eval_metric': 'mae'
}

watchlist = [(xg_train, 'train'), (xg_test, 'test')]

# Entrenamiento
xg_reg = xgb.train(params, 
                   xg_train, 
                   num_boost_round=50,
                   evals=watchlist)

# Prediccion
y_pred_train = xg_reg.predict(xg_train)
y_pred_test = xg_reg.predict(xg_test)


linear_mae_train = utils.MAE(y_train, y_pred_train)
linear_mae = utils.MAE(y_test, y_pred_test)

y_pred_eval = xg_reg.predict(dfg_test)
linear_mae_eval = utils.MAE(y_eval, y_pred_eval)


print(f"MAE (train): {linear_mae_train:.5f}")
print(f"MAE: {linear_mae:.5f}")
print(f"MAE (eval): {linear_mae_eval:.5f}")


[0]	train-mae:2.16103e+06	test-mae:2.17105e+06
[1]	train-mae:1.84888e+06	test-mae:1.86095e+06
[2]	train-mae:1.58441e+06	test-mae:1.60258e+06
[3]	train-mae:1.36168e+06	test-mae:1.39102e+06
[4]	train-mae:1.17792e+06	test-mae:1.22323e+06
[5]	train-mae:1.02169e+06	test-mae:1.0857e+06
[6]	train-mae:890644	test-mae:976696
[7]	train-mae:786948	test-mae:894752
[8]	train-mae:702373	test-mae:834397
[9]	train-mae:626022	test-mae:782273
[10]	train-mae:563955	test-mae:740363
[11]	train-mae:511502	test-mae:709750
[12]	train-mae:468330	test-mae:686668
[13]	train-mae:431716	test-mae:669752
[14]	train-mae:402172	test-mae:656662
[15]	train-mae:376319	test-mae:646482
[16]	train-mae:354688	test-mae:639380
[17]	train-mae:337166	test-mae:636270
[18]	train-mae:324303	test-mae:634012
[19]	train-mae:308031	test-mae:630606
[20]	train-mae:294108	test-mae:627755
[21]	train-mae:282390	test-mae:625016
[22]	train-mae:271090	test-mae:623372
[23]	train-mae:262810	test-mae:622170
[24]	train-mae:254276	test-mae:620695
[

In [37]:
len(y_pred_train)

21206

## Predicción final

In [58]:
df_train_final = pd.read_csv('./data/train.csv')
df_test_final = pd.read_csv('./data/test.csv')

df_train_final_f = features_independientes_precio(df_train_final)
df_train_final_f = features_dependientes_precio(df_train_final_f, df_train_final)
df_test_final_f = features_independientes_precio(df_test_final)
df_test_final_f = features_dependientes_precio(df_test_final_f, df_train_final)

In [63]:
df_XGBoost_fin.columns

Index(['habitaciones', 'prop_frecuente', 'idzona', 'lng', 'porcentaje_metros',
       'promedio_precio_tipo_propiedad', 'count_tipo_propiedad_ciudad',
       'puntaje', 'banos', 'gimnasio', 'metroscubiertos', 'escuelascercanas',
       'promedio_metros_cub_tipo_propiedad', 'piscina', 'count_id_zona',
       'delincuencia', 'promedio_precio_ciudad', 'garages', 'anio',
       'es_ciudad_centrica', 'usosmultiples', 'turismo', 'diferencia_metros',
       'promedio_por_mes', 'metrostotales', 'antiguedad',
       'centroscomercialescercanos', 'count_ciudad', 'top_provincia', 'lat',
       'count_tipo_propiedad', 'promedio_id_zona', 'precio'],
      dtype='object')

In [91]:
features = ['antiguedad', 'habitaciones', 'garages', 'banos', 'metroscubiertos',
       'metrostotales', 'idzona', 'lat', 'lng', 'gimnasio', 'usosmultiples',
       'piscina', 'escuelascercanas', 'centroscomercialescercanos', 'precio']

features_extra = ['prop_frecuente', 'es_ciudad_centrica', 'porcentaje_metros', 'diferencia_metros',
       'delincuencia','turismo','promedio_precio_ciudad', 'promedio_precio_tipo_propiedad',
       'anio','promedio_metros_cub_tipo_propiedad', 'top_provincia', 'promedio_id_zona', 'promedio_por_mes', 
       'count_id_zona', 'count_ciudad', 'puntaje', 'count_tipo_propiedad', 'count_tipo_propiedad_ciudad',
       'promedio_id_zona_gen', 'promedio_precio_tipo_propiedad_ciudad_gen', 'promedio_precio_hbg_tipo_propiedad_provincia']

features += features_extra

# Filtro columnas
df_XGBoost_fin = utils.filtrar_features(df_train_final_f, features)
df_eval_fin = utils.filtrar_features(df_test_final_f, features)

x_train_f, x_test_f, y_train_f, y_test_f = utils.dividir_dataset(df_XGBoost_fin, 'precio', features)


xg_train = xgb.DMatrix(x_train_f, label=y_train_f)
xg_test = xgb.DMatrix(x_test_f, label=y_test_f)

# Modelo - Ver hiperparametros
params = {
    'objective':'reg:squarederror', 
    'colsample_bytree' : colsample_bytree , 
    'learning_rate' : learning_rate,
    'max_depth' : max_depth, 
    'alpha' : alpha,
    'n_estimators' : n_estimators,
    'eval_metric': 'mae'
}

watchlist = [(xg_train, 'train'), (xg_test, 'test')]

# Entrenamiento
xg_reg = xgb.train(params, 
                   xg_train, 
                   num_boost_round=2000,
                   evals=watchlist
                  )


# Prediccion
y_pred_test_f = xg_reg.predict(xgb.DMatrix(x_test_f))
y_pred_train_f = xg_reg.predict(xgb.DMatrix(x_train_f))

linear_mae_train_f = utils.MAE(y_train_f, y_pred_train_f)
linear_mae_f = utils.MAE(y_test_f, y_pred_test_f)



print(f"MAE (train): {linear_mae_train_f:.5f}")
print(f"MAE: {linear_mae_f:.5f}")

prediccion_final = xg_reg.predict(xgb.DMatrix(df_eval_fin))
df_test_final_f['target'] = prediccion_final
df_test_final_f[['id', 'target']].to_csv('respuesta2.csv', index = False)

[0]	train-mae:2.16519e+06	test-mae:2.14518e+06
[1]	train-mae:1.84989e+06	test-mae:1.83578e+06
[2]	train-mae:1.58307e+06	test-mae:1.57667e+06
[3]	train-mae:1.35628e+06	test-mae:1.36049e+06
[4]	train-mae:1.16334e+06	test-mae:1.18102e+06
[5]	train-mae:999444	test-mae:1.03401e+06
[6]	train-mae:860687	test-mae:914441
[7]	train-mae:743501	test-mae:818854
[8]	train-mae:644863	test-mae:742826
[9]	train-mae:562145	test-mae:684232
[10]	train-mae:492934	test-mae:637771
[11]	train-mae:434849	test-mae:602547
[12]	train-mae:386234	test-mae:575684
[13]	train-mae:345586	test-mae:554963
[14]	train-mae:311901	test-mae:539220
[15]	train-mae:284621	test-mae:527430
[16]	train-mae:260964	test-mae:518571
[17]	train-mae:241208	test-mae:511970
[18]	train-mae:224063	test-mae:507203
[19]	train-mae:209813	test-mae:503533
[20]	train-mae:197237	test-mae:500621
[21]	train-mae:186868	test-mae:498669
[22]	train-mae:177364	test-mae:497046
[23]	train-mae:169097	test-mae:495831
[24]	train-mae:161317	test-mae:494889
[25]	

[208]	train-mae:20967.1	test-mae:486745
[209]	train-mae:20866	test-mae:486740
[210]	train-mae:20851.2	test-mae:486740
[211]	train-mae:20827.5	test-mae:486739
[212]	train-mae:20741.7	test-mae:486740
[213]	train-mae:20525.8	test-mae:486723
[214]	train-mae:20452	test-mae:486722
[215]	train-mae:20423.9	test-mae:486722
[216]	train-mae:20339.4	test-mae:486713
[217]	train-mae:20309.2	test-mae:486712
[218]	train-mae:20282.9	test-mae:486711
[219]	train-mae:20218.9	test-mae:486707
[220]	train-mae:20196.8	test-mae:486707
[221]	train-mae:20057.5	test-mae:486694
[222]	train-mae:20007.1	test-mae:486691
[223]	train-mae:19640.3	test-mae:486675
[224]	train-mae:19467.9	test-mae:486669
[225]	train-mae:19266.7	test-mae:486661
[226]	train-mae:19033.7	test-mae:486662
[227]	train-mae:18870.8	test-mae:486657
[228]	train-mae:18615.4	test-mae:486654
[229]	train-mae:18522.6	test-mae:486653
[230]	train-mae:18447.7	test-mae:486644
[231]	train-mae:18354.5	test-mae:486643
[232]	train-mae:18345.6	test-mae:486642
[233

[414]	train-mae:5220.43	test-mae:486422
[415]	train-mae:5209.35	test-mae:486421
[416]	train-mae:5204.71	test-mae:486421
[417]	train-mae:5203.34	test-mae:486421
[418]	train-mae:5175.89	test-mae:486422
[419]	train-mae:5158.59	test-mae:486423
[420]	train-mae:5076.85	test-mae:486419
[421]	train-mae:5072.16	test-mae:486419
[422]	train-mae:5026.41	test-mae:486416
[423]	train-mae:4990.38	test-mae:486416
[424]	train-mae:4958.56	test-mae:486416
[425]	train-mae:4956.93	test-mae:486416
[426]	train-mae:4930.63	test-mae:486415
[427]	train-mae:4878.19	test-mae:486418
[428]	train-mae:4857.54	test-mae:486418
[429]	train-mae:4802.92	test-mae:486415
[430]	train-mae:4735.04	test-mae:486418
[431]	train-mae:4730.21	test-mae:486418
[432]	train-mae:4702.77	test-mae:486420
[433]	train-mae:4693.45	test-mae:486420
[434]	train-mae:4648.41	test-mae:486420
[435]	train-mae:4590.49	test-mae:486420
[436]	train-mae:4567.82	test-mae:486420
[437]	train-mae:4563.9	test-mae:486420
[438]	train-mae:4513.73	test-mae:486420
[

[620]	train-mae:1799.34	test-mae:486393
[621]	train-mae:1797.54	test-mae:486393
[622]	train-mae:1795.07	test-mae:486393
[623]	train-mae:1792.68	test-mae:486393
[624]	train-mae:1792.48	test-mae:486393
[625]	train-mae:1783.48	test-mae:486393
[626]	train-mae:1766.73	test-mae:486391
[627]	train-mae:1757.66	test-mae:486392
[628]	train-mae:1752.17	test-mae:486392
[629]	train-mae:1751.5	test-mae:486392
[630]	train-mae:1739.94	test-mae:486392
[631]	train-mae:1736.86	test-mae:486391
[632]	train-mae:1734.55	test-mae:486392
[633]	train-mae:1727.73	test-mae:486391
[634]	train-mae:1714.48	test-mae:486391
[635]	train-mae:1703.22	test-mae:486391
[636]	train-mae:1692.69	test-mae:486393
[637]	train-mae:1674.09	test-mae:486393
[638]	train-mae:1655.42	test-mae:486393
[639]	train-mae:1634.31	test-mae:486392
[640]	train-mae:1619.12	test-mae:486392
[641]	train-mae:1611.92	test-mae:486393
[642]	train-mae:1604.13	test-mae:486392
[643]	train-mae:1593.08	test-mae:486392
[644]	train-mae:1587.61	test-mae:486392
[

[826]	train-mae:900.338	test-mae:486389
[827]	train-mae:895.926	test-mae:486388
[828]	train-mae:892.93	test-mae:486388
[829]	train-mae:889.835	test-mae:486389
[830]	train-mae:884.551	test-mae:486388
[831]	train-mae:883.506	test-mae:486388
[832]	train-mae:882.656	test-mae:486388
[833]	train-mae:880.49	test-mae:486388
[834]	train-mae:875.796	test-mae:486388
[835]	train-mae:872.638	test-mae:486388
[836]	train-mae:870.798	test-mae:486388
[837]	train-mae:869.778	test-mae:486388
[838]	train-mae:869.047	test-mae:486388
[839]	train-mae:865.449	test-mae:486388
[840]	train-mae:865.229	test-mae:486388
[841]	train-mae:864.522	test-mae:486388
[842]	train-mae:863.061	test-mae:486388
[843]	train-mae:862.493	test-mae:486388
[844]	train-mae:862.362	test-mae:486388
[845]	train-mae:861.71	test-mae:486388
[846]	train-mae:860.743	test-mae:486388
[847]	train-mae:857.029	test-mae:486388
[848]	train-mae:856.856	test-mae:486388
[849]	train-mae:855.853	test-mae:486389
[850]	train-mae:855.628	test-mae:486389
[85

[1031]	train-mae:674.848	test-mae:486387
[1032]	train-mae:674.488	test-mae:486387
[1033]	train-mae:673.975	test-mae:486388
[1034]	train-mae:673.826	test-mae:486388
[1035]	train-mae:673.255	test-mae:486388
[1036]	train-mae:673.137	test-mae:486388
[1037]	train-mae:672.614	test-mae:486388
[1038]	train-mae:672.06	test-mae:486388
[1039]	train-mae:671.587	test-mae:486388
[1040]	train-mae:671.394	test-mae:486388
[1041]	train-mae:670.947	test-mae:486388
[1042]	train-mae:670.313	test-mae:486388
[1043]	train-mae:669.618	test-mae:486388
[1044]	train-mae:669.118	test-mae:486388
[1045]	train-mae:668.691	test-mae:486388
[1046]	train-mae:668.493	test-mae:486388
[1047]	train-mae:668.157	test-mae:486388
[1048]	train-mae:668.117	test-mae:486388
[1049]	train-mae:668.077	test-mae:486388
[1050]	train-mae:667.807	test-mae:486388
[1051]	train-mae:667.034	test-mae:486388
[1052]	train-mae:666.725	test-mae:486388
[1053]	train-mae:666.12	test-mae:486387
[1054]	train-mae:665.434	test-mae:486387
[1055]	train-mae:6

[1232]	train-mae:620.475	test-mae:486388
[1233]	train-mae:620.257	test-mae:486388
[1234]	train-mae:620.106	test-mae:486388
[1235]	train-mae:619.97	test-mae:486388
[1236]	train-mae:619.891	test-mae:486388
[1237]	train-mae:619.824	test-mae:486388
[1238]	train-mae:619.748	test-mae:486388
[1239]	train-mae:619.423	test-mae:486388
[1240]	train-mae:619.293	test-mae:486388
[1241]	train-mae:619.254	test-mae:486388
[1242]	train-mae:619.22	test-mae:486388
[1243]	train-mae:619.147	test-mae:486388
[1244]	train-mae:619.081	test-mae:486388
[1245]	train-mae:618.827	test-mae:486388
[1246]	train-mae:618.571	test-mae:486388
[1247]	train-mae:618.336	test-mae:486388
[1248]	train-mae:618.173	test-mae:486388
[1249]	train-mae:617.858	test-mae:486388
[1250]	train-mae:617.761	test-mae:486388
[1251]	train-mae:617.658	test-mae:486388
[1252]	train-mae:617.543	test-mae:486388
[1253]	train-mae:617.159	test-mae:486388
[1254]	train-mae:616.994	test-mae:486388
[1255]	train-mae:616.721	test-mae:486388
[1256]	train-mae:6

[1433]	train-mae:604.364	test-mae:486388
[1434]	train-mae:604.235	test-mae:486388
[1435]	train-mae:604.157	test-mae:486388
[1436]	train-mae:604.143	test-mae:486388
[1437]	train-mae:604.034	test-mae:486388
[1438]	train-mae:603.973	test-mae:486388
[1439]	train-mae:603.95	test-mae:486388
[1440]	train-mae:603.899	test-mae:486388
[1441]	train-mae:603.879	test-mae:486388
[1442]	train-mae:603.856	test-mae:486388
[1443]	train-mae:603.802	test-mae:486388
[1444]	train-mae:603.725	test-mae:486388
[1445]	train-mae:603.713	test-mae:486388
[1446]	train-mae:603.677	test-mae:486388
[1447]	train-mae:603.653	test-mae:486388
[1448]	train-mae:603.605	test-mae:486388
[1449]	train-mae:603.553	test-mae:486388
[1450]	train-mae:603.489	test-mae:486388
[1451]	train-mae:603.406	test-mae:486388
[1452]	train-mae:603.395	test-mae:486388
[1453]	train-mae:603.338	test-mae:486388
[1454]	train-mae:603.288	test-mae:486388
[1455]	train-mae:603.241	test-mae:486388
[1456]	train-mae:603.165	test-mae:486388
[1457]	train-mae:

[1634]	train-mae:599.366	test-mae:486388
[1635]	train-mae:599.358	test-mae:486388
[1636]	train-mae:599.349	test-mae:486388
[1637]	train-mae:599.34	test-mae:486388
[1638]	train-mae:599.334	test-mae:486388
[1639]	train-mae:599.328	test-mae:486388
[1640]	train-mae:599.314	test-mae:486388
[1641]	train-mae:599.306	test-mae:486388
[1642]	train-mae:599.29	test-mae:486388
[1643]	train-mae:599.279	test-mae:486388
[1644]	train-mae:599.263	test-mae:486388
[1645]	train-mae:599.246	test-mae:486388
[1646]	train-mae:599.229	test-mae:486388
[1647]	train-mae:599.221	test-mae:486388
[1648]	train-mae:599.209	test-mae:486388
[1649]	train-mae:599.205	test-mae:486388
[1650]	train-mae:599.201	test-mae:486388
[1651]	train-mae:599.189	test-mae:486388
[1652]	train-mae:599.178	test-mae:486388
[1653]	train-mae:599.174	test-mae:486388
[1654]	train-mae:599.171	test-mae:486388
[1655]	train-mae:599.165	test-mae:486388
[1656]	train-mae:599.149	test-mae:486388
[1657]	train-mae:599.138	test-mae:486388
[1658]	train-mae:5

[1835]	train-mae:598.384	test-mae:486388
[1836]	train-mae:598.382	test-mae:486388
[1837]	train-mae:598.379	test-mae:486388
[1838]	train-mae:598.378	test-mae:486388
[1839]	train-mae:598.378	test-mae:486388
[1840]	train-mae:598.375	test-mae:486388
[1841]	train-mae:598.373	test-mae:486388
[1842]	train-mae:598.372	test-mae:486388
[1843]	train-mae:598.37	test-mae:486388
[1844]	train-mae:598.37	test-mae:486388
[1845]	train-mae:598.369	test-mae:486388
[1846]	train-mae:598.369	test-mae:486388
[1847]	train-mae:598.367	test-mae:486388
[1848]	train-mae:598.366	test-mae:486388
[1849]	train-mae:598.364	test-mae:486388
[1850]	train-mae:598.362	test-mae:486388
[1851]	train-mae:598.359	test-mae:486388
[1852]	train-mae:598.357	test-mae:486388
[1853]	train-mae:598.355	test-mae:486388
[1854]	train-mae:598.354	test-mae:486388
[1855]	train-mae:598.353	test-mae:486388
[1856]	train-mae:598.352	test-mae:486388
[1857]	train-mae:598.35	test-mae:486388
[1858]	train-mae:598.348	test-mae:486388
[1859]	train-mae:59

### Verificación de Nulos

In [None]:
df_test_final_f['target'].isnull().any()

In [None]:
df_test_final_f['target'].isnull().value_counts()

In [75]:
x_train_f.columns

Index(['habitaciones', 'prop_frecuente', 'idzona', 'lng', 'porcentaje_metros',
       'promedio_precio_tipo_propiedad', 'count_tipo_propiedad_ciudad',
       'puntaje', 'banos', 'gimnasio', 'escuelascercanas',
       'promedio_metros_cub_tipo_propiedad', 'piscina', 'count_id_zona',
       'delincuencia', 'promedio_precio_ciudad', 'garages', 'anio',
       'es_ciudad_centrica', 'usosmultiples', 'turismo', 'diferencia_metros',
       'promedio_por_mes', 'metrostotales', 'antiguedad',
       'centroscomercialescercanos', 'count_ciudad', 'top_provincia', 'lat',
       'count_tipo_propiedad', 'promedio_id_zona', 'metroscubiertos'],
      dtype='object')