In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.model_selection import train_test_split
from ipynb.fs.full.features import features_independientes_precio, features_dependientes_precio, columna_a_ohe
import ipynb.fs.full.utils as utils
from hyperopt import fmin, tpe, hp
from sklearn.metrics import mean_absolute_error

df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

### Agrego Features

In [2]:
df_train_f = features_independientes_precio(df_train)
df_train_f = features_dependientes_precio(df_train_f, df_train)

df_test_f = features_independientes_precio(df_test)
df_test_f = features_dependientes_precio(df_test_f, df_train)

df_test_f, cols_tipodepropiedad_ohe = columna_a_ohe(df_test_f, 'tipodepropiedad', N=100, df_aux=df_train, devolver_cols=True)
df_test_f, cols_provincia_ohe = columna_a_ohe(df_test_f, 'provincia', N=100, df_aux=df_train, devolver_cols=True)

df_test_f, cols_tipodepropiedad_ohe = columna_a_ohe(df_test_f, 'tipodepropiedad', N=100, df_aux=df_train, devolver_cols=True)
df_test_f, cols_provincia_ohe = columna_a_ohe(df_test_f, 'provincia', N=100, df_aux=df_train, devolver_cols=True)

df_train_f = columna_a_ohe(df_train_f, 'tipodepropiedad', N=100, df_aux=df_test)
df_train_f = columna_a_ohe(df_train_f, 'provincia', N=100, df_aux=df_test)

### Filtro columnas

In [3]:
features = ['habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 
#             'lat', 'lng',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas', 'centroscomercialescercanos']

features_test = ['prop_frecuente', 'top_provincia', 'porcentaje_metros', 'diferencia_metros', 'promedio_precio_ciudad', 
                 'promedio_por_mes', 'anio', 'promedio_id_zona', 'promedio_precio_tipo_propiedad', 
                 'promedio_precio_hbg_tipo_propiedad', 'count_id_zona', 'count_ciudad', 'puntaje', 
                 'count_tipo_propiedad', 'count_tipo_propiedad_ciudad', 
                 'promedio_precio_tipo_propiedad_ciudad_gen', 'promedio_precio_hbg_tipo_propiedad_provincia']

features += features_test

features += cols_tipodepropiedad_ohe + cols_provincia_ohe

df_XGBoost, df_eval = utils.dividir_df_testeo(df_train_f, test_size=0.15)

df_XGBoost = utils.filtrar_features(df_XGBoost, features, 'precio')
df_eval = utils.filtrar_features(df_eval, features, 'precio')

## Busco los mejores hiperparametros

In [4]:
def xgb_eval(args):
    
    # Modelo - Ver hiperparametros
    colsample_bytree, learning_rate, max_depth, alpha, n_estimators, test_size = args
    
    # Preparacion de los datos
    x_train, x_test, y_train, y_test = utils.dividir_dataset(df_XGBoost, 'precio', features, test_size=test_size)
    
    x_eval = df_eval.drop('precio', axis=1)
    y_eval = df_eval['precio']
    
    xg_train = xgb.DMatrix(x_train, label=y_train)
    xg_test = xgb.DMatrix(x_test, label=y_test)
    dfg_test = xgb.DMatrix(x_eval, label = y_eval)

    max_depth = int(max_depth)
    n_estimators = int(n_estimators)
    params = {
        'objective':'reg:squarederror', 
        'colsample_bytree' : colsample_bytree , 
        'learning_rate' : learning_rate,
        'max_depth' : max_depth, 
        'alpha' : alpha,
        'n_estimators' : n_estimators,
        'eval_metric': 'mae'
    }

    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    num_round = 25
    
    xg_reg = xgb.train(params, 
                    xg_train, 
                    num_round, 
#                     watchlist
                   )

    y_pred_eval = xg_reg.predict(dfg_test)
    y_pred_test = xg_reg.predict(xg_test)
    return utils.MAE(y_eval, y_pred_eval)

space = [hp.quniform('colsample_bytree', 0.3, 1, 0.05), hp.quniform('learning_rate', 0.01, 0.15, 0.01),
         hp.quniform("max_depth", 1, 20, 1),hp.uniform("alpha", 0.01, 30),
        hp.quniform("n_estimators", 100, 600, 30), hp.quniform("test_size", 0.1, 0.4, 0.05)]

hps = fmin(xgb_eval, space=space, algo=tpe.suggest, max_evals=50)

display(hps)

  0%|          | 0/50 [00:00<?, ?it/s, best loss: ?]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



  2%|▏         | 1/50 [00:13<10:55, 13.37s/it, best loss: 527353.3421701388]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



  4%|▍         | 2/50 [00:17<08:26, 10.54s/it, best loss: 527353.3421701388]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



  6%|▌         | 3/50 [00:30<08:49, 11.27s/it, best loss: 526487.0738363715]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



  8%|▊         | 4/50 [00:43<09:08, 11.92s/it, best loss: 526487.0738363715]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 10%|█         | 5/50 [01:01<10:13, 13.64s/it, best loss: 526487.0738363715]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 12%|█▏        | 6/50 [01:09<08:49, 12.04s/it, best loss: 526487.0738363715]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 14%|█▍        | 7/50 [01:21<08:41, 12.12s/it, best loss: 526487.0738363715]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 16%|█▌        | 8/50 [01:37<09:09, 13.09s/it, best loss: 526487.0738363715]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 18%|█▊        | 9/50 [02:23<15:40, 22.95s/it, best loss: 526487.0738363715]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 20%|██        | 10/50 [03:43<26:40, 40.02s/it, best loss: 520398.3488046875]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 22%|██▏       | 11/50 [03:55<20:35, 31.67s/it, best loss: 520398.3488046875]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 24%|██▍       | 12/50 [04:04<15:49, 25.00s/it, best loss: 520398.3488046875]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 26%|██▌       | 13/50 [04:09<11:41, 18.96s/it, best loss: 520398.3488046875]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 28%|██▊       | 14/50 [04:19<09:40, 16.14s/it, best loss: 520398.3488046875]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 30%|███       | 15/50 [04:23<07:18, 12.52s/it, best loss: 520398.3488046875]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 32%|███▏      | 16/50 [04:38<07:29, 13.23s/it, best loss: 520398.3488046875]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 34%|███▍      | 17/50 [04:59<08:34, 15.59s/it, best loss: 520398.3488046875]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 36%|███▌      | 18/50 [05:06<07:02, 13.21s/it, best loss: 520398.3488046875]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 38%|███▊      | 19/50 [05:12<05:37, 10.88s/it, best loss: 520398.3488046875]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 40%|████      | 20/50 [05:19<04:49,  9.66s/it, best loss: 520398.3488046875]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 42%|████▏     | 21/50 [05:58<08:59, 18.61s/it, best loss: 508333.1542291667]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 44%|████▍     | 22/50 [06:38<11:40, 25.01s/it, best loss: 506570.3321588542]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 46%|████▌     | 23/50 [07:25<14:10, 31.51s/it, best loss: 506570.3321588542]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 48%|████▊     | 24/50 [08:02<14:21, 33.13s/it, best loss: 506570.3321588542]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 50%|█████     | 25/50 [08:36<13:54, 33.38s/it, best loss: 505790.8125052083]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 52%|█████▏    | 26/50 [08:59<12:05, 30.25s/it, best loss: 505790.8125052083]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 54%|█████▍    | 27/50 [09:35<12:20, 32.21s/it, best loss: 501508.92386371526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 56%|█████▌    | 28/50 [10:04<11:27, 31.24s/it, best loss: 501508.92386371526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 58%|█████▊    | 29/50 [10:25<09:51, 28.18s/it, best loss: 501508.92386371526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 60%|██████    | 30/50 [10:51<09:07, 27.37s/it, best loss: 501508.92386371526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 62%|██████▏   | 31/50 [11:10<07:55, 25.01s/it, best loss: 501508.92386371526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 64%|██████▍   | 32/50 [11:41<08:01, 26.76s/it, best loss: 501508.92386371526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 66%|██████▌   | 33/50 [11:59<06:50, 24.14s/it, best loss: 501508.92386371526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 68%|██████▊   | 34/50 [12:38<07:36, 28.56s/it, best loss: 501508.92386371526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 70%|███████   | 35/50 [13:21<08:14, 32.99s/it, best loss: 501508.92386371526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 72%|███████▏  | 36/50 [13:33<06:11, 26.56s/it, best loss: 501508.92386371526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 74%|███████▍  | 37/50 [13:52<05:14, 24.23s/it, best loss: 501508.92386371526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 76%|███████▌  | 38/50 [14:05<04:12, 21.07s/it, best loss: 501508.92386371526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 78%|███████▊  | 39/50 [14:54<05:22, 29.30s/it, best loss: 501508.92386371526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 80%|████████  | 40/50 [15:04<03:56, 23.66s/it, best loss: 501508.92386371526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 82%|████████▏ | 41/50 [15:40<04:04, 27.21s/it, best loss: 501508.92386371526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 84%|████████▍ | 42/50 [16:08<03:40, 27.56s/it, best loss: 501508.92386371526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 86%|████████▌ | 43/50 [16:45<03:31, 30.20s/it, best loss: 501508.92386371526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 88%|████████▊ | 44/50 [17:03<02:40, 26.67s/it, best loss: 501508.92386371526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 90%|█████████ | 45/50 [17:32<02:16, 27.33s/it, best loss: 501508.92386371526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 92%|█████████▏| 46/50 [18:10<02:02, 30.64s/it, best loss: 501508.92386371526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 94%|█████████▍| 47/50 [18:22<01:14, 24.83s/it, best loss: 501508.92386371526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 96%|█████████▌| 48/50 [18:57<00:56, 28.06s/it, best loss: 501508.92386371526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 98%|█████████▊| 49/50 [19:22<00:27, 27.14s/it, best loss: 501508.92386371526]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



100%|██████████| 50/50 [19:31<00:00, 23.44s/it, best loss: 501508.92386371526]


{'alpha': 20.91434940058063,
 'colsample_bytree': 0.65,
 'learning_rate': 0.14,
 'max_depth': 16.0,
 'n_estimators': 150.0,
 'test_size': 0.1}

### Busco los mejores features

In [None]:
features = ['antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 
            # 'lat', 'lng',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas', 'centroscomercialescercanos']

features_test = ['tipo_propiedad_compartida',
                 'prop_frecuente', 'top_provincia', 'es_ciudad_centrica',
                 'promedio_metros_totales_provincia',
                 'promedio_metros_cubiertos_provincia', 'porcentaje_metros',
                 'diferencia_metros', 'delincuencia', 'turismo',
                 'promedio_precio_ciudad', 'promedio_id_zona',
                 'promedio_precio_tipo_propiedad','es_antigua',
                 'promedio_por_mes', 'promedio_precio_habitaciones',
                 'promedio_precio_habitaciones_banos_garages','cantidad_inquilinos',
                 'promedio_precio_banos_garages', 'promedio_precio_booleanos',
                 'metros_totales_normalizados', 'metros_cubiertos_normalizados','escomercial',
                 'promedio_metros_cub_tipo_propiedad', 'anio', 'mes', 'dia','trimestre']


In [None]:
hps = {'alpha': 9.499855511978337,
 'colsample_bytree': 0.8500000000000001,
 'learning_rate': 0.2226513740962932,
 'max_depth': 20.0,
 'n_estimators': 240.0,
 'test_size': 0.103950169925154826
}

alpha = hps['alpha']
colsample_bytree = hps['colsample_bytree']
max_depth = hps['max_depth']
learning_rate = hps['learning_rate']
n_estimators = hps['n_estimators']

n_estimators = int(hps['n_estimators'])
max_depth = int(hps['max_depth'])

params = {
        'objective':'reg:squarederror', 
        'colsample_bytree' : colsample_bytree , 
        'learning_rate' : learning_rate,
        'max_depth' : max_depth, 
        'alpha' : alpha,
        'n_estimators' : n_estimators,
        'eval_metric': 'mae'
}


base_train = 0
base_test = 0
base_eval = 0
for i in ['None'] + features_test:
    x_train, x_test, y_train, y_test = utils.dividir_dataset(df_train_f, 'precio', features + [i])

    xg_train = xgb.DMatrix(x_train, label=y_train)
    xg_test = xgb.DMatrix(x_test, label=y_test)
    xg_train = xgb.DMatrix(x_train, label=y_train)
    xg_test = xgb.DMatrix(x_test, label=y_test)

    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    num_round = 10
    
    xg_reg = xgb.train(params, 
                    xg_train, 
                    num_round, 
                    watchlist
                   )
    
    x_train_predict = xgb.DMatrix(x_train, label=y_train)
    x_test_predict = xgb.DMatrix(x_test, label=y_test)
    x_eval_predict = xgb.DMatrix(utils.filtrar_features(df_eval, features + [i]))
    
    y_pred_test = xg_reg.predict(x_test_predict)
    y_pred_train = xg_reg.predict(x_train_predict)
    y_pred_eval = xg_reg.predict(x_eval_predict)

    xgb_mae_train = utils.MAE(y_train, y_pred_train)
    xgb_mae = utils.MAE(y_test, y_pred_test)
    xgb_mae_eval = utils.MAE(df_eval['precio'].values, y_pred_eval)

    print(f"MAE LightGBM (train): {xgb_mae_train:.5f}")
    print(f"MAE LightGBM (test): {xgb_mae:.5f}")
    print(f"MAE LightGBM (eval): {xgb_mae_eval:.5f}")
    if i is not 'None':
        print(f"Overfitting (base_eval - base_test) - (eval - test) - {i}: {(base_eval - base_test) - (xgb_mae_eval - xgb_mae)}")
        print(f"Diff evaluation (base_eval - eval)                  - {i}: {base_eval - xgb_mae_eval}")
        print(f"Diff train (base_train - train)                     - {i}: {base_train - xgb_mae_train}")
    else:
        base_train = xgb_mae_train
        base_test = xgb_mae
        base_eval = xgb_mae_eval


## Predicción

In [5]:
features = ['habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 
#             'lat', 'lng',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas', 'centroscomercialescercanos']

features_test = ['prop_frecuente', 'top_provincia', 'porcentaje_metros', 'diferencia_metros', 'promedio_precio_ciudad', 
                 'promedio_por_mes', 'anio', 'promedio_id_zona', 'promedio_precio_tipo_propiedad', 
                 'promedio_precio_hbg_tipo_propiedad', 'count_id_zona', 'count_ciudad', 'puntaje', 
                 'count_tipo_propiedad', 'count_tipo_propiedad_ciudad', 
                 'promedio_precio_tipo_propiedad_ciudad_gen', 'promedio_precio_hbg_tipo_propiedad_provincia']

features += features_test

features += cols_tipodepropiedad_ohe + cols_provincia_ohe


# Preparación hiperparámetros

# hps = {'alpha': 12.338515720263425,
#  'colsample_bytree': 0.8500000000000001,
#  'learning_rate': 0.15,
#  'max_depth': 17.0,
#  'n_estimators': 300.0,
#  'test_size': 0.15000000000000002}

hps = {'alpha': 20.91434940058063,
 'colsample_bytree': 0.65,
 'learning_rate': 0.14,
 'max_depth': int(16.0),
 'n_estimators': int(150.0),
 'test_size': 0.1}

alpha = hps['alpha']
colsample_bytree = hps['colsample_bytree']
max_depth = hps['max_depth']
learning_rate = hps['learning_rate']
n_estimators = hps['n_estimators']
test_size = hps['test_size']

n_estimators = int(hps['n_estimators'])
max_depth = int(hps['max_depth'])


# Preparacion de los datos
x_train, x_test, y_train, y_test = utils.dividir_dataset(df_XGBoost, 'precio', features, test_size=test_size)

x_eval = df_eval.drop('precio', axis=1)
y_eval = df_eval['precio']

xg_train = xgb.DMatrix(x_train, label=y_train)
xg_test = xgb.DMatrix(x_test, label=y_test)
dfg_test = xgb.DMatrix(x_eval, label = y_eval)


params = {
    'objective':'reg:squarederror', 
    'colsample_bytree' : colsample_bytree , 
    'learning_rate' : learning_rate,
    'max_depth' : max_depth, 
    'alpha' : alpha,
    'n_estimators' : n_estimators,
    'eval_metric': 'mae'
}

watchlist = [(xg_train, 'train'), (xg_test, 'test')]

# Entrenamiento
xg_reg = xgb.train(params, 
                   xg_train, 
                   num_boost_round=50,
                   evals=watchlist)

# Prediccion
y_pred_train = xg_reg.predict(xg_train)
y_pred_test = xg_reg.predict(xg_test)


linear_mae_train = utils.MAE(y_train, y_pred_train)
linear_mae = utils.MAE(y_test, y_pred_test)

y_pred_eval = xg_reg.predict(dfg_test)
linear_mae_eval = utils.MAE(y_eval, y_pred_eval)


print(f"MAE (train): {linear_mae_train:.5f}")
print(f"MAE: {linear_mae:.5f}")
print(f"MAE (eval): {linear_mae_eval:.5f}")


  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


[0]	train-mae:2.18158e+06	test-mae:2.1978e+06
[1]	train-mae:1.88329e+06	test-mae:1.90038e+06
[2]	train-mae:1.62764e+06	test-mae:1.64632e+06
[3]	train-mae:1.40866e+06	test-mae:1.43295e+06
[4]	train-mae:1.22136e+06	test-mae:1.25308e+06
[5]	train-mae:1.06140e+06	test-mae:1.10262e+06
[6]	train-mae:925465	test-mae:979498
[7]	train-mae:810357	test-mae:877989
[8]	train-mae:713868	test-mae:797339
[9]	train-mae:632277	test-mae:732024
[10]	train-mae:562950	test-mae:680198
[11]	train-mae:506820	test-mae:640341
[12]	train-mae:460596	test-mae:609310
[13]	train-mae:420124	test-mae:584957
[14]	train-mae:386418	test-mae:566562
[15]	train-mae:359766	test-mae:552164
[16]	train-mae:336512	test-mae:541408
[17]	train-mae:316903	test-mae:533376
[18]	train-mae:300420	test-mae:526763
[19]	train-mae:286051	test-mae:522179
[20]	train-mae:274933	test-mae:518592
[21]	train-mae:264194	test-mae:515521
[22]	train-mae:254612	test-mae:512816
[23]	train-mae:245736	test-mae:510966
[24]	train-mae:237429	test-mae:509551
[

## Predicción final

In [None]:
df_train_final = pd.read_csv('./data/train.csv')
df_test_final = pd.read_csv('./data/test.csv')

df_train_final_f = features_independientes_precio(df_train_final)
df_train_final_f = features_dependientes_precio(df_train_final_f, df_train_final)
df_test_final_f = features_independientes_precio(df_test_final)
df_test_final_f = features_dependientes_precio(df_test_final_f, df_train_final)

In [None]:
features = ['antiguedad', 'habitaciones', 'garages', 'banos', 'metroscubiertos',
       'metrostotales', 'idzona', 'lat', 'lng', 'gimnasio', 'usosmultiples',
       'piscina', 'escuelascercanas', 'centroscomercialescercanos', 'precio']

features_extra = ['prop_frecuente', 'es_ciudad_centrica', 'porcentaje_metros', 'diferencia_metros',
       'delincuencia','turismo','promedio_precio_ciudad', 'promedio_precio_tipo_propiedad',
       'anio','promedio_metros_cub_tipo_propiedad', 'top_provincia', 'promedio_id_zona', 'promedio_por_mes', 
       'count_id_zona', 'count_ciudad', 'puntaje', 'count_tipo_propiedad', 'count_tipo_propiedad_ciudad',
       'promedio_id_zona_gen', 'promedio_precio_tipo_propiedad_ciudad_gen', 'promedio_precio_hbg_tipo_propiedad_provincia']

features += features_extra

# Filtro columnas
df_XGBoost_fin = utils.filtrar_features(df_train_final_f, features)
df_eval_fin = utils.filtrar_features(df_test_final_f, features)

x_train_f, x_test_f, y_train_f, y_test_f = utils.dividir_dataset(df_XGBoost_fin, 'precio', features)


xg_train = xgb.DMatrix(x_train_f, label=y_train_f)
xg_test = xgb.DMatrix(x_test_f, label=y_test_f)

# Modelo - Ver hiperparametros
params = {
    'objective':'reg:squarederror', 
    'colsample_bytree' : colsample_bytree , 
    'learning_rate' : learning_rate,
    'max_depth' : max_depth, 
    'alpha' : alpha,
    'n_estimators' : n_estimators,
    'eval_metric': 'mae'
}

watchlist = [(xg_train, 'train'), (xg_test, 'test')]

# Entrenamiento
xg_reg = xgb.train(params, 
                   xg_train, 
                   num_boost_round=2000,
                   evals=watchlist
                  )


# Prediccion
y_pred_test_f = xg_reg.predict(xgb.DMatrix(x_test_f))
y_pred_train_f = xg_reg.predict(xgb.DMatrix(x_train_f))

linear_mae_train_f = utils.MAE(y_train_f, y_pred_train_f)
linear_mae_f = utils.MAE(y_test_f, y_pred_test_f)



print(f"MAE (train): {linear_mae_train_f:.5f}")
print(f"MAE: {linear_mae_f:.5f}")

prediccion_final = xg_reg.predict(xgb.DMatrix(df_eval_fin))
df_test_final_f['target'] = prediccion_final
df_test_final_f[['id', 'target']].to_csv('respuesta2.csv', index = False)

### Verificación de Nulos

In [None]:
df_test_final_f['target'].isnull().any()

In [None]:
df_test_final_f['target'].isnull().value_counts()

In [None]:
x_train_f.columns