In [8]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.model_selection import train_test_split
import ipynb.fs.full.features as features
import ipynb.fs.full.utils as utils
from hyperopt import fmin, tpe, hp
from sklearn.metrics import mean_absolute_error
import ipynb.fs.full.features_distancias as f_distancias

df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

df_train = features.llenar_nulls(df_train)
df_test = features.llenar_nulls(df_test)

### Agrego Features

In [9]:

df_test_f = features.features_independientes_precio(df_test)
df_test_f = features.features_dependientes_precio(df_test_f, df_train)

df_train_f = features.features_independientes_precio(df_train)
df_train_f = features.features_dependientes_precio(df_train_f, df_train)

df_test_f, cols_tipodepropiedad_ohe = features.columna_a_ohe(df_test_f, 'tipodepropiedad', N=100, df_aux=df_train, devolver_cols=True)
df_test_f, cols_provincia_ohe = features.columna_a_ohe(df_test_f, 'provincia', N=100, df_aux=df_train, devolver_cols=True)
df_test_f, cols_zona_ohe = features.columna_a_ohe(df_test_f, 'zona', df_aux=df_train_f, devolver_cols=True)

df_train_f = features.columna_a_ohe(df_train_f, 'tipodepropiedad', N=100, df_aux=df_test)
df_train_f = features.columna_a_ohe(df_train_f, 'provincia', N=100, df_aux=df_test)
df_train_f = features.columna_a_ohe(df_train_f, 'zona', df_aux=df_test_f)


df_train_f['fecha'] = pd.to_datetime(df_train_f['fecha']).astype(int)
df_test_f['fecha'] = pd.to_datetime(df_test_f['fecha']).astype(int)

# df_train_f = df_train_f.sample(frac=1).reset_index(drop=True)

df_train_idf = pd.read_csv('./data/train_idf.csv')
df_test_idf = pd.read_csv('./data/test_idf.csv')

df_train_f = pd.merge(df_train_f, df_train_idf, on= 'id', how= 'left')
df_test_f = pd.merge(df_test_f, df_test_idf, on= 'id', how= 'left')

df_train_f = f_distancias.feature_distancias(df_train_f)
df_test_f = f_distancias.feature_distancias(df_test_f, df_train_f)

### Filtro columnas

In [11]:

features = ['habitaciones', 'garages','banos','antiguedad', 'metroscubiertos',  'metrostotales','lat_norm', 
           'lng_norm', 'gimnasio', 'usosmultiples', 'piscina']

features_test = ['top_provincia', 'promedio_precio_ciudad', 'anio', 'promedio_id_zona', 
                 'promedio_precio_tipo_propiedad', 'count_id_zona', 'count_ciudad', 'puntaje', 
               'count_tipo_propiedad_ciudad', 'promedio_precio_tipo_propiedad_ciudad_gen','count_id_zona',
           'dias_desde_datos','meses_desde_datos','porcentaje_metros','distancia_ciudad_centrica', 
                'distancia_centro_mexico', 'distancia_ciudad_cara', 'promedio_precio_hbg_tipo_propiedad_provincia']

features += features_test 

df_XGBoost, df_eval = utils.dividir_df_testeo(df_train_f, test_size=0.15)

df_XGBoost = utils.filtrar_features(df_XGBoost, features, 'precio')
df_eval = utils.filtrar_features(df_eval, features, 'precio')

## Busco los mejores hiperparametros

In [4]:
def xgb_eval(args):
    
    # Modelo - Ver hiperparametros
    colsample_bytree, learning_rate, max_depth, alpha, n_estimators, test_size = args
    
    # Preparacion de los datos
    x_train, x_test, y_train, y_test = utils.dividir_dataset(df_XGBoost, 'precio', features, test_size=test_size)
    
    x_eval = df_eval.drop('precio', axis=1)
    y_eval = df_eval['precio']
    
    xg_train = xgb.DMatrix(x_train, label=y_train)
    xg_test = xgb.DMatrix(x_test, label=y_test)
    dfg_test = xgb.DMatrix(x_eval, label = y_eval)

    max_depth = int(max_depth)
    n_estimators = int(n_estimators)
    params = {
        'objective':'reg:squarederror', 
        'colsample_bytree' : colsample_bytree , 
        'learning_rate' : learning_rate,
        'max_depth' : max_depth, 
        'alpha' : alpha,
        'n_estimators' : n_estimators,
        'eval_metric': 'mae'
    }

    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    num_round = 25
    
    xg_reg = xgb.train(params, 
                    xg_train, 
                    num_round, 
#                     watchlist
                   )

    y_pred_eval = xg_reg.predict(dfg_test)
    y_pred_test = xg_reg.predict(xg_test)
    return utils.MAE(y_eval, y_pred_eval)

space = [hp.quniform('colsample_bytree', 0.3, 1, 0.05), hp.quniform('learning_rate', 0.01, 0.15, 0.01),
         hp.quniform("max_depth", 1, 20, 1),hp.uniform("alpha", 0.01, 30),
        hp.quniform("n_estimators", 100, 600, 30), hp.quniform("test_size", 0.1, 0.4, 0.05)]

hps = fmin(xgb_eval, space=space, algo=tpe.suggest, max_evals=50)

display(hps)

  0%|          | 0/50 [00:00<?, ?it/s, best loss: ?]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



  2%|▏         | 1/50 [00:08<07:01,  8.60s/it, best loss: 537014.3095911458]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



  4%|▍         | 2/50 [00:12<05:47,  7.24s/it, best loss: 537014.3095911458]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



  6%|▌         | 3/50 [00:27<07:28,  9.54s/it, best loss: 518385.10228342016]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



  8%|▊         | 4/50 [00:41<08:14, 10.74s/it, best loss: 518385.10228342016]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 10%|█         | 5/50 [00:42<06:02,  8.07s/it, best loss: 518385.10228342016]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 12%|█▏        | 6/50 [00:53<06:27,  8.81s/it, best loss: 518385.10228342016]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 14%|█▍        | 7/50 [01:02<06:20,  8.86s/it, best loss: 518385.10228342016]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 16%|█▌        | 8/50 [01:09<05:47,  8.28s/it, best loss: 518385.10228342016]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 18%|█▊        | 9/50 [01:24<07:05, 10.37s/it, best loss: 507502.53913802083]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 20%|██        | 10/50 [01:41<08:16, 12.41s/it, best loss: 507502.53913802083]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 22%|██▏       | 11/50 [01:47<06:51, 10.54s/it, best loss: 507502.53913802083]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 24%|██▍       | 12/50 [01:56<06:19,  9.99s/it, best loss: 507502.53913802083]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 26%|██▌       | 13/50 [02:09<06:46, 10.98s/it, best loss: 507502.53913802083]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 28%|██▊       | 14/50 [02:11<04:49,  8.05s/it, best loss: 507502.53913802083]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 30%|███       | 15/50 [02:17<04:21,  7.48s/it, best loss: 507502.53913802083]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 32%|███▏      | 16/50 [02:44<07:37, 13.46s/it, best loss: 507502.53913802083]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 34%|███▍      | 17/50 [02:58<07:25, 13.50s/it, best loss: 503569.0695190972] 

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 36%|███▌      | 18/50 [03:00<05:19,  9.98s/it, best loss: 503569.0695190972]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 38%|███▊      | 19/50 [03:16<06:08, 11.90s/it, best loss: 503569.0695190972]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 40%|████      | 20/50 [03:38<07:23, 14.79s/it, best loss: 503569.0695190972]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 42%|████▏     | 21/50 [03:59<08:08, 16.86s/it, best loss: 503569.0695190972]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 44%|████▍     | 22/50 [04:04<06:11, 13.28s/it, best loss: 503569.0695190972]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 46%|████▌     | 23/50 [04:19<06:12, 13.78s/it, best loss: 502985.2364375]   

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 48%|████▊     | 24/50 [04:29<05:26, 12.58s/it, best loss: 502985.2364375]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 50%|█████     | 25/50 [04:34<04:17, 10.32s/it, best loss: 502985.2364375]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 52%|█████▏    | 26/50 [04:44<04:06, 10.27s/it, best loss: 502985.2364375]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 54%|█████▍    | 27/50 [05:00<04:38, 12.11s/it, best loss: 502985.2364375]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 56%|█████▌    | 28/50 [05:17<04:58, 13.58s/it, best loss: 502985.2364375]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 58%|█████▊    | 29/50 [05:29<04:33, 13.05s/it, best loss: 502985.2364375]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 60%|██████    | 30/50 [05:45<04:37, 13.87s/it, best loss: 502985.2364375]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 62%|██████▏   | 31/50 [05:54<03:54, 12.36s/it, best loss: 502985.2364375]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 64%|██████▍   | 32/50 [06:02<03:16, 10.94s/it, best loss: 502985.2364375]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 66%|██████▌   | 33/50 [06:29<04:27, 15.76s/it, best loss: 497064.2531875]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 68%|██████▊   | 34/50 [07:10<06:14, 23.41s/it, best loss: 497064.2531875]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 70%|███████   | 35/50 [07:49<07:01, 28.10s/it, best loss: 497064.2531875]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 72%|███████▏  | 36/50 [08:26<07:10, 30.78s/it, best loss: 497064.2531875]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 74%|███████▍  | 37/50 [09:01<06:55, 31.99s/it, best loss: 497064.2531875]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 76%|███████▌  | 38/50 [09:30<06:13, 31.10s/it, best loss: 497064.2531875]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 78%|███████▊  | 39/50 [09:35<04:17, 23.44s/it, best loss: 497064.2531875]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 80%|████████  | 40/50 [09:55<03:43, 22.40s/it, best loss: 497064.2531875]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 82%|████████▏ | 41/50 [10:13<03:08, 20.97s/it, best loss: 497064.2531875]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 84%|████████▍ | 42/50 [10:52<03:32, 26.52s/it, best loss: 497064.2531875]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 86%|████████▌ | 43/50 [11:32<03:34, 30.60s/it, best loss: 497064.2531875]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 88%|████████▊ | 44/50 [12:15<03:25, 34.30s/it, best loss: 497064.2531875]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 90%|█████████ | 45/50 [12:31<02:22, 28.55s/it, best loss: 497064.2531875]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 92%|█████████▏| 46/50 [13:20<02:19, 34.83s/it, best loss: 497064.2531875]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 94%|█████████▍| 47/50 [13:48<01:38, 32.84s/it, best loss: 497064.2531875]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 96%|█████████▌| 48/50 [14:24<01:07, 33.74s/it, best loss: 497064.2531875]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



 98%|█████████▊| 49/50 [14:58<00:33, 33.84s/it, best loss: 497064.2531875]

  if getattr(data, 'base', None) is not None and \

  data.base is not None and isinstance(data, np.ndarray) \



100%|██████████| 50/50 [15:17<00:00, 29.33s/it, best loss: 497064.2531875]


{'alpha': 16.874596444772834,
 'colsample_bytree': 0.8,
 'learning_rate': 0.14,
 'max_depth': 17.0,
 'n_estimators': 480.0,
 'test_size': 0.1}

### Busco los mejores features

In [6]:
from sklearn.preprocessing import MinMaxScaler
from vecstack import StackingTransformer
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout, AlphaDropout

def normalizar_df(df, features):
    min_max = StandardScaler()
    df[features] = pd.DataFrame(min_max.fit_transform(df[features]), columns=features)
    return df
    
features = ['habitaciones', 'garages','banos','antiguedad', 'metroscubiertos',  'metrostotales','lat_norm', 
           'lng_norm', 'gimnasio', 'usosmultiples', 'piscina']

features_test = ['top_provincia', 'promedio_precio_ciudad', 'anio', 'promedio_id_zona', 
                 'promedio_precio_tipo_propiedad', 'count_id_zona', 'count_ciudad', 
               'count_tipo_propiedad_ciudad', 'promedio_precio_tipo_propiedad_ciudad_gen','count_id_zona',
           'dias_desde_datos','meses_desde_datos','porcentaje_metros','distancia_ciudad_centrica', 'puntaje', 'distancia_centro_mexico']


features_cat = ['provincia', 'tipodepropiedad', 'intervalo_metros_totales', 'intervalo_metros_cubiertos',
               'zona']

features += features_test + features_cat

df_train_g = utils.filtrar_features(df_train_f, features, 'precio')

features_a_normalizar = [f for f in features if f not in features_cat + ['precio']]

df_train_n = pd.get_dummies(df_train_g, columns=features_cat)
df_train_n = normalizar_df(df_train_n, features_a_normalizar)

x_train, x_test, y_train, y_test = utils.dividir_dataset(df_train_n, 'precio', features, test_size=0.1)
df_train_n.columns

Index(['antiguedad', 'habitaciones', 'garages', 'banos', 'metroscubiertos',
       'metrostotales', 'gimnasio', 'usosmultiples', 'piscina', 'precio',
       'porcentaje_metros', 'top_provincia', 'anio', 'dias_desde_datos',
       'meses_desde_datos', 'promedio_precio_ciudad', 'count_ciudad',
       'promedio_id_zona', 'count_id_zona', 'promedio_precio_tipo_propiedad',
       'promedio_precio_tipo_propiedad_ciudad_gen',
       'count_tipo_propiedad_ciudad', 'lat_norm', 'lng_norm', 'puntaje',
       'distancia_ciudad_centrica', 'distancia_centro_mexico',
       'provincia_Aguascalientes', 'provincia_Baja California Norte',
       'provincia_Baja California Sur', 'provincia_Campeche',
       'provincia_Chiapas', 'provincia_Chihuahua', 'provincia_Coahuila',
       'provincia_Colima', 'provincia_Distrito Federal', 'provincia_Durango',
       'provincia_Edo. de México', 'provincia_Guanajuato',
       'provincia_Guerrero', 'provincia_Hidalgo', 'provincia_Jalisco',
       'provincia_Michoacán'

In [7]:
hps = {'alpha': 9.616105489494071,
 'colsample_bytree': 0.8500000000000001,
 'learning_rate': 0.14,
 'max_depth': 16.0,
 'n_estimators': 450.0,
 'test_size': 0.1}

alpha = hps['alpha']
colsample_bytree = hps['colsample_bytree']
max_depth = hps['max_depth']
learning_rate = hps['learning_rate']
n_estimators = hps['n_estimators']

n_estimators = int(hps['n_estimators'])
max_depth = int(hps['max_depth'])

params = {
        'objective':'reg:squarederror', 
        'colsample_bytree' : colsample_bytree , 
        'learning_rate' : learning_rate,
        'max_depth' : max_depth, 
        'alpha' : alpha,
        'n_estimators' : n_estimators,
        'eval_metric': 'mae'
}


base_train = 0
base_test = 0
base_eval = 0
for i in ['None'] + features_test:
    x_train, x_test, y_train, y_test = utils.dividir_dataset(df_train_n, 'precio', features + [i])

    xg_train = xgb.DMatrix(x_train, label=y_train)
    xg_test = xgb.DMatrix(x_test, label=y_test)
    xg_train = xgb.DMatrix(x_train, label=y_train)
    xg_test = xgb.DMatrix(x_test, label=y_test)

    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    num_round = 10
    
    xg_reg = xgb.train(params, 
                    xg_train, 
                    num_round, 
                    watchlist
                   )
    
    x_train_predict = xgb.DMatrix(x_train, label=y_train)
    x_test_predict = xgb.DMatrix(x_test, label=y_test)
    x_eval_predict = xgb.DMatrix(utils.filtrar_features(df_eval, features + [i]))
    
    y_pred_test = xg_reg.predict(x_test_predict)
    y_pred_train = xg_reg.predict(x_train_predict)
    y_pred_eval = xg_reg.predict(x_eval_predict)

    xgb_mae_train = utils.MAE(y_train, y_pred_train)
    xgb_mae = utils.MAE(y_test, y_pred_test)
    xgb_mae_eval = utils.MAE(df_eval['precio'].values, y_pred_eval)

    print(f"MAE LightGBM (train): {xgb_mae_train:.5f}")
    print(f"MAE LightGBM (test): {xgb_mae:.5f}")
    print(f"MAE LightGBM (eval): {xgb_mae_eval:.5f}")
    if i is not 'None':
        print(f"Overfitting (base_eval - base_test) - (eval - test) - {i}: {(base_eval - base_test) - (xgb_mae_eval - xgb_mae)}")
        print(f"Diff evaluation (base_eval - eval)                  - {i}: {base_eval - xgb_mae_eval}")
        print(f"Diff train (base_train - train)                     - {i}: {base_train - xgb_mae_train}")
    else:
        base_train = xgb_mae_train
        base_test = xgb_mae
        base_eval = xgb_mae_eval


  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


[0]	train-mae:2.18963e+06	test-mae:2.16954e+06
[1]	train-mae:1.89114e+06	test-mae:1.87593e+06
[2]	train-mae:1.63561e+06	test-mae:1.62643e+06
[3]	train-mae:1.41549e+06	test-mae:1.41515e+06
[4]	train-mae:1.2266e+06	test-mae:1.23804e+06
[5]	train-mae:1.06457e+06	test-mae:1.08964e+06
[6]	train-mae:926267	test-mae:967791
[7]	train-mae:808198	test-mae:868064
[8]	train-mae:707906	test-mae:787262
[9]	train-mae:622509	test-mae:722408


NameError: name 'df_eval' is not defined

## Predicción

In [1]:
features = ['habitaciones', 'garages','banos','antiguedad', 'metroscubiertos',  'metrostotales','lat_norm', 
           'lng_norm', 'gimnasio', 'usosmultiples', 'piscina']

features_test = ['top_provincia', 'promedio_precio_ciudad', 'anio', 'promedio_id_zona', 
                 'promedio_precio_tipo_propiedad', 'count_id_zona', 'count_ciudad', 'puntaje', 
               'count_tipo_propiedad_ciudad', 'promedio_precio_tipo_propiedad_ciudad_gen','count_id_zona',
           'dias_desde_datos','meses_desde_datos','porcentaje_metros','distancia_ciudad_centrica', 
                'distancia_centro_mexico', 'distancia_ciudad_cara', 'promedio_precio_hbg_tipo_propiedad_provincia']

features += features_test 

df_train_g = utils.filtrar_features(df_train_f, features, 'precio')

# features_a_normalizar = [f for f in features if f not in features_cat + ['precio']]

# df_train_n = pd.get_dummies(df_train_g, columns=features_cat)
# df_train_n = normalizar_df(df_train_n, features_a_normalizar)

x_train, x_test, y_train, y_test = utils.dividir_dataset(df_train_g, 'precio', features, test_size=0.1)


# Preparación hiperparámetros
hps = {'alpha': 16.874596444772834,
 'colsample_bytree': 0.8,
 'learning_rate': 0.1,
 'max_depth': 17.0,
 'n_estimators': 300.0,
 'test_size': 0.1}

alpha = hps['alpha']
colsample_bytree = hps['colsample_bytree']
max_depth = hps['max_depth']
learning_rate = hps['learning_rate']
n_estimators = hps['n_estimators']
test_size = hps['test_size']

n_estimators = int(hps['n_estimators'])
max_depth = int(hps['max_depth'])


#x_eval = df_eval.drop('precio', axis=1)
#y_eval = df_eval['precio']

xg_train = xgb.DMatrix(x_train, label=y_train)
xg_test = xgb.DMatrix(x_test, label=y_test)
#dfg_test = xgb.DMatrix(x_eval, label = y_eval)

params = {
    'objective':'reg:squarederror', 
    'colsample_bytree' : colsample_bytree , 
    'learning_rate' : learning_rate,
    'max_depth' : max_depth, 
    'alpha' : alpha,
    'n_estimators' : n_estimators,
    'eval_metric': 'mae'
}

watchlist = [(xg_train, 'train'), (xg_test, 'test')]

# Entrenamiento
xg_reg = xgb.train(params, 
                   xg_train, 
                   num_boost_round=50,
                   evals=watchlist)

# Prediccion
y_pred_train = xg_reg.predict(xg_train)
y_pred_test = xg_reg.predict(xg_test)


linear_mae_train = utils.MAE(y_train, y_pred_train)
linear_mae = utils.MAE(y_test, y_pred_test)

#y_pred_eval = xg_reg.predict(dfg_test)
#linear_mae_eval = utils.MAE(y_eval, y_pred_eval)


print(f"MAE (train): {linear_mae_train:.5f}")
print(f"MAE: {linear_mae:.5f}")
#print(f"MAE (eval): {linear_mae_eval:.5f}")


NameError: name 'utils' is not defined

## Predicción final

In [12]:
df_train = pd.read_csv('./data/train_filtrado.csv',)

# Para usarse con el submit a Kaggle
df_test = pd.read_csv('./data/test.csv')

df_test_f = features.features_independientes_precio(df_test)
df_test_f = features.features_dependientes_precio(df_test_f, df_train)

df_train_f = features.features_independientes_precio(df_train)
df_train_f = features.features_dependientes_precio(df_train_f, df_train)

df_test_f, cols_tipodepropiedad_ohe = features.columna_a_ohe(df_test_f, 'tipodepropiedad', N=250, df_aux=df_train, devolver_cols=True)
df_test_f, cols_provincia_ohe = features.columna_a_ohe(df_test_f, 'provincia', N=250, df_aux=df_train, devolver_cols=True)
df_test_f, cols_zona_ohe = features.columna_a_ohe(df_test_f, 'zona', N=250, df_aux=df_train_f, devolver_cols=True)

df_train_f = features.columna_a_ohe(df_train_f, 'tipodepropiedad', N=250, df_aux=df_test)
df_train_f = features.columna_a_ohe(df_train_f, 'provincia', N=250, df_aux=df_test)
df_train_f = features.columna_a_ohe(df_train_f, 'zona', N=250, df_aux=df_test_f)


df_train_idf = pd.read_csv('./data/train_idf.csv')
df_test_idf = pd.read_csv('./data/test_idf.csv')

df_train_f = pd.merge(df_train_f, df_train_idf, on= 'id', how= 'left')
df_test_f = pd.merge(df_test_f, df_test_idf, on= 'id', how= 'left')

df_train_f = f_distancias.feature_distancias(df_train_f)
df_test_f = f_distancias.feature_distancias(df_test_f, df_train_f)


AttributeError: 'list' object has no attribute 'features_independientes_precio'

In [16]:

features = ['habitaciones', 'garages','banos','antiguedad', 'metroscubiertos',  'metrostotales','lat_norm', 
           'lng_norm', 'gimnasio', 'usosmultiples', 'piscina']

features_test = ['top_provincia', 'promedio_precio_ciudad', 'anio', 'promedio_id_zona', 
                 'promedio_precio_tipo_propiedad', 'count_id_zona', 'count_ciudad', 'puntaje', 
               'count_tipo_propiedad_ciudad', 'promedio_precio_tipo_propiedad_ciudad_gen','count_id_zona'
           'dias_desde_datos','meses_desde_datos','porcentaje_metros','distancia_ciudad_centrica', 
                'distancia_centro_mexico', 'distancia_ciudad_cara', 'promedio_precio_hbg_tipo_propiedad_provincia']

features += features_test

hps = {'colsample_bytree': 0.9,
 'eval_metric': 'mae',
 'learning_rate': 0.1,
 'max_depth': 10,
 'n_estimators': 120,
 'n_jobs': 4,
 'objective': 'reg:squarederror',
 'scale_pos_weight': 1,
 'verbosity': 0}

colsample_bytree = hps['colsample_bytree']
max_depth = hps['max_depth']
learning_rate = hps['learning_rate']
n_estimators = hps['n_estimators']
n_estimators = int(hps['n_estimators'])
max_depth = int(hps['max_depth'])
scale_pos_weight = int(hps['scale_pos_weight'])
verbosity = int(hps['verbosity'])

# Filtro columnas
df_XGBoost_fin = utils.filtrar_features(df_train_f, features, 'precio')
df_eval_fin = utils.filtrar_features(df_test_f, features)

x_train_f, x_test_f, y_train_f, y_test_f = utils.dividir_dataset(df_XGBoost_fin, 'precio', features,0.1)


xg_train = xgb.DMatrix(x_train_f, label=y_train_f)
xg_test = xgb.DMatrix(x_test_f, label=y_test_f)

# Modelo - Ver hiperparametros
params = {
    'objective':'reg:squarederror', 
    'colsample_bytree' : colsample_bytree , 
    'learning_rate' : learning_rate,
    'max_depth' : max_depth, 
    'scale_pos_weight': scale_pos_weight,
    'n_estimators' : n_estimators,
    'eval_metric': 'mae'
}

watchlist = [(xg_train, 'train'), (xg_test, 'test')]

# Entrenamiento
xg_reg = xgb.train(params, 
                   xg_train, 
                   num_boost_round=500,
                   evals=watchlist,
                   early_stopping_rounds=10
                  )


# Prediccion
y_pred_test_f = xg_reg.predict(xgb.DMatrix(x_test_f))
y_pred_train_f = xg_reg.predict(xgb.DMatrix(x_train_f))

linear_mae_train_f = utils.MAE(y_train_f, y_pred_train_f)
linear_mae_f = utils.MAE(y_test_f, y_pred_test_f)



print(f"MAE (train): {linear_mae_train_f:.5f}")
print(f"MAE: {linear_mae_f:.5f}")

prediccion_final = xg_reg.predict(xgb.DMatrix(df_eval_fin))

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


[0]	train-mae:2.28356e+06	test-mae:2.23717e+06
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 10 rounds.
[1]	train-mae:2.05667e+06	test-mae:2.0136e+06
[2]	train-mae:1.85294e+06	test-mae:1.81278e+06
[3]	train-mae:1.67054e+06	test-mae:1.63365e+06
[4]	train-mae:1.50798e+06	test-mae:1.47437e+06
[5]	train-mae:1.36372e+06	test-mae:1.33323e+06
[6]	train-mae:1.23634e+06	test-mae:1.20969e+06
[7]	train-mae:1.12418e+06	test-mae:1.10118e+06
[8]	train-mae:1.02596e+06	test-mae:1.00637e+06
[9]	train-mae:940367	test-mae:924364
[10]	train-mae:866050	test-mae:853672
[11]	train-mae:801902	test-mae:793704
[12]	train-mae:746942	test-mae:742792
[13]	train-mae:699965	test-mae:700031
[14]	train-mae:660122	test-mae:664154
[15]	train-mae:626341	test-mae:634285
[16]	train-mae:597800	test-mae:609652
[17]	train-mae:573791	test-mae:588901
[18]	train-mae:553525	test-mae:571563
[19]	train-mae:536506	test-mae:557377
[20]	train-mae:52211

[207]	train-mae:333662	test-mae:462167
[208]	train-mae:333184	test-mae:462096
[209]	train-mae:332757	test-mae:462098
[210]	train-mae:332545	test-mae:462048
[211]	train-mae:332135	test-mae:461913
[212]	train-mae:331740	test-mae:461858
[213]	train-mae:331700	test-mae:461845
[214]	train-mae:331446	test-mae:461803
[215]	train-mae:331174	test-mae:461790
[216]	train-mae:330915	test-mae:461795
[217]	train-mae:330583	test-mae:461744
[218]	train-mae:330178	test-mae:461658
[219]	train-mae:329888	test-mae:461643
[220]	train-mae:329636	test-mae:461528
[221]	train-mae:329516	test-mae:461484
[222]	train-mae:329274	test-mae:461442
[223]	train-mae:329193	test-mae:461412
[224]	train-mae:328888	test-mae:461373
[225]	train-mae:328324	test-mae:461278
[226]	train-mae:328043	test-mae:461248
[227]	train-mae:327752	test-mae:461250
[228]	train-mae:327696	test-mae:461244
[229]	train-mae:327439	test-mae:461133
[230]	train-mae:327195	test-mae:461099
[231]	train-mae:326781	test-mae:461035
[232]	train-mae:326684	te

[418]	train-mae:279229	test-mae:455674
[419]	train-mae:279111	test-mae:455672
[420]	train-mae:278704	test-mae:455687
[421]	train-mae:278545	test-mae:455704
[422]	train-mae:278348	test-mae:455662
[423]	train-mae:278124	test-mae:455638
[424]	train-mae:277966	test-mae:455603
[425]	train-mae:277869	test-mae:455575
[426]	train-mae:277616	test-mae:455580
[427]	train-mae:277514	test-mae:455587
[428]	train-mae:277253	test-mae:455517
[429]	train-mae:276902	test-mae:455509
[430]	train-mae:276566	test-mae:455448
[431]	train-mae:276505	test-mae:455443
[432]	train-mae:276462	test-mae:455444
[433]	train-mae:276287	test-mae:455467
[434]	train-mae:276008	test-mae:455475
[435]	train-mae:275982	test-mae:455465
[436]	train-mae:275828	test-mae:455464
[437]	train-mae:275618	test-mae:455428
[438]	train-mae:275484	test-mae:455422
[439]	train-mae:275320	test-mae:455406
[440]	train-mae:275074	test-mae:455415
[441]	train-mae:274892	test-mae:455435
[442]	train-mae:274630	test-mae:455451
[443]	train-mae:274492	te

In [18]:
df_test_f['target'] = prediccion_final
df_test_f[['id', 'target']].to_csv('respuesta38e.csv', index = False)

### Verificación de Nulos

In [None]:
df_test_final_f['target'].isnull().any()

In [None]:
df_test_final_f['target'].isnull().value_counts()

In [None]:
x_train_f.columns