In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer
from hyperopt import fmin, tpe, hp
from ipynb.fs.full.features import features_independientes_precio, features_dependientes_precio

df_train = pd.read_csv('./data/train.csv')

# Para usarse con el submit a Kaggle
df_test = pd.read_csv('./data/test.csv')

## Métrica de evaluación

In [7]:
def RMSLE(actual, pred):
    return (np.mean((np.log(actual + 1) - np.log(pred + 1)) ** 2)) **.5

## Regresión lineal

In [17]:
x = df_train[['metroscubiertos']]
y = df_train['precio']

# Dividir el DF de entrenamiento
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)
# Para evitar SettingWithCopyWarning
x_train, x_test, y_train, y_test = x_train.copy(), x_test.copy(), y_train.copy(), y_test.copy()

# Rellenar 
imp = SimpleImputer()
x_train['metroscubiertos'] = imp.fit_transform(x_train[['metroscubiertos']])
x_test['metroscubiertos'] = imp.transform(x_test[['metroscubiertos']])

linear_model = LinearRegression().fit(x_train, y_train)
pred = linear_model.predict(x_test)

# El modelo de regresion lineal con una variable obtiene RMSLE=0.65
linear_rmsle_train = RMSLE(y_train, linear_model.predict(x_train))
linear_rmsle = RMSLE(y_test, pred)
print(f"RMSLE LinearRegression (train): {linear_rmsle_train:.5f}")
print(f"RMSLE LinearRegression: {linear_rmsle:.5f}")

RMSLE LinearRegression (train): 0.65673
RMSLE LinearRegression: 0.65657


## Poly K

In [18]:
import warnings

# Para filtrar RankWarning en numpy
warnings.filterwarnings('ignore')

x_train_arr = x_train['metroscubiertos'].values
x_test_arr = x_test['metroscubiertos'].values
y_train_arr = y_train.values
y_test_arr = y_test.values

k = fmin(lambda k: RMSLE(y_train, np.poly1d(np.polyfit(x_train_arr, y_train_arr, k))(x_train_arr)), 
                     space=hp.uniform('k', 1, 30), algo=tpe.suggest, max_evals=20)['k']

poly_k_rmsle_train = RMSLE(y_train, np.poly1d(np.polyfit(x_train_arr, y_train_arr, k))(x_train_arr))
poly_k_rmsle = RMSLE(y_test, np.poly1d(np.polyfit(x_train_arr, y_train_arr, k))(x_test_arr))
print(f"RMSLE LinearRegression (train): {poly_k_rmsle_train: .5f}")
print(f"RMSLE LinearRegression: {poly_k_rmsle:.5f}")

df_test_f = df_test.copy()

y_pred_test = np.poly1d(np.polyfit(x_train_arr, y_train_arr, k))(df_test_f['metroscubiertos'])

df_test_f['precio'] = y_pred_test

df_test_f[['id', 'precio']].to_csv('respuesta_2.csv', header=False, index=False)

warnings.filterwarnings('default')

100%|██████████| 20/20 [00:04<00:00,  4.18it/s, best loss: 0.6496065747766164]
RMSLE LinearRegression (train):  0.64961
RMSLE LinearRegression: 0.65024


## Features

In [14]:
df_train_f = features_independientes_precio(df_train)
df_train_f = features_dependientes_precio(df_train_f, df_train)

df_test_f = features_independientes_precio(df_test)
df_test_f = features_dependientes_precio(df_test_f, df_train)

## LightBM

In [5]:
import lightgbm as lgb

In [8]:
df_gbm = df_train_f.copy().drop(['titulo', 'descripcion', 'tipodepropiedad', 'direccion', 'ciudad', 'provincia', 
                                 'fecha', 'zona', 'intervalo_metros_totales', 'intervalo_metros_cubiertos', 'id'], axis=1)

df_gbm = df_gbm[['antiguedad', 'habitaciones', 'garages', 'banos', 'metroscubiertos',
       'metrostotales', 'idzona', 'lat', 'lng', 'gimnasio', 'usosmultiples',
       'piscina', 'escuelascercanas', 'centroscomercialescercanos',
       'escomercial', 'promedio_precio_ciudad', 'promedio_id_zona',
       'porcentaje_metros', 'promedio_metros_tipo_propiedad', 'precio']]

df_gbm_test, df_gbm_train = train_test_split(df_gbm, test_size=0.25, random_state=1)

y_test = df_gbm_test['precio']
y_train = df_gbm_train['precio']
x_test = df_gbm_test.drop('precio', axis=1)
x_train = df_gbm_train.drop('precio', axis=1)

lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)

def eval_lightgbm(args):
    num_leaves, learning_rate, feature_fraction, bagging_fraction, bagging_freq = args
    num_leaves = int(num_leaves)
    bagging_freq = int(bagging_freq)
    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': '', # Si se deja vacio se toma el ideal para llegar al 'objective'
        'num_leaves': num_leaves,
        'learning_rate': learning_rate,
        'feature_fraction': feature_fraction,
        'bagging_fraction': bagging_fraction,
        'bagging_freq': bagging_freq,
        'verbose': -1,
    }

    gbm = lgb.train(params,
                    lgb_train,
                    valid_sets=lgb_eval,
                    num_boost_round=1000,
                    early_stopping_rounds=15,
                    verbose_eval=-1)
    
    y_pred_train = gbm.predict(x_train, num_iteration=gbm.best_iteration)
    return RMSLE(y_train, y_pred_train)

space = [hp.quniform('num_leaves', 1, 150, 1), hp.uniform('learning_rate', 0.05, 0.35),
        hp.uniform('feature_fraction', 0.60, 1), hp.uniform('bagging_fraction', 0.60, 1),
        hp.quniform('bagging_freq', 1, 30, 1)]

hps = fmin(eval_lightgbm, space=space, algo=tpe.suggest, max_evals=50)

display(hps)

Training until validation scores don't improve for 15 rounds
Early stopping, best iteration is:                  
[57]	valid_0's l2: 9.6837e+11
Training until validation scores don't improve for 15 rounds                  
Early stopping, best iteration is:                                            
[72]	valid_0's l2: 9.43951e+11
Training until validation scores don't improve for 15 rounds                  
Early stopping, best iteration is:                                           
[121]	valid_0's l2: 9.35345e+11
Training until validation scores don't improve for 15 rounds                  
  6%|▌         | 3/50 [00:07<01:56,  2.47s/it, best loss: 0.27938684626008786]

  



Early stopping, best iteration is:                                            
[76]	valid_0's l2: 9.45423e+11
Training until validation scores don't improve for 15 rounds                  
Early stopping, best iteration is:                                           
[325]	valid_0's l2: 9.02771e+11
Training until validation scores don't improve for 15 rounds                  
Early stopping, best iteration is:                                            
[184]	valid_0's l2: 9.88207e+11
Training until validation scores don't improve for 15 rounds                  
Early stopping, best iteration is:                                            
[158]	valid_0's l2: 9.3031e+11
Training until validation scores don't improve for 15 rounds                  
Did not meet early stopping. Best iteration is:                               
[1000]	valid_0's l2: 9.6091e+11
Training until validation scores don't improve for 15 rounds                  
Early stopping, best iteration is:                   

Training until validation scores don't improve for 15 rounds                   
Early stopping, best iteration is:                                             
[227]	valid_0's l2: 9.11046e+11
Training until validation scores don't improve for 15 rounds                   
Early stopping, best iteration is:                                             
[99]	valid_0's l2: 9.49889e+11
Training until validation scores don't improve for 15 rounds                   
Early stopping, best iteration is:                                             
[153]	valid_0's l2: 9.30162e+11
Training until validation scores don't improve for 15 rounds                   
Early stopping, best iteration is:                                             
[323]	valid_0's l2: 9.28041e+11
100%|██████████| 50/50 [03:34<00:00,  4.29s/it, best loss: 0.25098757183644077]


{'bagging_fraction': 0.9239025313228117,
 'bagging_freq': 3.0,
 'feature_fraction': 0.7985685775978174,
 'learning_rate': 0.05086005195331275,
 'num_leaves': 126.0}

In [11]:
bagging_fraction = hps['bagging_fraction']
bagging_freq = int(hps['bagging_freq'])
feature_fraction = hps['feature_fraction']
learning_rate = hps['learning_rate']
num_leaves = int(hps['num_leaves'])

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': '', # Si se deja vacio se toma el ideal para llegar al 'objective'
    'num_leaves': num_leaves,
    'learning_rate': learning_rate,
    'feature_fraction': feature_fraction,
    'bagging_fraction': bagging_fraction,
    'bagging_freq': bagging_freq,
    'verbose': 0,
}

gbm = lgb.train(params,
                lgb_train,
                valid_sets=lgb_eval,
                num_boost_round=1000,
                early_stopping_rounds=15)

y_pred_test = gbm.predict(x_test, num_iteration=gbm.best_iteration)
y_pred_train = gbm.predict(x_train, num_iteration=gbm.best_iteration)

gbm_rmsle_train = RMSLE(y_train, y_pred_train)
gbm_rmsle = RMSLE(y_test, y_pred_test)

print(f"RMSLE LightGBM: {gbm_rmsle_train:.5f}")
print(f"RMSLE LightGBM: {gbm_rmsle:.5f}")

[1]	valid_0's l2: 4.34355e+12
Training until validation scores don't improve for 15 rounds
[2]	valid_0's l2: 4.04286e+12
[3]	valid_0's l2: 3.773e+12
[4]	valid_0's l2: 3.57691e+12
[5]	valid_0's l2: 3.35036e+12
[6]	valid_0's l2: 3.14362e+12
[7]	valid_0's l2: 2.95512e+12
[8]	valid_0's l2: 2.79656e+12
[9]	valid_0's l2: 2.64309e+12
[10]	valid_0's l2: 2.50549e+12
[11]	valid_0's l2: 2.37894e+12
[12]	valid_0's l2: 2.26395e+12
[13]	valid_0's l2: 2.15758e+12
[14]	valid_0's l2: 2.05905e+12
[15]	valid_0's l2: 1.97468e+12
[16]	valid_0's l2: 1.89301e+12
[17]	valid_0's l2: 1.82331e+12
[18]	valid_0's l2: 1.75931e+12
[19]	valid_0's l2: 1.69997e+12
[20]	valid_0's l2: 1.64687e+12
[21]	valid_0's l2: 1.59342e+12
[22]	valid_0's l2: 1.54534e+12
[23]	valid_0's l2: 1.5e+12
[24]	valid_0's l2: 1.4586e+12
[25]	valid_0's l2: 1.42147e+12
[26]	valid_0's l2: 1.3888e+12
[27]	valid_0's l2: 1.35724e+12
[28]	valid_0's l2: 1.32745e+12
[29]	valid_0's l2: 1.30051e+12
[30]	valid_0's l2: 1.27857e+12
[31]	valid_0's l2: 1.2563e

[272]	valid_0's l2: 8.95952e+11
[273]	valid_0's l2: 8.96004e+11
[274]	valid_0's l2: 8.95943e+11
[275]	valid_0's l2: 8.95925e+11
[276]	valid_0's l2: 8.9582e+11
[277]	valid_0's l2: 8.95726e+11
[278]	valid_0's l2: 8.95616e+11
[279]	valid_0's l2: 8.95535e+11
[280]	valid_0's l2: 8.95323e+11
[281]	valid_0's l2: 8.95417e+11
[282]	valid_0's l2: 8.95336e+11
[283]	valid_0's l2: 8.95326e+11
[284]	valid_0's l2: 8.95341e+11
[285]	valid_0's l2: 8.95315e+11
[286]	valid_0's l2: 8.95321e+11
[287]	valid_0's l2: 8.95341e+11
[288]	valid_0's l2: 8.95332e+11
[289]	valid_0's l2: 8.95287e+11
[290]	valid_0's l2: 8.95242e+11
[291]	valid_0's l2: 8.95247e+11
[292]	valid_0's l2: 8.95115e+11
[293]	valid_0's l2: 8.95096e+11
[294]	valid_0's l2: 8.95051e+11
[295]	valid_0's l2: 8.94992e+11
[296]	valid_0's l2: 8.9494e+11
[297]	valid_0's l2: 8.94922e+11
[298]	valid_0's l2: 8.9486e+11
[299]	valid_0's l2: 8.94827e+11
[300]	valid_0's l2: 8.94829e+11
[301]	valid_0's l2: 8.94932e+11
[302]	valid_0's l2: 8.94897e+11
[303]	valid

  


In [16]:
df_test_gdb = df_test_f.copy()

df_test_gdb_ids = df_test_gdb['id']

df_test_gdb = df_test_gdb.drop(['titulo', 'descripcion', 'tipodepropiedad', 'direccion', 'ciudad', 'provincia', 
                                 'fecha', 'zona', 'intervalo_metros_totales', 'intervalo_metros_cubiertos', 'id'], axis=1)

y_pred_test = gbm.predict(df_test_gdb[['antiguedad', 'habitaciones', 'garages', 'banos', 'metroscubiertos',
       'metrostotales', 'idzona', 'lat', 'lng', 'gimnasio', 'usosmultiples',
       'piscina', 'escuelascercanas', 'centroscomercialescercanos',
       'escomercial', 'promedio_precio_ciudad', 'promedio_id_zona',
       'porcentaje_metros', 'promedio_metros_tipo_propiedad']], num_iteration=gbm.best_iteration)

df_test_gdb['target'] = y_pred_test
df_test_gdb['id'] = df_test_gdb_ids

df_test_gdb[['id', 'target']].to_csv('respuesta.csv', index=False)


In [110]:
display(x_train.columns)

df_test_f.columns

Index(['antiguedad', 'habitaciones', 'garages', 'banos', 'metroscubiertos',
       'metrostotales', 'idzona', 'lat', 'lng', 'gimnasio', 'usosmultiples',
       'piscina', 'escuelascercanas', 'centroscomercialescercanos',
       'escomercial', 'promedio_precio_ciudad', 'promedio_id_zona',
       'porcentaje_metros', 'promedio_metros_tipo_propiedad'],
      dtype='object')

Index(['antiguedad', 'habitaciones', 'garages', 'banos', 'metroscubiertos',
       'metrostotales', 'idzona', 'lat', 'lng', 'gimnasio', 'usosmultiples',
       'piscina', 'escuelascercanas', 'centroscomercialescercanos',
       'escomercial', 'promedio_precio_ciudad', 'promedio_id_zona',
       'porcentaje_metros', 'promedio_metros_tipo_propiedad', 'target', 'id'],
      dtype='object')