In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer
from hyperopt import fmin, tpe, hp

df_train = pd.read_csv('./data/train.csv')

# Para usarse con el submit a Kaggle
df_test = pd.read_csv('./data/test.csv')

## Métrica de evaluación

In [2]:
def RMSLE(actual, pred):
    return (np.mean((np.log(actual + 1) - np.log(pred + 1)) ** 2)) **.5

## Set de datos precio/metroscubiertos

In [3]:
x = df_train[['metroscubiertos']]
y = df_train['precio']

# Dividir el DF de entrenamiento
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)
# Para evitar SettingWithCopyWarning
x_train, x_test, y_train, y_test = x_train.copy(), x_test.copy(), y_train.copy(), y_test.copy()
print(f"Train shapes: x={x_train.shape} y={y_train.shape}")
print(f"Test  shapes: x={x_test.shape}  y={y_test.shape}")

# Rellenar 
imp = SimpleImputer()
x_train['metroscubiertos'] = imp.fit_transform(x_train[['metroscubiertos']])
x_test['metroscubiertos'] = imp.transform(x_test[['metroscubiertos']])

Train shapes: x=(180000, 1) y=(180000,)
Test  shapes: x=(60000, 1)  y=(60000,)


## Regresión lineal

In [4]:
linear_model = LinearRegression().fit(x_train, y_train)
pred = linear_model.predict(x_test)

# El modelo de regresion lineal con una variable obtiene RMSLE=0.65
linear_rmsle_train = RMSLE(y_train, linear_model.predict(x_train))
linear_rmsle = RMSLE(y_test, pred)
print(f"RMSLE LinearRegression (train): {linear_rmsle_train:.5f}")
print(f"RMSLE LinearRegression: {linear_rmsle:.5f}")

RMSLE LinearRegression (train): 0.65673
RMSLE LinearRegression: 0.65657


## Poly K

In [5]:
import warnings

# Para filtrar RankWarning en numpy
warnings.filterwarnings('ignore')

x_train_arr = x_train['metroscubiertos'].values
x_test_arr = x_test['metroscubiertos'].values
y_train_arr = y_train.values
y_test_arr = y_test.values

k = fmin(lambda k: RMSLE(y_train, np.poly1d(np.polyfit(x_train_arr, y_train_arr, k))(x_train_arr)), 
                     space=hp.uniform('k', 1, 30), algo=tpe.suggest, max_evals=200)['k']

poly_k_rmsle_train = RMSLE(y_train, np.poly1d(np.polyfit(x_train_arr, y_train_arr, k))(x_train_arr))
poly_k_rmsle = RMSLE(y_test, np.poly1d(np.polyfit(x_train_arr, y_train_arr, k))(x_test_arr))
print(f"RMSLE LinearRegression (train): {poly_k_rmsle_train:.5f}")
print(f"RMSLE LinearRegression: {poly_k_rmsle:.5f}")

warnings.filterwarnings('default')

100%|██████████| 200/200 [00:27<00:00,  7.36it/s, best loss: 0.6496065747766164]
RMSLE LinearRegression (train): 0.64961
RMSLE LinearRegression: 0.65024
