In [None]:
#Las librerías utilizadas en este documento son:
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from sklearn import metrics
from matplotlib.ticker import ScalarFormatter
from matplotlib import gridspec
sns.set()

In [None]:
#leemos el df creado para el barrio de caballito
df = pd.read_csv('./data/data_limpio_gdf_caballito.csv')

Usamos OLS para ver como resultan las metricas al usar solo la variable superficie total

In [None]:
# Asignamos las variables predictoras

X = df[['lat', 'lon', 'sup_m2_total']]

y = df['precio_usd']


In [None]:
# Normalizamos los datos

scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)


In [None]:
# Dividimos en train y test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Instanciamos el modelo y lo entrenamos

lr = linear_model.LinearRegression()
lr.fit(X_train, y_train)


In [None]:
# Vemos los coeficientes

print('Coeficientes: ', lr.coef_)
print('Intercepto: ', lr.intercept_)

In [None]:
# Calculamos el R2

print('R2: ', r2_score(y_test, lr.predict(X_test)))


In [None]:
# Modelamos con statsmodels

X_train_sm = sm.add_constant(X_train)
model = sm.OLS(y_train, X_train_sm).fit()

model.summary()

In [None]:
# Probamos con regularización ridge

lm_ridge = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 13))

model_ridge = lm_ridge.fit(X_train, y_train)

lm_ridge.alpha_

In [None]:
model_ridge.score(X_test, y_test)

In [None]:
# Probamos con regularización lasso

lm_lasso = linear_model.LassoCV(alphas=np.logspace(-6, 6, 13), cv=5)

model_lasso = lm_lasso.fit(X_train, y_train)

lm_lasso.alpha_

In [None]:
model_lasso.score(X_test, y_test)

#### Analizaremos el impacto de la variables ambientes en la predicción de la variable precio. 

In [None]:
df_0_ambientes_cat = df[['ambientes_cat', 'precio_usd', 'municipio']]

In [None]:
# Creamos las variables dummies para la variable categórica de ambientes
df_0_ambientes_cat = pd.get_dummies(df_0_ambientes_cat, columns=['ambientes_cat'], drop_first=True)
print(df_0_ambientes_cat.shape)
df_0_ambientes_cat.head()

In [None]:
df_cat = pd.concat([df_0_ambientes_cat, df["sup_m2_total"]],axis=1, join='inner')
print(df_cat.shape)
df_cat.head(4)

In [None]:
# Asignamos las variables predictoras

X_cat = df_cat[['ambientes_cat_1', 'ambientes_cat_2', 'ambientes_cat_3', 'ambientes_cat_4 o mas','sup_m2_total']]
y = df_cat['precio_usd']


In [None]:
# Normalizamos los datos

scaler = StandardScaler()
scaler.fit(X_cat)
X_cat = scaler.transform(X_cat)


In [None]:
# Dividimos en train y test

X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(X_cat, y, test_size=0.2, random_state=42)

In [None]:
# Instanciamos el modelo y lo entrenamos

lr_cat = linear_model.LinearRegression()
lr_cat.fit(X_train_cat, y_train_cat)


In [None]:
# Vemos los coeficientes

print('Coeficientes: ', lr_cat.coef_)
print('Intercepto: ', lr_cat.intercept_)

In [None]:
# Calculamos el R2

print('R2: ', r2_score(y_test_cat, lr_cat.predict(X_test_cat)))


In [None]:
# Modelamos con statsmodels

X_train_sm = sm.add_constant(X_train_cat)
model_stats = sm.OLS(y_train_cat, X_train_sm).fit()

model_stats.summary()

In [None]:
# Probamos con regularización ridge

lm_ridge_2 = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 13))

model_ridge_2 = lm_ridge_2.fit(X_train_cat, y_train_cat)

lm_ridge_2.alpha_

In [None]:
model_ridge.score(X_test_cat, y_test_cat)

In [None]:
# Probamos con regularización lasso

lm_lasso_2 = linear_model.LassoCV(alphas=np.logspace(-6, 6, 13), cv=5)

model_lasso_2 = lm_lasso_2.fit(X_train_cat, y_train_cat)

lm_lasso_2.alpha_

In [None]:
model_lasso_2.score(X_test_cat, y_test_cat)

PENDIENTE- VER SI AJUSTAMOS EL MODELO REDUCIENDO LA CANTIDAD DE DATOS

In [None]:
# superficie_min=15
# superficie_max=1000

# data = data[(data.sup_m2_total <= 1000) & (data.sup_m2_total >= 15)]

# data = data[(data.precio_usd <= 4000000)]

In [None]:
# # Generamos una función que resume los coeficientes, el intercepto y el R2
# # "model" = objeto con el modelo
# # "X" = matrix de variables independientes

# def sum_mod(lr, X):
#     a = pd.DataFrame(lr.coef_ , X_simple.columns.values)
#     a = a.append(pd.DataFrame([lr.intercept_, lr.score(X_simple, y)], index=['Intecept','R2']))
#     return(a)