## Cargas iniciales

In [None]:
#Las librerías utilizadas en este documento son:
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder 
import statsmodels.api as sm
from sklearn import metrics
from matplotlib.ticker import ScalarFormatter
from matplotlib import gridspec
sns.set()

In [None]:
# Visualización de la forma y atributos del dataset
data = pd.read_csv("./data/data_limpio_gdf.csv")
print(data.shape)
print("El dataset está compuesto por:", data.shape[0], "filas y",data.shape[1],"columnas.")
data.sample(5)

## Elección de datos


### Elección del subconjunto de datos

In [None]:
## Elegimos sólo departamentos de la Ciudad de Buenos Aires
data = data[data['provincia'] == 'Capital Federal']
data = data.drop(['provincia'], axis=1)
data = data.drop(['provincia_cat_code'], axis=1)
data = data[data['tipo'] == 'apartment']
data = data.drop(['tipo'], axis=1)
data = data.drop(['tipo_cat_code'], axis=1)
print("El dataset está compuesto por:", data.shape[0], "filas y",data.shape[1],"columnas.")
data.sample(5)

### Limpieza de datos

In [None]:
print("El dataset tiene:", data.shape[0], "filas")
data.isna().sum()

In [None]:
data = data.dropna()
print("El dataset tiene:", data.shape[0], "filas")

### Exploración de datos


In [None]:
#Vemos la correlación entre las variables 
data_corr = data.corr()
#graficamos
plt.figure(figsize=(6,6))
sns.heatmap(data_corr, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlación entre variables")
plt.show()

#puede verse una correlación significativa entre sup_m2_total y precio_usd (0.39)
#tipo_cat_code / ambientes_cat_code y  precio_usd (0.30)
#precio_usd_por_m2_cat y lon

## Separación de datos

### Elección de features y target

In [None]:
data.reset_index(drop=True, inplace=True)

In [None]:
X = data[['sup_m2_total', 'ambientes_cat', 'lat', 'lon', 'municipio']]
y = data['precio_usd']

### Train/test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

## Preprocesamiento de datos

### Feature engineering (categóricas)

In [None]:
X_train_dummies = pd.get_dummies(X_train, columns=['ambientes_cat', 'municipio'])
X_test_dummies = pd.get_dummies(X_test, columns=['ambientes_cat', 'municipio'])

print(X_train_dummies.shape)
print(X_test_dummies.shape)


In [None]:
encoder_categories = []

categorical_columns = ['ambientes_cat', 'municipio']

X_categorical_columns = [x for x in categorical_columns]

for col in X_categorical_columns:    
    col_categories = data[col].unique()
    encoder_categories.append(col_categories)

encoder_categories

In [None]:
encoder = OneHotEncoder(categories = encoder_categories, sparse=False, drop='first')

encoder = encoder.fit(X_train[X_categorical_columns])

X_train_encoded = encoder.transform(X_train[X_categorical_columns])
X_train_categorical = pd.DataFrame(X_train_encoded, columns = encoder.get_feature_names(X_categorical_columns))

X_test_encoded = encoder.transform(X_test[X_categorical_columns])
X_test_categorical = pd.DataFrame(X_test_encoded, columns = encoder.get_feature_names(X_categorical_columns))
X_test_categorical.head()

### Feature engineering (numéricas)

In [None]:
X_train_numerical = X_train.drop(X_categorical_columns, axis=1)
X_test_numerical = X_test.drop(X_categorical_columns, axis=1)

In [None]:
scaler = RobustScaler()

X_train_scaled = scaler.fit_transform(X_train_numerical)
X_train_numerical = pd.DataFrame(X_train_scaled, columns = X_train_numerical.columns)

X_test_scaled = scaler.transform(X_test_numerical)
X_test_numerical = pd.DataFrame(X_test_scaled, columns = X_test_numerical.columns)
X_test_numerical.head()


### Combinación de features

In [None]:
X_train = pd.concat([X_train_categorical, X_train_numerical], axis=1)
X_test = pd.concat([X_test_categorical, X_test_numerical], axis=1)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

## Regresión lineal

In [None]:
X_train_simple = X_train[['sup_m2_total']]
X_test_simple = X_test[['sup_m2_total']]

print(X_train_simple.shape)
print(X_test_simple.shape)

In [None]:
# Create linear regression object
lr = linear_model.LinearRegression()

X_train_simple = X_train[['sup_m2_total']]
X_test_simple = X_test[['sup_m2_total']]

# Train the model using the training sets
lr.fit(X_train_simple, y_train)

# Make predictions using the testing set
y_pred = lr.predict(X_test_simple)


In [None]:
# Imprimimos el intercepto y los coeficientes como atributos del objeto entrenado.
print ('Intercepto=', ' ', lr.intercept_)
print ('sup_m2_total=', ' ', lr.coef_)

# imprimos la metrica que mide la bondad de ajusto del modelo. En este caso el R2.

print ('R2_train=', ' ', lr.score(X_train_simple, y_train))
print ('R2_test=', ' ', lr.score(X_test_simple, y_test))
print ("EMC:", mean_squared_error(y_test, y_pred))
print ("r_EMC:", np.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
# Plot outputs
plt.scatter(y_test, y_pred,  color='black')
plt.plot([0, 1000000], [0, 1000000], color='blue', linewidth=3)
plt.xlabel('Precio real')
plt.ylabel('Precio predicho')
plt.title('Precio real vs Precio predicho')
plt.show()

## Regresión lineal multiple

## Regresión lineal con regularización