# Imports

In [None]:
from pathlib import Path
import pandas as pd
from pandas.plotting import scatter_matrix
import tarfile
import urllib.request
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler,  StandardScaler, FunctionTransformer

# Lectura de los datos

In [None]:
def load_housing_data():
    tarball_path = Path("./datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
    return pd.read_csv(Path("./datasets/housing/housing.csv"))

In [None]:
housing = load_housing_data()

In [None]:
# Vistazo rapido a la estructura de datos
housing.head()

# Exploracion Inicial

Cada Registro es un distrito

In [None]:
# Informacion general de los datos
housing.info()

In [None]:
# Feature Categoricas
housing["ocean_proximity"].value_counts()

Estos Datos son por distritos y no por casas

In [None]:
# Feature Numerica - Se ignora los valores nulos
housing.describe()

In [None]:
# Histograma de las features
housing.hist(bins=100, figsize=(20,15))
plt.show()

# Crear conjuto de pruebas y entrenamiento

In [None]:
# Separacion de conjuntos sin estratificacion
# X_train, X_test = train_test_split(housing, test_size=0.2, random_state=42)

Para que una separacion de conjuntos sea mas efectiva hacemos una separacion estratificada segun una variable importante en este caso seleccionamos el `median_income` pero tenemos que transformarla a una variable categorica

In [None]:
housing['median_income'].describe()

In [None]:
# Transformacion de la feature "median_income" a una feature categorica
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

In [None]:
# Evaluamos la distribucion
housing["income_cat"].value_counts().sort_index().plot.bar(rot=0,grid=True)
plt.xlabel
plt.ylabel("Number of Districts")
plt.show()

In [None]:
strat_train_set, strat_test_set = train_test_split(housing, test_size=0.2, random_state=42, stratify=housing["income_cat"])
# Luego de usar esta columna la eliminamos
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [None]:
# Hacemos una copia del dataset de entrenamiento para analizarlo
housing = strat_train_set.copy()

# EDA y Visualizacion de datos

## Datos Geograficos

In [None]:
# Visualizar Datos Geograficos
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.2, grid=True)
plt.show()

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", grid=True, s=housing["population"]/100, label="population",
             c="median_house_value", cmap="jet", colorbar=True, legend=True, sharex=False, figsize=(10,7))
plt.show()

## Correlaciones

In [None]:
# Exclude non-numeric columns for correlation matrix
corr_matrix = housing.drop("ocean_proximity", axis=1).corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
attributes = ['median_house_value', 'median_income', 'total_rooms', 'housing_median_age']
scatter_matrix(housing[attributes], figsize=(12, 8))
plt.show()

In [None]:
housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1, grid=True)
plt.show()

# Experimentar con combinaciones de atributos

In [None]:
# Creamos nuesvos atributos
housing["rooms_per_house"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_ratio"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["people_per_house"] = housing["population"] / housing["households"]

In [None]:
corr_matrix = housing.drop("ocean_proximity", axis=1).corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
housing = strat_train_set.drop("median_house_value", axis=1) # Guardamos los datos de entrenamiento sin la variable a predecir
housing_labels = strat_train_set["median_house_value"].copy() # Guardamos la variable a predecir 

# Limpiar Datos

## Features Numericas

In [None]:
# Vamos a limpiar los datos nulos con un imputer
impute = SimpleImputer(strategy="median") # Creamos un imputer con la estrategia de reemplazo de valores nulos por la mediana
housing_num = housing.select_dtypes(include=[np.number]) # Seleccionamos solo las columnas numericas

impute.fit(housing_num) # Ajustamos el imputer a los datos
X = impute.transform(housing_num) # Transformamos los datos, Genera una salida de tipo numpy array
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index) # Convertimos el array a un dataframe de pandas

## Features Categoricas

In [None]:
# Atributos de texto o categoticos
housing_cat = housing[["ocean_proximity"]]
housing_cat.value_counts()

In [None]:
cat_encoder = OneHotEncoder(sparse_output=False) # Creamos un encoder para las variables categoricas
housing_cat_1hot = cat_encoder.fit_transform(housing_cat) # Transformamos las variables categoricas a numericas
type(housing_cat_1hot) # Salida de tipo scipy sparse matrix

In [None]:
cat_encoder.categories_ # Categorias de las variables categoricas

# Escalado de caracteristicas y Transformacion

In [None]:
# Normalizacion de los datos
min_max_scaler = MinMaxScaler() # Creamos un escalador para normalizar los datos
housing_num_min_max_scaled = min_max_scaler.fit_transform(housing_num) # Normalizamos los datos

In [None]:
std_scaler = StandardScaler() # Creamos un escalador para estandarizar los datos
housing_num_std_scaled = std_scaler.fit_transform(housing_num) # Estandarizamos los datos

In [None]:
log_transformer = FunctionTransformer(np.log, inverse_func=np.exp) # Creamos un transformador para aplicar logaritmo a los datos, El argumento inverse_func es opcional
log_pop = log_transformer.transform(housing[["population"]]) # Aplicamos el logaritmo a la columna "population"