## Regresion Lineal

In [None]:
# Librerias
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
# Cargar los datos
data = pd.read_csv("/content/california_housing.csv")
data.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY


## Regresión Lineal Simple

In [None]:
# Selección de características
X = data[["median_income"]]
y = data["median_house_value"]

In [None]:
# Separacion/division del conjunto de datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, # data de median income
                                                    y, # data median house value
                                                    test_size=0.2, # dimension de muestra
                                                    random_state=55) # estado de aleatoriedad

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Creación de una pipeline que primero normalizar y luego aplicar regresión lineal
pipeline = Pipeline([
    ('scaler', StandardScaler()), # Normalizacion
    ('regressor', LinearRegression()) # Modelo de regresion linea
])

In [None]:
# Entrenamiento del modelo
pipeline.fit(X_train, y_train)

In [None]:
# Predicción
y_pred = pipeline.predict(X_test)

In [None]:
# Librerias de evaluación
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
# Evaluación
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"Error Abosulto Medio: {mae}")
print(f"Error Cuadratico Medio: {mse}")
print(f"Error Cuadratico Medio de la raiz: {rmse}")
print(f"Puntaje R2: {r2}")

Error Abosulto Medio: 62967.66669406504
Error Cuadratico Medio: 7293971280.85341
Error Cuadratico Medio de la raiz: 85404.74975581517
Puntaje R2: 0.46031409383268607


## Regresión lineal simple con estratificación

In [None]:
import numpy as np

In [None]:
# Creación de bins para 'median_income'
data["income_cat"] = pd.cut(data["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

In [None]:
# Separacion/division del conjunto de datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, # data de median income
                                                    y, # data median house value
                                                    test_size=0.2, # dimension de muestra
                                                    random_state=55,# estado de aleatoriedad
                                                    stratify=data["income_cat"]) # estratificado

In [None]:
# Creación de pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()), # Normalizacion
    ('regressor', LinearRegression()) # Modelo de regresion linea
])

In [None]:
# Entrenamiento del modelo
pipeline.fit(X_train, y_train)

In [None]:
# Predicción
y_pred = pipeline.predict(X_test)

In [None]:
# Evaluación
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"Error Abosulto Medio: {mae}")
print(f"Error Cuadratico Medio: {mse}")
print(f"Error Cuadratico Medio de la raiz: {rmse}")
print(f"Puntaje R2: {r2}")

Error Abosulto Medio: 63234.83089790239
Error Cuadratico Medio: 7151130533.148642
Error Cuadratico Medio de la raiz: 84564.35734485683
Puntaje R2: 0.46079076072602276


## Regresión lineal multivariada

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [None]:
# Selección de múultiples caracteristicas
features = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'population', 'households', 'median_income', 'ocean_proximity']
X = data[features]
X[:3]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,households,median_income,ocean_proximity
0,-122.23,37.88,41.0,880.0,322.0,126.0,8.3252,NEAR BAY
1,-122.22,37.86,21.0,7099.0,2401.0,1138.0,8.3014,NEAR BAY
2,-122.24,37.85,52.0,1467.0,496.0,177.0,7.2574,NEAR BAY


In [None]:
# Separacion/division del conjunto de datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, # data de median income
                                                    y, # data median house value
                                                    test_size=0.2, # dimension de muestra
                                                    random_state=55)# estado de aleatoriedad

In [None]:
# Creación de una pipeline con transformador de columnas para one-hot encoding y normalizacion
column_transformer = ColumnTransformer([
    ('ohe', OneHotEncoder(), ["ocean_proximity"]),
    ('scaler', StandardScaler(), ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'population', 'households', 'median_income'])
], remainder="passthrough")

pipeline = Pipeline([
    ('transformer', column_transformer),
    ('regressor', LinearRegression())
])

In [None]:
# Entrenamiento del modelo
pipeline.fit(X_train, y_train)

In [None]:
# Predicción
y_pred = pipeline.predict(X_test)

In [None]:
# Evaluación
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"Error Abosulto Medio: {mae}")
print(f"Error Cuadratico Medio: {mse}")
print(f"Error Cuadratico Medio de la raiz: {rmse}")
print(f"Puntaje R2: {r2}")

Error Abosulto Medio: 50520.99245463301
Error Cuadratico Medio: 5011183851.696949
Error Cuadratico Medio de la raiz: 70789.71571984838
Puntaje R2: 0.6292190915155824


## Regresión lineal multivariante preprocesamiento adicional y estratificación

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
# Selección de múultiples caracteristicas
features = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'households', 'median_income', 'ocean_proximity']

In [None]:
# Instaciar el imputador con la estrategia de la mediana
imputer = SimpleImputer(strategy="median")

In [None]:
# Manejo de valores faltantes para todas las caracteristicas numericas
for feature in ['total_bedrooms', "population"]:
  data[feature] = imputer.fit_transform(data[[feature]])

In [None]:
X = data[features]

In [None]:
# Division/separacion/split del conjunto de datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, # data de median income
                                                    y, # data median house value
                                                    test_size=0.2, # dimension de muestra
                                                    random_state=55)# estado de aleatoriedad

In [None]:
# Creación de una pipeline con transformador de columnas para one-hot encoding y normalizacion
column_transformer = ColumnTransformer([
    ('ohe', OneHotEncoder(), ["ocean_proximity"]),
    ('scaler', StandardScaler(), ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'households', 'median_income'])
], remainder="passthrough")

pipeline = Pipeline([
    ('transformer', column_transformer),
    ('regressor', LinearRegression())
])
# Entrenamiento del modelo
pipeline.fit(X_train, y_train)
# Predicción
y_pred = pipeline.predict(X_test)
# Evaluación
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"Error Abosulto Medio: {mae}")
print(f"Error Cuadratico Medio: {mse}")
print(f"Error Cuadratico Medio de la raiz: {rmse}")
print(f"Puntaje R2: {r2}")

Error Abosulto Medio: 52887.44879308821
Error Cuadratico Medio: 5438807736.251751
Error Cuadratico Medio de la raiz: 73748.27276792149
Puntaje R2: 0.5975789088567136


## Regresión lineal multivariante con selección de features basada en VIF

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Valores faltantes
imputer = SimpleImputer(strategy="mean")
# Manejo de valores faltantes para todas las caracteristicas numericas
for feature in ['total_bedrooms', "population"]:
  data[feature] = imputer.fit_transform(data[[feature]])

In [None]:
# Creación de bins para 'median_income'
data["income_cat"] = pd.cut(data["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

In [None]:
# Seleccion de caracteristicas
features = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'population', 'households', 'median_income', 'ocean_proximity']
X = data[features]
y = data["median_house_value"]

In [None]:
# Preprocesamiento para VIF
X_processed = X.copy() # Trabajar en una copia del dataframe
ohe = OneHotEncoder()
X_encoded = ohe.fit_transform(X_processed[["ocean_proximity"]]).toarray()

In [None]:
X_encoded = pd.DataFrame(X_encoded, columns=ohe.get_feature_names_out(["ocean_proximity"]))

In [None]:
X_processed.drop("ocean_proximity", axis=1, inplace=True)

In [None]:
X_processed = pd.concat([X_processed.reset_index(drop=True), X_encoded], axis=1)
X_processed

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,322.0,126.0,8.3252,0.0,0.0,0.0,1.0,0.0
1,-122.22,37.86,21.0,7099.0,2401.0,1138.0,8.3014,0.0,0.0,0.0,1.0,0.0
2,-122.24,37.85,52.0,1467.0,496.0,177.0,7.2574,0.0,0.0,0.0,1.0,0.0
3,-122.25,37.85,52.0,1274.0,558.0,219.0,5.6431,0.0,0.0,0.0,1.0,0.0
4,-122.25,37.85,52.0,1627.0,565.0,259.0,3.8462,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,845.0,330.0,1.5603,0.0,1.0,0.0,0.0,0.0
20636,-121.21,39.49,18.0,697.0,356.0,114.0,2.5568,0.0,1.0,0.0,0.0,0.0
20637,-121.22,39.43,17.0,2254.0,1007.0,433.0,1.7000,0.0,1.0,0.0,0.0,0.0
20638,-121.32,39.43,18.0,1860.0,741.0,349.0,1.8672,0.0,1.0,0.0,0.0,0.0


In [None]:
# Cálculo de VIF
vif_data = pd.DataFrame()
vif_data["feature"] = X_processed.columns
vif_data["VIF"] = [variance_inflation_factor(X_processed.values, i) for i in range(len(X_processed.columns))]

In [None]:
vif_data

Unnamed: 0,feature,VIF
0,longitude,17.877996
1,latitude,19.838932
2,housing_median_age,1.319408
3,total_rooms,9.69798
4,population,6.026165
5,households,11.773368
6,median_income,1.567345
7,ocean_proximity_<1H OCEAN,14634.462892
8,ocean_proximity_INLAND,10251.574086
9,ocean_proximity_ISLAND,9.136133


In [None]:
# Filtrado de caracteristicas con VIF alto, para este caso VIF > 5
features_to_use = vif_data[vif_data["VIF"] < 10 ]["feature"].tolist()
features_to_use

['housing_median_age',
 'total_rooms',
 'population',
 'median_income',
 'ocean_proximity_ISLAND']

In [None]:
# Uso de las caracteristicas finales seleccionadas
X_vif_selected  = X_processed[features_to_use]
X_vif_selected

Unnamed: 0,housing_median_age,total_rooms,population,median_income,ocean_proximity_ISLAND
0,41.0,880.0,322.0,8.3252,0.0
1,21.0,7099.0,2401.0,8.3014,0.0
2,52.0,1467.0,496.0,7.2574,0.0
3,52.0,1274.0,558.0,5.6431,0.0
4,52.0,1627.0,565.0,3.8462,0.0
...,...,...,...,...,...
20635,25.0,1665.0,845.0,1.5603,0.0
20636,18.0,697.0,356.0,2.5568,0.0
20637,17.0,2254.0,1007.0,1.7000,0.0
20638,18.0,1860.0,741.0,1.8672,0.0


In [None]:
# Division/separacion/split del conjunto de datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_vif_selected, # data
                                                    y, # data median house value
                                                    test_size=0.2, # dimension de muestra
                                                    random_state=55,)# estado de aleatoriedad

In [None]:
# Creación de pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()), # Normalizacion
    ('regressor', LinearRegression()) # Modelo de regresion linea
])

In [None]:
# Entrenamiento del modelo
pipeline.fit(X_train, y_train)

In [None]:
# Prediccion
y_pred = pipeline.predict(X_test)

In [None]:
# Evaluación
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"Error Abosulto Medio: {mae}")
print(f"Error Cuadratico Medio: {mse}")
print(f"Error Cuadratico Medio de la raiz: {rmse}")
print(f"Puntaje R2: {r2}")

Error Abosulto Medio: 60486.50073549539
Error Cuadratico Medio: 6758811077.5970335
Error Cuadratico Medio de la raiz: 82211.98864884023
Puntaje R2: 0.4999109620020531


Con VIF < 5

Error Abosulto Medio: 60948.576637324826

Error Cuadratico Medio: 6898579150.116684

Error Cuadratico Medio de la raiz: 83057.68567758605

Puntaje R2: 0.4895694270594859

# Conclusiones


1. La regresión lineal multivariante con selección de features basada en VIF menor a diez (10) incluyó las features/variables 'housing_median_age','total_rooms', 'population', 'median_income', 'ocean_proximity_ISLAND' obtuvo las siguientes metricas

Error Abosulto Medio: 60486.50073549539

Error Cuadratico Medio: 6758811077.5970335

Error Cuadratico Medio de la raiz: 82211.98864884023

Puntaje R2: 0.4999109620020531

2. La regresión lineal multivariante con selección de features basada en VIF menor a cinco (05) incluyó las features/variables 'housing_median_age',y 'median_income' obtuvo las siguientes metricas

Con VIF < 5

Error Abosulto Medio: 60948.576637324826

Error Cuadratico Medio: 6898579150.116684

Error Cuadratico Medio de la raiz: 83057.68567758605

Puntaje R2: 0.48956942705948591
