# Proyecto Final - Machine Learning I: Regresión

## 0. Importar librerías

In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

## 1. Obtención de datos

In [18]:
# Cargar el dataset de California Housing de Scikit-Learn
california_df = fetch_california_housing()
X, y = california_df.data, california_df.target
feature_names = california_df.feature_names

# Crear dataframe para facilitar la exploración
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

## 2. Exploración de datos

In [24]:
# Exploración de datos
display(df.head())
display(df.describe())

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


## 3. Preparación de datos

In [25]:
# División en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Visualizar dimensiones de cada conjunto de datos
print(f"X_train.shape = {X_train.shape}")
print(f"X_test.shape = {X_train.shape}")
print(f"y_train.shape = {y_train.shape}")
print(f"y_test.shape = {y_test.shape}")

X_train.shape = (16512, 8)
X_test.shape = (16512, 8)
y_train.shape = (16512,)
y_test.shape = (4128,)


In [27]:
# Normalización de los datos
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 4. Modelado

In [30]:
# Modelos a evaluar
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor()
}

In [32]:
# Para almacenar los resultados
results = []

# Entrenamiento y evaluación de cada modelo
for name, model in models.items():
  # Entrenamiento del modelo
  model.fit(X_train_scaled, y_train)
  y_pred = model.predict(X_test_scaled)

  # Evaluación del modelo
  r2 = r2_score(y_test, y_pred)
  mse = mean_squared_error(y_test, y_pred)

  # Guardar resultados
  results.append({
      'Model': name,
      'R2 Score': r2,
      'MSE': mse
  })

# Convertir los resultados a un DataFrame
results_df = pd.DataFrame(results)

# Imprimir resultados
display(results_df)

Unnamed: 0,Model,R2 Score,MSE
0,Linear Regression,0.575788,0.555892
1,Ridge Regression,0.575816,0.555855
2,Lasso Regression,-0.000219,1.310696
3,Decision Tree,0.613767,0.506123
4,Random Forest,0.805229,0.25523


## 5. Evaluación

In [33]:
# Para obtener las características más importantes, por ejemplo con RandomForest
feature_importances = models['Random Forest'].feature_importances_
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print("Feature importances for Random Forest:")
print(importance_df.head(2))

Feature importances for Random Forest:
    Feature  Importance
0    MedInc    0.526505
5  AveOccup    0.136435


In [36]:
# Mostrar la mejor característica
best_features = importance_df.head(2)['Feature'].tolist()

# Mostrar los resultados finales en un DataFrame
final_results ={
    'Model': [],
    'R2 Score': [],
    'MSE': [],
    'Top 2 Features': []
}

for result in results:
  final_results['Model'].append(result['Model'])
  final_results['R2 Score'].append(result['R2 Score'])
  final_results['MSE'].append(result['MSE'])
  final_results['Top 2 Features'].append(', '.join(best_features))

final_results_df = pd.DataFrame(final_results)
display(final_results_df)

Unnamed: 0,Model,R2 Score,MSE,Top 2 Features
0,Linear Regression,0.575788,0.555892,"MedInc, AveOccup"
1,Ridge Regression,0.575816,0.555855,"MedInc, AveOccup"
2,Lasso Regression,-0.000219,1.310696,"MedInc, AveOccup"
3,Decision Tree,0.613767,0.506123,"MedInc, AveOccup"
4,Random Forest,0.805229,0.25523,"MedInc, AveOccup"
