In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

Importar datos

In [2]:
# Cargar datos
df_airbnb_cost = pd.read_csv('/Users/melaniealvarez/Documents/Octavo semestres/Data Mining/Pset2/data/ml/airbnb_analysis.csv')

División de datos

In [3]:
# Definir características y objetivo
features = ['amenities', 'accommodates', 'cleaning_fee', 'review_scores_rating', 'bedrooms', 'room_Entire home/apt', 'room_Private room', 'room_Shared room', 'city_Boston', 'city_Chicago', 'city_DC', 'city_LA', 'city_NYC', 'city_SF', 'property_type_encoded']
target = "log_price"

X = df_airbnb_cost[features].values
y = df_airbnb_cost[target].values.reshape(-1, 1)

# Dividir en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Escalar datos

In [4]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Entrenar modelos

In [5]:
# Configurar validación cruzada
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Lista de modelos a evaluar
models = {
    "Linear Regression": LinearRegression(),
    "Linear Regression SVD": Pipeline([
        ('svd', TruncatedSVD(n_components=min(X_train.shape) - 1)),
        ('lin_reg', LinearRegression())
    ]),
    "Batch Gradient Descent": SGDRegressor(max_iter=1000, tol=1e-3, eta0=0.01, learning_rate='constant', random_state=42),
    "Stochastic Gradient Descent": SGDRegressor(max_iter=1, tol=None, eta0=0.01, learning_rate='constant', random_state=42, shuffle=True),
    "Polynomial Regression": Pipeline([
        ('poly_features', PolynomialFeatures(degree=2, include_bias=False)),
        ('scaler', StandardScaler()),
        ('lin_reg', LinearRegression())
    ]),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=1.0)
}

Evaluación de modelos según métricas

In [6]:
# Almacenar métricas
evaluations = []

# Evaluar cada modelo
for name, model in models.items():
    rmse_scores = -cross_val_score(model, X_train, y_train.ravel(), cv=kf, scoring='neg_root_mean_squared_error')
    mae_scores = -cross_val_score(model, X_train, y_train.ravel(), cv=kf, scoring='neg_mean_absolute_error')
    r2_scores = cross_val_score(model, X_train, y_train.ravel(), cv=kf, scoring='r2')
    
    # Entrenar modelo final
    model.fit(X_train, y_train.ravel())
    y_pred = model.predict(X_test)
    
    # Calcular métricas en test
    rmse_test = mean_squared_error(y_test, y_pred, squared=False)
    mae_test = mean_absolute_error(y_test, y_pred)
    r2_test = r2_score(y_test, y_pred)
    
    evaluations.append({
        "Model": name,
        "RMSE_CV": np.mean(rmse_scores),
        "MAE_CV": np.mean(mae_scores),
        "R2_CV": np.mean(r2_scores),
        "RMSE_Test": rmse_test,
        "MAE_Test": mae_test,
        "R2_Test": r2_test
    })

# Convertir a DataFrame y mostrar resultados
evaluations_df = pd.DataFrame(evaluations)
print(evaluations_df.sort_values(by="RMSE_Test"))

# Seleccionar el mejor modelo (menor RMSE en test)
best_model = min(evaluations, key=lambda x: x["RMSE_Test"])
print(f"Mejor modelo: {best_model['Model']} con RMSE_Test: {best_model['RMSE_Test']:.4f}")



                         Model   RMSE_CV    MAE_CV     R2_CV  RMSE_Test  \
4        Polynomial Regression  0.479303  0.360298  0.553689   0.477713   
1        Linear Regression SVD  0.489320  0.368183  0.534853   0.488243   
5             Ridge Regression  0.489320  0.368183  0.534853   0.488243   
0            Linear Regression  0.489323  0.368200  0.534849   0.488297   
2       Batch Gradient Descent  0.520300  0.391022  0.473843   0.503889   
3  Stochastic Gradient Descent  0.519211  0.389761  0.476330   0.509075   
6             Lasso Regression  0.717548  0.561747 -0.000148   0.716788   

   MAE_Test   R2_Test  
4  0.360085  0.555774  
1  0.368433  0.535976  
5  0.368433  0.535976  
0  0.368429  0.535873  
2  0.381786  0.505760  
3  0.382426  0.495533  
6  0.560360 -0.000114  
Mejor modelo: Polynomial Regression con RMSE_Test: 0.4777




Exportar PKL

In [7]:
import joblib
import os

# Crear carpeta "models" si no existe
os.makedirs("models", exist_ok=True)

# Guardar todos los modelos entrenados en formato .pkl
for name, model in models.items():
    filename = f"models/{name.replace(' ', '_').lower()}.pkl"
    joblib.dump(model, filename)
    print(f"Modelo guardado: {filename}")


Modelo guardado: models/linear_regression.pkl
Modelo guardado: models/linear_regression_svd.pkl
Modelo guardado: models/batch_gradient_descent.pkl
Modelo guardado: models/stochastic_gradient_descent.pkl
Modelo guardado: models/polynomial_regression.pkl
Modelo guardado: models/ridge_regression.pkl
Modelo guardado: models/lasso_regression.pkl


In [8]:
import json

metrics_file = "/Users/melaniealvarez/Documents/Octavo semestres/Data Mining/Pset2/notebooks/models/metrics.json"

# Guardar las métricas en un archivo JSON
with open(metrics_file, "w") as f:
    json.dump(evaluations, f, indent=4)

print(f"Métricas guardadas en {metrics_file}")

Métricas guardadas en /Users/melaniealvarez/Documents/Octavo semestres/Data Mining/Pset2/notebooks/models/metrics.json
