Regresión Lineal con ecuación normal

In [28]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np

In [29]:
df_airbnb_cost = pd.read_csv('/Users/melaniealvarez/Documents/Octavo semestres/Data Mining/Pset2/data/ml/airbnb_analysis.csv')
print(df_airbnb_cost.head())

         id  log_price property_type  amenities  accommodates  cleaning_fee  \
0   6901257   5.010635     Apartment          1             3             1   
1   6304928   5.129899     Apartment          1             7             1   
2   7919400   4.976734     Apartment          1             5             1   
3  13418779   6.620073         House          1             4             1   
4   3808709   4.744932     Apartment          1             2             1   

   review_scores_rating  bedrooms  room_Entire home/apt  room_Private room  \
0            100.000000       1.0                     1                  0   
1             93.000000       3.0                     1                  0   
2             92.000000       1.0                     1                  0   
3             94.067365       2.0                     1                  0   
4             40.000000       1.0                     1                  0   

   room_Shared room  city_Boston  city_Chicago  city_DC 

In [30]:
# Seleccionar las características numéricas relevantes
features = ['amenities', 'accommodates', 'cleaning_fee', 'review_scores_rating', 'bedrooms', 'room_Entire home/apt', 'room_Private room', 'room_Shared room', 'city_Boston', 'city_Chicago', 'city_DC', 'city_LA', 'city_NYC', 'city_SF', 'property_type_encoded']
target = "log_price"

# Extraer X (características) y y (objetivo)
X = df_airbnb_cost[features].values
y = df_airbnb_cost[target].values.reshape(-1, 1)

# Dividir en conjunto de entrenamiento (80%) y prueba (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Regresión lineal

In [31]:
# Normalizar las características
X_mean = np.mean(X_train, axis=0)
X_std = np.std(X_train, axis=0)
X_train = (X_train - X_mean) / X_std
X_test = (X_test - X_mean) / X_std

# Agregar una columna de unos para el término de sesgo (intercepto)
X_train_b = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
X_test_b = np.hstack([np.ones((X_test.shape[0], 1)), X_test])

# Calcular los coeficientes de regresión lineal manualmente
theta_best = np.linalg.inv(X_train_b.T @ X_train_b) @ X_train_b.T @ y_train

# Hacer predicciones en el conjunto de prueba
y_pred = X_test_b.dot(theta_best)

# Mostrar las primeras predicciones
print("Predicciones del modelo para X_test:")
print(y_pred[:10])


Predicciones del modelo para X_test:
[[4.06256958]
 [4.80742308]
 [5.25981763]
 [5.0924625 ]
 [4.19475668]
 [5.38112051]
 [5.32502222]
 [4.84735142]
 [4.28072067]
 [4.86449846]]


Regresión lineal con SVD

In [32]:
U, S, Vt = np.linalg.svd(X_train_b, full_matrices=False)
S_inv = np.diag(1 / S)
theta_best = Vt.T @ S_inv @ U.T @ y_train

# Hacer predicciones en el conjunto de prueba
y_pred = X_test_b.dot(theta_best)

# Mostrar las primeras predicciones
print("Predicciones del modelo para X_test:")
print(y_pred[:5])

# Mostrar coeficientes del modelo
print("Coeficientes del modelo de regresión lineal (SVD):")
print(theta_best)

Predicciones del modelo para X_test:
[[4.06623077]
 [4.79454803]
 [5.25954628]
 [5.09811783]
 [4.23458481]]
Coeficientes del modelo de regresión lineal (SVD):
[[ 4.78053832e+00]
 [-2.17614300e-02]
 [ 1.54520700e-01]
 [-2.93671558e-02]
 [ 4.33034586e-02]
 [ 1.53390419e-01]
 [ 1.86645508e+10]
 [ 1.85075218e+10]
 [ 6.32643723e+09]
 [-1.55553587e+08]
 [-1.61770790e+08]
 [-1.96342090e+08]
 [-3.38974706e+08]
 [-3.66018708e+08]
 [-2.06812539e+08]
 [ 7.44376837e-03]]


Regresión Polinomial

In [39]:
degree = 2  # Por ajuste manual, se determinó que el grado 2 es el mejor
poly = PolynomialFeatures(degree=degree, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Normalizar las características
scaler = StandardScaler()
X_train_poly = scaler.fit_transform(X_train_poly)
X_test_poly = scaler.transform(X_test_poly)

X_train_b = np.hstack([np.ones((X_train_poly.shape[0], 1)), X_train_poly])
X_test_b = np.hstack([np.ones((X_test_poly.shape[0], 1)), X_test_poly])

theta_best_poly = np.linalg.inv(X_train_b.T @ X_train_b) @ X_train_b.T @ y_train

# Hacer predicciones en el conjunto de prueba
y_pred = X_test_b.dot(theta_best_poly)

# Mostrar las primeras predicciones
print("Predicciones del modelo para X_test:")
print(y_pred[:5])

# Mostrar coeficientes del modelo
print("Coeficientes del modelo de regresión polinomial:")
print(theta_best_poly)


Predicciones del modelo para X_test:
[[ 4.0625    ]
 [15.9375    ]
 [-3.578125  ]
 [-2.796875  ]
 [ 7.63671875]]
Coeficientes del modelo de regresión polinomial:
[[-1.01229593e+02]
 [-1.92082194e+13]
 [ 1.28554143e-01]
 [-4.52291370e+13]
 [ 4.50063865e-02]
 [ 1.24778987e+12]
 [-1.56011104e+13]
 [-2.23500411e+13]
 [ 2.47694536e+12]
 [ 1.48630277e+12]
 [ 1.15965525e+13]
 [-1.10310324e+13]
 [-2.51950575e+13]
 [ 2.35809300e+12]
 [-2.84991072e+11]
 [-6.63416653e-03]
 [-1.92082194e+13]
 [-3.52302932e-02]
 [-1.57677962e-01]
 [-2.34544270e-03]
 [ 7.50517636e-02]
 [ 3.21343329e+13]
 [ 3.18866711e+13]
 [ 2.07347408e+13]
 [-6.02979062e+12]
 [-4.34210358e+12]
 [-6.21568185e+12]
 [-1.82791988e+13]
 [-1.64722263e+13]
 [-9.30661537e+12]
 [ 1.75836534e-02]
 [ 2.09836273e-01]
 [-2.46285504e-02]
 [ 3.24028176e-02]
 [-3.19636401e-01]
 [-1.40613425e+13]
 [-1.36545241e+13]
 [-6.03411929e+12]
 [-1.96609779e+12]
 [-2.46013440e+12]
 [-2.54943308e+12]
 [-4.58042863e+12]
 [-4.42372794e+12]
 [-2.40488677e+12]
 [

Regresión lineal con Batch Gradient Descent

In [34]:
eta = 0.1  # Tasa de aprendizaje
n_iterations = 1000  # Número de iteraciones
m = X_train_b.shape[0]  # Número de ejemplos de entrenamiento

# Inicializar los coeficientes de manera aleatoria
theta = np.random.randn(X_train_b.shape[1], 1)

# Gradient Descent Loop
for iteration in range(n_iterations):
    gradients = (2/m) * X_train_b.T @ (X_train_b @ theta - y_train)
    theta -= eta * gradients

# Hacer predicciones en el conjunto de prueba
y_pred = X_test_b @ theta

# Mostrar las primeras predicciones
print("Predicciones del modelo para X_test:")
print(y_pred[:5])

# Mostrar coeficientes del modelo
print("Coeficientes del modelo de regresión lineal (Batch Gradient Descent):")
print(theta)

Predicciones del modelo para X_test:
[[4.12132681]
 [4.72708976]
 [5.29495638]
 [5.09133335]
 [4.26978567]]
Coeficientes del modelo de regresión lineal (Batch Gradient Descent):
[[ 4.78053776e+00]
 [-2.11216011e-01]
 [ 2.12902359e-01]
 [ 4.04940805e-01]
 [ 6.27089879e-02]
 [ 2.38158035e-01]
 [ 7.23386710e-01]
 [-3.49583593e-01]
 [-2.72554549e-01]
 [-4.78648558e-01]
 [-1.13419414e+00]
 [ 5.17766002e-01]
 [ 5.82576724e-01]
 [-3.14378808e-01]
 [-7.17218349e-02]
 [-1.73565421e-02]
 [-1.91276957e-01]
 [-1.10746086e-02]
 [ 6.73377844e-03]
 [-2.83624046e-03]
 [ 3.21179373e-03]
 [ 6.70368584e-01]
 [ 6.59159320e-01]
 [ 4.25222242e-01]
 [-6.62564953e-02]
 [-4.79474284e-02]
 [-6.56103285e-02]
 [-1.96345587e-01]
 [-1.82739523e-01]
 [-1.02101681e-01]
 [ 1.30387932e-03]
 [-4.89539901e-02]
 [-2.57058325e-02]
 [ 1.72383987e-02]
 [ 1.67050174e-02]
 [-1.16134936e+00]
 [-1.09638404e+00]
 [-5.13055866e-01]
 [-1.16224543e-02]
 [-1.27320362e-02]
 [ 1.08368392e-03]
 [-2.26358430e-02]
 [-1.16257876e-02]
 [ 9.

Stochastic Gradient Descent

In [35]:
eta = 0.0001  # Tasa de aprendizaje
n_iterations = 3  # Número de iteraciones
m = X_train_b.shape[0]  # Número de ejemplos de entrenamiento

# Inicializar los coeficientes de manera aleatoria
theta = np.random.randn(X_train_b.shape[1], 1)

# Stochastic Gradient Descent Loop
for iteration in range(n_iterations):
    for i in range(m):
        random_index = np.random.randint(m)  # Seleccionar un índice aleatorio
        xi = X_train_b[random_index:random_index+1]  # Seleccionar una muestra aleatoria
        yi = y_train[random_index:random_index+1]  # Seleccionar la salida correspondiente
        gradients = 2 * xi.T @ (xi @ theta - yi)  # Calcular el gradiente con una muestra
        theta -= eta * gradients  # Actualizar parámetros

# Hacer predicciones en el conjunto de prueba
y_pred = X_test_b @ theta

# Mostrar las primeras predicciones
print("Predicciones del modelo para X_test:")
print(y_pred[:5])

# Mostrar coeficientes del modelo
print("Coeficientes del modelo de regresión lineal (Stochastic Gradient Descent):")
print(theta)


Predicciones del modelo para X_test:
[[4.15330586]
 [4.73424803]
 [5.31316121]
 [5.11348414]
 [4.25436415]]
Coeficientes del modelo de regresión lineal (Stochastic Gradient Descent):
[[ 4.77930914e+00]
 [ 3.02127435e-01]
 [ 2.12742894e-01]
 [-3.27286756e-01]
 [ 7.07623559e-02]
 [-1.41156184e-01]
 [ 1.18716229e+00]
 [ 1.73607248e+00]
 [-4.51520206e-01]
 [-1.96370515e-01]
 [ 7.81655629e-01]
 [ 5.26264406e-01]
 [ 5.70305022e-01]
 [-8.12039059e-01]
 [-5.68413793e-01]
 [-1.96070536e-02]
 [ 3.34016160e-01]
 [-2.39584609e-02]
 [ 2.08652304e-02]
 [ 2.23959961e-03]
 [-3.12592340e-02]
 [ 8.55059901e-02]
 [ 7.96976919e-02]
 [ 5.66903796e-02]
 [ 2.00397531e-01]
 [ 1.38935447e-01]
 [ 2.24929477e-01]
 [ 6.48801809e-01]
 [ 5.84824792e-01]
 [ 3.20479333e-01]
 [ 5.16134848e-04]
 [-2.06449113e-02]
 [-1.75562766e-02]
 [ 3.18187935e-02]
 [-4.43276656e-02]
 [ 1.18473828e+00]
 [ 1.19615719e+00]
 [ 4.83622533e-01]
 [ 4.60264395e-01]
 [ 5.60788036e-01]
 [ 5.92948583e-01]
 [ 1.05008172e+00]
 [ 1.01711717e+00]


LASSO Regression

In [36]:
eta = 0.01  # Tasa de aprendizaje
alpha = 0.1  # Parámetro de regularización L1
n_iterations = 1000
m, n = X_train_b.shape  # Número de muestras y características

# Inicializar los coeficientes con valores pequeños
theta = np.random.randn(n, 1) * 0.01

# Función de umbralización suave (Soft Thresholding)
def soft_thresholding(x, alpha):
    return np.sign(x) * np.maximum(np.abs(x) - alpha, 0)

# Descenso de gradiente con penalización L1 para Lasso
for iteration in range(n_iterations):
    gradients = (2/m) * X_train_b.T @ (X_train_b @ theta - y_train)  # Gradiente normal
    theta -= eta * gradients  # Actualización de parámetros
    theta[1:] = soft_thresholding(theta[1:], eta * alpha)  # Aplicar umbralización suave a los coeficientes (excepto intercepto)

# Hacer predicciones en el conjunto de prueba
y_pred = X_test_b @ theta

# Mostrar las primeras predicciones
print("Predicciones del modelo para X_test:")
print(y_pred[:5])

# Mostrar coeficientes del modelo
print("Coeficientes del modelo de Lasso Regression (Manual - ISTA):")
print(theta)

Predicciones del modelo para X_test:
[[4.28418545]
 [4.84416195]
 [5.23420625]
 [5.12080695]
 [4.31956714]]
Coeficientes del modelo de Lasso Regression (Manual - ISTA):
[[ 4.78053775]
 [-0.        ]
 [ 0.12532754]
 [-0.        ]
 [ 0.        ]
 [ 0.13473194]
 [ 0.11757756]
 [-0.        ]
 [-0.        ]
 [ 0.        ]
 [-0.        ]
 [ 0.        ]
 [-0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [-0.        ]
 [ 0.        ]
 [ 0.        ]
 [-0.        ]
 [-0.        ]
 [ 0.        ]
 [-0.        ]
 [ 0.        ]
 [ 0.        ]
 [-0.        ]
 [-0.        ]
 [ 0.        ]
 [ 0.        ]
 [-0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [-0.        ]
 [-0.        ]
 [-0.        ]
 [-0.        ]
 [ 0.        ]
 [ 0.        ]
 [-0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [-0.        ]
 [ 0.        ]
 [-0.        ]
 [ 0.        ]
 [ 0.        ]
 [-0.        ]
 [-0.        ]
 [-0.        ]
 [ 0.        ]
 [ 0. 

Ridge Regression

In [37]:
eta = 0.01  # Tasa de aprendizaje
alpha = 0.1  # Parámetro de regularización L2
n_iterations = 1000
m, n = X_train_b.shape  # Número de muestras y características

# Inicializar los coeficientes con valores pequeños
theta = np.random.randn(n, 1) * 0.01

# Descenso de gradiente con penalización L2 para Ridge
for iteration in range(n_iterations):
    gradients = (2/m) * X_train_b.T @ (X_train_b @ theta - y_train) + (2 * alpha * theta)  # Gradiente con penalización L2
    theta -= eta * gradients  # Actualización de parámetros

# Hacer predicciones en el conjunto de prueba
y_pred = X_test_b @ theta

# Mostrar las primeras predicciones
print("Predicciones del modelo para X_test:")
print(y_pred[:5])

# Mostrar coeficientes del modelo
print("Coeficientes del modelo de Ridge Regression (Manual - Gradiente Descendente):")
print(theta)

Predicciones del modelo para X_test:
[[3.70640566]
 [4.30494839]
 [4.83359655]
 [4.66222346]
 [3.84329807]]
Coeficientes del modelo de Ridge Regression (Manual - Gradiente Descendente):
[[ 4.34594341e+00]
 [-9.96245656e-03]
 [ 1.60585756e-01]
 [-1.48725967e-02]
 [ 4.82368742e-02]
 [ 1.13193592e-01]
 [ 7.73349952e-02]
 [-6.33776006e-02]
 [-3.45628020e-02]
 [ 1.47527231e-03]
 [-1.11427198e-02]
 [ 3.54332007e-03]
 [-1.46273518e-02]
 [ 6.06405702e-03]
 [ 2.05162222e-02]
 [-6.82032721e-03]
 [ 8.00170758e-03]
 [-7.78564810e-03]
 [ 6.39664851e-03]
 [-2.47110977e-03]
 [ 2.01315024e-03]
 [ 4.59715562e-03]
 [-5.11260519e-04]
 [-3.25064283e-03]
 [-1.35235902e-03]
 [-1.39496764e-03]
 [ 9.70655102e-04]
 [-4.80151781e-04]
 [-5.70145071e-03]
 [-2.10917506e-03]
 [ 1.18000443e-03]
 [-1.74513671e-02]
 [-1.69411901e-02]
 [ 1.17986805e-02]
 [-1.23297243e-02]
 [-4.91875666e-03]
 [ 1.21381710e-02]
 [-1.51705080e-02]
 [-3.53077672e-03]
 [-2.39369833e-03]
 [ 7.67305498e-03]
 [-2.98503179e-03]
 [ 2.81546438e-0