In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso, Ridge

In [2]:
df_airbnb_cost_scikit = pd.read_csv('/Users/melaniealvarez/Documents/Octavo semestres/Data Mining/Pset2/data/ml/airbnb_analysis.csv')
print(df_airbnb_cost_scikit.head())

         id  log_price property_type  amenities  accommodates  cleaning_fee  \
0   6901257   5.010635     Apartment          1             3             1   
1   6304928   5.129899     Apartment          1             7             1   
2   7919400   4.976734     Apartment          1             5             1   
3  13418779   6.620073         House          1             4             1   
4   3808709   4.744932     Apartment          1             2             1   

   review_scores_rating  bedrooms  room_Entire home/apt  room_Private room  \
0            100.000000       1.0                     1                  0   
1             93.000000       3.0                     1                  0   
2             92.000000       1.0                     1                  0   
3             94.067365       2.0                     1                  0   
4             40.000000       1.0                     1                  0   

   room_Shared room  city_Boston  city_Chicago  city_DC 

In [3]:
# Seleccionar las características numéricas relevantes
features = ['amenities', 'accommodates', 'cleaning_fee', 'review_scores_rating', 'bedrooms', 'room_Entire home/apt', 'room_Private room', 'room_Shared room', 'city_Boston', 'city_Chicago', 'city_DC', 'city_LA', 'city_NYC', 'city_SF', 'property_type_encoded']
target = "log_price"

# Extraer X (características) y y (objetivo)
X = df_airbnb_cost_scikit[features].values
y = df_airbnb_cost_scikit[target].values.reshape(-1, 1)

# Dividir en conjunto de entrenamiento (80%) y prueba (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Regresión lineal

In [4]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Crear y entrenar el modelo de regresión lineal
model = LinearRegression()
model.fit(X_train, y_train)

# Hacer predicciones en el conjunto de prueba
y_pred = model.predict(X_test)

# Mostrar las primeras predicciones
print("Predicciones del modelo para X_test:")
print(y_pred[:5])

# Mostrar coeficientes del modelo
print("Coeficientes del modelo:")
print(model.coef_)
print("Intercepto:", model.intercept_)

Predicciones del modelo para X_test:
[[4.05638325]
 [4.80144157]
 [5.26397838]
 [5.10209026]
 [4.22979176]]
Coeficientes del modelo:
[[-2.15443900e-02  1.53803933e-01 -2.92305984e-02  4.34131481e-02
   1.53420109e-01 -3.23459023e+10 -3.20737690e+10 -1.09637956e+10
  -1.17043320e+10 -1.21721334e+10 -1.47733847e+10 -2.55055030e+10
  -2.75403771e+10 -1.55612137e+10  7.45864436e-03]]
Intercepto: [4.78068929]


Regresión Polinomial

In [5]:
degree = 2 
pipeline = Pipeline([
    ('poly_features', PolynomialFeatures(degree=degree, include_bias=False)),
    ('scaler', StandardScaler()),
    ('lin_reg', LinearRegression())
])

# Entrenar el modelo
pipeline.fit(X_train, y_train)

# Hacer predicciones en el conjunto de prueba
y_pred = pipeline.predict(X_test)

# Mostrar las primeras predicciones
print("Predicciones del modelo para X_test:")
print(y_pred[:5])

# Obtener coeficientes del modelo
print("Coeficientes del modelo de regresión polinomial:")
print(pipeline.named_steps['lin_reg'].coef_)
print("Intercepto:", pipeline.named_steps['lin_reg'].intercept_)

Predicciones del modelo para X_test:
[[4.11370621]
 [4.73187348]
 [5.29038522]
 [5.08583161]
 [4.27209244]]
Coeficientes del modelo de regresión polinomial:
[[ 1.11246834e+09  2.13628531e-01 -2.50587786e+09  6.52477741e-02
   5.33054654e+10 -5.71824857e+10 -9.54223097e+09  1.75126126e+11
  -8.76542413e+10 -4.44993368e+10  4.95856787e+10  6.12555972e+11
  -4.67848438e+10  3.40137796e+10 -1.56229176e-02  1.11246834e+09
  -1.30716534e-02  6.01447513e-03 -2.36751395e-03  3.53242282e-03
   3.72877786e+10  3.70003988e+10  2.40600117e+10 -3.16620050e+10
  -2.28000795e+10 -3.26381068e+10 -9.59827827e+10 -8.64944976e+10
  -4.88683806e+10  6.60676509e-04 -4.86231202e-02 -2.49229889e-02
   1.43890843e-02  1.52740479e-02 -4.13707041e+10 -4.01737796e+10
  -1.77533378e+10  9.71869304e+08  1.21607843e+09  1.26022000e+09
   2.26416916e+09  2.18670984e+09  1.18876876e+09 -4.80651855e-04
  -2.50587786e+09 -1.52587891e-05  1.25427246e-02  5.79635492e+10
   5.82479359e+10  2.40007062e+10  1.64665466e+10  

Regresión lineal con SVD

In [6]:
svd = TruncatedSVD(n_components=min(X_train.shape) - 1)
X_train_svd = svd.fit_transform(X_train)
X_test_svd = svd.transform(X_test)

# Crear y entrenar el modelo de regresión lineal
model = LinearRegression()
model.fit(X_train_svd, y_train)

# Hacer predicciones en el conjunto de prueba
y_pred = model.predict(X_test_svd)

# Mostrar las primeras predicciones
print("Predicciones del modelo para X_test:")
print(y_pred[:5])

# Mostrar coeficientes del modelo
print("Coeficientes del modelo de regresión lineal (SVD):")
print(model.coef_)
print("Intercepto:", model.intercept_)

Predicciones del modelo para X_test:
[[4.05982771]
 [4.80113172]
 [5.26166367]
 [5.10020241]
 [4.23211577]]
Coeficientes del modelo de regresión lineal (SVD):
[[ 2.77443332e-01  9.13235947e-02  9.51801175e-02 -3.11203131e-02
   2.08718740e-02  1.01025477e-01 -1.61346983e-02  6.02140122e-02
  -8.23846734e-02 -2.24398837e-03 -7.02731442e-02  2.82470125e-03
   1.39821259e-03 -3.22554976e-18]]
Intercepto: [4.78053776]


Regresión lineal con Batch Gradient Descent

In [7]:
model = SGDRegressor(max_iter=1000, tol=1e-3, eta0=0.01, learning_rate='constant', random_state=42)
model.fit(X_train, y_train)

# Hacer predicciones en el conjunto de prueba
y_pred = model.predict(X_test)

# Mostrar las primeras predicciones
print("Predicciones del modelo para X_test:")
print(y_pred[:5])

# Mostrar coeficientes del modelo
print("Coeficientes del modelo de regresión lineal (Batch Gradient Descent - SGDRegressor):")
print(model.coef_)
print("Intercepto:", model.intercept_)

Predicciones del modelo para X_test:
[4.40823185 4.86901416 5.13296564 4.90854316 4.33059187]
Coeficientes del modelo de regresión lineal (Batch Gradient Descent - SGDRegressor):
[-0.08260537  0.10059389 -0.00571732  0.08870124  0.15205502  0.1564792
 -0.12010325 -0.11029911 -0.01049814 -0.00185429  0.0629313  -0.05735112
 -0.01970832  0.0784823   0.01029135]
Intercepto: [4.78664183]


  y = column_or_1d(y, warn=True)


Regresión lineal con Stochastic Gradient Descent

In [8]:
model = SGDRegressor(max_iter=1000, tol=1e-3, eta0=0.01, learning_rate='constant', random_state=42)
model.fit(X_train, y_train)

# Hacer predicciones en el conjunto de prueba
y_pred = model.predict(X_test)

# Mostrar las primeras predicciones
print("Predicciones del modelo para X_test:")
print(y_pred[:5])

# Mostrar coeficientes del modelo
print("Coeficientes del modelo de regresión lineal (Stochastic Gradient Descent - SGDRegressor):")
print(model.coef_)
print("Intercepto:", model.intercept_)

Predicciones del modelo para X_test:
[4.40823185 4.86901416 5.13296564 4.90854316 4.33059187]
Coeficientes del modelo de regresión lineal (Stochastic Gradient Descent - SGDRegressor):
[-0.08260537  0.10059389 -0.00571732  0.08870124  0.15205502  0.1564792
 -0.12010325 -0.11029911 -0.01049814 -0.00185429  0.0629313  -0.05735112
 -0.01970832  0.0784823   0.01029135]
Intercepto: [4.78664183]


  y = column_or_1d(y, warn=True)


LASSO Regression

In [9]:
alpha = 0.1  # Parámetro de regularización
model = Lasso(alpha=alpha)
model.fit(X_train, y_train)

# Hacer predicciones en el conjunto de prueba
y_pred = model.predict(X_test)

# Mostrar las primeras predicciones
print("Predicciones del modelo para X_test:")
print(y_pred[:5])

# Mostrar coeficientes del modelo
print("Coeficientes del modelo de Lasso Regression:")
print(model.coef_)
print("Intercepto:", model.intercept_)

Predicciones del modelo para X_test:
[4.40112651 4.87816274 5.16809628 5.07544697 4.40112651]
Coeficientes del modelo de Lasso Regression:
[-0.          0.10621259 -0.          0.          0.11147041  0.23700753
 -0.         -0.          0.         -0.          0.         -0.
  0.          0.01103678  0.        ]
Intercepto: [4.78053776]


Ridge Regression

In [10]:
alpha = 0.1  # Parámetro de regularización
model = Ridge(alpha=alpha)
model.fit(X_train, y_train)

# Hacer predicciones en el conjunto de prueba
y_pred = model.predict(X_test)

# Mostrar las primeras predicciones
print("Predicciones del modelo para X_test:")
print(y_pred[:5])

# Mostrar coeficientes del modelo
print("Coeficientes del modelo de Ridge Regression:")
print(model.coef_)
print("Intercepto:", model.intercept_)


Predicciones del modelo para X_test:
[[4.05982855]
 [4.80113182]
 [5.26166338]
 [5.10020227]
 [4.23211622]]
Coeficientes del modelo de Ridge Regression:
[[-0.02174653  0.15446751 -0.02942306  0.04329897  0.1533988   0.16328499
  -0.12565636 -0.11413265  0.00841774 -0.05821332  0.01786913 -0.05151589
   0.00668627  0.09484255  0.00744334]]
Intercepto: [4.78053776]
