# Linear Regression, Regularisation and Polynomial Regression

### Evaluation Function

In [51]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd

def evaluate_model(y_true, y_pred, X_test):
    n = X_test.shape[0]  # number of samples
    p = X_test.shape[1]  # number of predictors

    # Since we logged the resale prices during data processing, we exponetiate it back to view the metrics in dollars and not log units
    y_true = np.exp(y_true)
    y_pred = np.exp(y_pred)

    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    adj_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))

    metrics = pd.Series({
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'R^2': r2,
        'Adjusted R²': adj_r2
    })

    return metrics

### Linear Regression Function

In [62]:
from sklearn.linear_model import LinearRegression
import numpy as np

def linear_regression_eval(X_train, X_test, y_train, y_test):
    # Initialize model
    model = LinearRegression()
    
    # Fit on training data
    model.fit(X_train, y_train)
    
    # Predict on test data
    y_pred = model.predict(X_test)
    
    # Evaluate model performance
    metrics = evaluate_model(y_test, y_pred, X_test)
    print("Linear Regression Performance:")
    print(metrics)

### Lasso Regression Function

In [59]:
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import numpy as np

def lasso_regression_eval(X_train, X_test, y_train, y_test, cv=10, random_state=42):
    # Create pipeline with scaling and LassoCV
    model = make_pipeline(
        StandardScaler(),
        LassoCV(cv=cv, random_state=random_state)
    )
    
    # Fit on training data
    model.fit(X_train, y_train)
    
    # Predict on test data
    y_pred = model.predict(X_test)
    
    # Get best alpha from the LassoCV step
    best_alpha = model.named_steps['lassocv'].alpha_
    
    # Evaluate model performance (assuming evaluate_model is defined)
    metrics = evaluate_model(y_test, y_pred, X_test)
    
    # Print best alpha and evaluation metrics
    print(f"Best alpha: {best_alpha}")
    print("Lasso Regression Performance:")
    print(metrics)

### Ridge Regression Function

In [63]:
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import numpy as np

def ridge_regression_eval(X_train, X_test, y_train, y_test, alphas=None, cv=10):
    # Default alphas if not provided
    if alphas is None:
        alphas = np.logspace(-6, 6, 13)
    
    # Create pipeline with scaling and RidgeCV
    model = make_pipeline(
        StandardScaler(),
        RidgeCV(alphas=alphas, cv=cv)
    )
    
    # Fit on training data
    model.fit(X_train, y_train)
    
    # Predict on test data
    y_pred = model.predict(X_test)
    
    # Get best alpha from RidgeCV step
    best_alpha = model.named_steps['ridgecv'].alpha_
    
    # Evaluate model performance (assuming evaluate_model is defined)
    metrics = evaluate_model(y_test, y_pred, X_test)
    
    # Print best alpha and evaluation metrics
    print(f"Best alpha: {best_alpha}")
    print("Ridge Regression Performance:")
    print(metrics)

### Polynomial Regression Function

In [64]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

def polynomial_regression_eval(X_train, X_test, y_train, y_test, degree=2):
    # Create pipeline with polynomial features and linear regression
    model = make_pipeline(
        PolynomialFeatures(degree=degree, include_bias=False),
        LinearRegression()
    )
    
    # Fit on training data
    model.fit(X_train, y_train)
    
    # Predict on test data
    y_pred = model.predict(X_test)
    
    # Evaluate model performance (assuming evaluate_model is defined)
    metrics = evaluate_model(y_test, y_pred, X_test)
    
    # Print evaluation metrics
    print(f"Polynomial Regression Performance (degree={degree}):")
    print(metrics)

## df_base

In [56]:
import pandas as pd

X_base_train = pd.read_pickle("X_base_train.pkl")
X_base_test = pd.read_pickle("X_base_test.pkl")
y_base_train = pd.read_pickle("y_base_train.pkl")
y_base_test = pd.read_pickle("y_base_test.pkl")

### Linear Regression

In [57]:
linear_regression_eval(X_base_train, X_base_test, y_base_train, y_base_test)

MAE            1.278118e+05
MSE            2.192958e+10
RMSE           1.480864e+05
R^2            3.834172e-01
Adjusted R²    3.828996e-01
dtype: float64

### Lasso (L1 Regularisation)

LassoCV automatically generates 100 default alpha values (the hyperparameter controlling the regularization strength) based on the data. It then applies cross-validation to select the alpha that minimizes the training mean squared error (MSE).

In [65]:
lasso_regression_eval(X_base_train, X_base_test, y_base_train, y_base_test, cv=10, random_state=42)

Best alpha: 0.00021601217412189918
Lasso Regression Performance:
MAE            1.281465e+05
MSE            2.206658e+10
RMSE           1.485482e+05
R^2            3.795652e-01
Adjusted R²    3.790444e-01
dtype: float64


### Ridge (L2 Regularisation)
In contrast, RidgeCV does not automatically determine alpha values from the data. Instead, it defaults to a fixed set of alphas such as [0.1, 1.0, 10.0]. To achieve better model tuning, we specify a custom range of alpha values. Like LassoCV, RidgeCV uses cross-validation to select the alpha that results in the lowest training MSE.

In [66]:
ridge_regression_eval(X_base_train, X_base_test, y_base_train, y_base_test, alphas=None, cv=10)

Best alpha: 1000.0
Ridge Regression Performance:
MAE            1.281840e+05
MSE            2.208519e+10
RMSE           1.486109e+05
R^2            3.790420e-01
Adjusted R²    3.785207e-01
dtype: float64


### Polynomial Regression Degree=2

In [67]:
polynomial_regression_eval(X_base_train, X_base_test, y_base_train, y_base_test, degree=2)

Polynomial Regression Performance (degree=2):
MAE            4.549957e+04
MSE            4.329693e+09
RMSE           6.580040e+04
R^2            8.782643e-01
Adjusted R²    8.781621e-01
dtype: float64


## df_post_covid

In [68]:
X_post_train = pd.read_pickle("X_post_train.pkl")
X_post_test = pd.read_pickle("X_post_test.pkl")
y_post_train = pd.read_pickle("y_post_train.pkl")
y_post_test = pd.read_pickle("y_post_test.pkl")

### Linear Regression

In [69]:
linear_regression_eval(X_post_train, X_post_test, y_post_train, y_post_test)

Linear Regression Performance:
MAE            4.716768e+04
MSE            3.897595e+09
RMSE           6.243072e+04
R^2            8.993742e-01
Adjusted R²    8.991935e-01
dtype: float64


### Lasso (L1 Regularisation)

LassoCV automatically generates 100 default alpha values (the hyperparameter controlling the regularization strength) based on the data. It then applies cross-validation to select the alpha that minimizes the training mean squared error (MSE).

In [70]:
lasso_regression_eval(X_post_train, X_post_test, y_post_train, y_post_test, cv=10, random_state=42)

Best alpha: 0.00021873592047883046
Lasso Regression Performance:
MAE            4.704637e+04
MSE            3.885107e+09
RMSE           6.233063e+04
R^2            8.996966e-01
Adjusted R²    8.995164e-01
dtype: float64


### Ridge (L2 Regularisation)
In contrast, RidgeCV does not automatically determine alpha values from the data. Instead, it defaults to a fixed set of alphas such as [0.1, 1.0, 10.0]. To achieve better model tuning, we specify a custom range of alpha values. Like LassoCV, RidgeCV uses cross-validation to select the alpha that results in the lowest training MSE.

In [71]:
ridge_regression_eval(X_post_train, X_post_test, y_post_train, y_post_test, alphas=None, cv=10)

Best alpha: 100.0
Ridge Regression Performance:
MAE            4.710623e+04
MSE            3.890184e+09
RMSE           6.237134e+04
R^2            8.995655e-01
Adjusted R²    8.993851e-01
dtype: float64


### Polynomial Regresison

In [72]:
polynomial_regression_eval(X_post_train, X_post_test, y_post_train, y_post_test, degree=2)

Polynomial Regression Performance (degree=2):
MAE            4.390866e+04
MSE            3.914648e+09
RMSE           6.256715e+04
R^2            8.989339e-01
Adjusted R²    8.987524e-01
dtype: float64
