In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV

In [2]:
# Read the datast from the (.csv) file
df = pd.read_csv('/kaggle/input/california-houses/California_Houses.csv')

# Get a data frame with selected columns for Features (x)
X = df[['Median_Income', 'Median_Age', 'Tot_Rooms',
       'Tot_Bedrooms', 'Population', 'Households', 'Latitude', 'Longitude',
       'Distance_to_coast', 'Distance_to_LA', 'Distance_to_SanDiego',
       'Distance_to_SanJose', 'Distance_to_SanFrancisco']]
# Get a data frame with selected column for Target (y)
y = df['Median_House_Value'] 

# Split data into training (70%) and remaining 30% (for validation and test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3)

# Split the remaining 30% into validation (50%) and testing (50%) sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5)

# Now, you can check the shapes of your splits
print("Training set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (14448, 13)
Validation set shape: (3096, 13)
Testing set shape: (3096, 13)


In [3]:
print(">> Linear Regression <<")

# Creating a Linear Regression Model
model = LinearRegression() 

# Train the model using the Training Dataset
model.fit(X_train, y_train)

# Predict the outputs of Test set using the fitted Linear Regression model
y_test_pred = model.predict(X_test)

# Calculate MSE and MAE of the model across the Test Dataset
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
print("Mean Absolute Error on Testing Set:", test_mae)
print("The Mean Squared Error on Testing Set:", test_mse)
print("-------------------------------------------------------")

# Predict the outputs of Validation set using the fitted Linear Regression model
y_val_pred = model.predict(X_val)

# Calculate MSE and MAE of the model across the Test Dataset
val_mse = mean_squared_error(y_val, y_val_pred)
val_mae = mean_absolute_error(y_val, y_val_pred)
print("Mean Absolute Error on Validation Set:", val_mae)
print("The Mean Squared Error on Validation Set:", val_mse)


>> Linear Regression <<
Mean Absolute Error on Testing Set: 50697.46493298002
The Mean Squared Error on Testing Set: 5066777677.918302
-------------------------------------------------------
Mean Absolute Error on Validation Set: 50254.56554611642
The Mean Squared Error on Validation Set: 4804130437.78863


In [4]:
print(">> Lasso Regression <<")

# Creating a Lasso Regression Model
lasso_model = Lasso(alpha=3, max_iter=5000)

# Train the model using the Training Dataset
lasso_model.fit(X_train, y_train)

# Predict the outputs of Test dataset using the fitted Lasso Regression model
y_test_pred_lasso = lasso_model.predict(X_test)

# Calculate MSE and MAE of the model across the Test Dataset
test_mse_lasso = mean_squared_error(y_test, y_test_pred_lasso)
test_mae_lasso = mean_absolute_error(y_test, y_test_pred_lasso)
print("Mean Absolute Error on Testing Set:", test_mae_lasso)
print("The Mean Squared Error on Testing Set:", test_mse_lasso)
print("-------------------------------------------------------")

# Predict the outputs of Validation set using the fitted Lasso Regression model
y_val_pred_lasso = lasso_model.predict(X_val)

# Calculate MSE and MAE of the model across the Test Dataset
val_mse_lasso = mean_squared_error(y_val, y_val_pred_lasso)
val_mae_lasso = mean_absolute_error(y_val, y_val_pred_lasso)
print("Mean Absolute Error on Validation Set:", val_mae_lasso)
print("The Mean Squared Error on Validation Set:", val_mse_lasso)

>> Lasso Regression <<
Mean Absolute Error on Testing Set: 50698.40893616802
The Mean Squared Error on Testing Set: 5066775203.882901
-------------------------------------------------------
Mean Absolute Error on Validation Set: 50254.70386752037
The Mean Squared Error on Validation Set: 4804138583.034417


In [5]:
print(">> Ridge Regression <<")

# Define a range of alpha values to test
alpha_values = [0.01, 0.1, 1, 5, 10, 100]

# Initialize RidgeCV with the alpha values
ridge_cv = RidgeCV(alphas=alpha_values, store_cv_values=True)  # store_cv_values=True saves cross-validation values
ridge_cv.fit(X_train, y_train)

# Get the best alpha value  --> hyperparameter tuning with built-in cross-validation instead of
best_alpha = ridge_cv.alpha_            # manually tuning using validation set
print(f"Best alpha: {best_alpha}")

y_test_pred_ridge = ridge_cv.predict(X_test)

# Calculate the Mean Absolute Error (MSA) and the Mean Squared Error (MSE) on the test set
test_mse_ridge = mean_squared_error(y_test, y_test_pred_ridge)
test_mae_ridge = mean_absolute_error(y_test, y_test_pred_ridge)
print("Mean Absolute Error on Testing Set:", test_mae_ridge)
print("The Mean Squared Error on Testing Set:", test_mse_ridge)
print("-------------------------------------------------------")

# Predict the outputs of the Validation set using the Ridge Regression model
y_val_pred_ridge = ridge_cv.predict(X_val)

# Calculate MSE and MAE of the model across the Validation Dataset
val_mse_ridge = mean_squared_error(y_val, y_val_pred_ridge)
val_mae_ridge= mean_absolute_error(y_val, y_val_pred_ridge)
print("Mean Absolute Error on Validation Set:", val_mae_ridge)
print("The Mean Squared Error on Validation Set:", val_mse_ridge)

>> Ridge Regression <<
Best alpha: 5.0
Mean Absolute Error on Testing Set: 50752.36453271187
The Mean Squared Error on Testing Set: 5132429367.847173
-------------------------------------------------------
Mean Absolute Error on Validation Set: 50602.91857372461
The Mean Squared Error on Validation Set: 4888481690.370817


> The difference between linear, lasso, and ridge models :

*linear Model :*
* The model can sometimes cause overfit if the model is not trained well.
* Can be overly sensitive to noise and outliers.
*  The linear model can't perform well if the number of features is more than the number
of samples, however, lasso and ridge can.

*Ridge Model :*
* The model won't overfit on the data and will result in better results than linear Regression, 
because adding (alpha) hyperparameter scales the overfitted equation to be slightly different.
It may seem that it caused a relatively bigger cost function in the training set,
However, in the test and validation set the classification will be better and have less error (loss). 
*  when the number of features is more than the number of samples, the ridge model can calculate 
a valid equation for all the features using l2 regularization.
> Doesn't explicitly select features but reduces their impact.

*Lasso Model :*
* It is the same as the ridge model, however differing the eqn beside the scale
factor alpha, it minimizes w (scale/coefficient) of each feature until it reaches zero.
This can be useful in adding a lot of redundant features and not related or useful
for the training but if all the features are related and useful,
Ridge regression will give better results when using l1 regularization.

*In Summary :*
* Linear Regression: Straightforward but can be sensitive.
* Ridge Regression: Smoother, more robust to noise.
* Lasso Regression: Simpler, focuses on important features.


> This implies to the previous error outputs of our Model, Ridge Regression has the greatest errors,then comes the Lasso Regression, and then the Linear Regression.
> Despite Ridge using L2 Regularization, it's very sensitive to the noise in the dataset,While the Lasso removes highly correlated features which lead to less accurate predictions.