# Regression Models

# Import Libraries

In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from matplotlib import pyplot as plt
from sklearn import datasets
from sklearn import linear_model
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge


# Data Loading & Inspecting the Dataset

In [24]:
df = pd.read_csv("California_Houses.csv")

In [26]:
print(df.shape)

(20640, 14)


In [28]:
print(df.head())

   Median_House_Value  Median_Income  Median_Age  Tot_Rooms  Tot_Bedrooms  \
0            452600.0         8.3252          41        880           129   
1            358500.0         8.3014          21       7099          1106   
2            352100.0         7.2574          52       1467           190   
3            341300.0         5.6431          52       1274           235   
4            342200.0         3.8462          52       1627           280   

   Population  Households  Latitude  Longitude  Distance_to_coast  \
0         322         126     37.88    -122.23        9263.040773   
1        2401        1138     37.86    -122.22       10225.733072   
2         496         177     37.85    -122.24        8259.085109   
3         558         219     37.85    -122.25        7768.086571   
4         565         259     37.85    -122.25        7768.086571   

   Distance_to_LA  Distance_to_SanDiego  Distance_to_SanJose  \
0   556529.158342         735501.806984         67432.5170

In [30]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Median_House_Value        20640 non-null  float64
 1   Median_Income             20640 non-null  float64
 2   Median_Age                20640 non-null  int64  
 3   Tot_Rooms                 20640 non-null  int64  
 4   Tot_Bedrooms              20640 non-null  int64  
 5   Population                20640 non-null  int64  
 6   Households                20640 non-null  int64  
 7   Latitude                  20640 non-null  float64
 8   Longitude                 20640 non-null  float64
 9   Distance_to_coast         20640 non-null  float64
 10  Distance_to_LA            20640 non-null  float64
 11  Distance_to_SanDiego      20640 non-null  float64
 12  Distance_to_SanJose       20640 non-null  float64
 13  Distance_to_SanFrancisco  20640 non-null  float64
dtypes: flo

In [32]:
print(df.describe())

       Median_House_Value  Median_Income    Median_Age     Tot_Rooms  \
count        20640.000000   20640.000000  20640.000000  20640.000000   
mean        206855.816909       3.870671     28.639486   2635.763081   
std         115395.615874       1.899822     12.585558   2181.615252   
min          14999.000000       0.499900      1.000000      2.000000   
25%         119600.000000       2.563400     18.000000   1447.750000   
50%         179700.000000       3.534800     29.000000   2127.000000   
75%         264725.000000       4.743250     37.000000   3148.000000   
max         500001.000000      15.000100     52.000000  39320.000000   

       Tot_Bedrooms    Population    Households      Latitude     Longitude  \
count  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000   
mean     537.898014   1425.476744    499.539680     35.631861   -119.569704   
std      421.247906   1132.462122    382.329753      2.135952      2.003532   
min        1.000000      3.000000  

In [34]:
print(df.isnull().sum())
df.fillna(df.mean(), inplace=True)

Median_House_Value          0
Median_Income               0
Median_Age                  0
Tot_Rooms                   0
Tot_Bedrooms                0
Population                  0
Households                  0
Latitude                    0
Longitude                   0
Distance_to_coast           0
Distance_to_LA              0
Distance_to_SanDiego        0
Distance_to_SanJose         0
Distance_to_SanFrancisco    0
dtype: int64


In [36]:
print(df.isnull().sum())
df.fillna(df.mean(), inplace=True)

Median_House_Value          0
Median_Income               0
Median_Age                  0
Tot_Rooms                   0
Tot_Bedrooms                0
Population                  0
Households                  0
Latitude                    0
Longitude                   0
Distance_to_coast           0
Distance_to_LA              0
Distance_to_SanDiego        0
Distance_to_SanJose         0
Distance_to_SanFrancisco    0
dtype: int64


## Feature Scalling

In [98]:
X_mean = np.mean(X_train, axis=0)
X_std = np.std(X_train, axis=0)
X_train_scaled = (X_train - X_mean) / X_std
X_val_scaled = (X_val - X_mean) / X_std
X_test_scaled = (X_test - X_mean) / X_std

# Data Splitting (Training , Validation , Test)

In [100]:
X = df.drop("Median_House_Value", axis=1).values
y = df["Median_House_Value"].values

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print("Train size:", X_train.shape[0])
print("Validation size:", X_val.shape[0])
print("Test size:", X_test.shape[0])

Train size: 14448
Validation size: 3096
Test size: 3096


In [102]:
X_train.shape , X_val.shape , X_test.shape 

((14448, 13), (3096, 13), (3096, 13))

# Part 1: Regression Models From Scratch

### Implementing using Normal Equation

In [236]:
#  Merge column 1 into X to
X_train_b = np.c_[np.ones((X_train.shape[0], 1)), X_train]
X_val_b = np.c_[np.ones ((X_val.shape[0], 1)), X_val]
X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]

# Calculate weights (w)
w = np.linalg.inv(X_train_b.T.dot(X_train_b)).dot(X_train_b.T).dot(y_train)

# prediction
y_val_pred = X_val_b.dot(w)

print(y_val_pred)

[117112.59499985 332653.97680117 406311.5956708  ... 254464.49377846
 142097.9688237  205083.4967788 ]


### Mean Square Error (MSE)

In [239]:
def mean_squared_error_manual(y_true, y_pred):
    errors = y_true - y_pred                 
    squared_errors = errors ** 2             
    mse = np.mean(squared_errors)      
    return mse

### Mean Absolute Error (MAE)

In [241]:
def mean_absolute_error_manual(y_true, y_pred):
    errors = np.abs(y_true - y_pred)         
    mae = np.mean(errors)               
    return mae

### Implemetation using Gradient Descent

In [287]:
def gradient_descent(X, y, learning_rate=0.0001, iterations=10000):
    n = len(y)
    X_b = np.c_[np.ones((n, 1)), X]
    w = np.random.randn(X_b.shape[1], 1)
    y = np.array(y).reshape(-1, 1)

    for iteration in range(iterations):
        gradients = 2/n * X_b.T.dot(X_b.dot(w) - y)
        w -= learning_rate * gradients
    return w

w_gd = gradient_descent(X_train_scaled, y_train)
y_val_pred_gd = np.c_[np.ones((X_val.shape[0], 1)), X_val_scaled].dot(w_gd)

print(y_val_pred_gd)

[[112580.39770045]
 [308457.5409349 ]
 [348111.14897513]
 ...
 [195161.87111607]
 [129398.98640994]
 [189377.31397506]]


In [288]:
learning_rates = [0.00001, 0.0001, 0.001, 0.01, 0.1]
results = []

for lr in learning_rates:
    w_gd = gradient_descent(X_train_scaled, y_train, learning_rate=lr, iterations=10000)  
    y_val_pred_gd = np.c_[np.ones((X_val.shape[0], 1)), X_val_scaled].dot(w_gd)
    
    mse = mean_squared_error(y_val, y_val_pred_gd)
    mae = mean_absolute_error(y_val, y_val_pred_gd)
    
    results.append((lr, mse, mae))
    print(f"learning_rate={lr} | MSE={mse:.2f} | MAE={mae:.2f}")

best_result = min(results, key=lambda x: x[1])
best_lr = best_result[0]

print("\nBest learning rate:", best_lr)


learning_rate=1e-05 | MSE=38641620276.27 | MAE=169420.60
learning_rate=0.0001 | MSE=6307127296.88 | MAE=54680.95
learning_rate=0.001 | MSE=5010448852.15 | MAE=51725.78
learning_rate=0.01 | MSE=4921892948.00 | MAE=50954.17
learning_rate=0.1 | MSE=4907210127.73 | MAE=50790.86

Best learning rate: 0.1


In [244]:
print(np.mean(y_train), np.std(y_train))
print(np.mean(y_val_pred_gd), np.std(y_val_pred_gd))

206923.9608942414 115745.23650984169
205385.56523705626 91581.69582285437


### L2 Regularization (Ridge Regression)

In [267]:
def ridge_regression(X, y, lampda=0.0001):
    n = len(y)
    X_b = np.c_[np.ones((n, 1)), X]
    I = np.eye(X_b.shape[1])
    I[0,0] = 0  # Do not penalize bias operator.
    w_l2 = np.linalg.inv(X_b.T.dot(X_b) + lampda * n *I).dot(X_b.T).dot(y)
    return w_l2
    
w_l2 = ridge_regression(X_train_scaled, y_train)
y_val_pred_l2 = np.c_[np.ones((X_val.shape[0], 1)), X_val_scaled].dot(w_l2)

print(y_val_pred_l2)

[117450.31047956 332654.04816986 406181.08710155 ... 254067.58576683
 142376.43674426 205277.35666689]


In [273]:
lampdas = [0.001, 0.005, 0.01, 0.1, 1, 10]
results = []

for l in lampdas:
    w_l2 = ridge_regression(X_train_scaled, y_train, lampda=l)
    y_val_pred_l2 = np.c_[np.ones((X_val.shape[0], 1)), X_val_scaled].dot(w_l2)
    
    mse = mean_squared_error(y_val, y_val_pred_l2)
    mae = mean_absolute_error(y_val, y_val_pred_l2)
    
    results.append((l, mse, mae))
    print(f"lambda={l} | MSE={mse:.2f} | MAE={mae:.2f}")

best_result = min(results, key=lambda x: x[1])  # أقل MSE
best_lambda = best_result[0]

print("\nBest lambda:", best_lambda)

lambda=0.001 | MSE=4911260495.43 | MAE=50854.31
lambda=0.005 | MSE=4931430768.05 | MAE=51089.99
lambda=0.01 | MSE=4950294464.53 | MAE=51277.86
lambda=0.1 | MSE=5180553755.99 | MAE=52923.68
lambda=1 | MSE=6882561834.15 | MAE=63299.40
lambda=10 | MSE=11372943901.09 | MAE=83985.20

Best lambda: 0.001


In [None]:
print(np.mean(y_train), np.std(y_train))
print(np.mean(y_val_pred_l2), np.std(y_val_pred_l2))

In [None]:
mse_value = mean_squared_error_manual(y_val, y_val_pred)
mae_value = mean_absolute_error_manual(y_val, y_val_pred)

mse_value_gd = mean_squared_error_manual(y_val, y_val_pred_gd)
mae_value_gd = mean_absolute_error_manual(y_val, y_val_pred_gd)

mse_value_l2 = mean_squared_error_manual(y_val, y_val_pred_l2)
mae_value_l2 = mean_absolute_error_manual(y_val, y_val_pred_l2)

print(f"MSE: {mse_value}")
print(f"MAE: {mae_value}")
print("\n")

print(f"MSE_gd: {mse_value_gd}")
print(f"MAE_gd: {mae_value_gd}")
print("\n")

print(f"MSE_l2: {mse_value_l2}")
print(f"MAE_l2: {mae_value_l2}")

# Part 2: Regression Models using Scikit-Learn

## Linear Regression

In [230]:
#Linear Regression

#Training the Model
regressor = LinearRegression()
regressor.fit(X_train_scaled, y_train)

# Predict on validation data
y_pred = regressor.predict(X_val_scaled)


# Evaluate using validation set
mse = mean_squared_error(y_val, y_pred)
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

#Fianl Evaluation with the testing Set
y_pred_test = regressor.predict(X_test_scaled)

mse_test = mean_squared_error(y_test,y_pred_test)
mae_test = mean_absolute_error(y_test,y_pred_test)
r2_test = r2_score(y_test,y_pred_test)

print("Final Test Results:")
print("MSE:", mse_test)
print("MAE:", mae_test)
print("R²:", r2_test)

Final Test Results:
MSE: 4400953150.613741
MAE: 48782.03108085671
R²: 0.6671770047345611


## Model 2: Lasso Regression (L1)

In [143]:
# L1 Regularization (Lasso Regression)

alphas = [0.001, 0.005, 0.01, 0.1, 1, 10]
results = []

for a in alphas:
    model = Lasso(alpha=a, max_iter=200000, tol=0.01) 
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_val_scaled)
    mse = mean_squared_error(y_val, y_pred)
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    results.append((a, mse, mae, r2))

# Pick best alpha
best_result = min(results, key=lambda x: x[1])
best_alpha = best_result[0]
print("\nBest alpha:", best_alpha)

# Train final model on training + validation
final_model = Lasso(alpha=best_alpha, max_iter=200000, tol=0.01)
final_model.fit(np.concatenate((X_train_scaled, X_val_scaled)),
                np.concatenate((y_train, y_val)))

# Test evaluation
y_pred_test = final_model.predict(X_test_scaled)
print("\nFinal Test Results:")
print("MSE:", mean_squared_error(y_test, y_pred_test))
print("MAE:", mean_absolute_error(y_test, y_pred_test))
print("R²:", r2_score(y_test, y_pred_test))



Best alpha: 0.001

Final Test Results:
MSE: 4400899527.616999
MAE: 48842.22539295722
R²: 0.6671810599848171


## Model 3: Ridge Regression (L2)

In [148]:
#L2 Regularization (Ridge Regression)
alphas = [0.001, 0.005, 0.01, 0.1, 1, 10]
results = []

for a in alphas:
    model = Ridge(alpha=a, max_iter=200000, tol=0.01) 
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_val_scaled)
    mse = mean_squared_error(y_val, y_pred)
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    results.append((a, mse, mae, r2))

# Pick best alpha
best_result = min(results, key=lambda x: x[1])
best_alpha = best_result[0]
print("\nBest alpha:", best_alpha)

# Train final model on training + validation
final_model = Ridge(alpha=best_alpha, max_iter=200000, tol=0.01)
final_model.fit(np.concatenate((X_train_scaled, X_val_scaled)),
                np.concatenate((y_train, y_val)))

# Test evaluation
y_pred_test = final_model.predict(X_test_scaled)
print("\nFinal Test Results:")
print("MSE:", mean_squared_error(y_test, y_pred_test))
print("MAE:", mean_absolute_error(y_test, y_pred_test))
print("R²:", r2_score(y_test, y_pred_test))


Best alpha: 0.001

Final Test Results:
MSE: 4400899288.400519
MAE: 48842.226434694414
R²: 0.6671810780756131
