<a href="https://colab.research.google.com/github/MukiiriKoome/ML-Algorithms/blob/main/ridge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error

# Generate sample data
X, y = make_regression(n_samples=100, n_features=5, noise=20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define possible lambda (alpha) values
alphas = [0.01, 0.1, 1, 10, 100, 1000]

# RidgeCV performs cross-validation internally
ridge_cv = RidgeCV(alphas=alphas, cv=5)
ridge_cv.fit(X_train, y_train)

# Best lambda value
print("Best lambda (alpha):", ridge_cv.alpha_)

# Evaluate on test data
y_pred = ridge_cv.predict(X_test)
print("Test MSE:", mean_squared_error(y_test, y_pred))


Best lambda (alpha): 0.1
Test MSE: 454.7043306990102


In [10]:
from google.colab import files
data = files.upload()
print("file uploaded successfully!!")


Saving housing_data.csv to housing_data (2).csv
file uploaded successfully!!


In [11]:
import pandas as pd
df = pd.read_csv("housing_data.csv")
df.head()

Unnamed: 0,House,Square_Feet,Bedrooms,Age,Garage_Size,Price
0,1,1500,3,10,1,250
1,2,1800,3,8,2,290
2,3,2400,4,5,2,380
3,4,2000,3,12,1,270
4,5,1600,2,15,1,220


In [12]:
df.corr()

Unnamed: 0,House,Square_Feet,Bedrooms,Age,Garage_Size,Price
House,1.0,0.168654,0.089541,-0.028909,0.194698,0.129576
Square_Feet,0.168654,1.0,0.90043,-0.888876,0.779663,0.975876
Bedrooms,0.089541,0.90043,1.0,-0.924437,0.730729,0.931275
Age,-0.028909,-0.888876,-0.924437,1.0,-0.793952,-0.933712
Garage_Size,0.194698,0.779663,0.730729,-0.793952,1.0,0.819727
Price,0.129576,0.975876,0.931275,-0.933712,0.819727,1.0


Features with a correleation of above |0.7| are highly related
Since this problem is meant to learn ridge regression, we're going to use these features that are highly correleated.

In [14]:
# Extract features (X) and target (y)
X = df[['Square_Feet', 'Bedrooms', 'Age', 'Garage_Size']].values
y = df['Price'].values

In [17]:
# Standardize the features using z-score normalization
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
features_scaled = scaler.fit_transform(X)
features_scaled

array([[-1.32486384, -0.20652852,  0.09260847, -1.22474487],
       [-0.47920607, -0.20652852, -0.27782542,  0.81649658],
       [ 1.21210947,  1.17032826, -0.83347626,  0.81649658],
       [ 0.08456578, -0.20652852,  0.46304237, -1.22474487],
       [-1.04297792, -1.5833853 ,  1.01869321, -1.22474487],
       [ 0.64833763,  1.17032826, -0.64825931,  0.81649658],
       [-1.60674977, -1.5833853 ,  1.94477794, -1.22474487],
       [ 1.77588132,  1.17032826, -1.20391015,  0.81649658],
       [-0.19732015, -0.20652852, -0.09260847,  0.81649658],
       [ 0.3664517 , -0.20652852, -0.46304237,  0.81649658],
       [-0.761092  , -0.20652852,  0.27782542, -1.22474487],
       [ 0.93022355,  1.17032826, -1.01869321,  0.81649658],
       [-1.18392088, -1.5833853 ,  1.57434405, -1.22474487],
       [ 1.4939954 ,  1.17032826, -1.3891271 ,  0.81649658],
       [-0.33826311, -0.20652852,  0.09260847, -1.22474487],
       [ 0.22550874, -0.20652852, -0.27782542,  0.81649658],
       [-1.46580681, -1.

In [18]:
#  OLS Regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
model = LinearRegression()
model.fit(X_train, y_train)



In [23]:
# Let's seee the coefficients of this model and it's equation
print(f"The function of this equation/linear model is: \n price = {model.coef_[0]} square feet + {model.coef_[1]} bedrooms +{model.coef_[2]} age +{model.coef_[3]} garage size + {model.intercept_}")

The function of this equation/linear model is: 
 price = 0.15308647713734286 square feet + 10.942743665706836 bedrooms +-0.5585088815878017 age +18.0991126974536 garage size + -59.36432613200293


In [24]:
# Predictions

y_pred = model.predict(X_test)
print("Actual values: ", y_test)
print("Predicted values: ", y_pred)


Actual values:  [250 340 320 290]
Predicted values:  [215.60764445 346.38974648 319.02133734 280.74971805]


In [25]:
# Evaluate performance of the model
from sklearn.metrics import r2_score, mean_squared_error

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print("R² Score:", r2)
print("MSE:", mse)


R² Score: 0.7151764180730623
MSE: 327.54711921597834


In [27]:
# Ridge Regression
from sklearn.linear_model import Ridge
# Use a static guessed alpha parameter to see how it behaves to the predictions
ridge = Ridge(alpha=10, fit_intercept=True)
ridge.fit(X_train, y_train)

In [30]:
y_predicted = ridge.predict(X_test)
print("Actual y values: ", y_test)
print("Predicted y values: ", y_predicted)

Actual y values:  [250 340 320 290]
Predicted y values:  [214.65600839 337.86416308 316.37120688 272.44783614]


In [32]:
# Evaluate performance
r2_ = r2_score(y_test, y_predicted)
mse_ = mean_squared_error(y_test, y_predicted)
print("R² Score:", r2_)
print("MSE:", mse_)

R² Score: 0.6576073613125891
MSE: 393.7515344905225


In [35]:
# Ridge regression with hyperparameter tuning
# Let's manually try different values for the penalty
lambd = [0.1, 1, 10, 100, 1000]
for value in lambd:
  ridge1 = Ridge(alpha=value)
  ridge1.fit(X_train, y_train)
  y_predicted1 = ridge1.predict(X_test)
  r2__ = r2_score(y_test, y_predicted1)
  mse__ = mean_squared_error(y_test, y_predicted1)
  print("R² Score:", r2__)
  print("MSE:", mse__)

R² Score: 0.7133128683932171
MSE: 329.69020134780044
R² Score: 0.7027379118347463
MSE: 341.85140139004164
R² Score: 0.6576073613125891
MSE: 393.7515344905225
R² Score: 0.49366552740180747
MSE: 582.2846434879214
R² Score: 0.37095745214302966
MSE: 723.398930035516


In [40]:
# Ridge regression with cross validation
from sklearn.linear_model import RidgeCV

ridgeCv = RidgeCV(alphas=(0.01, 0.1, 1, 5, 10, 50, 100), cv=5).fit(X_train, y_train)
# print(ridgeCv.score(X_train, y_train))

In [38]:
ypred = ridgeCv.predict(X_test)
print(r2_score(y_test, ypred))

0.7133128683932171


In [41]:
ridgeCv.alpha_

np.float64(0.1)