#Ridge Regression with Gradient Descent

- uses Gradient Descent to optimize Ridge Regression parameters instead of the default Scikit-learn solver.

In [1]:
from sklearn.datasets import load_diabetes
from sklearn.metrics import r2_score
import numpy as np

In [2]:
X, y =load_diabetes(return_X_y=True)

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=4)

In [5]:
X_train

array([[ 0.0090156 , -0.04464164,  0.05522933, ...,  0.02323852,
         0.05568623,  0.10661708],
       [ 0.03081083,  0.05068012, -0.03422907, ...,  0.05755657,
         0.0354587 ,  0.08590655],
       [ 0.01628068, -0.04464164, -0.06332999, ..., -0.03949338,
        -0.05947118, -0.06735141],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04688253,  0.01549073],
       [ 0.01628068,  0.05068012,  0.00996123, ..., -0.03949338,
         0.01703607,  0.00720652],
       [ 0.04897352,  0.05068012,  0.08109682, ...,  0.07120998,
         0.03243232,  0.04862759]])

In [6]:
y_train

array([173., 120.,  65., 332., 107.,  84.,  48.,  87.,  53., 158.,  71.,
       121., 244., 131., 152.,  72., 118., 210., 140., 170.,  49., 262.,
        61.,  53.,  52.,  63., 181., 103., 277., 235.,  97.,  88., 197.,
       152., 187.,  92., 150.,  75., 131., 104., 160., 127., 243.,  40.,
        61., 179.,  53., 109.,  49., 281., 257., 306., 115.,  88., 252.,
       283.,  79., 200., 275.,  96., 259.,  77., 103.,  42.,  60., 246.,
       110., 310., 111., 147., 190., 196.,  91.,  95., 265., 274., 116.,
       116.,  83., 122., 270., 230., 275., 310.,  67., 104., 280., 166.,
        94., 243., 164.,  90.,  85., 195., 177., 221., 102., 233., 249.,
        90.,  72., 214., 144.,  60., 270., 220.,  68., 123.,  45., 129.,
       182., 174., 127., 124., 257.,  47.,  88., 189.,  71., 242., 178.,
       272.,  68., 190., 219.,  93., 233., 230., 297., 185., 138., 181.,
       321., 248., 264., 101., 281., 107., 308., 225., 102.,  71.,  53.,
       263., 168.,  83., 281.,  75., 150., 273.,  5

In [7]:
from sklearn.linear_model import SGDRegressor  #Stohastic Gradient Descent

reg = SGDRegressor(penalty='l2', max_iter=500, eta0=0.1, learning_rate='constant', alpha=0.001)

#penalty = {'l1', 'l2', 'elasticnet'}; l1 :L1 norm, l2:L2 norm
#learning rate = 'constant' ----> won't change when close to the soln
#alpha is lamda value

- penalty='l2' → Applies L2 regularization (Ridge Regression).
- max_iter=500 → Runs 500 iterations of Gradient Descent.
- eta0=0.1 → Sets learning rate to 0.1.
- learning_rate='constant' → Learning rate remains fixed (doesn’t decay).
- alpha=0.001 → Defines the regularization strength (λ value).

In [8]:
reg

In [9]:
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)
print("R2 score", r2_score(y_test,y_pred))
print(reg.coef_)
print(reg.intercept_)

R2 score 0.4396843769832307
[  53.31729151 -142.41049694  353.89163141  265.87958786    1.19342549
  -53.07859081 -165.82279247  137.69506402  321.45997748  106.76891487]
[165.80184278]


## Alterative: Using Ridge regression with different Solvers.

In [10]:
from sklearn.linear_model import Ridge

reg = Ridge(alpha=0.001, max_iter=500, solver='sparse_cg')


# solver : {'saga', 'sparse_cg', 'lsqr', 'sag', 'cholesky', 'auto', 'lbfgs', 'svd'}
# svd, cholesky, lsqr : uses OLS method --> Small datasets, exact soln
# saga, sparse_cg, sag : uses gradient descent --> Large datsets, iterative optimization
# auto: Chooses best method when not sure which to use.

In [11]:
reg.fit(X_train, y_train)

y_pred1 = reg.predict(X_test)
print("R2 score", r2_score(y_test,y_pred1))
print(reg.coef_)
print(reg.intercept_)

R2 score 0.46250101619914563
[  34.52192544 -290.84084076  482.40181344  368.0678662  -852.44873179
  501.59160336  180.11115788  270.76333979  759.73534372   37.4913546 ]
151.10198517439466


1️⃣ R2 Score Calculation:

The model is trained and evaluated using R2 score to check its performance.

The score is printed before and after retraining to compare performance.


2️⃣ Coefficient & Intercept Analysis:

Displays learned feature weights (coefficients) and the intercept.

Helps analyze how Ridge regression adjusts weights to reduce overfitting.


##NOTE:
✔ Lower α (close to 0) → High variance, risk of overfitting.

✔ Moderate α (0.01 - 0.1) → Best balance, optimal bias-variance tradeoff.

✔ High α (1 - 10) → Reduces overfitting, but can slightly underfit.

✔ Very High α (100+) → Too much shrinkage, poor model performance.
