# Overfitting & Regularization

This notebook demonstrates overfitting with polynomial regression and how regularization (Ridge/Lasso) can help. We'll keep examples small and visual so beginners can see the effect.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
np.random.seed(0)

In [None]:
# Create noisy nonlinear data
X = np.linspace(-3, 3, 60).reshape(-1,1)
y = np.sin(X).ravel() + np.random.normal(scale=0.2, size=X.shape[0])

def fit_and_predict(degree, model):
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    Xp = poly.fit_transform(X)
    model.fit(Xp, y)
    yp = model.predict(Xp)
    mse = mean_squared_error(y, yp)
    return yp, mse

# Fit degrees 1, 3, 9 with plain LinearRegression to show underfit -> good -> overfit
yp1, mse1 = fit_and_predict(1, LinearRegression())
yp3, mse3 = fit_and_predict(3, LinearRegression())
yp9, mse9 = fit_and_predict(9, LinearRegression())

plt.figure(figsize=(10,6))
plt.scatter(X, y, label='data', color='black', s=20)
plt.plot(X, yp1, label=f'Degree 1 (MSE={mse1:.3f})')
plt.plot(X, yp3, label=f'Degree 3 (MSE={mse3:.3f})')
plt.plot(X, yp9, label=f'Degree 9 (MSE={mse9:.3f})')
plt.legend()
plt.title('Polynomial fits (underfit -> good -> overfit)')
plt.show()

## Regularization example (Ridge & Lasso)
We'll fit a flexible model (degree=9) and compare LinearRegression vs Ridge/Lasso to see how regularization reduces overfitting.

In [None]:
degree = 9
poly = PolynomialFeatures(degree=degree, include_bias=False)
Xp = poly.fit_transform(X)

models = {
    'Linear': LinearRegression(),
    'Ridge (alpha=1)': Ridge(alpha=1.0),
    'Lasso (alpha=0.01)': Lasso(alpha=0.01, max_iter=10000)
}
results = {}
for name, m in models.items():
    m.fit(Xp, y)
    yp = m.predict(Xp)
    results[name] = (yp, mean_squared_error(y, yp))

plt.figure(figsize=(10,6))
plt.scatter(X, y, color='black', s=20)
for name, (yp, mse) in results.items():
    plt.plot(X, yp, label=f"{name} (MSE={mse:.3f})")
plt.legend()
plt.title('Regularization reduces overfitting (degree=9)')
plt.show()