# Visualizing Bias and Variance

We will fit polynomials of increasing degree to a noisy sine wave to see Underfitting vs Overfitting.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# 1. Generate Data (True function: sin(2*pi*X))
np.random.seed(0)
n_samples = 30
X = np.sort(np.random.rand(n_samples))
y_true = np.sin(2 * np.pi * X)
y_noise = y_true + np.random.randn(n_samples) * 0.1

X = X[:, np.newaxis] # Reshape for sklearn

# Plot
plt.scatter(X, y_noise, color='black', label='Noisy Data')
plt.plot(X, y_true, color='green', label='True Function')
plt.legend()
plt.show()

## 2. Fitting Models of Varying Complexity
We will try Degree 1 (Line), Degree 4 (Good), and Degree 15 (Complex).

In [None]:
degrees = [1, 4, 15]

plt.figure(figsize=(14, 4))
for i, degree in enumerate(degrees):
    ax = plt.subplot(1, 3, i + 1)
    
    # Create Pipeline: Add Polynomial Features -> Linear Regression
    polynomial_features = PolynomialFeatures(degree=degree, include_bias=False)
    linear_regression = LinearRegression()
    pipeline = Pipeline([
        ("polynomial_features", polynomial_features),
        ("linear_regression", linear_regression),
    ])
    
    # Train
    pipeline.fit(X, y_noise)
    
    # Evaluate
    scores = mean_squared_error(y_noise, pipeline.predict(X))
    
    # Plotting
    X_test = np.linspace(0, 1, 100)[:, np.newaxis]
    plt.plot(X_test, pipeline.predict(X_test), label="Model")
    plt.plot(X_test, np.sin(2 * np.pi * X_test), label="True")
    plt.scatter(X, y_noise, edgecolor='b', s=20, label="Samples")
    
    plt.title(f"Degree {degree}\nMSE = {scores:.2e}")
    plt.legend(loc="best")

plt.show()

### Observation
*   **Degree 1 (Underfitting)**: The line cannot capture the curve. High Bias.
*   **Degree 4 (Balanced)**: Fits the curve well without chasing noise.
*   **Degree 15 (Overfitting)**: Wiggles wildly to hit every noise point. High Variance. Low Train Error, but would perform poorly on new data.