# Spline Regression: Compare Knot Counts vs Linear Baseline
Comparison of **spline-based regression** using scikit-learn’s `SplineTransformer`
against a linear baseline, on a noisy nonlinear dataset with a few outliers.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import SplineTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

plt.rcParams['figure.figsize'] = (8, 5)
plt.rcParams['axes.grid'] = True
plt.rcParams['grid.alpha'] = 0.3
plt.rcParams['figure.dpi'] = 140
plt.rcParams['font.size'] = 12

In [None]:
rng = np.random.default_rng(42)

n = 120
x = rng.uniform(-3.0, 3.0, size=n)

def true_fn(t):
    return 0.6*t**3 - 1.0*t**2 + 2.2*t + 1.0

noise = rng.normal(0, 4.5, size=n)
y = true_fn(x) + noise

n_outliers = max(4, n // 25)
idx_out = rng.choice(np.arange(n), size=n_outliers, replace=False)
y[idx_out] += rng.normal(0, 10.0, size=n_outliers)

X = x.reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=7)

print(f"Samples: {n}   Train: {len(X_train)}   Test: {len(X_test)}   Outliers: {n_outliers}")

In [None]:
plt.figure()
plt.scatter(X_train[:, 0], y_train, s=16, alpha=0.8, label='train points')
plt.title('Training Data')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)
yhat_tr_lin = lin_model.predict(X_train)
yhat_te_lin = lin_model.predict(X_test)

lin_metrics = {
    'config': 'Linear (degree 1)',
    'train_RMSE': mean_squared_error(y_train, yhat_tr_lin),
    'test_RMSE': mean_squared_error(y_test, yhat_te_lin),
    'train_R2': r2_score(y_train, yhat_tr_lin),
    'test_R2': r2_score(y_test, yhat_te_lin)
}
lin_metrics

In [None]:
def fit_spline(n_knots, degree, X_train, y_train, X_test, y_test):
    pipe = Pipeline([
        ('spline', SplineTransformer(n_knots=n_knots, degree=degree, include_bias=False, extrapolation='continue')),
        ('lin', LinearRegression())
    ])
    pipe.fit(X_train, y_train)
    yhat_tr = pipe.predict(X_train)
    yhat_te = pipe.predict(X_test)
    return {
        'config': f'Spline k={n_knots}, deg={degree}',
        'n_knots': n_knots,
        'degree': degree,
        'train_RMSE': mean_squared_error(y_train, yhat_tr),
        'test_RMSE': mean_squared_error(y_test, yhat_te),
        'train_R2': r2_score(y_train, yhat_tr),
        'test_R2': r2_score(y_test, yhat_te),
        'model': pipe
    }

configs = [(2,3), (5,3), (8,3), (15,3)]
results = [lin_metrics.copy()]
models = {'Linear (degree 1)': lin_model}

for (k, d) in configs:
    m = fit_spline(k, d, X_train, y_train, X_test, y_test)
    results.append({k2:v for k2,v in m.items() if k2 != 'model'})
    models[m['config']] = m['model']

df = pd.DataFrame(results)
df_display = df[['config', 'train_RMSE', 'test_RMSE', 'train_R2', 'test_R2']]
df_display

In [None]:
x_grid = np.linspace(X.min(), X.max(), 400).reshape(-1, 1)

plt.figure()
plt.scatter(X_train[:,0], y_train, s=12, alpha=0.6, label='train points')

order = ['Linear (degree 1)'] + [f'Spline k={k}, deg=3' for k,_ in configs]
for name in order:
    y_grid = models[name].predict(x_grid)
    r2 = df_display.loc[df_display['config']==name, 'test_R2'].iloc[0]
    plt.plot(x_grid[:,0], y_grid, linewidth=2, label=f'{name} (test R²={r2:.3f})')

plt.title('Spline vs Linear: Fitted Curves')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
for name, model in models.items():
    yhat_te = model.predict(X_test)
    residuals = y_test - yhat_te
    plt.figure()
    plt.scatter(X_test[:,0], residuals, s=16, alpha=0.8, label=f'{name} residuals')
    plt.axhline(0, linestyle='--', linewidth=1)
    plt.title(f'Residuals vs x — {name}')
    plt.xlabel('x')
    plt.ylabel('test residual')
    plt.legend()
    plt.tight_layout()
    plt.show()