In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.preprocessing import quantile_transform, PolynomialFeatures

from sklearn.linear_model import LinearRegression

## Error

In [None]:
fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(1, 1, 1)

x = [1, 2, 3, 4, 5]
a = [3, 3.5, 3, 4, 5]
p = [2 + 0.5 * _ for _ in x]

ax.plot(x, p, label="model", color="grey")
ax.scatter(x, a, color="blue")
ax.vlines(x, ymin=p, ymax=a, color="red")


None

## Quantile Transform

In [None]:
n = 5000
x = np.random.lognormal(0, 0.6, n) * 5
x_t = quantile_transform(x.reshape(-1, 1), n_quantiles=100)

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(8, 6))
colors = ["red", "orange", "green"]
titles = ["Original Data", "Quantile Transformed Data"]

for idx, series in enumerate((x, x_t)):
    axes[idx].hist(series, bins=100, edgecolor="white")
    axes[idx].set_title(titles[idx])
    quantiles = np.quantile(series, [0.25, 0.5, 0.75])
    for quantile, color in zip(quantiles, colors):
        axes[idx].axvline(x=quantile, color=color)

plt.tight_layout()

## Winsorizing

In [None]:
n = 200
x = pd.Series(np.random.lognormal(1.5, 1, n))
upper = x.quantile(0.95)
xw = x.clip(upper=upper)

fig = plt.figure(figsize=(6, 6))
bins = list(range(0, int(x.max() + 5), 5))

# Plot original data
ax1 = fig.add_subplot(2, 1, 1)
ax1.hist(x, bins=bins, edgecolor="white")
ax1.axvline(x=upper, color="red")
ax1.set_title("Raw Data")

# Plot winsorized
ax2 = fig.add_subplot(2, 1, 2)
ax2.hist(xw, bins=bins, edgecolor="white")
ax2.axvline(x=upper, color="red")
ax2.set_title("Winsorized Data")

fig.tight_layout()

## Fit vs Overfit

In [None]:
def model(x):
    """Generative model."""
    return 3 + 0.5 * x

In [None]:
# Create population data
pop_size = 5000
sample_size = 15
xlim = 1, 10

x = np.random.uniform(xlim[0], xlim[1], pop_size)
y = model(x) + np.random.normal(0, 0.5, pop_size)

In [None]:
# Take a sample
idx = np.random.randint(0, len(x), sample_size)
xs = x[idx]
ys = y[idx]

# Create high order polynomial features
poly = PolynomialFeatures(degree=5)
Xs = poly.fit_transform(xs.reshape(-1, 1))

# Fit model
linear = LinearRegression()
linear.fit(Xs, ys)

In [None]:
# Create the figure
fig = plt.figure(figsize=(4, 8))
ax1, ax2 = fig.subplots(2, 1, sharex=True, sharey=True)

# Plot population scatter
ax1.scatter(x, y, alpha=0.05)

# Overlay generative model
xt = np.linspace(xlim[0], xlim[1], 2)
ax1.plot(xt, model(xt), color="red", label="y = 3 + 0.5 * x")
ax1.legend()
ax1.set_title("Populatie")

# Create linear model data
xm = np.linspace(xlim[0], xlim[1], 100).reshape(-1, 1)
Xm = poly.transform(xm)
ym = linear.predict(Xm)

# Plot sample scatter
ax2.scatter(xs, ys)

# Plot polynomial model
ax2.plot(xm, ym, color="red")
ax2.set_ylim((3, 8))
ax2.set_title("Steekproef")

None