In [3]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures

from sklearn.linear_model import LinearRegression

## Error

In [None]:
fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(1, 1, 1)

x = [1, 2, 3, 4, 5]
a = [3, 3.5, 3, 4, 5]
p = [2 + 0.5 * _ for _ in x]

ax.plot(x, p, label="model", color="grey")
ax.scatter(x, a, color="blue")
ax.vlines(x, ymin=p, ymax=a, color="red")

None

## Fiting proces

In [22]:
def mean_absolute_deviation(true, predicted):
    """Comupte mean absolute deviation."""
    deviation = [abs(t - p) for t, p in zip(true, predicted)]
    return sum(deviation) / len(deviation)

In [None]:
x = [1, 2, 3, 4, 5]
a = [2, 3, 5, 6, 7]

for beta in (0.5, 1.0, 1.5):
    fig = plt.figure(figsize=(5, 5))
    ax = fig.add_subplot(1, 1, 1)

    p = [beta * _ for _ in x]

    ax.plot(x, p, label="model", color="grey")
    ax.scatter(x, a, color="blue")
    ax.vlines(x, ymin=p, ymax=a, color="red")

    print("MAD: ", mean_absolute_deviation(a, p))

In [None]:
beta = [0.5, 1.0]
mad  = [3.1, 1.6]

fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(1, 1, 1)
ax.set(xlabel="Beta", ylabel="MAD")
ax.set_xlim(0, 4)
ax.set_ylim(0, 4)

ax.plot(beta, mad, color="grey", marker=".")

## Fit vs Overfit

In [3]:
def model(x):
    """Generative model."""
    return 3 + 0.5 * x

In [4]:
# Create population data.
pop_size = 5000
sample_size = 15
xlim = 1, 10

x = np.random.uniform(xlim[0], xlim[1], pop_size)
y = model(x) + np.random.normal(0, 0.5, pop_size)

In [None]:
# Take a sample.
idx = np.random.randint(0, len(x), sample_size)
xs = x[idx]
ys = y[idx]

# Create high order polynomial features.
poly = PolynomialFeatures(degree=5)
Xs = poly.fit_transform(xs.reshape(-1, 1))

# Fit model.
linear = LinearRegression()
linear.fit(Xs, ys)

In [None]:
# Create the figure.
fig = plt.figure(figsize=(4, 8))
ax1, ax2 = fig.subplots(2, 1, sharex=True, sharey=True)

# Plot population scatter
ax1.scatter(x, y, alpha=0.05)

# Overlay generative model.
xt = np.linspace(xlim[0], xlim[1], 2)
ax1.plot(xt, model(xt), color="red", label="y = 3 + 0.5 * x")
ax1.legend()
ax1.set_title("Populatie")

# Create linear model data.
xm = np.linspace(xlim[0], xlim[1], 100).reshape(-1, 1)
Xm = poly.transform(xm)
ym = linear.predict(Xm)

# Plot sample scatter.
ax2.scatter(xs, ys)

# Plot polynomial model.
ax2.plot(xm, ym, color="red")
ax2.set_ylim((3, 8))
ax2.set_title("Steekproef")

None

## Regression Interpretation

In [7]:
def model(x1, x2):
    """Generative model."""
    return 3 + 1.5 * x1 + 1.5 * x2

In [8]:
# Create the data.
lim = 0, 10
n = 50
x1 = np.random.uniform(lim[0], lim[1], n)
x2 = np.random.uniform(lim[0], lim[1], n)
y = model(x1, x2) + np.random.normal(0, 1, n)

In [None]:
# Make the figure.
fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(1, 1, 1, projection="3d")

# Plot the data.
ax.scatter(x1, x2, y, alpha=0.5, color="black")

# Plot the surface.
X1, X2 = np.meshgrid(range(lim[1]), range(lim[1]))
Y = model(X1, X2) # 3 + 1.5 * X1 + 1.5 * X2
ax.plot_surface(X1, X2, Y, alpha=0.5)
ax.set(xlabel="X1", ylabel="X2", zlabel="Y")

# Remove grey background.
ax.xaxis.pane.fill = False
ax.yaxis.pane.fill = False
ax.zaxis.pane.fill = False

fig.tight_layout()

### Different distributions

In [None]:
# Create the figure.
fig = plt.figure(figsize=(10, 2))
ax1, ax2 = fig.subplots(1, 2, sharex=True, sharey=True)

# Generate some dummy data
n = 500
train = np.random.normal(0, 2, n)
test = np.random.normal(0.5, 2, n)

# Add some outliers
test[0:5] = 11

ax1.hist(train, bins=25, edgecolor="white")
ax2.hist(test, bins=25, edgecolor="white")

ax1.axvline(train.mean(), color="red")
ax2.axvline(test.mean(), color="red")

ax1.set_title("Train Dataset")
ax2.set_title("Test Dataset")

None