<a href="https://colab.research.google.com/github/HSE-LAMBDA/MLatFIAN2020/blob/master/seminar07/MLatFIAN2020_seminar07_BiasVariance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bias-Variance decomposition

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
def true_dep(x):
  return np.cos((x - 0.2)**2) + 0.2 / (1 + 50 * (x - 0.3)**2)

x_true = np.linspace(0, 1, 100)
y_true = true_dep(x_true)

def generate_n_datasets(num_datasets, dataset_length, noise_power=0.02):
  shape = (num_datasets, dataset_length, 1)
  x = np.random.uniform(size=shape)
  y = true_dep(x) + np.random.normal(scale=noise_power, size=shape)
  return x, y

In [None]:
x, y = generate_n_datasets(1, 30)
plt.scatter(x.squeeze(), y.squeeze(), s=20, c='orange')
plt.plot(x_true, y_true, c='c', linewidth=1.5);

In [None]:
from copy import deepcopy
from tqdm import tqdm, trange

In [None]:
def calc_bias2_variance(model, datasets_X, datasets_y):
  preds = []
  for X, y in tqdm(zip(datasets_X, datasets_y), total=len(datasets_X)):
    m = deepcopy(model)
    m.fit(X, y)
    preds.append(m.predict(x_true[:,np.newaxis]).squeeze())
  preds = np.array(preds)
  mean_pred = preds.mean(axis=0)
  bias2 = (y_true - mean_pred)**2
  variance = ((preds - mean_pred[np.newaxis,...])**2).mean(axis=0)

  return bias2, variance, preds

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

In [None]:
powers = np.arange(1, 9)

bias2, variance, preds = [], [], []
for p in powers:
  model = Pipeline([
      ('poly', PolynomialFeatures(degree=p)),
      ('linear', LinearRegression())
  ])

  b2, v, p = calc_bias2_variance(model, *generate_n_datasets(1000, 20))
  bias2.append(b2)
  variance.append(v)
  preds.append(p)

bias2 = np.array(bias2)
variance = np.array(variance)

In [None]:
ncols = 4
nrows = int(np.ceil(len(powers) / ncols))

plt.figure(figsize=(18, 3.5 * nrows))

yrange = y_true.max() - y_true.min()

for i, (pred, pow) in tqdm(enumerate(zip(preds, powers), 1)):
  plt.subplot(nrows, ncols, i)
  for p in pred[np.random.choice(len(pred), size=200, replace=False)]:
    plt.plot(x_true, p, linewidth=0.05, c='b');
  plt.plot(x_true, y_true, linewidth=3, label='Truth', c='r')
  plt.ylim(y_true.min() - 0.5 * yrange, y_true.max() + 0.5 * yrange)
  plt.title('power = {}'.format(pow))
  plt.legend();

In [None]:
plt.plot(powers, bias2.mean(axis=1), label='bias^2')
plt.plot(powers, variance.mean(axis=1), label='variance')
plt.legend()
plt.yscale('log')
plt.xlabel('power');