Logs
- [2024/02/11]    
  A copy of chapter 8 from (Grus, 2019)

In [17]:
import numpy as np
import matplotlib.pyplot as plt

from scratch.linear_algebra import LinearAlgebra as la
from scratch.linear_algebra import Vector
from typing import Callable, TypeVar, List, Iterator

In [None]:
%load_ext autoreload
%autoreload 2 

## The Idea Behing Gradient Descent

We have to define a function that has a vector input and turns into a scalar.

[Plot function 3d using plotly]

## Estimating the Gradient

$$
  f'(x) = \lim_{h \rightarrow 0} \frac{f(x + h) - f(x)}{h}
$$

In [None]:
def difference_quotient(f: Callable[[float], float], x: float, h:float) -> float:
  return (f(x + h) - f(x)) / h

In [5]:
def partial_difference_quotient(f: Callable[[Vector], float],
                                v: Vector,
                                i: int,
                                h: float) -> float:
  """Returns the i-th partial difference quotient of f at v"""
  w = [v_j + (h if j == i else 0)     # add h to just the ith element of v
       for j, v_j in enumerate(v)]

  return (f(w) - f(v)) / h

In [6]:
def estimate_gradient(f: Callable[[Vector], float],
                      v: Vector,
                      h: float = 0.0001) -> Vector:
  return [partial_difference_quotient(f, v, i, h) for i in range(len(v))]

Estimating gradient using difference quotient is very slow.  
It is better to provide an explicit formula for the derivative instead calculating  
from the difference quotient

## Using the Gradient

In [7]:
def gradient_step(v: Vector, gradient: Vector, step_size: float) -> Vector:
  """Moves `step_size` in the `gradient` direction from `v"""
  assert len(v) == len(gradient)
  step = la.scalar_multiply(step_size, gradient)

  return la.add(v, step)

def sum_of_squares_gradient(v: Vector) -> Vector:
  return [2 * v_i for v_i in v]

In [10]:
# pick a random starting point
seed = 24_02_11
rng = np.random.default_rng(seed)
v = np.random.uniform(-10, 10, size=3) 

for epoch in range(1_000):
  grad = sum_of_squares_gradient(v)       # compute the gradient at v
  v = gradient_step(v, grad, -0.01)
  print(epoch, v)

assert la.distance(v, [0, 0, 0]) < 0.001


0 [7.900097855036686, 4.168918412932111, 9.287024305369352]
1 [7.742095897935952, 4.085540044673469, 9.101283819261965]
2 [7.587253979977233, 4.00382924378, 8.919258142876727]
3 [7.435508900377688, 3.9237526589044003, 8.740872980019192]
4 [7.2867987223701345, 3.845277605726312, 8.566055520418809]
5 [7.1410627479227315, 3.768372053611786, 8.394734410010432]
6 [6.998241492964277, 3.6930046125395504, 8.226839721810224]
7 [6.858276663104991, 3.6191445202887595, 8.06230292737402]
8 [6.721111129842892, 3.5467616298829845, 7.90105686882654]
9 [6.5866889072460335, 3.4758263972853247, 7.743035731450009]
10 [6.454955129101113, 3.4063098693396183, 7.588175016821009]
11 [6.325856026519091, 3.338183671952826, 7.436411516484589]
12 [6.199338905988709, 3.2714199985137697, 7.2876832861548975]
13 [6.075352127868935, 3.2059915985434944, 7.1419296204317995]
14 [5.953845085311556, 3.1418717665726246, 6.999091028023163]
15 [5.834768183605325, 3.079034331241172, 6.8591092074627]
16 [5.7180728199332185, 3.01

## Choosing the Right Step Size

It is not a science but rather an art to choose the right step size.
The following are the popular options:
- Using a fixe step size
- Gradually shrinking the step size over time
- At each step, choosing the step size that minimizes the value of the   
  objective function

## Using Gradient to Fit Models

_loss function_ is a function to measure how well the model fits our data.

In [11]:
# x ranges from -50 to 49, y is always 20 * x  + 5
inputs = [(x, 20 * x + 5) for x in range(-50, 50)]
inputs

[(-50, -995),
 (-49, -975),
 (-48, -955),
 (-47, -935),
 (-46, -915),
 (-45, -895),
 (-44, -875),
 (-43, -855),
 (-42, -835),
 (-41, -815),
 (-40, -795),
 (-39, -775),
 (-38, -755),
 (-37, -735),
 (-36, -715),
 (-35, -695),
 (-34, -675),
 (-33, -655),
 (-32, -635),
 (-31, -615),
 (-30, -595),
 (-29, -575),
 (-28, -555),
 (-27, -535),
 (-26, -515),
 (-25, -495),
 (-24, -475),
 (-23, -455),
 (-22, -435),
 (-21, -415),
 (-20, -395),
 (-19, -375),
 (-18, -355),
 (-17, -335),
 (-16, -315),
 (-15, -295),
 (-14, -275),
 (-13, -255),
 (-12, -235),
 (-11, -215),
 (-10, -195),
 (-9, -175),
 (-8, -155),
 (-7, -135),
 (-6, -115),
 (-5, -95),
 (-4, -75),
 (-3, -55),
 (-2, -35),
 (-1, -15),
 (0, 5),
 (1, 25),
 (2, 45),
 (3, 65),
 (4, 85),
 (5, 105),
 (6, 125),
 (7, 145),
 (8, 165),
 (9, 185),
 (10, 205),
 (11, 225),
 (12, 245),
 (13, 265),
 (14, 285),
 (15, 305),
 (16, 325),
 (17, 345),
 (18, 365),
 (19, 385),
 (20, 405),
 (21, 425),
 (22, 445),
 (23, 465),
 (24, 485),
 (25, 505),
 (26, 525),
 (27, 

$$
\begin{align*}
  \textrm{error} = \textrm{predicted} - y = \textrm{slope} \cdot x + \textrm{intercept}
\end{align*}
$$

$$
\begin{align*}
  \textrm{squared error} &= \left( \textrm{predicted} - y \right)^2 \\
  \nabla\left( \textrm{squared error} \right)
    &= \left[\frac{\partial(\textrm{squared error})}{\partial (\textrm{slope})}, 
             \frac{\partial(\textrm{squared error})}{\partial (\textrm{intercept})}\right] \\
    &= \left[- 2\, (\textrm{predicted} - y) \cdot x, -2 \,(\textrm{predicted} - y) \right] \\
    &= \left[- 2\,\textrm{error} \cdot x, -2 \textrm{error} \right]
\end{align*}
$$

In [14]:
def linear_gradient(x: float, y: float, theta: Vector) -> Vector:
  slope, intercept = theta
  predicted = slope * x + intercept   # the prediction of the model
  error = predicted - y               # error is (predicted - actual)
  squared_error = error**2             # we'll minimize squared erro
  grad = [-2 * error * x, -2 * error]   # using its gradient
  return grad

In [16]:
# Start with random values for slope and intercept
seed = 24_02_11
rng = np.random.default_rng(seed)
theta = [rng.uniform(-1, 1), rng.uniform(-1, 1)]

learning_rate = 0.001

for epoch in range(5_000):
  # Compute the mean of the gradients
  grad = la.vector_mean([linear_gradient(x, y, theta) for x, y in inputs])

  # Take a step in that direction
  theta = gradient_step(theta, grad, learning_rate)
  print(epoch, theta)


slope, intercept = theta
assert 19.9 < slope < 20.1, "slope should be about 20"
assert 4.9 < intercept < 5.1, "intercept should be about 5"

0 [33.234627342714056, 0.6928780680840432]
1 [11.168196440477796, 0.7147269392905891]
2 [25.886527701140597, 0.7144656818524856]
3 [16.069400489021074, 0.7289232781899212]
4 [22.617438797101137, 0.7335348321225625]
5 [18.249901857165664, 0.7446852012554185]
6 [21.163060146471757, 0.7514457327100733]
7 [19.219990328036047, 0.7611059013911249]
8 [20.516027557101346, 0.7688036799163787]
9 [19.65157842309332, 0.7777821001136472]
10 [20.22817497389687, 0.7858781143365132]
11 [19.843593170525125, 0.794534533081737]
12 [20.100117889792823, 0.8027890571860986]
13 [19.929024156565372, 0.8112835969615192]
14 [20.043152171167858, 0.8195900539241615]
15 [19.96703709188496, 0.8279940259874811]
16 [20.017814253738717, 0.836305075027391]
17 [19.983954197831302, 0.8446502791310749]
18 [20.006547200325652, 0.8529449327706441]
19 [19.99148596231556, 0.8612455901054284]
20 [20.001540108725624, 0.8695145848875331]
21 [19.994842262064896, 0.8777770958264837]
22 [19.99931798829854, 0.8860163838968956]
23 [1

## Minibatch and Stochastic Gradient Descent

Minibatch gradiend descent

In [18]:
T = TypeVar('T')      # this allows us to type "generic" functions

def minibatches(dataset: List[T],
                batch_size: int,
                shuffle: bool = True) -> Iterator[List[T]]:
  """Generates `batch_size`-sized minibatches from the dataset"""
  # start indexes 0, batch_size, 2 * batch_size, ...
  batch_starts = [start for start in range(0, len(dataset), batch_size)]

  if shuffle: 
    rng = np.random.default_rng()
    rng.shuffle(batch_starts)       # shuffle the batches

  for start in batch_starts:
    end = start + batch_size
    yield dataset[start:end]

In [20]:
seed = 24_02_11
rng = np.random.default_rng(seed)
theta = [rng.uniform(-1, 1), rng.uniform(-1, 1)]

for epoch in range(1_000):
  for batch in minibatches(inputs, batch_size=20):
    grad = la.vector_mean([linear_gradient(x, y, theta) for x, y in batch])
    theta = gradient_step(theta, grad, learning_rate)
  print(epoch, theta)

slope, intercept = theta
assert 19.9 < slope < 20.1, "slope should be about 20"
assert 4.9 < intercept < 5.1, "intercept should be about 5"

0 [18.446246140636635, 0.048951193679595106]
1 [19.95650359378635, 0.1747656345010629]
2 [19.525219198740032, 0.2198798591327645]
3 [20.928402854985094, 0.28698818102436546]
4 [20.075095367576676, 0.5288776045945229]
5 [20.288236515606695, 0.5628682942420712]
6 [19.970986196341094, 0.592633772067263]
7 [18.997538391185035, 0.6433022578781783]
8 [19.92419750328032, 0.7944609605976578]
9 [20.006088212180238, 0.8211359374687802]
10 [19.836450370584796, 1.0025215100215081]
11 [19.5189775930602, 1.039372843648976]
12 [19.829621755075834, 1.0249730502311987]
13 [19.81887890001573, 1.2052306427301203]
14 [19.990309649509637, 1.369124060157827]
15 [20.04150434211827, 1.3914199882163485]
16 [19.43348768224138, 1.4463197199263962]
17 [19.992790444355613, 1.470398245595506]
18 [20.123147684811215, 1.5614226370774777]
19 [20.237910104098848, 1.585893699091342]
20 [20.16294865980484, 1.6098180923249303]
21 [19.81351359983358, 1.6215116723372143]
22 [19.856310640737913, 1.6202669798639988]
23 [20.08

Stochastic gradient descent

In [21]:
seed = 24_02_11
rng = np.random.default_rng(seed)
theta = [rng.uniform(-1, 1), rng.uniform(-1, 1)]

for epoch in range(100):
  for x, y in inputs:
    grad = linear_gradient(x, y, theta)
    theta = gradient_step(theta, grad, learning_rate)
  print(epoch, theta)

slope, intercept = theta
assert 19.9 < slope < 20.1, "slope should be about 20"
assert 4.9 < intercept < 5.1, "intercept should be about 5"

0 [20.090137168028185, 0.5136884254133652]
1 [20.086269301565377, 0.7061680865386093]
2 [20.082568032321127, 0.8903896334492326]
3 [20.079025541350113, 1.0667073826922595]
4 [20.07563504690379, 1.2354604367585962]
5 [20.072390012481073, 1.3969733496323076]
6 [20.069284214542737, 1.5515567508047672]
7 [20.06631166070162, 1.699507942471441]
8 [20.063466639148768, 1.8411114712488088]
9 [20.060743680435444, 1.9766396757416647]
10 [20.05813754738781, 2.1063532102474514]
11 [20.05564322758552, 2.2305015460071123]
12 [20.053255933946325, 2.3493234512216645]
13 [20.050971042208346, 2.463047449523784]
14 [20.048784204900613, 2.571892260384638]
15 [20.04669116463509, 2.6760672192280794]
16 [20.044687943975973, 2.7757726801802254]
17 [20.042770640396274, 2.871200401332841]
18 [20.040935641179335, 2.9625339142030485]
19 [20.03917935081003, 3.0499488761893363]
20 [20.037498402889124, 3.1336134072849817]
21 [20.035889573550346, 3.2136884149152753]
22 [20.03434979994502, 3.2903279037809274]
23 [20.03