Skip to content

Latest commit

 

History

History
342 lines (260 loc) · 7.7 KB

1-1.livemd

File metadata and controls

342 lines (260 loc) · 7.7 KB

Part1 Supervised learning - Chapter 1 Linear regression

Mix.install(
  [
    {:nx, "~> 0.2"},
    {:nimble_csv, "~> 1.2"},
    {:kino, "~> 0.8"},
    {:kino_vega_lite, "~> 0.1"},
    {:exla, "~> 0.2"}
  ],
  config: [
    nx: [default_backend: EXLA.Backend]
  ]
)

Helper Module to parsing CSV

defmodule Dataset do
  def read_csv(url, headers \\ false) do
    {:ok, _} = Application.ensure_all_started(:inets)
    {:ok, _} = Application.ensure_all_started(:ssl)

    [hs | data] =
      url
      |> :httpc.request()
      |> then(fn
        {:ok, {_, _, data}} ->
          IO.iodata_to_binary(data)
      end)
      |> NimbleCSV.RFC4180.parse_string(skip_headers: false)

    if headers do
      data
      |> Enum.map(fn x ->
        Enum.zip(hs, x)
        |> Enum.into(%{})
        |> Map.take(headers)
      end)
    else
      data
    end
  end
end

We have a dataset

url = "https://www.openintro.org/stat/data/ames.csv"

dataset =
  Dataset.read_csv(url, ["Gr.Liv.Area", "SalePrice", "Yr.Sold", "MS.SubClass", "Bedroom.AbvGr"])
  |> Enum.filter(fn d -> d["Yr.Sold"] == "2010" end)

Let's plot the data

alias VegaLite, as: Vl

Vl.new(width: 500, height: 400)
|> Vl.mark(:point)
|> Vl.data_from_values(dataset)
|> Vl.encode_field(:y, "SalePrice", type: :quantitative)
|> Vl.encode_field(:x, "['Gr.Liv.Area']", type: :quantitative, scale: [zero: false])

Notation

features

$$ x^{(i)} $$

target

$$ y^{(i)} $$

training example

$$ (x^{(i)},y^{(i)}) $$

training set

$$ {(x^{(i)},y^{(i)});i=1,...,n} $$

hypothesis

graph TD;
    A(Training set)-->B(Learning algorithm)
    subgraph hypothesis
        direction LR
        D(x)-->h
        h-->E(predicted y)
    end
    B-->hypothesis

When the target variable that we're predict is continuous, we call the learning problem a regression problem.

When y can take on only a small number of discrete values, we call it a classification problem.

parameters (also called weights)

$$ \theta_i $$

approximate y as a linear function of x

$$ h(x) = \sum_{i=0}^{d}\theta_ix_i = \theta^Tx $$

cost function

$$ J(\theta) = \dfrac{1}{2}\sum_{i=1}^{n}(h_\theta(x^{(i)})-y^{(i)})^2 $$

If you've seen linear regression before, you may recognize this as the familiar least-squares cost function that gives rise to the ordinary least squares regression model.

gradient descent

$$ \theta_j:=\theta_j-\alpha\dfrac{\partial}{\partial\theta_j}J(\theta) $$

Here, alpha is called the learning rate.

batch gradient descent

Looks at every example in the entire training set on every step.

stochastic gradient descent (also incremental gradient descent)

Each time we encounter a training example, we update the parameters according to the gradient of the error with respect to that single training example only.

Matrix derivatives

For a function $f$ : $\mathbb{R}^{n\times d}\mapsto\mathbb{R}$ mapping from n-by-d matrices to the real numbers, we define the derivative of $f$ with respect to $A$ to be:

$$ \nabla_Af(A) = \begin{bmatrix} \dfrac{\partial f}{\partial A_{11}} & \dots & \dfrac{\partial f}{\partial A_{1d}}\\ \vdots & \ddots & \vdots\\ \dfrac{\partial f}{\partial A_{n1}} & \dots & \dfrac{\partial f}{\partial A_{nd}} \end{bmatrix} $$

Thus, the gradient $\nabla_Af(A)$ is itself an n-by-d matrix, whose $(i, j)$-element is $\dfrac{\partial f}{\partial A_{ij}}$.

Least squares revisited

Given a training set, define the design matrix $X$ to be the n-by-d matrix that contains the training examples' input values in its rows:

$$ X = \begin{bmatrix} \textit{---}&(x^{(1)})^T&\textit{---}\\ \textit{---}&(x^{(2)})^T&\textit{---}\\ &\vdots\\ \textit{---}&(x^{(n)})^T&\textit{---}\\ \end{bmatrix} $$

Also, let $\vec{y}$ by the n-dimensional vector containing all the target values from the training set:

$$ \vec{y} = \begin{bmatrix} y^{(1)}\\ y^{(2)}\\ \vdots\\ y^{(n)}\\ \end{bmatrix} $$

The value of $\theta$ that minimizes $J(\theta)$ is given in closed form by the equation

$$ \theta=(X^TX)^{-1}X^T\vec{y} $$

Probabilistic interpretation

Given $X$ and $\theta$, what is the distribution of the $y^(i)$'s? The probability of the data is given by $p(\vec{y}|X;\theta)$. This quantity is typically viewed a function of $\vec{y}$ (and perhaps $X$), for a fixed value of $\theta$. When we wish to explicityly view this as a function of $\theta$, we will instead call it the likelihood function:

$$ L(\theta)=L(\theta;X,\vec{y})=p(\vec{y}|X;\theta) $$

maximum likelihood

Choose $\theta$ so as to make the data as high probability as possible.

log likelihood

$$ \ell(\theta) $$

Locally weighted linear regression

$w^{(i)}$'s are non-negative valued weights.

A fairly standard choice for the weights is

$$ w^{(i)}=exp(-\dfrac{(x^{(i)}-x)^{2}}{2\tau^{2}}) $$

The parameter $\tau$ controls how quickly the weight of a training example falls off with distance of its $x^(i)$ from the query point $x$; $\tau$ is called the bandwidth parameter.

Locally weightted linear regression is a non-parametric algorithm.

Practise: batch gradient descent

defmodule LinReg do
  import Nx.Defn

  defn predict({m, b}, x) do
    m * x + b
  end

  defn loss(params, x, y) do
    y_pred = predict(params, x)
    Nx.mean(Nx.power(y - y_pred, 2))
  end

  defn update({m, b} = params, inp, tar) do
    {grad_m, grad_b} = grad(params, &loss(&1, inp, tar))

    {
      m - grad_m * 0.000000001,
      b - grad_b * 0.000000001
    }
  end

  defn init_random_params(key) do
    {m, key} = Nx.Random.normal(key, 0.0, 0.1)
    {b, key} = Nx.Random.normal(key, 0.0, 0.1)
    {{m, b}, key}
  end

  def train(epochs, data, key \\ Nx.Random.key(42)) do
    {init_params, _key} = init_random_params(key)
    1..epochs

    Stream.resource(
      fn -> {1, init_params} end,
      fn {epoch, cur_params} ->
        if epoch > epochs do
          {:halt, {epoch, cur_params}}
        else
          params =
            data
            |> Enum.take(200)
            |> Enum.reduce(
              cur_params,
              fn batch, cur_params ->
                {inp, tar} = Enum.unzip(batch)
                x = Nx.tensor(inp)
                y = Nx.tensor(tar)

                # IO.puts "epoch: #{epoch}, current loss: #{loss(cur_params, x, y) |> Nx.to_number() |> Kernel./(10000000)}"

                update(cur_params, x, y)
              end
            )

          {[params], {epoch + 1, params}}
        end
      end,
      fn _ -> :ok end
    )
  end
end
data =
  dataset
  |> Enum.map(fn d ->
    {d["Gr.Liv.Area"] |> String.to_integer(), d["SalePrice"] |> String.to_integer()}
  end)

IO.inspect(length(data))
alias VegaLite, as: Vl

widget =
  Vl.new(width: 500, height: 400)
  |> Vl.datasets_from_values(examples: dataset, outputs: [])
  |> Vl.layers([
    Vl.new()
    |> Vl.mark(:point)
    |> Vl.data(name: "examples")
    |> Vl.encode_field(:y, "SalePrice", type: :quantitative)
    |> Vl.encode_field(:x, "['Gr.Liv.Area']", type: :quantitative, scale: [zero: false]),
    Vl.new()
    |> Vl.data(name: "outputs")
    |> Vl.mark(:line)
    |> Vl.encode_field(:y, "SalePrice", type: :quantitative)
    |> Vl.encode_field(:x, "['Gr.Liv.Area']", type: :quantitative, scale: [zero: false])
  ])
  |> Kino.VegaLite.new()
  |> Kino.render()

LinReg.train(1000, [data])
|> Stream.take_every(10)
|> Stream.zip(Stream.interval(100))
|> Enum.each(fn {{m, b}, _} ->
  widget
  |> Kino.VegaLite.clear(dataset: "outputs")

  widget
  |> Kino.VegaLite.push_many(
    Enum.map(data, fn {x, _} ->
      y = Nx.multiply(m, x) |> Nx.add(b) |> Nx.to_number()
      %{"SalePrice" => y, "Gr.Liv.Area" => x}
    end),
    dataset: "outputs"
  )
end)

:ok