# Mathematics for Machine Learning

## Linear Algebra

### Gaussian distribution

\begin{equation}
f(x) = \frac{1}{\sigma \sqrt{2 \pi}} e^\frac{-(x - \mu)^2}{2 \sigma^2}
\end{equation}

\begin{equation}
\mathbf{p} = 
  \begin{bmatrix}
    \mu \\
    \sigma
  \end{bmatrix}
\end{equation}

### Inverse Matrix

In [3]:
import numpy as np

A = [
    [4, 6, 2],
    [3, 4, 1],
    [2, 8, 13]
]
Ainv = np.linalg.inv(A)
print(Ainv)

s = [9, 7, 2]

r = np.linalg.solve(A, s)
print(r)

[[-3.14285714  4.42857143  0.14285714]
 [ 2.64285714 -3.42857143 -0.14285714]
 [-1.14285714  1.42857143  0.14285714]]
[ 3.  -0.5  0. ]


### Eigenvalues & Eigenvectors

In [2]:
import numpy as np

M = np.array([
    [  4,   -5,  6],
    [  7,   -8,  6],
    [3/2, -1/2, -2]
])
vals, vecs = np.linalg.eig(M)
# Eigenvalues
print('Eigenvalues:\n', vals)
# Eigenvectors - Note, the eigenvectors are the columns of the output.
print('Eigenvectors:\n', vecs)

Eigenvalues:
 [ 1. -4. -3.]
Eigenvectors:
 [[-0.6882472  -0.66666667  0.40824829]
 [-0.6882472  -0.66666667 -0.40824829]
 [-0.22941573  0.33333333 -0.81649658]]


## Multivariate Calculus

### Artificial Neural Networks

In [8]:
import numpy as np

# First we set the state of the network
σ = np.tanh
w1 = -5
b1 = 5

# Then we define the neuron activation.
def a1(a0):
    return σ(w1 * a0 + b1)
  
# Finally let's try the network out!
# Replace x with 0 or 1 below,
print(a1(0))

0.9999092042625951


In [7]:
import numpy as np

# First set up the network.
sigma = np.tanh
W = np.array([[-2, 4, -1], [6, 0, -3]])
b = np.array([0.1, -2.5])

# Define our input vector
x = np.array([0.3, 0.4, 0.1])

# Calculate the values by hand,
# and replace a1_0 and a1_1 here (to 2 decimal places)
# (Or if you feel adventurous, find the values with code!)
z = W @ x + b
a1 = np.array(sigma(z))
print(a1)

[ 0.76159416 -0.76159416]


In [6]:
import numpy as np

# First we set the state of the network
σ = np.tanh
w1 = 1.3
b1 = -0.1

# Then we define the neuron activation.
def a1(a0):
    z = w1 * a0 + b1
    return σ(z)

# Experiment with different values of x below.
x = 0
print(a1(x))

-0.09966799462495582


In [4]:
import numpy as np

# First define our sigma function.
sigma = np.tanh

# Next define the feed-forward equation.
def a1(w1, b1, a0):
    z = w1 * a0 + b1
    return sigma(z)

# The individual cost function is the square of the difference between
# the network output and the training data output.
def C(w1, b1, x, y):
    return (a1(w1, b1, x) - y)**2

# This function returns the derivative of the cost function with
# respect to the weight.
def dCdw(w1, b1, x, y):
    z = w1 * x + b1
    dCda = 2 * (a1(w1, b1, x) - y) # Derivative of cost with activation
    dadz = 1/np.cosh(z)**2 # derivative of activation with weighted sum z
    dzdw = x # derivative of weighted sum z with weight
    return dCda * dadz * dzdw # Return the chain rule product.

# This function returns the derivative of the cost function with
# respect to the bias.
# It is very similar to the previous function.
# You should complete this function.
def dCdb(w1, b1, x, y):
    z = w1 * x + b1
    dCda = 2 * (a1(w1, b1, x) - y)
    dadz = 1/np.cosh(z)**2
    """ Change the next line to give the derivative of
      the weighted sum, z, with respect to the bias, b. """
    dzdb = 1
    return dCda * dadz * dzdb

"""Test your code before submission:"""
# Let's start with an unfit weight and bias.
w1 = 2.3
b1 = -1.2
# We can test on a single data point pair of x and y.
x = 0
y = 1
# Output how the cost would change
# in proportion to a small change in the bias
print(dCdb(w1, b1, x, y))

-1.1186026425530913


In [5]:
import numpy as np

# Define the activation function.
sigma = np.tanh

# Let's use a random initial weight and bias.
W = np.array([[-0.94529712, -0.2667356 , -0.91219181],
    [2.05529992, 1.21797092, 0.22914497]])
b = np.array([0.61273249, 1.6422662])

# define our feed forward function
def a1(a0):
    # Notice the next line is almost the same as previously,
    # except we are using matrix multiplication rather than scalar multiplication
    # hence the '@' operator, and not the '*' operator.
    z = W @ a0 + b
    # Everything else is the same though,
    return sigma(z)

# Next, if a training example is,
x = np.array([0.7, 0.6, 0.2])
y = np.array([0.9, 0.6])

# Then the cost function is,
d = a1(x) - y # Vector difference between observed and expected activation
C = d @ d # Absolute value squared of the difference.
print(C)

1.7788340952508737


### Newton-Raphson

In [9]:
import numpy as np
import pandas as pd

def f(x):
    return x**6/6 - 3*x**4 - 2*x**3/3 + 27*x**2/2 + 18*x - 30

def d_f(x) :
    # Complete this line with the derivative you have calculated.
    return x**5 - 12*x**3 - 2*x**2 + 27*x  + 18

x = 1.99

d = {"x": [x], "f(x)": [f(x)]}
for i in range(0, 20):
    x = x - f(x) / d_f(x)
    d["x"].append(x)
    d["f(x)"].append(f(x))

pd.DataFrame(d, columns=['x', 'f(x)'])

Unnamed: 0,x,f(x)
0,1.99,17.33108
1,-36.474613,387197500.0
2,-30.422744,129602200.0
3,-25.384916,43370120.0
4,-21.193182,14508480.0
5,-17.707798,4851113.0
6,-14.812568,1620886.0
7,-12.410978,541017.3
8,-10.423021,180299.3
9,-8.7826,59942.79


In [11]:
import scipy as sp

def f(x):
  return x**6/6 - 3*x**4 - 2*x**3/3 + 27*x**2/2 + 18*x - 30
  
x0 = 3.1
sp.optimize.newton(f, x0)

1.063070629709697

### Lagrange Multipliers

In [12]:
import numpy as np
import scipy as sp

# First we define the functions,
def f(x, y):
    return np.exp(-(2*x*x + y*y - x*y) / 2)

def g(x, y):
    return x*x + 3*(y+1)**2 - 1

# Next their derivatives,
def dfdx(x, y):
    return 1/2 * (-4*x + y) * f(x, y)

def dfdy(x, y):
    return 1/2 * (x - 2*y) * f(x, y)

def dgdx(x, y):
    return 2 * x

def dgdy(x, y):
    return 6 * (y + 1)

def DL(xyλ) :
    [x, y, λ] = xyλ
    return np.array([
        dfdx(x, y) - λ * dgdx(x, y),
        dfdy(x, y) - λ * dgdy(x, y),
        - g(x, y)
    ])

(x0, y0, λ0) = (1, -1, 0)
x, y, λ = sp.optimize.root(DL, [x0, y0, λ0]).x
print("x = %g" % x)
print("y = %g" % y)
print("λ = %g" % λ)
print("f(x, y) = %g" % f(x, y))

x = 0.930942
y = -1.21083
λ = -0.152319
f(x, y) = 0.114944


In [13]:
# Import libraries
import numpy as np
import scipy as sp

# First we define the functions, YOU SHOULD IMPLEMENT THESE
def f(x, y):
    return -np.exp(x - y**2 + x * y)

def g(x, y):
    return np.cosh(y) + x - 2

# Next their derivatives, YOU SHOULD IMPLEMENT THESE
def dfdx(x, y):
    return (1 + y) * f(x, y)

def dfdy(x, y):
    return (x - 2 * y) * f(x, y)

def dgdx(x, y):
    return 1

def dgdy(x, y):
    return np.sinh(y)

# Use the definition of DL from previously.
def DL(xyλ) :
    [x, y, λ] = xyλ
    return np.array([
        dfdx(x, y) - λ * dgdx(x, y),
        dfdy(x, y) - λ * dgdy(x, y),
        - g(x, y)
    ])

# To score on this question, the code above should set
# the variables x, y, λ, to the values which solve the
# Langrange multiplier problem.

# I.e. use the optimize.root method, as you did previously.

(x0, y0, λ0) = (0, 0, 0)
x, y, λ = sp.optimize.root(DL, [x0, y0, λ0]).x

print("x = %g" % x)
print("y = %g" % y)
print("λ = %g" % λ)
print("f(x, y) = %g" % f(x, y))

x = 0.957782
y = 0.289565
λ = -4.07789
f(x, y) = -3.16222


### Linear Regression

In [None]:
import numpy as np

# Here the function is defined
def linfit(xdat,ydat):
    # Here xbar and ybar are calculated
    xbar = np.sum(xdat)/len(xdat)
    ybar = np.sum(ydat)/len(ydat)
    
    # Insert calculation of m and c below
    m = np.sum((xdat - xbar) * ydat) / np.sum((xdat - xbar)**2)
    c = ybar - m * xbar
    # Return your values as [m, c]
    return [m, c]

In [None]:
import scipy as sp

# Use the stats.linregress() method to evaluate regression
regression = sp.stats.linregress(x=xdat, y=ydat)