### Multi-Layer Perceptron using Iris DataSet

<img src="Images/iris-machinelearning.png" width="600" height="300" />


In [3]:
using CSV, Random

iris = CSV.read("iris_data.csv")

# Mixing up the data
shuff = copy(iris)
iris = shuff[shuffle(1:end), :]

Unnamed: 0_level_0,SepalLength,SepalWidth,PetalLength,PetalWidth,Species
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,String
1,5.5,2.4,3.7,1.0,versicolor
2,6.5,3.0,5.8,2.2,virginica
3,6.2,2.8,4.8,1.8,virginica
4,6.7,3.3,5.7,2.5,virginica
5,6.5,3.0,5.2,2.0,virginica
6,5.8,4.0,1.2,0.2,setosa
7,6.8,2.8,4.8,1.4,versicolor
8,5.6,2.5,3.9,1.1,versicolor
9,5.8,2.6,4.0,1.2,versicolor
10,6.9,3.1,5.4,2.1,virginica


In [4]:
# Making empty matrix X to append variable values (width/length, pedal/sepal)
X = zeros(4, 150)

# Making empty matrix Y to hold the value of which species it is
Y = zeros(3, 150)

# Filling in the matrices
for i = 1:150
    for j = 1:4
        X[j, i] = iris[i, j]
        # Determing species ([1,0,0]=setosa, [0,1,0]=versicolor, [0,0,1]=virginica)
        if iris[i, 5] == "setosa"
            Y[1, i] = 1.0
        elseif iris[i, 5] == "versicolor"
            Y[2, i] = 1.0
        else 
            Y[3, i] = 1.0
        end
    end
end

In [5]:
X,Y

([5.5 6.5 … 4.8 5.0; 2.4 3.0 … 3.0 3.2; 3.7 5.8 … 1.4 1.2; 1.0 2.2 … 0.1 0.2], [0.0 0.0 … 1.0 1.0; 1.0 0.0 … 0.0 0.0; 0.0 1.0 … 0.0 0.0])

<img src="Images/irismlp.png" width="500" height="300" />


##### Sigmoid Function 

<img src="Images/Sigmoid.png" width="200" height="200" />


In [6]:
function σ(s)
    1/(1+exp(-s))
end

σ (generic function with 1 method)

##### Derivative of Sigmoid Function

$$\frac{\delta \sigma(x)}{\delta(x)}=\sigma(x) *(1-\sigma(x))$$

In [7]:
function dσ(s)
    σ(s)*(1 - σ(s))
end

dσ (generic function with 1 method)

##### Softmax Function

$$f_{i}(\vec{a})=\frac{e^{a_{i}}}{\sum_{k} e^{a_{k}}}$$

In [8]:
function softmax(a, i)
    exp(a[i]) / (sum(exp(a[j]) for j = 1:length(a)))
end

softmax (generic function with 1 method)

#### Cross Entropy Function 

$$C E=-\sum_{i}^{C} t_{i} \log \left(f(s)_{i}\right)$$

In [9]:
function L(O, y)  
    -sum(y[i]*log(O[i]) for i = 1:length(y))
end

L (generic function with 1 method)

In [10]:
function hadamard(x, y) 
    [x[i]*y[i] for i = 1:length(x)]
end

hadamard (generic function with 1 method)

In [11]:
function forward_propagation(x, y, W, b)
    a1 = copy(x)
    z2 = W[1]*a1 + b[1]
    a2 = σ.(z2)
    
    z3 = W[2]*a2 + b[2]
    a3 = σ.(z3)
    
    z4 = W[3]*a3 + b[3]
    a4 = σ.(z4)
    
    a = [a1, a2, a3, a4]
    z = [[0,0], z2, z3, z4]
    O = [softmax(a4, i) for i = 1:length(a4)]
    loss = L(O, y)
    return a, z, O, loss
end

forward_propagation (generic function with 1 method)

In [12]:
function back_propagation(x, y, W, b)
    a, z, O, loss = forward_propagation(x, y, W, b)
    δ4 = a[4] - y
    δ3 = hadamard(W[3]'*δ4, dσ.(z[3]))
    δ2 = hadamard(W[2]'*δ3, dσ.(z[2]))
    δ = [[0,0], δ2, δ3, δ4]
    return a, δ
end

function ∇L(x, y, W, b)
    
    a, δ = back_propagation(x, y, W, b)
    
    db1 = copy(δ[2])
    db2 = copy(δ[3])
    db3 = copy(δ[4])
    
    dW1 = δ[2]*a[1]'
    dW2 = δ[3]*a[2]'
    dW3 = δ[4]*a[3]'
    return [db1, db2, db3], [dW1, dW2, dW3]
end

function gradient_descent!(x, y, W, b, α)
    db, dW = ∇L(x, y, W, b)
    for i = 1:length(W)
        W[i] -= α*dW[i]
        b[i] -= α*b[i]
    end
end

gradient_descent! (generic function with 1 method)

In [13]:

W1 = rand(5, 4)
W2 = rand(5, 5)
W3 = rand(3, 5)
W = [W1, W2, W3]

3-element Array{Array{Float64,2},1}:
 [0.24672666673585586 0.29185243743599854 0.14735410087481382 0.19440782786300947; 0.03492777714149464 0.9040513700416737 0.8809892864035829 0.33818397032950176; … ; 0.630687986390595 0.48386930601100797 0.6402513197303188 0.5006855418420693; 0.104256976325517 0.9166229895847076 0.49311587757229147 0.2825078120355997]          
 [0.2273318652716596 0.06250367422658099 … 0.8505655293730838 0.25984296440306176; 0.361940889912135 0.22540732163350996 … 0.9664304766107483 0.7897073912607533; … ; 0.7724847512429258 0.025649311659785567 … 0.4057466756232422 0.8539282430985815; 0.3287232431239566 0.40697912795347846 … 0.007787203536470777 0.47902979772001664]
 [0.42849790840037105 0.11780434297214537 … 0.11635181155353447 0.10822145985337372; 0.01959323130623236 0.07832427014401389 … 0.05569854468899793 0.18910865107416974; 0.1446635519771715 0.182815668677756 … 0.7034390406142057 0.8430255438164489]                                                          

In [14]:
b1 = -1*ones(5)
b2 = -1*ones(5)
b3 = -1*ones(3)
b = [b1, b2, b3]

3-element Array{Array{Float64,1},1}:
 [-1.0, -1.0, -1.0, -1.0, -1.0]
 [-1.0, -1.0, -1.0, -1.0, -1.0]
 [-1.0, -1.0, -1.0]            

In [15]:
for _ in 1:1000000
    j = rand(1:50)
    gradient_descent!(X[:,j], Y[:,j], W, b, 0.37) # The last number is alpha which is our step length
end

In [16]:
forward_propagation(X[:,110], Y[:,110], W, b)[3:4]


([0.30484726340794155, 0.34819248656282126, 0.34696025002923736], 1.0549998298164636)

In [17]:
train_data = zeros(4, 100)
train_label = zeros(3, 100)

for i in 1:100
    j = rand(1:3)
    if j == 1
        k = rand(1:50)
        train_data[:, i] = copy(X[:, k])
        train_label[:, i] = copy(Y[:,k])
    elseif j == 2
        k = rand(50:100)
        train_data[:, i] = copy(X[:, k])
        train_label[:, i] = copy(Y[:,k])
    else
        k = rand(100:150)
        train_data[:, i] = copy(X[:, k])
        train_label[:, i] = copy(Y[:,k])
    end
end

In [18]:
function mini_batch_∇L(train_data, train_label, W, b, m)
    
    i = rand(1:100)
    a, δ = back_propagation(train_data[:, i], train_label[:,i], W, b)
    
    db1 = (δ[2])
    db2 = (δ[3])
    db3 = (δ[4])
    
    dW1 = δ[2]*a[1]'
    dW2 = δ[3]*a[2]'
    dW3 = δ[4]*a[3]'
    
    for _ in 1:m
        j = rand(1:100)
        a, δ = back_propagation(train_data[:, j], train_label[:,j], W, b)
        
        db1 += copy(δ[2])
        db2 += copy(δ[3])
        db3 += copy(δ[4])
    
        dW1 += δ[2]*a[1]'
        dW2 += δ[3]*a[2]'
        dW3 += δ[4]*a[3]'
    end
    
    return [db1/m, db2/m, db3/m], [dW1/m, dW2/m, dW3/m]
end

mini_batch_∇L (generic function with 1 method)

In [19]:
function stochastic_gradient_descent!(train_data, train_label, W, b, α, m)
    db, dW = mini_batch_∇L(train_data, train_label, W, b, m)

    for i = 1:length(W)
        W[i] -= α*dW[i]
        b[i] -= α*b[i]
    end
end

stochastic_gradient_descent! (generic function with 1 method)

In [21]:
for _ in 1:10000
    stochastic_gradient_descent!(train_data, train_label, W, b, 0.38, 23)
end

3×100 Array{Float64,2}:
 0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  1.0  0.0  0.0  0.0  1.0  0.0
 0.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0     1.0  0.0  0.0  1.0  0.0  0.0  0.0
 1.0  0.0  1.0  0.0  1.0  0.0  1.0  1.0     0.0  0.0  1.0  0.0  1.0  0.0  1.0