## Neural Network
Forward propagation <br>
layer_1 = Dense(units=3, activation='sigmoid') <br>
layer_2 = Dense(units=1, activation='sigmoid') <br>
x = np.array([200, 17])  
\begin{align*} 
w_{1}^{[1]} &= \text{np.array([1,2])} & w_{1}^{[2]} &= \text{np.array([-3,4])} & w_{1}^{[3]} &= \text{np.array([5,-6])} \newline 
b_{1}^{[1]} &= \text{np.array([-1])} & b_{1}^{[2]} &= \text{np.array([1])} & b_{1}^{[3]} &= \text{np.array([2])} \newline 
z_{1}^{[1]} &= {np.dot(w_{1}^{[1]}, x) + b_{1}^{[1]}}  & z_{1}^{[2]} &= {np.dot(w_{1}^{[2]}, x) + b_{1}^{[2]}} & z_{1}^{[3]} &= {np.dot(w_{1}^{[3]}, x) + b_{1}^{[3]}} \newline 
a_{1}^{[1]} &= {sigmoid(z_{1}^{[1]})} & a_{1}^{[2]} &= {sigmoid(z_{1}^{[2]})} & a_{1}^{[3]} &= {sigmoid(z_{1}^{[3]})} \newline 
\end{align*}


\begin{align*}
& a_{1} = np.array(a_{1}^{[1]}, a_{1}^{[2]}, a_{1}^{[3]}) 
\end{align*}

In [1]:
import numpy as np
X = np.array([[200.0, 17.0]]) # 1x2 matrix
X

array([[200.,  17.]])

In [2]:
import tensorflow as tf # or PyTorch
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

In [3]:
layer_1 = Dense(units=3, activation='sigmoid') # 3 neurons
a1 = layer_1(X)
#tf.Tensor([[0.2 0.7 0.3]]), shape(1,3), dtype=float32) Tensor is a data type to representing matrix
#a1.numpy() to reverse it
print(a1)
a1 = a1.numpy()
print(a1)

tf.Tensor([[1. 1. 0.]], shape=(1, 3), dtype=float32)
[[1. 1. 0.]]


## Forward Prop in NumPy
W = np.array([1, -3, 5], <br>
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;[2, 4, -6]])   2X3 <br>
b = np.array([-1, 1, 2])  <br>
a_in = np.array([-2, 4])  <br>

def dense(a_in, W, b, g): <br>
&emsp; &emsp;units = W.shape[1] <br>
&emsp; &emsp;a_out = np.zeros(units) <br>
&emsp; &emsp;for j in range(units): 0,1,2 <br>
&emsp; &emsp;&emsp;w = W[:,j] <br>
&emsp; &emsp;&emsp;z = np.dot(w, a_in) + b[j] <br>
&emsp; &emsp;&emsp;a_out[j] = g(z) <br>
&emsp; &emsp;return a_out
   
def sequential(x): <br>
   &emsp; &emsp; a1 = dense(x, W1, b1) <br>
   &emsp; &emsp; a2 = dense(a1, W2, b2) <br>
   &emsp; &emsp; a3 = dense(a2, W3, b3) <br>
   &emsp; &emsp; a4 = dense(a3, W4, b4) <br>
   &emsp; &emsp; f_x = a4 <br>
   &emsp; &emsp; return f_x
   
## Alternate approach
def dense(A_in, W, B):<br>
&emsp;Z = np.matmul(A_in, W) + B <br>
&emsp;A_out = g(Z) <br>
return A_out <br>

a = [1  <br>
&emsp;&emsp;2] <br>
a_T = [1 2] <br>
W = [3 5    <br>
&emsp;&emsp; 4 6] <br>

Z = a_T.W <br>
Z = [11 17]

Matrix Multiplicatio:  3 x 2 with 2 x 5

## Training 
<img align="left" src="Training Steps - Andrew NG.png"     style=" width:1000px; padding: 10px; " >

In [4]:
import tensorflow as tf # or PyTorch
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

In [5]:
model = Sequential([
        Dense(units=25, activation='sigmoid'),
        Dense(units=15, activation='sigmoid'),
        Dense(units=1, activation='sigmoid')
])

In [6]:
from tensorflow.keras.losses import BinaryCrossentropy, MeanSquaredError, SparseCategoricalCrossentropy
model.compile(loss=BinaryCrossentropy()) # for Binary Classification problem
#model.fit(X, y, epochs=100) # epochs: number of steps in gradient descent using "back propagation"

In [7]:
model.compile(loss=MeanSquaredError()) # for Regression problem
#model.predict(x_new)

# Activation Functions
Sigmoid  0 < g(z) < 1 when y in <0, 1> <br>
ReLU (Rectified Linear Unit): g(z) = max(0, z); if z<0 then g(z) is 0 else g(z) is Z when y <non negative values> <br>
Linear Activiation Function: g(z) = z when y in <-1.2, 1.5, -0.4> don't use in hidden layers

In [8]:
# Binary Classification
model = Sequential([
        Dense(units=25, activation='relu'),
        Dense(units=15, activation='relu'),
        Dense(units=1, activation='sigmoid')
])

## Multi Class 

- Softmax regression (4 possible outputs) y = 1, 2, 3, 4 <br>
&emsp; z1 = W1 . X + b1 <br>
&emsp; a1 = e^z1 / (e^z1 + e^z2 + e^3z3 + e^4z4) <br>
&emsp;&emsp;&ensp;= P(y=1|x) <br>

&emsp; z2 = W2 . X + b2 <br>
&emsp; a2 = e^z2 / (e^z1 + e^z2 + e^3z3 + e^4z4) <br>
&emsp;&emsp;&ensp;= P(y=2|x) <br>

&emsp; z1 = W1 . X + b1 <br>
&emsp; a1 = e^z1 / (e^z1 + e^z2 + e^3z3 + e^4z4) <br>
&emsp;&emsp;&ensp;= P(y=1|x) <br>

&emsp; z1 = W1 . X + b1 <br>
&emsp; a1 = e^z1 / (e^z1 + e^z2 + e^3z3 + e^4z4) <br>
&emsp;&emsp;&ensp;= P(y=1|x) <br>

Softmax Regression (N posssible outputs) <br>
z_j = W_j . X + b_j  j=1,2,...,N <br>
a_j = e^z_j / Sum(e^z_k) = P(y=j|X)

In [9]:
x1= 2.0 / 10000
print(f"{x1:.18f}")
x1

0.000200000000000000


0.0002

## Numerical Roundoff Errors

### Numerically accurate implementation of logistic loss:

#### Logistic regression:

$g(z) = \frac{1}{1+e^{-z}}\tag{1}$
$a = {g(z)}\tag{2}$

#### Original loss
$loss = {-ylog(a) - (1-y)log(1-a)}\tag{3}$

#### More accurate loss
$loss = {-ylog(\frac{1}{1+e^{-z}}) - (1-y)log(1-\frac{1}{1+e^{-z}})}\tag{4}$

In [10]:
# Multiclass Classification
model = Sequential([
            Dense(units=25, activation='relu'),
            Dense(units=15, activation='relu'),
            Dense(units=10, activation='sigmoid')
])

# Original Loss
model.compile(loss=BinaryCrossentropy())

In [11]:
# More accurate loss
model.compile(loss=BinaryCrossentropy(from_logits=True))

## Numerical Roundoff Errors

### Numerically accurate implementation of softmax:

#### Softmax regression:

$(a1,...,a10) = {g(z1,...,z10)}\tag{1}$


\begin{equation}
  Loss = L(\mathbf{a},y)=\begin{cases}
    -log(a_1) & \text{if $y=1$}.\\
        &\vdots\\
     -log(a_N) & \text{if $y=N$}
  \end{cases} \tag{3}
\end{equation}



#### More accurate loss
\begin{equation}
  Loss = L(\mathbf{a},y)=\begin{cases}
    -log(\frac{e^{-z_1}}{e^{-z_1}+e^{-z_2}+ ... + e^{-z_N}}) & \text{if $y=1$}.\\
        &\vdots\\
     -log(\frac{e^{-z_N}}{e^{-z_1}+e^{-z_2}+ ... + e^{-z_N}}) & \text{if $y=N$}
  \end{cases} \tag{3}
\end{equation}

In [12]:
# Multiclass Classification
model = Sequential([
            Dense(units=25, activation='relu'),
            Dense(units=15, activation='relu'),
            Dense(units=10, activation='softmax')
])

# Original Loss
model.compile(loss=SparseCategoricalCrossentropy())

# More accurate
model = Sequential([
            Dense(units=25, activation='relu'),
            Dense(units=15, activation='relu'),
            Dense(units=10, activation='linear')
])

model.compile(loss=SparseCategoricalCrossentropy(from_logits=True))

In [13]:
# Multi-label Classification (Multiple outputs)
# A self driving car with multiple outputs

In [14]:
# When learning rate is too small in Gradient Descent, there is another lgorithm that takes faster steps
# Adaptive Moement Estimation: Adam algorithm can adjust the alpha to larger or smaller learning rates automatically
# it can assign a separate learning rate for each w and b
# if w and b keeps moving in same direction, increase alpha

Adam: Adaptive Moment estimation (not just one alpha)
$$\begin{align*} \; \newline\;
& w_1 = w_1 -  \alpha1 \frac{\partial J(\mathbf{w},b)}{\partial w_1}   \; \newline
& w_2 = w_2 -  \alpha2 \frac{\partial J(\mathbf{w},b)}{\partial w_2}   \; \newline
& w_3 = w_3 -  \alpha3 \frac{\partial J(\mathbf{w},b)}{\partial w_3}   \; \newline
&b\ \ = b -  \alpha4 \frac{\partial J(\mathbf{w},b)}{\partial b}  \newline 
\end{align*}$$