In [1]:
import numpy as np

## Basics of DL and NN

In [2]:
input_data = np.array([3, 5])
weights = {'node_0': np.array([2, 4]), 
          'node_1': np.array([4, -5]),
          'output': np.array([2, 7])}

In [3]:
def relu(num):
    return max(num, 0)

### Neural Network
![title](nn.png)

In [4]:
h1 = (input_data * weights['node_0']).sum()
h1_out = relu(h1)

h2 = (input_data * weights['node_1']).sum()
h2_out = relu(h2)

hidden_layer_out = np.array([h1_out, h2_out])

In [5]:
output = (hidden_layer_out * weights['output']).sum()
out = relu(output)

print("Weights of Hidden Layer: {}, {}".format(h1_out, h2_out))
print("Output: {}".format(output))

Weights of Hidden Layer: 26, 0
Output: 52


### Multi-Layer Neural Network
![title](mlnn.png)

In [6]:
input_data = np.array([3, 5])
weights = {'node_0_0': np.array([2, 4]), 
          'node_0_1': np.array([4, -5]),
          'node_1_0': np.array([-1, 1]),
          'node_1_1': np.array([2, 2]),
          'output': np.array([2, 7])}

In [7]:
def predict_with_network(input_data):
    node_0_0_input = (weights['node_0_0'] * input_data).sum()
    node_0_0_output = relu(node_0_0_input)

    node_0_1_input = (weights['node_0_1'] * input_data).sum()
    node_0_1_output = relu(node_0_1_input)

    hidden_0_outputs = np.array([node_0_0_output, node_0_1_output])
    
    node_1_0_input = (weights['node_1_0'] * hidden_0_outputs).sum()
    node_1_0_output = relu(node_1_0_input)

    node_1_1_input = (weights['node_1_1'] * hidden_0_outputs).sum()
    node_1_1_output = relu(node_1_1_input)

    hidden_1_outputs = np.array([node_1_0_output, node_1_1_output])

    model_output = (weights['output'] * hidden_1_outputs).sum()
    
    return(model_output)

def relu(num):
    return max(0, num)

In [8]:
output = predict_with_network(input_data)
print(output)

364


## Optimizing a Neural Network with Backward Propagation
 
### Loss Function

- Aggregates errors in predictions from many data points into single number
- Measure of model's predictive performance
- Ex: Mean squared error
- sum(prediction - actual)^2

- Lower loss function value means a better model
- Goal: Find the weights that give the lowest value for the loss function
- Gradient Descent to minimize loss

### Gradient Descent

- If the slope is positive:
    - Going opposite the slope means moving to lower numbers
    - Subtract the slope from the current value
    - Too big a step might lead us astray
- Solution: Learning rate
    - Update each weight by subtracting learning rate * slope

### Gradient Descent Steps

- Start at random point
- Until you are somewhere flat:
    - Find the slope
    - Take a step downhill

In [9]:
weights = np.array([1, 2])
input_data = np.array([3, 4])
target = 6
lr = 0.01
preds = (weights * input_data).sum()
error = preds - target
print("Error: ", error)

gradient = 2 * input_data * error
print("Gradient: ", gradient)

weights_updated = weights - lr * gradient
preds_updated = (weights_updated * input_data).sum()
error_updated = preds_updated - target
print("Updated Error: ", error_updated)

Error:  5
Gradient:  [30 40]
Updated Error:  2.5


### Backpropagation

![](bp.PNG)

- Allows Gradient Descent to update all weights in NN(by getting gradients for all weights)

### Backpropagation Process

- Go back one layer at a time
- Gradients for weight is product of:
    1. Node value feeding into that weight
    2. Slope of loss function w.r.t node it feeds into
    3. Slope of activation function at the note it feeds into

- Trying to estimate the slope of the loss function w.r.t each weight
- First do forward propagation to calculate predictions and errors

## Building Deep Learning Models with Keras

### Model building steps

- Specify Architecture
- Compile
- Fit
- Predict

In [10]:
import keras
from keras.models import Sequential
from keras.layers import Dense
import pandas as pd

In [11]:
df = pd.read_csv('hourly_wages.csv')
data = df.drop(['wage_per_hour'], axis=1)
cols = data.shape[1]
print(cols)
target = np.array(df['wage_per_hour'])
print(target.shape)

9
(534,)


In [12]:
data.head(5)

Unnamed: 0,union,education_yrs,experience_yrs,age,female,marr,south,manufacturing,construction
0,0,8,21,35,1,1,0,1,0
1,0,9,42,57,1,1,0,1,0
2,0,12,1,19,0,0,0,1,0
3,0,12,4,22,0,0,0,0,0
4,0,12,17,35,0,1,0,0,0


In [13]:
# Specifying the model

model = Sequential()
# Input layer
model.add(Dense(50, activation='relu', input_shape=(cols, )))

# Hidden layer
model.add(Dense(32, activation='relu'))

# Output layer with one unit
model.add(Dense(1))

### Compilng and fitting the model

#### Why Compling the model

- Specify the optimizer
    - Many options and mathematically complex
    - 'Adam' is usually a good choice
- Loss function
    - 'mean_squared_error' common for regression
    
#### Fitting a model

- Applying backpropagation and gradient descent with data to update weights
- Scaling data before fitting can ease optimization

In [14]:
model.compile(optimizer='adam', loss='mean_squared_error')
print("Loss function: ", model.loss)

model.fit(data, target)

Loss function:  mean_squared_error


<keras.callbacks.History at 0x17da6699a50>

In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 50)                500       
                                                                 
 dense_1 (Dense)             (None, 32)                1632      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 2,165
Trainable params: 2,165
Non-trainable params: 0
_________________________________________________________________


### Classification models

- In classification, the loss function becomes ```categorical_crossentropy```
- Similar to log loss: Lower score is better
- Add ```metrics = ['accuracy']``` to compile step for easy-to-understand diagnostics. Model prints accuracy at the end of every epoch
- The activation function at the output layer becomes ```softmax``` because we want seperate outcomes. All outputs sum to probability of 1.

In [16]:
# Loading titanic.csv dataset

df = pd.read_csv('titanic_all_numeric.csv')
df.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,male,embarked_from_cherbourg,embarked_from_queenstown,embarked_from_southampton
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,0.647587,0.188552,0.08642,0.722783
std,0.486592,0.836071,13.002015,1.102743,0.806057,49.693429,0.47799,0.391372,0.281141,0.447876
min,0.0,1.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,22.0,0.0,0.0,7.9104,0.0,0.0,0.0,0.0
50%,0.0,3.0,29.699118,0.0,0.0,14.4542,1.0,0.0,0.0,1.0
75%,1.0,3.0,35.0,1.0,0.0,31.0,1.0,0.0,0.0,1.0
max,1.0,3.0,80.0,8.0,6.0,512.3292,1.0,1.0,1.0,1.0


In [17]:
df.head(3)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,male,age_was_missing,embarked_from_cherbourg,embarked_from_queenstown,embarked_from_southampton
0,0,3,22.0,1,0,7.25,1,False,0,0,1
1,1,1,38.0,1,0,71.2833,0,False,1,0,0
2,1,3,26.0,0,0,7.925,0,False,0,0,1


In [44]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model

In [23]:
# Get target from dataset which is survived
n_cols = 10
target = to_categorical(df.survived)
predictors = df.drop(['survived'], axis=1).astype(np.int32)
predictors.head(3)

Unnamed: 0,pclass,age,sibsp,parch,fare,male,age_was_missing,embarked_from_cherbourg,embarked_from_queenstown,embarked_from_southampton
0,3,22,1,0,7,1,0,0,0,1
1,1,38,1,0,71,0,0,1,0,0
2,3,26,0,0,7,0,0,0,0,1


In [24]:
# Setting up the model
model = Sequential()

model.add(Dense(32, activation='relu', input_shape=(n_cols, )))
model.add(Dense(2, activation='softmax'))

model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(predictors, target)



<keras.callbacks.History at 0x17da7c13b50>

### Using models

- Save
- Reload
- Make Predictions

In [45]:
model.save('titanic_model.h5')
titanic = load_model('titanic_model.h5')

x = titanic.predict([[1, 38, 1, 0, 71, 0, 0, 1, 0, 0]])
print(x)

[[8.2282051e-05 9.9991775e-01]]


In [60]:
y = model.predict(predictors)
y = y[:, 1] # Survival Predicted Probability
print(y[0:10]) # Initial 10 samples

[0.45297593 0.99991775 0.4050524  0.9993616  0.29610923 0.36008003
 0.9964192  0.96686125 0.63213295 0.9778661 ]


## Fine-tuning keras models

### Understanding model optimization

#### Why optimization is hard
- Simultaneously optimizing 1000s of parameters with complex relationships
- Updates may not improve model meaningfully
- Updates too small(if lr is low) or too large(if lr is high)