# Model Building steps using keras

1. Specify architecture
2. Compile
3. Fit
4. Prediction

# Regression problem
## Model specification

In [None]:
import numpy as np
from keras.layers import Dense
from keras.models import Sequential

In [None]:
predictors = np.loadtxt('data.csv' , delimiter=',')

In [None]:
n_cols = predictors.shape[1] # inpput neurons
model = Sequential()
model.add(Dense(100 , activation='relu' , input_shape = (n_cols,)))
model.add(Dense(100 , activation='relu')) # dense means all previous N connect with next layer all N
model.add(Dense(1))

# Compile
1. Specify the optimizer
    
    1.1. Controls the learning rate
    
    1.2. Many options and mathematical complex
    
    1.3.'Adam' is usually a good choice
    

2. Loss Function
 
     2.1. "mean_square_error" for common regression

In [None]:
model.compile(optimizer='adam',loss = 'mean_squared_error')

# Fitting

1. Applying backpropagation with data and gradient decent to update the weights

2. Scaling data before fitting can easy optimization


In [None]:
model.fit(predictors,target)

# Classification

loss = 'categorical_crossentropy

similar to log loss:lower is better

add metric=['accuracy'] to compile step for easy-to-understand diagnostics

output layer has seperate node for each possible outcome and uses 'softmax' activation

In [None]:
from keras.utils import to_categorical

In [None]:
data = pd.read_csv('data.csv')
predictors = data.drop(['labels'],axis=1).as_matrix()
target = to_categorical(data['labels'])
model = Sequential()
model.add(Dense(100 , activation='relu' , input_shape = (n_cols,)))
model.add(Dense(100 , activation='relu'))
model.add(Dense(100 , activation='relu'))
model.add(Dense(2, activation='softmax'))
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
model.fit(predictors,target)

# Saving , reloading and using model

In [None]:
from keras.models import load_model
model.save('model_file.h5')
my_model = load_model('my_model.h5') # bi class classfication model  
predictions = my_model.predict(data_to_predict_with)
probability_true = predictions[:,1]

## Verify model structure

In [None]:
my_model.summary()

# Understanding Model optimization

## why optimization is hard?
1. Simultaneously optimizing 1000s of parameters with complex relationships
2. updates may not improve model meaningfully
3. update too small if learning rate is low and updat too high if learning rate is high

In [None]:
from keras.optimizers import SGD
def get_new_model(input_shape = input_shape):
    model = Sequential()
    model.add(Dense(100 , activation='relu' , input_shape = input_shape))
    model.add(Dense(100 , activation='relu'))
    model.add(Dense(2   , activation='softmax'))
    return model
lr_to_test=[.0000001 , .001 , .01 , .1 , 1]
for lr in lr_to_test:
    model = get_new_model()
    my_optimizer = SGD(lr)
    model.compile(optimizer = my_optimizer , loss='categorical_crossentropy')

## The dying neuron problem

once a node starts always getting negative inputs

It may continue only getting negative inputs

Contributes nothing to the model

'dead' neuron

instead use 'tanh function'

## Vanishing gradient

Occurs when many layers have very small slopes (e.g. due to being on flat part of tanh curve)

In deep network, update to backprop close to 0

# Model validation

1. Commonly use validation split rather than cross-validation

2. Deep learning widely used on large datasets

3. Single validation score is based on large amount of data, and is reliable

4. repeating training from cross-validation would take long time

In [None]:
model.fit(predictors, target, validation_split = 0.3 )

# Early Stopping

In [None]:
from keras.callbacks import EarlyStopping
early_stopping_monitor = EarlyStopping(patience=2)
model.fit(pred,target,validation_split=0.3,epochs=20,callbacks=[early_stopping_monitor])

In [None]:
# Define early_stopping_monitor
early_stopping_monitor = EarlyStopping(patience=2)

# Create the new model: model_2
model_2 = Sequential()

# Add the first and second layers
model_2.add(Dense(100, activation='relu', input_shape=input_shape))
model_2.add(Dense(100, activation='relu', input_shape=input_shape))
model_2.add(Dense(2 , activation='softmax'))

# Compile model_2
model_2.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
# model_1 with 10,10 nueorn 
# Fit model_1
model_1_training = model_1.fit(predictors, target, epochs=15, validation_split=0.2, callbacks=[early_stopping_monitor], verbose=False)

# Fit model_2
model_2_training = model_2.fit(predictors, target, epochs=15, validation_split=0.2, callbacks=[early_stopping_monitor], verbose=False)

# Create the plot
plt.plot(model_1_training.history['val_loss'], 'r', model_2_training.history['val_loss'], 'b')
plt.xlabel('Epochs')
plt.ylabel('Validation score')
plt.show()

# Working flow of optimizing model capacity

1. start with a small network

2. Get the validation score

3. Keep increasing capacity until validation score is no longer improving (calculate MSE)

4. increase layer increase nodes until score is no longer imporoving