# Validation protocols

In [None]:
''' 2. Validation '''

# One. Hold-out validation

num_validation_samples = 10000

np.random.shuffle(data)

validation_data = data[:num_validation_samples]
data = data[num_validation_samples:]

training_data = data[:]

model = get_model()
model.train(training_data)
validation_score = model.evaluate(validation_data)

# At this point - tune model, retrain, evaluate, tune again...

model = get_model()
model.train(np.concatenate([training_data, validation_data]))
test_score = model.evaluate(test_data)

In [None]:
# Two. K-fold validation

k = 4
num_validation_samples = len(data) // k

np.random.shuffle(data)

validation_scores = []

# for iterated k-fold validation with shuffling add here one more cycle

# n_iterations = 5
# for i in range(iterations):
#     np.random.shuffle(data)
for fold in range(k):
    validation_data = data[num_validation_samples * fold:
                          num_validation_samples * (fold + 1)]
    training_data = data[:num_validation_samples * fold] +
                    data[num_validation_samples * (fold + 1):]
        
    model = get_model()
    model.train(training_data)
    validation_score = model.evaluate(validation_data)
    validation_scores.append(validatioin_score)
    
validation_score = np.average(validation_scores)

model = get_model()
model.train(data)
test_score = model.evaluate(test_data)

 ###  How to choose evaluation protocol
 
 1) *Data representativeness* - both test and training set are **representative** of the data in hand. 
 
 2) *The arrow of time* - if trying predict the future given the past, do not shuffle data before splitting it. Data in test set and validation set are **posterior** to the data in the training set.
 
 3) *Redundancy in data* - if data points appear twice, then shuffling the data and splitting it into training and validation sets results in redundancy between data. Make sure that sets are disjoint.

# Data preprocessing, feature engineering, and feature learning

## *Data preprocessing*

In [1]:
''' Vectorization '''

# Turn data into tensors of float32 data.

In [2]:
''' Value normalization '''

# 1) Make the values lie in the 0-1 range for images.
# 2) Take small values.
# 3) Be homogenous for all features
# 4) Normalize each feature independently to have mean of 0 and st_dev of 1.

' Value normalization '

In [3]:
''' Handling missing values '''

# With neural networks it is safe to input missing values as 0, if 0 isn't a meaningful value.
# Make sure training data has missing values too.

' Handling missing values '

## *Feature engineering*

In [4]:
''' Clock arrows example '''

# If you know what data mean - make sure your NN can handle 
#  this meaning rather than row data.

' Clock arrows example '

## *Overfitting and underfitting*

In [5]:
# If loss on both training and validation set are diminishing
#  from batch to batch, just keep learning.

# The best solution is to get more training data.

In [7]:
''' Reducing the network's size '''

# A model with more parameters has more memorization capacity.

## Original model 

from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

# Model with lower capacity

model = models.Sequential()
model.add(layers.Dense(4, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(4, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

# Model with higher capacity

model = models.Sequential()
model.add(layers.Dense(512, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

In [8]:
''' Adding weight regularization '''

# At training time, adds 0.001 * weight_coefficient_value to the total loss

from keras import regularizers

model = models.Sequential()
model.add(layers.Dense(16, kernel_regularizer=regularizers.l2(0.001),
                      activation='relu', input_shape=(10000,)))
model.add(layers.Dense(16, kernel_regularizer=regularizers.l2(0.001),
                      activation='relu'))
model.add(layers.Dense(1, activation='relu'))

# Different regularizers

regularizers.l1(0.001)
regularizers.l1_l2(l1=0.001, l2=0.001)

<keras.regularizers.L1L2 at 0x1c13d7db38>

In [None]:
''' Adding dropout '''

# At training time, drops out 50% of the units in the output
layer_output *= np.random.randint(0, high=2, size=layer_output.shape)

# At test time, scale output to stay in the same range
layer_output *= 0.5

# We can also do both operation at training time
layer_output *= np.random.randint(0, high=2, size=layer_output.shape)
layer_output /= 0.5



In [9]:
# IMDB network with dropout

model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1, activation='sigmoid'))

# The universal workflow of machine learning

## *Defining the problem and assembling a dataset*

* What will input data be? 

* What type of problem will it be? Binary classification? Multiclass classification? Scalar regression? Vector regression? Multiclass multilabel classification?

Always be aware of nonstationary problems. E.g. recommendation engine for clothes which was training on August data, to be used to predict clothes in winter.

## *Choosing a measure of success*

Accuracy? ROC AUC? MAE? RMSE?

## *Deciding on an evaluation protocol*

* Maintaining a hold-out validatioin set - when you have plenty of data.

* Doing K-fold cross-validation - too few samples for hold-out validation.

* Doing iterated K-fold validation - when little data is available.

## *Preparing your data*

* Format data as tensors.

* Scale data to small values - [-1, 1] or [0, 1].

* If different features take values in different ranges, normalize data.

* Make feature engineering, especially for small-data problems.

## *Developing a model that does better than a baseline*

* You hypothesize that your outputs can be predicted given your inputs.

* You hypothesize that the available data is sufficiently informative to learn the relationship between inputs and outputs.

Three key choices to build first network:

1) Last-layer activation - sigmoid, softmax or no activation.

2) Loss function - binary_crossentropy, categorical_crossentropy, mse etc.

3) Optimization configuration - optimizer and learning rate (rmsprop and default).

## *Scaling up: developing a model that overfits*

1) Add layers

2) Make the layers bigger

3) Train for more epochs

## *Regularizing model and tuning hyperparameters*

* Add dropout.

* Try different architectures: add of remove layers.

* Add L1 and/or L2 regularization.

* Try different hyperparameters (number of units per layer or learning rate).

* Iterate to feature engineering: add new features or remove features that don't seem to be informative.