<a href="https://colab.research.google.com/github/Lopeznil/Excursion/blob/main/Neural_Networks_Practice_Examples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#IMAGE CLASSIFIER
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf

BATCH_SIZE = 16

print("\nLoading training data...")

training_data_generator = ImageDataGenerator(
        rescale=1./255,
        zoom_range=0.2,
        rotation_range=15,
        width_shift_range=0.05,
        height_shift_range=0.05)

training_iterator = training_data_generator.flow_from_directory('data/train',class_mode='categorical',color_mode='grayscale',batch_size=BATCH_SIZE)


print("\nLoading validation data...")

#1) Create validation_data_generator, an ImageDataGenerator that just performs pixel normalization:

validation_data_generator = ImageDataGenerator(rescale=1.0/255)

#2) Use validation_data_generator.flow_from_directory(...) to load the validation data from the 'data/test' folder:

validation_iterator = validation_data_generator.flow_from_directory('data/test', class_mode='categorical', color_mode='grayscale', batch_size=BATCH_SIZE)


print("\nBuilding model...")

#Rebuilds our model from the previous exercise, with convolutional and max pooling layers:

model = tf.keras.Sequential()
model.add(tf.keras.Input(shape=(256, 256, 1)))
model.add(tf.keras.layers.Conv2D(2, 5, strides=3, activation="relu"))
#model.add(tf.keras.layers.Conv2D(2, 5, strides=3, padding='valid', activation="relu"))
model.add(tf.keras.layers.MaxPooling2D(
    pool_size=(5, 5), strides=(5,5)))
model.add(tf.keras.layers.Conv2D(4, 3, strides=1, activation="relu"))
model.add(tf.keras.layers.MaxPooling2D(
    pool_size=(2,2), strides=(2,2)))
model.add(tf.keras.layers.Flatten())

model.add(tf.keras.layers.Dense(2,activation="softmax"))

model.summary()


print("\nCompiling model...")

#3) Compile the model with an Adam optimizer, Categorical Cross Entropy Loss, and Accuracy and AUC metrics:

model.compile(
   optimizer=tf.keras.optimizers.Adam(learning_rate=0.005),
   loss=tf.keras.losses.CategoricalCrossentropy(),
   metrics=[tf.keras.metrics.CategoricalAccuracy(),tf.keras.metrics.AUC()]
)

print("\nTraining model...")

#4) Use model.fit(...) to train and validate our model for 5 epochs:

model.fit(
       training_iterator,
       steps_per_epoch=training_iterator.samples/BATCH_SIZE,
       epochs=5,
       validation_data=validation_iterator,
       validation_steps=validation_iterator.samples/BATCH_SIZE
)


In [None]:
#REGESSOR

import pandas as pd
import tensorflow
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

tensorflow.random.set_seed(35) #for the reproducibility of results

def design_model(features):
  model = Sequential(name = "my_first_model")
  #without hard-coding
  input = InputLayer(input_shape=(features.shape[1],))
  #add the input layer
  model.add(input)
  #add a hidden layer with 128 neurons
  model.add(Dense(128, activation='relu'))
  #model.add(layers.Dropout(0.2))
  #add an output layer to our model
  model.add(Dense(1))
  opt = Adam(learning_rate=0.1)
  model.compile(loss='mse',  metrics=['mae'], optimizer=opt)
  return model

dataset = pd.read_csv('insurance.csv') #load the dataset
features = dataset.iloc[:,0:6] #choose first 7 columns as features
labels = dataset.iloc[:,-1] #choose the final column for prediction

features = pd.get_dummies(features) #one-hot encoding for categorical variables
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.33, random_state=42) #split the data into training and test data

#standardize
ct = ColumnTransformer([('standardize', StandardScaler(), ['age', 'bmi', 'children'])], remainder='passthrough')
features_train = ct.fit_transform(features_train)
features_test = ct.transform(features_test)

#invoke the function for our model design
model = design_model(features_train)
print(model.summary())

#fit the model using 40 epochs and batch size 1
model.fit(features_train, labels_train, epochs=40, batch_size=1, verbose=1)

#evaluate the model on the test data
val_mse, val_mae = model.evaluate(features_test, labels_test, verbose = 0)

print("MAE: ", val_mae)


++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

In this lesson, you learned how to both manually and automatically choose hyperparameters of the neural network training procedure in order to select a model with the best predictive performance on a validation set. The hyperparameters we covered in this lesson are

    learning rate
    batch size
    number of epochs
    model size (number of hidden layers/neurons and number of parameters)
    regularization (dropout)

We discussed the concepts of underfitting (having a too simple model to capture data patterns) and overfitting (having a model with too many parameters that learns the training data too well and is unable to generalize). We discussed methods to combat overfitting such as regularization. To avoid underfitting we increased the complexity of our model.

Besides data preprocessing, hyperparameter tuning is probably the most costly and intensive process of neural network training. We covered how to set up grid seach and randomized search in Keras in order to automate the process of hyperparameter tuning.

We also showed you how to check the performance of your model against a simple baseline. Baselines give you an idea of whether your model has a reasonable performance

When the batch contains all the training examples, the process is called batch gradient descent. If the batch has one sample, it is called the stochastic gradient descent. And finally, when 1 < batch size < number of training points, is called mini-batch gradient descent. An advantage of using batches is for GPU computation that can parallelize neural network computations.

How do we choose the batch size for our model? On one hand, a larger batch size will provide our model with better gradient estimates and a solution close to the optimum, but this comes at a cost of computational efficiency and good generalization performance. On the other hand, smaller batch size is a poor estimate of the gradient, but the learning is performed faster. Finding the “sweet spot” depends on the dataset and the problem, and can be determined through hyperparameter tuning.

In [None]:
# TUNING WITH HYPERPARAMATERS

import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer
from tensorflow.keras.callbacks import EarlyStopping


tf.random.set_seed(42) #for reproducibility of result we always use the same seed for random number generator

dataset = pd.read_csv("insurance.csv") #read the dataset

def fit_model(model, f_train, l_train, learning_rate, num_epochs):
    #train the model on the training data
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience = 20)
    history = model.fit(features_train, labels_train, epochs=num_epochs, batch_size= 16, verbose=0, validation_split = 0.2, callbacks = [es])
    return history

features = dataset.iloc[:,0:6] #choose first 7 columns as features
labels = dataset.iloc[:,-1] #choose the final column for prediction

features = pd.get_dummies(features) #one hot encoding for categorical variables
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.33, random_state=42)

#standardize
ct = ColumnTransformer([('standardize', StandardScaler(), ['age', 'bmi', 'children'])], remainder='passthrough')
features_train = ct.fit_transform(features_train) #gives numpy arrays
features_test = ct.transform(features_test) #gives numpy arrays

#______________________________________________________________________________________________________________________________________________________________________


import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from plotting import plot
import matplotlib.pyplot as plt

def design_model_dropout(X, learning_rate):
    model = Sequential(name="my_first_model")
    input = tf.keras.Input(shape=(X.shape[1],))
    model.add(input)
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(24, activation='relu'))
    model.add(layers.Dropout(0.3))
    #------your code here!------


    model.add(layers.Dense(1))
    opt = tf.keras.optimizers.Adam(learning_rate = learning_rate)
    model.compile(loss='mse', metrics=['mae'], optimizer=opt)
    return model

def design_model_no_dropout(X, learning_rate):
    model = Sequential(name="my_first_model")
    input = layers.InputLayer(input_shape=(X.shape[1],))
    model.add(input)
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(24, activation='relu'))
    model.add(layers.Dense(1))
    opt = tf.keras.optimizers.Adam(learning_rate = learning_rate)
    model.compile(loss='mse', metrics=['mae'], optimizer=opt)
    return model

#using the early stopping in fit_model
learning_rate = 0.001
num_epochs = 200
#train the model without dropout
history1 = fit_model(design_model_no_dropout(features_train, learning_rate), features_train, labels_train, learning_rate, num_epochs)
#train the model with dropout
history2 = fit_model(design_model_dropout(features_train, learning_rate), features_train, labels_train, learning_rate, num_epochs)

plot(history1, 'static/images/no_dropout.png')

plot(history2, 'static/images/with_dropout.png')

import app #don't worry about this. This is to show you the plot in the browser.



Why do we need a baseline? For example, we have data consisting of 90% dog images, and 10% cat images. An algorithm that predicts the majority class for each data point, will have 90% accuracy on this dataset! That might sound good, but predicting the majority class is hardly a useful classifier. We need to perform better.

A baseline result is the simplest possible prediction. For some problems, this may be a random result, and for others, it may be the most common class prediction. Since we are focused on a regression task in this lesson, we can use averages or medians of the class distribution known as central tendency measures as the result for all predictions.

Scikit-learn provides DummyRegressor, which serves as a baseline regression algorithm. We’ll choose mean (average) as our central tendency measure

In [None]:
import matplotlib.pyplot as plt
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_absolute_error


dummy_regr = DummyRegressor(strategy="median")
dummy_regr.fit(features_train, labels_train)
y_pred = dummy_regr.predict(features_test)
MAE_baseline = mean_absolute_error(labels_test, y_pred)
print(MAE_baseline)

So far we’ve been manually setting and adjusting hyperparameters to train and evaluate our model. If we didn’t like the result, we changed the hyperparameters to some other values. However, this is rather cumbersome; it would be nice if we could make these changes in a systematic and automated way. Fortunately, there are some strategies for automated hyperparameter tuning, including the following two.

Grid search, or exhaustive search, tries every combination of desired hyperparameter values. If, for example, we want to try learning rates of 0.01 and 0.001 and batch sizes of 10, 30, and 50, grid search will try six combinations of parameters (0.01 and 10, 0.01 and 30, 0.01 and 50, 0.001 and 10, and so on). This obviously gets very computationally demanding when we increase the number of values per hyperparameter or the number of hyperparameters we want to tune.

On the other hand, Random Search goes through random combinations of hyperparameters and doesn’t try them all.

Grid search in Keras

To use GridSearchCV from scikit-learn for regression we need to first wrap our neural network model into a

    KerasRegressor:

    model = KerasRegressor(build_fn=design_model)

Then we need to setup the desired hyperparameters grid (we don’t use many values for the sake of speed):

    batch_size = [10, 40]
    epochs = [10, 50]
    param_grid = dict(batch_size=batch_size, epochs=epochs)



Finally, we initialize a GridSearchCV object and fit our model to the data:

    grid = GridSearchCV(estimator = model, param_grid=param_grid, scoring = make_scorer(mean_squared_error, greater_is_better=False))
    
    grid_result = grid.fit(features_train, labels_train, verbose = 0)



Notice that we initialized the scoring parameter with scikit-learn’s .make_scorer() method. We’re evaluating our hyperparameter combinations with a mean squared error making sure that greater_is_better is set to False since we are searching for a set of hyperparameters that yield us the smallest error.

Randomized search in Keras

We first change our hyperparameter grid specification for the randomized search in order to have more options:

    param_grid = {'batch_size': sp_randint(2, 16), 'nb_epoch': sp_randint(10, 100)}



Randomized search will sample values for batch_size and nb_epoch from uniform distributions in the interval [2, 16] and [10, 100], respectively, for a fixed number of iterations. In our case, 12 iterations:

    grid = RandomizedSearchCV(estimator = model, param_distributions=param_grid, scoring = make_scorer(mean_squared_error, greater_is_better=False), n_iter = 12)



We cover only simpler cases here, but you can set up GridSearchCV and RandomizedSearchCV to tune over any hyperparameters you can think of: optimizers, number of hidden layers, number of neurons per layer, and so on.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from model import design_model, features_train, labels_train

#------------- GRID SEARCH --------------
# def do_grid_search():
#   batch_size = [6, 64]
#   epochs = [10, 50]
#   model = KerasRegressor(build_fn=design_model)
#   param_grid = dict(batch_size=batch_size, epochs=epochs)
#   grid = GridSearchCV(estimator = model, param_grid=param_grid, scoring = make_scorer(mean_squared_error, greater_is_better=False),return_train_score = True)
#   grid_result = grid.fit(features_train, labels_train, verbose = 0)
#   print(grid_result)
#   print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

#   means = grid_result.cv_results_['mean_test_score']
#   stds = grid_result.cv_results_['std_test_score']
#   params = grid_result.cv_results_['params']
#   for mean, stdev, param in zip(means, stds, params):
#       print("%f (%f) with: %r" % (mean, stdev, param))

#   print("Traininig")
#   means = grid_result.cv_results_['mean_train_score']
#   stds = grid_result.cv_results_['std_train_score']
#   for mean, stdev, param in zip(means, stds, params):
#       print("%f (%f) with: %r" % (mean, stdev, param))

#------------- RANDOMIZED SEARCH --------------
def do_randomized_search():
  param_grid = {'batch_size': sp_randint(2, 16), 'nb_epoch': sp_randint(10, 100)}
  model = KerasRegressor(build_fn=design_model)
  grid = RandomizedSearchCV(estimator = model, param_distributions=param_grid, scoring = make_scorer(mean_squared_error, greater_is_better=False), n_iter = 12)
  grid_result = grid.fit(features_train, labels_train, verbose = 0)
  print(grid_result)
  print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

  means = grid_result.cv_results_['mean_test_score']
  stds = grid_result.cv_results_['std_test_score']
  params = grid_result.cv_results_['params']
  for mean, stdev, param in zip(means, stds, params):
      print("%f (%f) with: %r" % (mean, stdev, param))

# print("-------------- GRID SEARCH --------------------")
# do_grid_search()
print("-------------- RANDOMIZED SEARCH --------------------")
do_randomized_search()
