In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras

In [2]:
import tensorflow_datasets as tfds

In [3]:
mnist_dataset, mnist_info = tfds.load(name='mnist', with_info=True, as_supervised=True)

#once we have loaded the dataset, we can easily extract the training and testing dataset with the built references
mnist_train, mnist_test = mnist_dataset['train'], mnist_dataset['test']

# by default, TF has training and testing datasets, but no validation sets
# thus we must split it on our own
# we start by defining the number of validation samples as a % of the train samples
# this is also where we make use of mnist_info (we don't have to count the observations)

num_validation_samples = 0.1 * mnist_info.splits['train'].num_examples

# let's cast this number to an integer, as a float may cause an error along the way
# tf.cast(x,dtype)-casts (converts) a variable into a given data type

num_validation_samples = tf.cast(num_validation_samples, tf.int64)

# let's also store the number of test samples in a dedicated variable (instead of using the mnist_info one)
num_test_samples = mnist_info.splits['test'].num_examples

# once more, we'd prefer an integer (rather than the default float)
num_test_samples = tf.cast(num_test_samples, tf.int64)

# normally, we would like to scale our data in some way to make the result more numerically stable
# in this case we will simply prefer to have inputs between 0 and 1
# let's define a function called: scale, that will take an MNIST image and its label

def scale(image, label):
# we make sure the value is a float
    image = tf.cast(image, tf.float32)
# since the possible values for the inputs are 0 to 255 (256 different shades of grey)
# if we divide each element by 255, we would get the desired result -> all elements will be between 0 and 1
    image /= 255. # the dot at the end means we want the result to be a float
    return image, label

# dataset.map(*function*) appplies a custom transformation to a given dataset. it takes as input a fn 
# which determines the transformation

# we have already decided that we will get the validation data from mnist_train

scaled_train_and_validation_data = mnist_train.map(scale) 
# this will scale the whole train dataset and store it in our new variable

# finally, we scale and batch the test data\n",
# we scale it so it has the same magnitude as the train and validation
# there is no need to shuffle it, because we won't be training on the test data
# there would be a single batch, equal to the size of the test data

test_data = mnist_test.map(scale)

# VID 379 preprocess the data-shuffle and Batch
# SHUFFLING: Keeping the same infor but in a different order

BUFFER_SIZE = 10000
# this BUFFER_SIZE parameter is here for cases when we're dealing with enormous datasets
# then we can't shuffle the whole dataset in one go because we can't fit it all in memory of the computer
# so instead TF only stores BUFFER_SIZE samples in memory at a time and shuffles them
# if BUFFER_SIZE=1 => no shuffling will actually happen
# if BUFFER_SIZE >= num samples => shuffling will happen at once but uniformly
# if 1< BUFFER_SIZE< num_samples, we will be optimizing the computational power of our computer 

shuffled_train_and_validation_data = scaled_train_and_validation_data.shuffle(BUFFER_SIZE)

# once we have scaled and shuffled the data, we can proceed to actually extracting the train and validation
# our validation data would be equal to 10% of the training set, which we've already calculated
# we use the .take() method to take that many samples

# finally, we create a batch with a batch size equal to the total number of validation samples

validation_data = shuffled_train_and_validation_data.take(num_validation_samples)

# similarly, the train_data is everything else, so we skip as many samples as there are in the validation dataset

train_data = shuffled_train_and_validation_data.skip(num_validation_samples)

# determine the batch size
batch_size = 1000

# we can also take advantage of the occasion to batch the train data
# this would be very helpful when we train, as we would be able to iterate over the different batches

# batch size = 1 = SGD
# batch size = number of samples = (single batch) GD
# dataset.batch(batch_size) a method that combines the consecutive elemrnts of a dataset into batches

train_data = train_data.batch(batch_size) # this indicates to our model hw many samples it should take in each batch

# Since we wont be back propagating in the validation data but only forward propagating we dont really need to batch
# Recall that batching was useful in updating wt only once per batch which is like 100 samples rather than in every sample
# hence reducing noise in the training update. so whenever we validate or test we simply forward propagate once
# when batching we simply find the average loss and average accuracy. During validation and testing we want the exact values.
# therefore we should take all the data at once. moreover when forward propagating we dont use that much computational power so
# it is not expensive to calculate the exact values, however the model expects the validation in batch form too

validation_data = validation_data.batch(num_validation_samples)

# batch the test data
test_data = test_data.batch(num_test_samples) # takes next batch (it is the only batch)

# because as_supervized=True, we've got a 2-tuple structure(the mnist data is iterable and in 2-turple format)
# so we must extract and convert the validation inputs and targets appropriately
# our validation data must have the same shape and object properties as the train and test data 

validation_inputs, validation_targets = next(iter(validation_data))

# iter()-creates an object which can be iterated one element at a time (eg in a for or while loop). by default it will
# make the dataset iterable but will not load any data
# next()-loads the next (batch) elements of an iterable object. nd since there is only one batch it will load
# the inputs and targets

In [4]:
input_size = 784
output_size = 10
hidden_layer_size = 200

model = tf.keras.Sequential([                          
                            tf.keras.layers.Flatten(input_shape=(28, 28, 1)),
                            tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 1st hidden layer
                            tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
                            tf.keras.layers.Dense(hidden_layer_size, activation='tanh'), # 2nd hidden layer
# the final layer is no different, we just make sure to activate it with softmax
                            tf.keras.layers.Dense(output_size, activation='softmax') # output layer
                            ])

In [5]:
# we define the optimizer we'd like to use
# the loss function
# and the metrics we are interested in obtaining at each iteration

# model.compile(optimizer,loss)-configures the model for training
# the string for optimizer are not case sensitive, so we can use small or capital letters

# In tensorflow 2 there are 3 built-in variations of cross-entropy(CE) loss; 
# BINARY CE-used when there is binary encoding
# CATEGORICAL CE-expects that you have one-hot encoded the targets
# SPARSE CATEGORICAL CE- applies one-hot encoding

   
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [6]:
# determine the maximum number of epochs

NUM_EPOCHS = 5

# we fit the model, specifying the
# training data
# the total number of epochs
# and the validation data we just created ourselves in the format: (inputs,targets)

model.fit(train_data, epochs=NUM_EPOCHS, validation_data=validation_data, verbose =2)


Epoch 1/5
54/54 - 30s - loss: 0.6084 - accuracy: 0.8365 - val_loss: 0.2332 - val_accuracy: 0.9325
Epoch 2/5
54/54 - 13s - loss: 0.1943 - accuracy: 0.9421 - val_loss: 0.1565 - val_accuracy: 0.9525
Epoch 3/5
54/54 - 12s - loss: 0.1356 - accuracy: 0.9591 - val_loss: 0.1232 - val_accuracy: 0.9643
Epoch 4/5
54/54 - 12s - loss: 0.1022 - accuracy: 0.9689 - val_loss: 0.0886 - val_accuracy: 0.9725
Epoch 5/5
54/54 - 11s - loss: 0.0797 - accuracy: 0.9759 - val_loss: 0.0790 - val_accuracy: 0.9757


<tensorflow.python.keras.callbacks.History at 0x29fefc683c8>

## setting an early stopping mechanism with "patience" and repeating the TRAINING

In [26]:
# That's where we train the model we have built
# set the batch size
batch_size = 1000

# set a maximum number of training epochs
NUM_EPOCHS = 20

# set an early stopping mechanism
# let's set patience=2, to be a bit tolerant against random validation loss increases

# early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)
early_stopping = tf.keras.callbacks.EarlyStopping()

# we fit the model, specifying the
# training data
# the total number of epochs
# and the validation data we just created ourselves in the format: (inputs,targets)

model.fit(train_data, epochs=NUM_EPOCHS, callbacks=[early_stopping], validation_data = validation_data, verbose =2)


Epoch 1/20
54/54 - 11s - loss: 5.3426e-04 - accuracy: 1.0000 - val_loss: 6.7930e-04 - val_accuracy: 1.0000
Epoch 2/20
54/54 - 12s - loss: 4.6758e-04 - accuracy: 1.0000 - val_loss: 7.1923e-04 - val_accuracy: 1.0000


<tensorflow.python.keras.callbacks.History at 0x29fefa78828>

In [27]:
# we can access the test accuracy using the method evaluate. 
# model.evaluate()- returns the loss value and metrics values fot the model in "test mode"

test_loss, test_accuracy = model.evaluate(test_data)

      1/Unknown - 2s 2s/step - loss: 0.0796 - accuracy: 0.9809

In [28]:
# We can apply some nice formatting if we want to

print('Test loss: {0:.2f}. Test accuracy: {1:.2f}%'.format(test_loss, test_accuracy*100.))

# After we test the model conceptually we are not alloed to change it. if we start changing the model after this point, the test
# data will no longer be a dataset the model has never seen

Test loss: 0.08. Test accuracy: 98.09%


### FEEDLE WITH THE LEARNING RATE

In [29]:
# First, we have to define a custom optimizer (as we did in the TensorFlow intro)
    
# custom_optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)

#Then we change the respective argument in model.compile to reflect this
# model.compile(optimizer=custom_optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# While Adam adapts to the problem, if the orders of magnitude are too different, it may not have time to adjust accordingly. 
# We start overfitting before we can reach a neat solution.
# Therefore, for this problem, even 0.02 is a **HIGH** starting learning rate. What if you try a learning rate of = 1?
# It's a good practice to try 0.001, 0.0001, and 0.00001. If it makes no difference, pick whatever, 
# otherwise it makes sense to fiddle with the learning rate.

# Adjust the learning rate. Try a value of 0.0001.
# First, we have to define a custom optimizer (as we did in the TensorFlow intro)
# custom_optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)

# Then we change the respective argument in model.compile to reflect this
# model.compile(optimizer=custom_optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Since the learning rate is lower than normal, we may need to adjust the max_epochs (to, say, 50).
# The result is basically the same, but we reach it much slower.
# While Adam adapts to the problem, if the orders of magnitude are too different, it may not have enough time to adjust accordingly

### BATCH SIZE ADJUSTMENT

In [30]:
# Adjust the batch size. Try a batch size of 10000. How does the required time change? What about the accuracy?

# Change batch_size from 100 to 10000.

# A bigger batch size results in slower training. That's what we expected from the theory. We are taking advantage of batching because of the amazing speed increase.

# Notice that the validation accuracy starts from a low number and with 5 epochs actually **finishes** at a lower number. That's because there are **fewer** updates in a single epoch
# Try a batch size of 30,000 or 50,000. That's very close to single batch GD for this problem. What do you think about the speed?You will need to change the max epochs to 100 (for instance), 
# as 5 epochs won't be enough to train the model. What do you think about the speed of optimization?

### ADJUSTING THE ACTIVATION FN

In [31]:
# Fiddle with the activation functions. Try applying a ReLu to the first hidden layer and tanh to the second one. The tanh activation is given by the string: 'tanh'
# Analogically to the previous lecture, we can change the activation functions. This time though, we will use different activators for the different layers.
# The result should not be significantly different. However, with different width and depth, that may change.
# Additional exercise: Try to find a better combination of activation functions

### Feedle with the Activation fn

In [32]:
# Fiddle with the activation functions. Try applying sigmoid transformation to both layers. The sigmoid activation is given by the method: tf.nn.sigmoid()
# Adjust the activations from 'relu' to 'sigmoid'
# Generally, we should reach an inferior solution. That is because relu 'cleans' the noise in the data 
# (think about it - if a value is negative, relu filters it out, while if it is positive, it takes it into account). For the MNIST dataset, 
# we care only about the intensely black and white parts in the images of the digits, so such filtering proves beneficial.\n",
# The sigmoid does not filter the signals as well as relu, but still reaches a respectable result (around 95%)
# Try using softmax activations for all layers. How does the result change? Can you explain why that happens?

### ADJUSTING THE WIDTH

In [33]:
# The *width* (the hidden layer size) of the algorithm. Try a hidden layer size of 200. How does the validation accuracy 
# of the model change? What about the time it took the algorithm to train?
# The validation accuracy is significantly higher (as the algorithm with 50 hidden units was too simple of a model).
#Naturally, it takes the algorithm much longer to train (unless early stopping is triggered too soon).
# A hidden layer size of 500 (and not only) works even better."

### ADJUSTING THE DEPTH

In [34]:
# The *depth* of the algorithm. Add another hidden layer to the algorithm. This is an extremely important exercise!
# How does the validation accuracy change? What about the time it took the algorithm to train?\n",
# Hint: Be careful with the shapes of the weights and the biases.
#Adding another hidden layer to the algorithm is done in the same way as in the lecture.
# tf.keras.layers.Dense(hidden_layer_size, activation='relu')
# We can see that the accuracy of the model does not necessarily improve. This is an important lesson for us. Fiddling with a single hyperparameter may not be enough. 
# Sometimes, a deeper net needs to also be wider in order to have higher accuracy. Maybe you need more epochs?
# ADDITIONAL TASK: Try this new model, but with a wider one (200-500 hidden units). Basically, combine this and the previous exercises**\n",
# In any case, it takes longer for the algorithm to train

### 98% model accuracy (ie using the test dataset)

In [35]:
# Achieving 98.5% accuracy with the methodology we've seen so far is extremely hard. A more realistic exercise would be to achieve 98%+ accuracy.
# However, being pushed to the limit (trying to achieve 98.5%), you have probably learned a whole lot about the machine learning process.
# Here is a link where you can check the results that some leading academics got on the MNIST (using different methodologies)
# https://rodrigob.github.io/are_we_there_yet/build/classification_datasets_results
# After some fine tuning, I decided to brute-force the algorithm and created 10 hidden layers with 5000 hidden units each.
# hidden_layer_size = 5000
# batch_size = 150
# NUM_EPOCHS = 10
# All activation functions are ReLu.
# There are better solutions using this methodology, this one is just superior to the one in the lessons. 
# Due to the width and the depth of the algorithm, it took my computer 3 hours and 50 mins to train it."

### ADJUSTING THE width and depth

In [36]:
# The *width and depth* of the algorithm. Add as many additional layers as you need to reach 5 hidden layers. Moreover, adjust the width of the algorithm as you find suitable. 
# How does the validation accuracy change? What about the time it took the algorithm to train?

#This exercise is pretty much the same as the previous one. However, it will get us to a much deeper net.
# As we noted in the previous exercise, you a deeeper net may need to be wider to produce better results.
# We tried with 1000 hidden units in each layer and 5 hidden layers.
# The result (as you can see below) is that our model's training was going very well, until it overfit. It did so by quite a lot.\n",
# It took my personal computer around 5-6 minutes to train the model.
# What if you have more epochs?

# BUSINESS CASE PREPROCESSING EXERCISE

In [37]:
import numpy as np

raw_csv_data = np.loadtxt('Audiobooks_data.csv', delimiter=',')

# Now you can work with the loaded data using NumPy's array operations
print(raw_csv_data)  # Print the loaded data
print(raw_csv_data.shape)  # Get the dimensions of the array

# Breakdown:

# np.loadtxt(): This function is specifically designed to load data from text files into a NumPy array.
# 'Audiobooks_data.csv': This is the path to the CSV file you want to load. 
# Make sure the file is in the same directory as your Python script or provide the full path.
# delimiter=',': This argument specifies that the values in the CSV file are separated by commas.

# What this code does:

# Reads the "Audiobooks_data.csv" file.
# Interprets the data based on the comma delimiter.
# Stores the loaded data into a NumPy array named raw_csv_data.

[[9.9400e+02 1.6200e+03 1.6200e+03 ... 5.0000e+00 9.2000e+01 0.0000e+00]
 [1.1430e+03 2.1600e+03 2.1600e+03 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [2.0590e+03 2.1600e+03 2.1600e+03 ... 0.0000e+00 3.8800e+02 0.0000e+00]
 ...
 [3.1134e+04 2.1600e+03 2.1600e+03 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [3.2832e+04 1.6200e+03 1.6200e+03 ... 0.0000e+00 9.0000e+01 0.0000e+00]
 [2.5100e+02 1.6740e+03 3.3480e+03 ... 0.0000e+00 0.0000e+00 1.0000e+00]]
(14084, 12)


## note that u can use this same code to preprocess any dataset that has two classes

In [38]:
# np.load:

# Purpose: Primarily used to load NumPy arrays that have been saved in binary format (typically using np.save or np.savez).
# Data Type: Specifically designed for loading NumPy arrays, which are efficient for numerical computations.   


# Use np.load when:

# You have data saved as NumPy arrays in binary format.
# You primarily need to work with numerical data and require efficient computations.

# Use pd.read_csv when:

# You are working with CSV files.
# You need to handle heterogeneous data types.
# You require data cleaning, manipulation, and analysis capabilities beyond those offered by NumPy.

# unscaled_inputs_all = raw_csv_data[:,1:-1]

# Explanation:

# raw_csv_data: This is assumed to be a 2D NumPy array containing the data loaded from a CSV file.
# [:,1:-1]: This is the core of the slicing operation:
# :: This colon selects all rows of the raw_csv_data array.
# 1: This indicates that the slicing starts from the second column (index 1, as Python uses zero-based indexing).
# -1: This indicates the ending index for column selection. In Python, negative indices count from the end of the array or DataFrame. 
# -1 represents the last column. Therefore, 1:-1 selects all columns from the second column to the second-to-last column.

In [39]:
# The inputs are all columns in the csv, except for the first one [:,0] which is just the arbitrary customer IDs that bear no useful information
# and the last one [:,-1] (which is our targets)

unscaled_inputs_all = raw_csv_data[:,1:-1]

# The targets are in the last column. That's how datasets are conventionally organized.

targets_all = raw_csv_data[:,-1]

## Balance the dataset

In [40]:
# Count how many targets are 1 (meaning that the customer did convert)
num_one_targets = int(np.sum(targets_all))

# Set a counter for targets that are 0 (meaning that the customer did not convert)
zero_targets_counter = 0
# We want to create a balanced dataset, so we will have to remove some input/target pairs.
# Declare a variable that will do that:

indices_to_remove = []
# Count the number of targets that are 0.
# Once there are as many 0s as 1s, mark entries where the target is 0.
for i in range(targets_all.shape[0]):
    if targets_all[i] == 0:
        zero_targets_counter += 1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)
            
# Create two new variables, one that will contain the inputs, and one that will contain the targets.
# We delete all indices that we marked \"to remove\" in the loop above.

unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis=0)

targets_equal_priors = np.delete(targets_all, indices_to_remove, axis=0)

## Standardize the inputs

In [41]:
# The result will be interesting as stdizing them will grtly improve the algo
from sklearn import preprocessing


scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

## Shuffle the data

In [42]:
# When the data was collected it was actually arranged by date
# Shuffle the indices of the data, so the data is not arranged in any way when we feed it.
# Since we will be batching, we want the data to be as randomly spread out as possible
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

# Use the shuffled indices to shuffle the inputs and targets.

shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

## Split the dataset into train, validation, and test

In [43]:
# Count the total number of samples

samples_count = shuffled_inputs.shape[0]
# Count the samples in each subset, assuming we want 80-10-10 distribution of training, validation, and test.
# Naturally, the numbers are integers.

train_samples_count = int(0.8 * samples_count)
validation_samples_count = int(0.1 * samples_count)

# The 'test' dataset contains all remaining data.\n",
test_samples_count = samples_count - train_samples_count - validation_samples_count

# Create variables that record the inputs and targets for training
# In our shuffled dataset, they are the first train_samples_count observations

train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]

# Create variables that record the inputs and targets for validation.
# They are the next validation_samples_count observations, folllowing the train_samples_count we already assigned
validation_inputs = shuffled_inputs[train_samples_count:train_samples_count+validation_samples_count]
validation_targets = shuffled_targets[train_samples_count:train_samples_count+validation_samples_count]

# Create variables that record the inputs and targets for test.
# They are everything that is remaining.

test_inputs = shuffled_inputs[train_samples_count+validation_samples_count:]
test_targets = shuffled_targets[train_samples_count+validation_samples_count:]
    
# We balanced our dataset to be 50-50 (for targets 0 and 1), but the training, validation, and test were
# taken from a shuffled dataset. Check if they are balanced, too. Note that each time you rerun this code
# you will get different values, as each time they are shuffled randomly
# Normally you preprocess ONCE, so you need not rerun this code once it is done
# If you rerun this whole sheet, the npzs will be overwritten with your newly preprocessed data.
# Print the number of targets that are 1s, the total number of samples, and the proportion for training, validation, and test

print(np.sum(train_targets), train_samples_count, np.sum(train_targets) / train_samples_count)
print(np.sum(validation_targets), validation_samples_count, np.sum(validation_targets) / validation_samples_count)
print(np.sum(test_targets), test_samples_count, np.sum(test_targets) / test_samples_count)

# THE PRIORS LOOK OK AS THEY ARE almost ALL EQUAL. NOTE THAT 52% OR 55% FOR TWO CLASSES ARE ALSO FINE HWWEVER WE WANT TO BE
# CLOSE TO 50% AS POSSIBLE

1796.0 3579 0.5018161497625034
221.0 447 0.49440715883668906
220.0 448 0.49107142857142855


## Save the three datasets in .npz.

In [44]:
# Save the three datasets in .npz.
    
np.savez('Audiobooks_data_train', inputs=train_inputs, targets=train_targets)
np.savez('Audiobooks_data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('Audiobooks_data_test', inputs=test_inputs, targets=test_targets)