# Business Case for AudioBooks Preprocessing

### Pre-processing

#### 1. Extract the data from the csv

In [1]:
import numpy as np

# use the sklearn for standardizing the inputs
# almost always we standardize all inputs as the quality of the algorithm improved significantly

from sklearn import preprocessing

raw_csv_data = np.loadtxt('Audiobooks-data.csv', delimiter = ',')

# the first column is the arbitrarily chosen ID
# the last column is the targets

unscaled_inputs_all = raw_csv_data[:,1:-1]
targets_all = raw_csv_data[:,-1]

#### 2. Balance the dataset

In [2]:
# it will count the number of targets that are 1s
# and will keep as many 0s as 1s (so it will delete the others)

# if sum all the targets we will get the number of targets that are 1s
# just keep as many 0 as 1
# need a variable that records the indices to be removed

num_one_targets = int(np.sum(targets_all))
zero_targets_counter = 0
indices_to_remove = []

# iterate over the dataset and balance it
# the shape of targets_all on axis = 0, is basically the length of the vector
# so it will show us the number of all targets
# in the loop want to increase the 0 counter by 1, if the target is 0
# if the targets at position i is 0, and the number of 0 is bigger than the number of 1, it want to take note of that index
# if the targets at position i is 0, and the number of 0 is bigger than the number of 1, it will know the indices of all data points to be removed

for i in range(targets_all.shape[0]):
    
    if targets_all[i] == 0:
        zero_targets_counter += 1
        
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)


# np.delete(array, obj to delete, axis) is a method that deletes an object along an axis            
            
unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis = 0)
targets_equal_prios = np.delete(targets_all, indices_to_remove, axis = 0)        

#### 3. Standardize the inputs

In [3]:
# that's the preprocessing library imported from sklearn
# the scale method standardize the dataset along each variable
# so basically all inputs will be standardize

scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

#### 4. Shuffle the data

In [4]:
# shuffle the inputs and the targets, it will keep the same information but in a random order
# it is possible that the original dataset was collected in the order of data
# since it will be batching, it must shuffle the data
# it should be randomly spread as possible so batching works fine
# imagine the data is ordered, so each batch represents approximately a different day of purchases
# inside the batch the data is homogeneous, but between batches it is very heterogeneous, due to promotions day of the week effects and so on
# this will confuse the stochastic gradient descent when we average the loss across batches

# first it take the indices from the axis 0 of these scaled_inputs shape
# and place them into variable
# the np.arange(start,stop) is a method that returns a evenly spaced values within a given interval
# then use the np.random.shuffle(x) is method that shuffles the numbers in a given sequence

shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_prios[shuffled_indices]

#### 5. Split the dataset into train, validation and test

In [5]:
samples_count = shuffled_inputs.shape[0]

# determine the size of the three dataset
# use the 80-10-10 split for train, validation and test

train_samples_count = int(0.8 * samples_count)
validation_samples_count = int(0.1 * samples_count)
test_samples_count = samples_count - train_samples_count - validation_samples_count

# we have the size of the train, validation and test
# extract them from the big dataset

# the train_inputs are given by the first train samples count of the preprocessed inputs
# the train_targets are the first train_samples count of the targets

train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]

# logically the validation_inputs are the inputs in the interval from train_samples_count to train_samples_count + validation_samples_count
# the validation targets are the targets in the same interval

validation_inputs = shuffled_inputs[train_samples_count:train_samples_count + validation_samples_count]
validation_targets = shuffled_targets[train_samples_count:train_samples_count + validation_samples_count]

# finally the test is everything that is left

test_inputs = shuffled_inputs[train_samples_count + validation_samples_count:]
test_targets = shuffled_targets[train_samples_count + validation_samples_count:]

# it is useful to check if balanced the dataset
# moreover we may have balanced the whole dataset but not the training, validation and test sets
# print the number of ones for each dataset, the total number of samples and the proportion of ones as a part of the total
# the total of observations is around 3574 + 447 + 448 = 4474

print(np.sum(train_targets), train_samples_count, np.sum(train_targets) / train_samples_count)
print(np.sum(validation_targets), validation_samples_count, np.sum(validation_targets) / validation_samples_count)
print(np.sum(test_targets), test_samples_count, np.sum(test_targets) / test_samples_count)

1791.0 3579 0.5004191114836547
221.0 447 0.49440715883668906
225.0 448 0.5022321428571429


#### 6. Save the three datasets in .npz

In [6]:
# now it is saved
# each time run the code, it will get different proportions as we shuffle the indices randomly
# training, validation and test datasets will contain different samples

np.savez('Audiobooks_data_train', inputs = train_inputs, targets = train_targets)
np.savez('Audiobooks_data_validation', inputs = validation_inputs, targets = validation_targets)
np.savez('Audiobooks_data_test', inputs = test_inputs, targets = test_targets)