In [1]:
import numpy as np
from sklearn import preprocessing
import tensorflow as tf

### Load the data

In [2]:
raw_csv_data= np.loadtxt('Audiobooks_data.csv', delimiter= ',')

unscaled_inputs_all= raw_csv_data[:,1:-1]
targets_all= raw_csv_data[:,-1]

In [3]:
raw_csv_data.shape

(14084, 12)

### Balance the dataset
The count of 1s and 0s in targets must be balanced, meaning about 50%.
From the excell file, we see that there are much zeros than ones, so we balance 1s and 0s and delete the rest.

In [4]:
num_one_targets= int(np.sum(targets_all))
zero_targets_counter = 0
indices_to_remove= []

for i in range(targets_all.shape[0]):
    if targets_all[i]== 0:
        zero_targets_counter += 1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)
            
unscaled_inputs_equal_priors= np.delete(unscaled_inputs_all, indices_to_remove, axis=0)
targets_equal_priors= np.delete(targets_all, indices_to_remove, axis= 0)

### Standardize the inputs

In [5]:
scaled_inputs= preprocessing.scale(unscaled_inputs_equal_priors)

### Shuffle the data
We will be batching, so we need to shuffle to randomize it.

In [6]:
# np.arange() is a method taht returns an evenly spaced values within a given interval.
# np.random.shuffle() is a method that shuffles the numbers in a given sequence

shuffled_indices= np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs= scaled_inputs[shuffled_indices]
shuffled_targets= targets_equal_priors[shuffled_indices]

### Split the dataset into train, validation, and test

In [7]:
samples_count= shuffled_inputs.shape[0]

train_samples_count= int(0.8 * samples_count)
validation_samples_count= int(0.1 * samples_count)
test_samples_count= samples_count - (train_samples_count + validation_samples_count)

train_inputs= shuffled_inputs[:train_samples_count]
train_targets= shuffled_targets[:train_samples_count]

validation_inputs= shuffled_inputs[train_samples_count: train_samples_count + validation_samples_count]
validation_targets= shuffled_targets[train_samples_count: train_samples_count + validation_samples_count]

test_inputs= shuffled_inputs[train_samples_count+ validation_samples_count:]
test_targets= shuffled_targets[train_samples_count+ validation_samples_count:]

# The target must be balanced:
print(np.sum(train_targets), train_samples_count, np.sum(train_targets)/train_samples_count)
print(np.sum(validation_targets), validation_samples_count, np.sum(validation_targets)/validation_samples_count)
print(np.sum(test_targets), test_samples_count, np.sum(test_targets)/test_samples_count)

1806.0 3579 0.5046102263202011
231.0 447 0.5167785234899329
200.0 448 0.44642857142857145


### Save/Load the datasets in *.npz

In [8]:
np.savez('Audiobooks_data_train', inputs= train_inputs, targets= train_targets)
np.savez('Audiobooks_data_validation', inputs= validation_inputs, targets= validation_targets)
np.savez('Audiobooks_data_test', inputs= test_inputs, targets= test_targets)

We have 10 inputs in input layer, 2 outputs (0,1), and we define 2 hidden layers with 50 units. 50units provide much comlexity, so it is more suffisticated than linear or logistic regression. We can change 50 to a higeher value, but it makes more runtime.

In [9]:
npz= np.load('Audiobooks_data_train.npz')
train_inputs= npz['inputs'].astype(float)
train_targets= npz['targets'].astype(int)

npzv= np.load('Audiobooks_data_validation.npz')
validation_inputs= npzv['inputs'].astype(float)
validation_targets= npzv['targets'].astype(int)

npzt= np.load('Audiobooks_data_test.npz')
test_inputs= npzt['inputs'].astype(float)
test_targets= npzt['targets'].astype(int)

In [10]:
input_size= 10
output_size= 2
hidden_layer_size= 50

model= tf.keras.Sequential([
                            tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
                            tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
                            tf.keras.layers.Dense(output_size, activation= 'softmax')
                            ])

model.compile(optimizer= 'adam', loss= 'sparse_categorical_crossentropy', metrics= ['accuracy'])

batch_size= 100
max_epochs= 100
early_stopping= tf.keras.callbacks.EarlyStopping(patience= 2)  # This is a readyly available structure

model.fit(train_inputs,
          train_targets,
          batch_size= batch_size,
          epochs= max_epochs,
          callbacks= early_stopping,
          validation_data=(validation_inputs, validation_targets),
          verbose= 2)

# If wwe notice, in some epochs, overfitting happens, this means while the training loss(first loss column) is
# consistantly decreasing, validation_loss was sometimes increasing. So we need to set early stopping.

# By default, the early_stopping object will monitor the validation loss and stop the training process the first time
# the validation loss starts increasing.

# tf.keras.callbacks.EarlyStopping(patience) comfigures the early stopping mechanism of the algorithm. 'patience' lets us decide how many consecutive increases we can tolerate.

Epoch 1/100
36/36 - 0s - loss: 0.5840 - accuracy: 0.6882 - val_loss: 0.5146 - val_accuracy: 0.7360
Epoch 2/100
36/36 - 0s - loss: 0.4650 - accuracy: 0.7558 - val_loss: 0.4481 - val_accuracy: 0.7629
Epoch 3/100
36/36 - 0s - loss: 0.4162 - accuracy: 0.7832 - val_loss: 0.4121 - val_accuracy: 0.7897
Epoch 4/100
36/36 - 0s - loss: 0.3934 - accuracy: 0.7949 - val_loss: 0.4019 - val_accuracy: 0.7696
Epoch 5/100
36/36 - 0s - loss: 0.3754 - accuracy: 0.8078 - val_loss: 0.3839 - val_accuracy: 0.8076
Epoch 6/100
36/36 - 0s - loss: 0.3652 - accuracy: 0.8066 - val_loss: 0.3764 - val_accuracy: 0.8031
Epoch 7/100
36/36 - 0s - loss: 0.3564 - accuracy: 0.8195 - val_loss: 0.3724 - val_accuracy: 0.7830
Epoch 8/100
36/36 - 0s - loss: 0.3514 - accuracy: 0.8142 - val_loss: 0.3707 - val_accuracy: 0.7919
Epoch 9/100
36/36 - 0s - loss: 0.3464 - accuracy: 0.8156 - val_loss: 0.3666 - val_accuracy: 0.7964
Epoch 10/100
36/36 - 0s - loss: 0.3435 - accuracy: 0.8148 - val_loss: 0.3523 - val_accuracy: 0.8076
Epoch 11/

<keras.callbacks.History at 0x20d17b922e0>

The final validation accuracy of the model is around 83%. the proior is 50%, so the algorithm definitly learned alot.

Around 83% of the fututre customer behavior is classified correctly.

### Test the model

In [11]:
test_loss, test_accuracy= model.evaluate(test_inputs, test_targets)



In [12]:
print('Test loss: {0:,.2f}.  Test accuracy: {1:,.2f}%'.format(test_loss, test_accuracy*100.))

Test loss: 0.35.  Test accuracy: 81.03%
