### Data preprocessing

In [18]:
import numpy as np
from sklearn import preprocessing

In [14]:
raw_csv_data = np.loadtxt('data/Audiobooks-data.csv',delimiter=',')

unscaled_inputs_all = raw_csv_data[:,1:-1] #[row,column]
targets_all = raw_csv_data[:,-1]

unscaled_inputs_all.shape, targets_all.shape

((14084, 10), (14084,))

#### Balance the dataset

In [12]:
num_one_targets = int(np.sum(targets_all))
zero_targets_counter = 0
indices_to_remove = []

for i in range(targets_all.shape[0]):
    if targets_all[i] == 0:
        zero_targets_counter +=1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)

unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all,indices_to_remove,axis=0) # axis=0 ->row
targets_equal_priors = np.delete(targets_all,indices_to_remove,axis=0)

unscaled_inputs_equal_priors.shape, targets_equal_priors.shape

((4474, 10), (4474,))

#### Standardize the inputs

In [15]:
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

#### Shuffle the data
* to work for the batches as to prevent homogeneous data
* `np.arange([start],stop)` is a method that returns a evenly spaced values within a given interval

In [17]:
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

#### Split the dataset into train, validation, and test

In [19]:
samples_count = shuffled_inputs.shape[0]

# dividing the data sets
train_samples_count = int(0.8*samples_count)
validation_samples_count = int(0.1*samples_count)
test_samples_count = samples_count - train_samples_count - validation_samples_count

# splitting the data
#training
train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]
# validation
validation_inputs = shuffled_inputs[train_samples_count:train_samples_count+validation_samples_count]
validation_targets = shuffled_targets[train_samples_count:train_samples_count+validation_samples_count]
#test
test_inputs = shuffled_inputs[train_samples_count+validation_samples_count:]
test_targets = shuffled_targets[train_samples_count+validation_samples_count:]

print(np.sum(train_targets),train_samples_count,np.sum(train_targets)/train_samples_count)
print(np.sum(validation_targets),validation_samples_count,np.sum(validation_targets)/validation_samples_count)
print(np.sum(test_targets),test_samples_count,np.sum(test_targets)/test_samples_count)

1774.0 3579 0.4956691813355686
219.0 447 0.4899328859060403
244.0 448 0.5446428571428571


#### Save the three datasets in *.npz

In [20]:
np.savez('Audiobooks_data_train',inputs=train_inputs,targets=train_targets)
np.savez('Audiobooks_data_validation',inputs=validation_inputs,targets=validation_targets)
np.savez('Audioboos_data_test',inputs=test_inputs,targets=test_targets)

### Create the machine learning algorithm

In [21]:
import tensorflow as tf

#### Data

In [24]:
# Train data
npz = np.load('Audiobooks_data_train.npz')

train_inputs = npz['inputs'].astype(np.float)  # we expect all the inputs be float for the ML
train_targets = npz['targets'].astype(np.int)

# Validation data
npz = np.load('Audiobooks_data_validation.npz')

validation_inputs,validation_targets = npz['inputs'].astype(np.float),npz['targets'].astype(np.int)

# test data
npz = np.load('Audioboos_data_test.npz')

test_inputs, test_targets = npz['inputs'].astype(np.float),npz['targets'].astype(np.int)


#### Create the model
Outline, optimizers, loss, early stopping and training

* `tf.keras.callbacks.EarlyStopping(patience)` configures the early stopping mechanism of the algorithm -> `patience` lets us decide how many consecutive increases we can tolerate

In [27]:
input_size = 10
output_size = 2
hidden_layer_size = 50

model = tf.keras.Sequential([
    tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
    tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
    tf.keras.layers.Dense(output_size,activation='softmax')
])

# optimizer and loss
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=['accuracy'])

batch_size = 100

max_epochs = 100

#preventing overfitting
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)

model.fit(train_inputs,
         train_targets,
         batch_size=batch_size,
         epochs = max_epochs,
            callbacks = [early_stopping],
         validation_data =(validation_inputs,validation_targets),
         verbose=2)

Train on 3579 samples, validate on 447 samples
Epoch 1/100
3579/3579 - 0s - loss: 0.6174 - accuracy: 0.6597 - val_loss: 0.5299 - val_accuracy: 0.7472
Epoch 2/100
3579/3579 - 0s - loss: 0.4866 - accuracy: 0.7555 - val_loss: 0.4393 - val_accuracy: 0.8009
Epoch 3/100
3579/3579 - 0s - loss: 0.4257 - accuracy: 0.7776 - val_loss: 0.3928 - val_accuracy: 0.8143
Epoch 4/100
3579/3579 - 0s - loss: 0.3971 - accuracy: 0.7896 - val_loss: 0.3647 - val_accuracy: 0.8143
Epoch 5/100
3579/3579 - 0s - loss: 0.3791 - accuracy: 0.8025 - val_loss: 0.3555 - val_accuracy: 0.8277
Epoch 6/100
3579/3579 - 0s - loss: 0.3683 - accuracy: 0.8025 - val_loss: 0.3341 - val_accuracy: 0.8412
Epoch 7/100
3579/3579 - 0s - loss: 0.3609 - accuracy: 0.8058 - val_loss: 0.3253 - val_accuracy: 0.8300
Epoch 8/100
3579/3579 - 0s - loss: 0.3545 - accuracy: 0.8120 - val_loss: 0.3221 - val_accuracy: 0.8322
Epoch 9/100
3579/3579 - 0s - loss: 0.3489 - accuracy: 0.8153 - val_loss: 0.3170 - val_accuracy: 0.8345
Epoch 10/100
3579/3579 - 0

<tensorflow.python.keras.callbacks.History at 0x2693687f6c8>

#### Test the model

In [31]:
test_loss, test_accuracy = model.evaluate(test_inputs,test_targets,verbose=0)

In [32]:
print('\n Test loss: {0:.2f}. Test accuracy: {1:.2f}%'.format(test_loss,test_accuracy*100.))


 Test loss: 0.34. Test accuracy: 82.59%
