# Audiobooks business case

In [1]:
import numpy as np
from sklearn import preprocessing
import tensorflow as tf

## Extract the data

In [2]:
raw_data = np.loadtxt('Audiobooks_data.csv',delimiter = ',')
print(raw_data.shape)

(14084, 12)


In [3]:
unscaled_inputs = raw_data[:,1:-1]
all_targets = raw_data[:,-1]

## Balance the dataset

In [4]:
#Lets see howmany 0s and 1s do we have in the targets
ones_targets = [lambda x: x for x in all_targets if x==1]
zeros_targets = [lambda x: x for x in all_targets if x==0]

In [5]:
print("The number of targets is: ",len(all_targets))
print("The number of ones is: ",len(ones_targets))
print("The number of zeros is: ",len(zeros_targets))
print("The percentage of ones/zeros: ",round(len(ones_targets)/len(zeros_targets)*100,2))

The number of targets is:  14084
The number of ones is:  2237
The number of zeros is:  11847
The percentage of ones/zeros:  18.88


In [6]:
remove_list = []
counter = 0

for item  in range(len(all_targets)):
    if all_targets[item] == 0:
        counter +=1
        if counter > len(ones_targets):
            remove_list.append(item)
    
    
balanced_data =np.delete(unscaled_inputs,remove_list, axis = 0)
targets_equal_priors = np.delete(all_targets, remove_list, axis=0)

## Standardizing

In [7]:
scaled_inputs = preprocessing.scale(balanced_data)

## Shuffle the data

In [8]:
shuffled_ind = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_ind)

In [9]:
shuffled_inputs = scaled_inputs[shuffled_ind]
shuffled_targets = targets_equal_priors[shuffled_ind]

## Split the dataset

In [10]:
num_train_sample =int( 0.8 * len(shuffled_inputs))
num_validation_sample =int( 0.1 * len(shuffled_inputs))
num_test_sample = int( 0.1 * len(shuffled_inputs))
print(num_train_sample)
print(num_validation_sample)
print(num_test_sample)

3579
447
447


In [11]:
train_input = shuffled_inputs[:num_train_sample]
train_output =shuffled_targets[:num_train_sample]
#validation samples starts from the end of a =training inputs a ends in a + number of validation samples 
validation_input = shuffled_inputs[num_train_sample:num_train_sample+num_validation_sample ]
validation_output = shuffled_targets[num_train_sample:num_train_sample+num_validation_sample ]

test_input = shuffled_inputs[num_train_sample+num_validation_sample:]
test_output = shuffled_targets[num_train_sample+num_validation_sample:]

In [12]:
print(len(train_input), len(train_output))
print(len(validation_input), len(validation_output))
print(len(test_input), len(test_output))

3579 3579
447 447
448 448


In [13]:
#Lets see how many 0s and 1s do we have in each dataset
print(np.sum(train_output)/len(train_input))
print(np.sum(validation_output)/len(validation_output))
print(np.sum(test_output)/ len(test_output))

0.49399273540095
0.5100671140939598
0.5379464285714286


#### The result shows that we have got three balanced samples to train validate and test the dataset

In [14]:
np.savez('Audiobooks_train',inputs = train_input, targets = train_output )
np.savez('Audiobooks_validation',inputs = validation_input, targets = validation_output )
np.savez('Audiobooks_test',inputs = test_input, targets = test_output )

## Loading Data

In [15]:
npz_train = np.load('Audiobooks_train.npz')
train_inputs = npz_train['inputs'].astype(np.float)
train_targets = npz_train['targets'].astype(np.int)

npz_validation = np.load('Audiobooks_validation.npz')
validation_inputs = npz_validation['inputs'].astype(np.float)
validation_targets = npz_validation['targets'].astype(np.int)

npz_test = np.load('Audiobooks_test.npz')
test_inputs = npz_test['inputs'].astype(np.float)
test_targets = npz_test['targets'].astype(np.int)


## Neural Networks 

In [16]:
input_size = 10
output_size = 2  # 0,1
hidden_layer_size = 100

model = tf.keras.Sequential([
    
    tf.keras.layers.Dense(hidden_layer_size, activation = 'relu'), #the first hidden layer
    tf.keras.layers.Dense(hidden_layer_size, activation = 'relu'), #the second hidden layer
    tf.keras.layers.Dense(output_size, activation = 'softmax') # the output is a calssifier
    
])

model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy',metrics = ['accuracy'])

In [17]:
batch_size = 100
max_epochs = 100
early_stop = tf.keras.callbacks.EarlyStopping(patience =2)

model.fit(
    train_inputs,
    train_targets,
    batch_size = batch_size ,
    callbacks = [early_stop],
    epochs = max_epochs,
    validation_data = (validation_inputs, validation_targets),
    verbose = 2
        )


Epoch 1/100
36/36 - 2s - loss: 0.5342 - accuracy: 0.7332 - val_loss: 0.4393 - val_accuracy: 0.7494
Epoch 2/100
36/36 - 0s - loss: 0.4171 - accuracy: 0.7835 - val_loss: 0.3823 - val_accuracy: 0.7987
Epoch 3/100
36/36 - 0s - loss: 0.3843 - accuracy: 0.7946 - val_loss: 0.3532 - val_accuracy: 0.7987
Epoch 4/100
36/36 - 0s - loss: 0.3644 - accuracy: 0.8027 - val_loss: 0.3434 - val_accuracy: 0.8121
Epoch 5/100
36/36 - 0s - loss: 0.3591 - accuracy: 0.8030 - val_loss: 0.3366 - val_accuracy: 0.8054
Epoch 6/100
36/36 - 0s - loss: 0.3497 - accuracy: 0.8114 - val_loss: 0.3305 - val_accuracy: 0.8233
Epoch 7/100
36/36 - 0s - loss: 0.3475 - accuracy: 0.8019 - val_loss: 0.3324 - val_accuracy: 0.8098
Epoch 8/100
36/36 - 0s - loss: 0.3437 - accuracy: 0.8125 - val_loss: 0.3288 - val_accuracy: 0.8076
Epoch 9/100
36/36 - 0s - loss: 0.3374 - accuracy: 0.8192 - val_loss: 0.3185 - val_accuracy: 0.8255
Epoch 10/100
36/36 - 0s - loss: 0.3334 - accuracy: 0.8215 - val_loss: 0.3158 - val_accuracy: 0.8345
Epoch 11/

<tensorflow.python.keras.callbacks.History at 0x22c87dd1af0>

## Testing

In [18]:
test_loss, test_accuracy = model.evaluate(test_inputs, test_targets)



In [19]:
print('\nTest loss: {0:.2f}. Test accuracy: {1:.2f}%'.format(test_loss, test_accuracy*100.))


Test loss: 0.36. Test accuracy: 81.47%
