<a href="https://colab.research.google.com/github/Ian3000Dias/AudioBook-Platfrom-Analysis/blob/main/AudioBook_DataScience_BusinessCase.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1> AUDIO BOOKS EXAMPLE

<h2> PREPROCESSING

<h3> Extract Data from CSV

In [1]:
import numpy as np
from sklearn import preprocessing

raw_csv_data = np.loadtxt('https://raw.githubusercontent.com/Ian3000Dias/AudioBook-Platfrom-Analysis/refs/heads/main/Audiobooks_data.csv', delimiter = ',')

unscaled_inputs_all = raw_csv_data[:,1:-1]
targets_all = raw_csv_data[:,-1]

<h3> Shuffle the data

In [2]:
shuffled_indices =np.arange(unscaled_inputs_all.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs = unscaled_inputs_all[shuffled_indices]
shuffled_targets = targets_all[shuffled_indices]


<h3> Balance the dataset

In [3]:
num_one_targets = int(np.sum(shuffled_targets))
zero_targets_counter = 0
indices_to_remove = []

for i in range(shuffled_targets.shape[0]):
    if shuffled_targets[i] == 0:
        zero_targets_counter += 1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append

In [4]:
unscaled_inputs_equal_priors = np.delete(shuffled_inputs, indices_to_remove, axis = 0)
targets_equal_priors = np.delete(shuffled_targets, indices_to_remove, axis = 0)

<h3> Standardize the inputs

In [5]:
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

<h3> Split the dataset into train, validation, and test

In [6]:
sample_count = shuffled_inputs.shape[0]

train_sample_count = int(0.8 * sample_count)
validation_sample_count = int(0.1 * sample_count)
test_sample_count = sample_count - train_sample_count - validation_sample_count

train_inputs = scaled_inputs[:train_sample_count]
train_targets = targets_equal_priors[:train_sample_count]

validation_inputs = scaled_inputs[train_sample_count:train_sample_count + validation_sample_count]
validation_targets = targets_equal_priors[train_sample_count:train_sample_count + validation_sample_count]

test_inputs = scaled_inputs[train_sample_count + validation_sample_count:]
test_targets = targets_equal_priors[train_sample_count + validation_sample_count:]

print(np.sum(train_targets), train_sample_count, np.sum(train_targets) / train_sample_count)
print(np.sum(validation_targets), validation_sample_count, np.sum(validation_targets) / validation_sample_count)
print(np.sum(test_targets), test_sample_count, np.sum(test_targets) / test_sample_count)

1817.0 11267 0.16126741812372414
217.0 1408 0.15411931818181818
203.0 1409 0.1440738112136267


<h3> Save datasets as .npz

In [7]:
np.savez('Audiobooks_data_train', inputs = train_inputs, targets = train_targets)
np.savez('Audiobooks_data_validation', inputs = validation_inputs, targets = validation_targets)
np.savez('Audiobooks_data_test', inputs = test_inputs, targets = test_targets)

<H2> Creating Machine Learning Algorithm

In [8]:
import tensorflow as tf

<h3> Data

In [12]:
npz = np.load('Audiobooks_data_train.npz')

train_inputs = npz['inputs'].astype(np.float64)
train_targets = npz['targets'].astype(np.int64)

npz = np.load('Audiobooks_data_validation.npz')
validation_inputs, validation_targets = npz['inputs'].astype(np.float64), npz['targets'].astype(np.int64)

npz = np.load('Audiobooks_data_test.npz')
test_inputs, test_targets = npz['inputs'].astype(np.float64), npz['targets'].astype(np.int64)


<h3> Model

In [16]:
input_size = 10
output_size = 2
hidden_layer_size = 50

model = tf.keras.Sequential([
    tf.keras.layers.Dense(hidden_layer_size, activation = 'relu'),
    tf.keras.layers.Dense(hidden_layer_size, activation = 'relu'),
    tf.keras.layers.Dense(output_size, activation = 'softmax')
])

model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

batch_size = 100

max_epochs = 100

early_stopping = tf.keras.callbacks.EarlyStopping(patience = 2
                                                  )

model.fit(train_inputs,
          train_targets,
          batch_size = batch_size,
          epochs = max_epochs,
          callbacks = [early_stopping],
          validation_data = (validation_inputs, validation_targets),
          verbose = 2)

Epoch 1/100
113/113 - 2s - 17ms/step - accuracy: 0.8765 - loss: 0.3607 - val_accuracy: 0.9041 - val_loss: 0.2737
Epoch 2/100
113/113 - 0s - 4ms/step - accuracy: 0.9020 - loss: 0.2712 - val_accuracy: 0.9041 - val_loss: 0.2537
Epoch 3/100
113/113 - 0s - 3ms/step - accuracy: 0.9033 - loss: 0.2580 - val_accuracy: 0.9077 - val_loss: 0.2448
Epoch 4/100
113/113 - 1s - 6ms/step - accuracy: 0.9068 - loss: 0.2492 - val_accuracy: 0.9084 - val_loss: 0.2378
Epoch 5/100
113/113 - 1s - 6ms/step - accuracy: 0.9079 - loss: 0.2433 - val_accuracy: 0.9077 - val_loss: 0.2353
Epoch 6/100
113/113 - 0s - 3ms/step - accuracy: 0.9085 - loss: 0.2415 - val_accuracy: 0.9070 - val_loss: 0.2380
Epoch 7/100
113/113 - 1s - 6ms/step - accuracy: 0.9075 - loss: 0.2374 - val_accuracy: 0.9098 - val_loss: 0.2315
Epoch 8/100
113/113 - 1s - 5ms/step - accuracy: 0.9088 - loss: 0.2352 - val_accuracy: 0.9084 - val_loss: 0.2351
Epoch 9/100
113/113 - 1s - 7ms/step - accuracy: 0.9095 - loss: 0.2368 - val_accuracy: 0.9098 - val_loss

<keras.src.callbacks.history.History at 0x7b7a0f750850>

<h3> Test the model

In [20]:
test_loss, test_accuracy = model.evaluate(test_inputs, test_targets)

[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9101 - loss: 0.2103


In [21]:
print('\nTest loss: {0:.2f}. Test accuracy: {1:.2f}%'.format(test_loss, test_accuracy * 100.))


Test loss: 0.20. Test accuracy: 92.05%
