In [1]:
# %% [markdown]
# # Audiobooks business case

# %% [markdown]
# ## Preprocess the data. Balance the dataset. Create 3 datasets: training, validation, and test. Save the newly created sets in a tensor friendly format (e.g. *.npz)
# 
# Since we are dealing with real life data, we will need to preprocess it a bit. This is the relevant code, which is not that hard, but is crucial to creating a good model.
# 
# If you want to know how to do that, go through the code. In any case, this should do the trick for most datasets organized in the way: many inputs, and then 1 cell containing the targets (supervised learning datasets). Keep in mind that a specific problem may require additional preprocessing.
# 
# Note that we have removed the header row, which contains the names of the categories. We simply want the data.

# %% [markdown]
# ### Extract the data from the csv

# %%
import numpy as np
from sklearn import preprocessing

# Load the CSV data into a NumPy array.
# 'Audiobooks_data.csv' is the file name and ',' is the delimiter used in the CSV file.
raw_csv_data = np.loadtxt('001 Audiobooks-data.csv', delimiter=',')

# Extract the input features from the raw data.
# All columns except the first (index 0) and the last (index -1) are considered input features.
unscaled_inputs_all = raw_csv_data[:, 1:-1]

# Extract the target values (labels) from the raw data.
# The last column (index -1) is considered the target.
targets_all = raw_csv_data[:, -1]

# %% [markdown]
# ### Balance the dataset

# %%
# 1. We will count the number of targets that are 1
# 2. We will keep as many 0's as 1's (We will delete the others)

# Count the number of target values that are 1.
num_one_targets = int(np.sum(targets_all))

# Initialize a counter for the number of 0 targets encountered.
zero_targets_counter = 0

# Initialize a list to store the indices of rows to be removed.
indices_to_remove = []

# Iterate over all the target values.
for i in range(targets_all.shape[0]):
    # The shape of targets_all on axis=0 is basically the length of the column.
    if targets_all[i] == 0:
        # Increment the counter for 0 targets.
        zero_targets_counter += 1
        # If the count of 0 targets exceeds the number of 1 targets,
        # append the current index to the list of indices to remove.
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)

# Remove the rows corresponding to the indices in 'indices_to_remove' from 'unscaled_inputs_all'.
unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis=0)

# Remove the rows corresponding to the indices in 'indices_to_remove' from 'targets_all'.
targets_equal_priors = np.delete(targets_all, indices_to_remove, axis=0)

# %% [markdown]
# ### Standardize the inputs

# %%
# Scale the input features to have a mean of 0 and a standard deviation of 1.
# This is done to normalize the data, which helps in faster convergence during training.
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

# %% [markdown]
# ### Shuffle the data

# %%
# Create an array of indices corresponding to the number of samples.
shuffled_indices = np.arange(scaled_inputs.shape[0])
# Shuffle the indices randomly to ensure the data is mixed well.
np.random.shuffle(shuffled_indices)

# Reorder the input features using the shuffled indices.
shuffled_inputs = scaled_inputs[shuffled_indices]
# Reorder the target values using the shuffled indices.
shuffled_targets = targets_equal_priors[shuffled_indices]

# %% [markdown]
# ### Split the data into training, validation, and test 

# %%
# STEP 1:

# Calculate the total number of samples in the dataset.
samples_count = shuffled_indices.shape[0]

# STEP 2:

# Calculate the number of training samples (80% of the total samples).
train_samples_count = int(0.8 * samples_count)

# Calculate the number of validation samples (10% of the total samples).
validation_samples_count = int(0.1 * samples_count)

# Calculate the number of test samples.
# The test samples count is the remaining samples after allocating for training and validation.
test_samples_count = samples_count - train_samples_count - validation_samples_count

# STEP 3:

# Select the training data.
# The training inputs are the first 'train_samples_count' samples from 'shuffled_inputs'.
train_input = shuffled_inputs[:train_samples_count]

# The training targets are the first 'train_samples_count' samples from 'shuffled_targets'.
train_targets = shuffled_targets[:train_samples_count]

# Select the validation data.
# The validation inputs are the next 'validation_samples_count' samples after the training samples.
validation_input = shuffled_inputs[train_samples_count:train_samples_count + validation_samples_count]

# The validation targets are the next 'validation_samples_count' samples after the training samples.
validation_targets = shuffled_targets[train_samples_count:train_samples_count + validation_samples_count]

# Select the test data.
# The test inputs are the remaining samples after the training and validation samples.
test_input = shuffled_inputs[train_samples_count + validation_samples_count:]

# The test targets are the remaining samples after the training and validation samples.
test_targets = shuffled_targets[train_samples_count + validation_samples_count:]

# Print the sum of targets and counts for train, validation, and test sets.
# This helps to check the balance of the dataset after the split.
print(np.sum(train_targets), train_samples_count, np.sum(train_targets) / train_samples_count)
print(np.sum(validation_targets), validation_samples_count, np.sum(validation_targets) / validation_samples_count)
print(np.sum(test_targets), test_samples_count, np.sum(test_targets) / test_samples_count)

# This process ensures that the data is divided into training, validation, and test sets with the proportions of 80%, 10%, and 10% respectively, 
# which helps in training the model, validating it, and testing its performance.


# %%
# Training Data: 1798.0 3579 0.502374965074043

# 1798.0: The sum of the target values for the training set. Since the target values are binary (0 or 1), this means there are 1798 samples with the target value of 1.
# 3579: The total number of samples in the training set.
# 0.502374965074043: The proportion of samples with the target value of 1 in the training set. 
# This is calculated as 1798 / 3579, which is approximately 50.24%. 
# This indicates that the training set is roughly balanced with respect to the target classes (1s and 0s).

# Validation Data: 210.0 447 0.4697986577181208

# 210.0: The sum of the target values for the validation set. There are 210 samples with the target value of 1.
# 447: The total number of samples in the validation set.
# 0.4697986577181208: The proportion of samples with the target value of 1 in the validation set. 
# This is calculated as 210 / 447, which is approximately 46.98%. 
# This shows that the validation set is slightly imbalanced but still reasonably close to 50%.

# Test Data: 229.0 448 0.5111607142857143

# 229.0: The sum of the target values for the test set. There are 229 samples with the target value of 1.
# 448: The total number of samples in the test set.
# 0.5111607142857143: The proportion of samples with the target value of 1 in the test set. 
# This is calculated as 229 / 448, which is approximately 51.12%. 
# This indicates that the test set is also roughly balanced.

# %% [markdown]
# ### Save the three datasets in *.npz

# %%
# Save the three datasets in *.npz:
np.savez('Audiobooks_data_train.npz', inputs=train_input, targets=train_targets)
np.savez('Audiobooks_data_validation.npz', inputs=validation_input, targets=validation_targets)
np.savez('Audiobooks_data_test.npz', inputs=test_input, targets=test_targets)

# %% [markdown]
# ## Problem
# 
# You are given data from an Audiobook app. Logically, it relates only to the audio versions of books. Each customer in the database has made a purchase at least once, that's why he/she is in the database. We want to create a machine learning algorithm based on our available data that can predict if a customer will buy again from the Audiobook company.
# 
# The main idea is that if a customer has a low probability of coming back, there is no reason to spend any money on advertizing to him/her. If we can focus our efforts ONLY on customers that are likely to convert again, we can make great savings. Moreover, this model can identify the most important metrics for a customer to come back again. Identifying new customers creates value and growth opportunities.
# 
# You have a .csv summarizing the data. There are several variables: Customer ID, Book length in mins_avg (average of all purchases), Book length in minutes_sum (sum of all purchases), Price Paid_avg (average of all purchases), Price paid_sum (sum of all purchases), Review (a Boolean variable), Review (out of 10), Total minutes listened, Completion (from 0 to 1), Support requests (number), and Last visited minus purchase date (in days).
# 
# So these are the inputs (excluding customer ID, as it is completely arbitrary. It's more like a name, than a number).
# 
# The targets are a Boolean variable (so 0, or 1). We are taking a period of 2 years in our inputs, and the next 6 months as targets. So, in fact, we are predicting if: based on the last 2 years of activity and engagement, a customer will convert in the next 6 months. 6 months sounds like a reasonable time. If they don't convert after 6 months, chances are they've gone to a competitor or didn't like the Audiobook way of digesting information. 
# 
# The task : create a machine learning algorithm, which is able to predict if a customer will buy again. 
# 
# This is a classification problem with two classes: won't buy and will buy, represented by 0s and 1s. 

# %% [markdown]
# ### Importing the relevant libraries

# %%
# import numpy as np  #Already imported
import tensorflow as tf

# %% [markdown]
# ### Data

# %%
npz = np.load('Audiobooks_data_train.npz')

train_inputs = npz['inputs'].astype(np.float64)
train_targets = npz['targets'].astype(np.int32)

npz = np.load('Audiobooks_data_validation.npz')

validation_inputs = npz['inputs'].astype(np.float64)
validation_targets = npz['targets'].astype(np.int32)

npz = np.load('Audiobooks_data_test.npz')

test_inputs = npz['inputs'].astype(np.float64)
test_targets = npz['targets'].astype(np.int32)

# %% [markdown]
# ### Model

# %% [markdown]
# Outline, Optimizers, Loss, Early Stopping and Training

# %%
input_size = 10
output_size = 2
hidden_layer_size = 50

# Define the model using the Input layer explicitly
model = tf.keras.Sequential([
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),  # First hidden layer with ReLU activation function.
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),  # Second hidden layer with ReLU activation function.
    tf.keras.layers.Dense(output_size, activation='softmax')  # Output layer with softmax activation function for probability distribution.
])

# Compile the model with the specified configurations
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

batch_size = 100
max_epochs = 100

model.fit(train_inputs, train_targets, batch_size=batch_size, epochs=max_epochs, validation_data=(validation_inputs, validation_targets), verbose=2)

# %%
# Here’s a summary of what to look for in your results:

# Training vs. Validation Accuracy: If training accuracy continues to improve while validation accuracy plateaus or decreases, this is a sign of overfitting.
# Training vs. Validation Loss: If training loss continues to decrease while validation loss plateaus or increases, this is another sign of overfitting.

# Based on your results:

# Training Accuracy: Increases steadily, reaching around 83-84% in the final epochs.
# Validation Accuracy: Fluctuates but generally remains stable around 83-85%, with some variation.
# Training Loss: Decreases consistently over epochs.
# Validation Loss: Decreases initially but starts to fluctuate around epoch 40, without significant improvement or degradation.

# While there is no drastic increase in validation loss or significant divergence between training and validation accuracy, 
# the fluctuation and plateauing of validation accuracy and loss suggest the model may be starting to overfit, particularly after epoch 40. 
# The model learns well initially, but its generalization ability does not improve as much beyond a certain point.

# %%
input_size = 10  # Number of input features
output_size = 2  # Number of output classes
hidden_layer_size = 50  # Number of neurons in the hidden layers

# Define the model using the Input layer explicitly
model = tf.keras.Sequential([
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),  # First hidden layer with ReLU activation function.
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),  # Second hidden layer with ReLU activation function.
    tf.keras.layers.Dense(output_size, activation='softmax')  # Output layer with softmax activation function for probability distribution.
])

# Compile the model with the specified configurations
model.compile(optimizer='adam',  # Optimizer to use
              loss='sparse_categorical_crossentropy',  # Loss function for classification
              metrics=['accuracy'])  # Metric to evaluate during training

batch_size = 100  # Number of samples per gradient update
max_epochs = 100  # Maximum number of epochs for training
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)  # Early stopping callback to prevent overfitting

# Train the model on the training data with validation
model.fit(train_inputs, train_targets,  # Training data and labels
          batch_size=batch_size,  # Batch size for training
          epochs=max_epochs,  # Maximum number of epochs
          callbacks=[early_stopping],  # Callbacks to use during training
          validation_data=(validation_inputs, validation_targets),  # Validation data and labels
          verbose=2)  # Verbosity mode

# %% [markdown]
# ### Test Model

# %%
test_loss, test_accuracy = model.evaluate(test_inputs, test_targets)

# %%
print('Test loss: {0:.2f}. Test accuracy: {1:.2f}%'.format(test_loss, test_accuracy *100))

# %%
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

# Make predictions on the test set
test_predictions = model.predict(test_inputs)
test_predicted_classes = np.argmax(test_predictions, axis=1)

# Calculate additional KPIs
precision = precision_score(test_targets, test_predicted_classes)
recall = recall_score(test_targets, test_predicted_classes)
f1 = f1_score(test_targets, test_predicted_classes)
conf_matrix = confusion_matrix(test_targets, test_predicted_classes)
roc_auc = roc_auc_score(test_targets, test_predictions[:, 1])  # Assuming binary classification

# Print the KPIs
print('Precision: {:.2f}'.format(precision))
print('Recall: {:.2f}'.format(recall))
print('F1 Score: {:.2f}'.format(f1))
print('Confusion Matrix:\n', conf_matrix)
print('AUC-ROC: {:.2f}'.format(roc_auc))

# %%
# Summary
# Precision (0.88): 88% of the customers predicted to buy again actually did buy again.
# Recall (0.77): 77% of the customers who actually bought again were correctly identified by the model.
# F1 Score (0.82): A good balance between precision and recall.
# Confusion Matrix: Provides a detailed breakdown of true positives, false positives, true negatives, and false negatives.
    # True Negatives (TN): 194 - The number of customers who were correctly identified as not buying again.
    # False Positives (FP): 25 - The number of customers who were incorrectly identified as buying again, but actually did not buy again.
    # False Negatives (FN): 52 - The number of customers who were incorrectly identified as not buying again, but actually did buy again.
    # True Positives (TP): 177 - The number of customers who were correctly identified as buying again.
# AUC-ROC (0.92): Excellent ability of the model to distinguish between classes.
    # AUC-ROC values range from 0 to 1.
    # An AUC-ROC value of 0.5 indicates a model with no discrimination ability, equivalent to random guessing.
    # An AUC-ROC value of 1.0 indicates a perfect model.
    # In your case, an AUC-ROC of 0.92 means that the model has excellent discrimination ability and is very good at distinguishing between customers who will buy again and those who will not.




1796.0 3579 0.5018161497625034
210.0 447 0.4697986577181208
231.0 448 0.515625
Epoch 1/100
36/36 - 2s - 43ms/step - accuracy: 0.6502 - loss: 0.6034 - val_accuracy: 0.7405 - val_loss: 0.5031
Epoch 2/100
36/36 - 0s - 3ms/step - accuracy: 0.7564 - loss: 0.4689 - val_accuracy: 0.7808 - val_loss: 0.4213
Epoch 3/100
36/36 - 0s - 3ms/step - accuracy: 0.7795 - loss: 0.4202 - val_accuracy: 0.7875 - val_loss: 0.3888
Epoch 4/100
36/36 - 0s - 2ms/step - accuracy: 0.7899 - loss: 0.3914 - val_accuracy: 0.7919 - val_loss: 0.3670
Epoch 5/100
36/36 - 0s - 3ms/step - accuracy: 0.7946 - loss: 0.3773 - val_accuracy: 0.8166 - val_loss: 0.3498
Epoch 6/100
36/36 - 0s - 3ms/step - accuracy: 0.8039 - loss: 0.3666 - val_accuracy: 0.7964 - val_loss: 0.3453
Epoch 7/100
36/36 - 0s - 3ms/step - accuracy: 0.8111 - loss: 0.3573 - val_accuracy: 0.8054 - val_loss: 0.3423
Epoch 8/100
36/36 - 0s - 3ms/step - accuracy: 0.8192 - loss: 0.3519 - val_accuracy: 0.8322 - val_loss: 0.3306
Epoch 9/100
36/36 - 0s - 5ms/step - accu