In [2]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler


import tensorflow as tf

import pickle

In [5]:
#id,book length overall, book length avg, price overall, price avg
#review num, review, minutes listened, completion, support requests
#last visit - purchase date, target
columns = ['id','book_length_overall','book_length_avg','price_overall','price_avg','review_num','review','minutes_listened','completion','support_requests','last_visit','target']

raw_data = np.loadtxt('Audiobooks_data.csv', delimiter=',')
unscaled_inputs_all = raw_data[:,1:-1]
targets_all = raw_data[:,-1]


Balance dataset

In [8]:
np.unique(targets_all, return_counts=True)

(array([0., 1.]), array([11847,  2237]))

In [9]:
num = int(np.sum(targets_all))

counter = 0

indices_to_remove = []

for i in range(targets_all.shape[0]):
    if targets_all[i] == 0:
        counter += 1
        if counter > num:
            indices_to_remove.append(i)

balanced_inputs = np.delete(unscaled_inputs_all, indices_to_remove, axis=0)
balanced_targets = np.delete(targets_all, indices_to_remove, axis=0)

In [10]:
print(np.sum(balanced_targets))

2237.0


In [11]:
scaler = StandardScaler()
scaled_inputs = scaler.fit_transform(balanced_inputs)

pickle.dump(scaler, open('dl_scaler.pkl', 'wb'))

In [15]:
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

# Use the shuffled indices to shuffle the inputs and targets.
shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = balanced_targets[shuffled_indices]

In [16]:
#split the dataset into train, validate, and test

samples_count = shuffled_inputs.shape[0]

train_count = int(0.8*samples_count)
validation_count = int(0.1*samples_count)
test_count = samples_count - train_count - validation_count

train_inputs = shuffled_inputs[:train_count]
train_targets = shuffled_targets[:train_count]


valid_inputs = shuffled_inputs[train_count:train_count+validation_count]
valid_targets = shuffled_targets[train_count:train_count+validation_count]

test_inputs = shuffled_inputs[train_count+validation_count:]
test_targets = shuffled_targets[train_count+validation_count:]

print(np.sum(train_targets), train_targets.shape[0], np.sum(train_targets)/train_targets.shape[0])
print(np.sum(valid_targets), valid_targets.shape[0], np.sum(valid_targets)/valid_targets.shape[0])
print(np.sum(test_targets), test_targets.shape[0], np.sum(test_targets)/test_targets.shape[0])


1807.0 3579 0.504889633975971
214.0 447 0.47874720357941836
216.0 448 0.48214285714285715


In [17]:
np.savez('Audiobooks_data_train', inputs=train_inputs, targets=train_targets)
np.savez('Audiobooks_data_validation', inputs=valid_inputs, targets=valid_targets)
np.savez('Audiobooks_data_test', inputs=test_inputs, targets=test_targets)

In [20]:
npz = np.load('Audiobooks_data_train.npz')


train_inputs = npz['inputs'].astype('float')
train_targets = npz['targets'].astype('int')

npz = np.load('Audiobooks_data_validation.npz')
validation_inputs, validation_targets = npz['inputs'].astype('float'), npz['targets'].astype('int')

npz = np.load('Audiobooks_data_test.npz')
test_inputs, test_targets = npz['inputs'].astype('float'), npz['targets'].astype('int')

Model

In [21]:
output_size = 2

hidden_layer_size = 50

model = tf.keras.Sequential([
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
    tf.keras.layers.Dense(output_size, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

batch_size =100

max_epochs = 100

early_stop = tf.keras.callbacks.EarlyStopping(patience=2)

model.fit(train_inputs, train_targets,
          batch_size=batch_size,
          epochs=max_epochs,
          callbacks=[early_stop],
          validation_data=(validation_inputs, validation_targets),
          verbose=2)

Epoch 1/100
36/36 - 1s - 41ms/step - accuracy: 0.7471 - loss: 0.5568 - val_accuracy: 0.8770 - val_loss: 0.4181
Epoch 2/100
36/36 - 0s - 4ms/step - accuracy: 0.8818 - loss: 0.3592 - val_accuracy: 0.8859 - val_loss: 0.3254
Epoch 3/100
36/36 - 0s - 5ms/step - accuracy: 0.8871 - loss: 0.3047 - val_accuracy: 0.8881 - val_loss: 0.3032
Epoch 4/100
36/36 - 0s - 7ms/step - accuracy: 0.8919 - loss: 0.2894 - val_accuracy: 0.8881 - val_loss: 0.2933
Epoch 5/100
36/36 - 0s - 4ms/step - accuracy: 0.8977 - loss: 0.2794 - val_accuracy: 0.8904 - val_loss: 0.2833
Epoch 6/100
36/36 - 0s - 4ms/step - accuracy: 0.9000 - loss: 0.2707 - val_accuracy: 0.8949 - val_loss: 0.2771
Epoch 7/100
36/36 - 0s - 4ms/step - accuracy: 0.9000 - loss: 0.2648 - val_accuracy: 0.8971 - val_loss: 0.2728
Epoch 8/100
36/36 - 0s - 5ms/step - accuracy: 0.9008 - loss: 0.2594 - val_accuracy: 0.8949 - val_loss: 0.2677
Epoch 9/100
36/36 - 0s - 7ms/step - accuracy: 0.9025 - loss: 0.2553 - val_accuracy: 0.9016 - val_loss: 0.2649
Epoch 10/

<keras.src.callbacks.history.History at 0x7c3fffd71c10>

In [22]:
test_loss, test_accuracy = model.evaluate(test_inputs, test_targets)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9125 - loss: 0.2387 


In [23]:
model.predict(test_inputs)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


array([[8.86578679e-01, 1.13421202e-01],
       [9.99997556e-01, 2.42557735e-06],
       [9.99685764e-01, 3.14234785e-04],
       [9.99884307e-01, 1.15663075e-04],
       [1.68980077e-01, 8.31019938e-01],
       [1.89522257e-06, 9.99998033e-01],
       [9.99999940e-01, 1.19294334e-08],
       [9.99999940e-01, 1.36748737e-11],
       [8.22661281e-01, 1.77338734e-01],
       [1.14657588e-01, 8.85342360e-01],
       [8.46761391e-02, 9.15323794e-01],
       [9.92869496e-01, 7.13052833e-03],
       [3.62688382e-07, 9.99999583e-01],
       [9.59256232e-01, 4.07436639e-02],
       [1.34394392e-02, 9.86560464e-01],
       [9.99999702e-01, 2.05061085e-07],
       [3.14101487e-01, 6.85898483e-01],
       [7.52559081e-02, 9.24744070e-01],
       [1.02144515e-09, 9.99999940e-01],
       [9.94641781e-01, 5.35806548e-03],
       [7.94784367e-01, 2.05215618e-01],
       [9.27163780e-01, 7.28362128e-02],
       [9.27946925e-01, 7.20530748e-02],
       [8.98063362e-01, 1.01936609e-01],
       [9.135153

In [25]:
model.predict(test_inputs)[:,1].round(0)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


array([0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0., 1., 0., 1., 0., 1.,
       1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0.,
       1., 1., 1., 1., 0., 0., 0., 1., 0., 1., 1., 0., 0., 1., 1., 0., 1.,
       0., 0., 1., 1., 1., 0., 0., 0., 1., 0., 1., 1., 1., 1., 1., 1., 0.,
       0., 0., 1., 0., 0., 1., 1., 0., 0., 1., 0., 1., 1., 1., 1., 1., 1.,
       1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 1., 0., 1.,
       1., 1., 0., 1., 1., 0., 0., 0., 1., 0., 1., 1., 1., 0., 0., 0., 0.,
       0., 0., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 0., 0.,
       0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0.,
       1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0.,
       1., 0., 1., 1., 0., 1., 0., 1., 1., 0., 1., 0., 1., 0., 0., 1., 0.,
       0., 1., 0., 0., 1., 1., 1., 0., 0., 1., 0., 0., 1., 1., 1., 0., 0.,
       1., 1., 0., 0., 0., 0., 1., 1., 0., 1., 0., 1., 0., 0., 1., 1., 1.,
       1., 0., 1., 1., 0.

In [26]:
np.argmax(model.predict(test_inputs),axis=1)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


array([0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0,

In [27]:
model.save('dl_model.h5')

