In [3]:
import numpy as np
import tensorflow as tf
import pandas as pd

### Import data

In [4]:
#load the full training dataset in variable called npz
npz = np.load("Data/Audiobooks_data_train.npz")

# we stored our inputs and targets in the dataset, now assigned to npz, with keywords inputs and target
# We therefore can extract the inputs and targets by indexing on the variable npz and set the data types
train_inputs = npz['inputs'].astype(np.float)
train_targets = npz['targets'].astype(np.int)

#load the full validation dataset in variable called npz
npz = np.load("Data/Audiobooks_data_validation.npz")

# store validation inputs and targets
validation_inputs = npz['inputs'].astype(np.float)
validation_targets = npz['targets'].astype(np.int)

#load the full testing dataset in variable called npz
npz = np.load("Data/Audiobooks_data_test.npz")

# store testing inputs and targets
test_inputs = npz['inputs'].astype(np.float)
test_targets = npz['targets'].astype(np.int)

In [5]:
# Testing the balance of the target datase
train_converts = np.where(train_targets == 1)[0].shape[0]
train_nonconverts = np.where(train_targets == 0)[0].shape[0]
validation_converts = np.where(validation_targets == 1)[0].shape[0]
validation_nonconverts = np.where(validation_targets == 0)[0].shape[0]
test_converts = np.where(test_targets == 1)[0].shape[0]
test_nonconverts = np.where(test_targets == 0)[0].shape[0]

print(f' There are {train_nonconverts} non-converts and {train_converts} converts in the training dataset targets for a ratio of {train_nonconverts/train_converts}.')
print(f' There are {validation_nonconverts} non-converts and {validation_converts} converts in the validation dataset targets for a ratio of {validation_nonconverts/validation_converts} .')
print(f' There are {test_nonconverts} non-converts and {test_converts} converts in the testing dataset targets for a ratio of {test_nonconverts/test_converts}.')






 There are 1796 non-converts and 1783 converts in the training dataset targets for a ratio of 1.007291082445317.
 There are 215 non-converts and 232 converts in the validation dataset targets for a ratio of 0.9267241379310345 .
 There are 226 non-converts and 222 converts in the testing dataset targets for a ratio of 1.018018018018018.



### Model


In [6]:
# Taking model from MNIST project and changing some parameters and hyperparameters
input_size = 10
output_size = 2
hidden_layer_size = 75

model = tf.keras.Sequential([
                    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
                    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
                    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
                    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
                    tf.keras.layers.Dense(output_size,activation = 'softmax')   
        ])

#Optimizer and loss function
model.compile(optimizer='adam',loss = 'sparse_categorical_crossentropy',metrics=['accuracy'])


batch_size = 100
max_epochs = 100

# This earlystopping function in tf.keras.callbacks will monitor validation loss and stop the trianing process
# the first time the validation loss starts increasing
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)

model.fit(train_inputs,
            train_targets,
            batch_size=batch_size,
            epochs = max_epochs,
            callbacks = [early_stopping],
            validation_data = (validation_inputs,validation_targets),
            verbose=2)

Train on 3579 samples, validate on 447 samples
Epoch 1/100
3579/3579 - 1s - loss: 7.8961 - accuracy: 0.6373 - val_loss: 5.9759 - val_accuracy: 0.6510
Epoch 2/100
3579/3579 - 0s - loss: 3.4886 - accuracy: 0.6871 - val_loss: 1.3544 - val_accuracy: 0.7338
Epoch 3/100
3579/3579 - 0s - loss: 1.8547 - accuracy: 0.7212 - val_loss: 0.7283 - val_accuracy: 0.7696
Epoch 4/100
3579/3579 - 0s - loss: 2.2786 - accuracy: 0.7058 - val_loss: 2.9689 - val_accuracy: 0.6823
Epoch 5/100
3579/3579 - 0s - loss: 1.6343 - accuracy: 0.7290 - val_loss: 3.5485 - val_accuracy: 0.6488


<tensorflow.python.keras.callbacks.History at 0x64a3aa080>

### Testing the model 

In [7]:
test_loss, test_accuracy = model.evaluate(test_inputs,test_targets)



In [8]:
print(f'Test Loss: {test_loss}, Test Accuracy: {round(100*test_accuracy,2)}%')

Test Loss: 3.258535248892648, Test Accuracy: 66.96%


In [9]:
customers = np.loadtxt('Data/original.csv',delimiter=',')
customers

array([[9.9400e+02, 1.6200e+03, 1.6200e+03, ..., 5.0000e+00, 9.2000e+01,
        0.0000e+00],
       [1.1430e+03, 2.1600e+03, 2.1600e+03, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [2.0590e+03, 2.1600e+03, 2.1600e+03, ..., 0.0000e+00, 3.8800e+02,
        0.0000e+00],
       ...,
       [3.1134e+04, 2.1600e+03, 2.1600e+03, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [3.2832e+04, 1.6200e+03, 1.6200e+03, ..., 0.0000e+00, 9.0000e+01,
        0.0000e+00],
       [2.5100e+02, 1.6740e+03, 3.3480e+03, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00]])

In [18]:
predictions = []
targets = customers[:,-1]
for i in range(customers.shape[0]-10000):
    predictions.append(model.predict_classes([[[[customers[i,1:-1]]]]]))

In [37]:
pred_df = pd.DataFrame([predictions,targets]).T
pred_df.columns = ["predictions","actuals"]
pred_df

Unnamed: 0,predictions,actuals
0,[[[0]]],0
1,[[[0]]],0
2,[[[0]]],0
3,[[[0]]],0
4,[[[0]]],0
...,...,...
14079,,0
14080,,0
14081,,0
14082,,0
