In [1]:
import numpy as np
import tensorflow as tf 
from sklearn import preprocessing 

## Preprocessing data.

In [2]:
raw_csv_data = np.loadtxt('Audiobooks_data.csv',delimiter=',')
raw_csv_data.shape

(14084, 12)

In [3]:
unscaled_inputs_all = raw_csv_data[:,1:-1]
target_all = raw_csv_data[:,-1] #will only contain column 
unscaled_inputs_all.shape

(14084, 10)

In [4]:
target_all.shape

(14084,)

### Balancing data.


In [5]:
num_one_targets = int(np.sum(target_all))
zero_targets_counter = 0
indices_to_remove = []

for i in range(target_all.shape[0]):
    if target_all[i] == 0 : 
        zero_targets_counter += 1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i) #will store all the indices after which the number of 0 = number of 1.
            
unscaled_input_equal_prior = np.delete(unscaled_inputs_all,indices_to_remove,axis=0) #We will delete that row.
target_equal_prior = np.delete(target_all,indices_to_remove,axis=0) #Doing the same for targets.

### Standardizing inputs. 

In [6]:
scaled_inputs = preprocessing.scale(unscaled_input_equal_prior)
scaled_inputs

array([[ 0.21053387, -0.18888517,  1.97823887, ...,  4.80955413,
        11.83828419,  0.09415043],
       [ 1.27894497,  0.41646744, -0.39082475, ..., -0.41569922,
        -0.20183481, -0.80255852],
       [ 1.27894497,  0.41646744, -0.39082475, ..., -0.41569922,
        -0.20183481,  2.979214  ],
       ...,
       [ 1.27894497,  0.41646744, -0.39082475, ..., -0.41569922,
        -0.20183481, -0.7440775 ],
       [ 0.31737498,  1.7482432 ,  0.04679395, ..., -0.41569922,
        -0.20183481, -0.80255852],
       [ 0.31737498,  1.7482432 , -0.39082475, ..., -0.41569922,
        -0.20183481, -0.80255852]])

## Shuffling the data.
### We do this to completely randomize the inputs.

In [7]:
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices) #shuffling the list of indices

shuffled_inputs=scaled_inputs[shuffled_indices]
shuffled_targets=target_equal_prior[shuffled_indices]

## Splitting the data into train,validation and test.

In [8]:
total_sample_count = shuffled_inputs.shape[0]

train_sample_count = int(0.8*total_sample_count)
validation_sample_count = int(0.1*total_sample_count)
test_sample_count = total_sample_count - train_sample_count - validation_sample_count
test_sample_count

448

In [9]:
train_inputs=shuffled_inputs[:train_sample_count]
train_targets=shuffled_targets[:train_sample_count]

validation_inputs=shuffled_inputs[train_sample_count:train_sample_count+validation_sample_count]
validation_targets=shuffled_targets[train_sample_count:train_sample_count+validation_sample_count]

test_inputs=shuffled_inputs[train_sample_count+validation_sample_count:]
test_targets=shuffled_targets[train_sample_count+validation_sample_count:]

In [10]:
print(np.sum(train_targets), train_sample_count, (np.sum(train_targets)/train_sample_count*100.00))
print(np.sum(validation_targets), validation_sample_count, (np.sum(validation_targets)/validation_sample_count*100.00))
print(np.sum(test_targets), test_sample_count, (np.sum(test_targets)/test_sample_count*100.00))

#50-50 is balanced.

1786.0 3579 49.90220732048058
221.0 447 49.44071588366891
230.0 448 51.33928571428571


### saving .npz file

In [11]:
np.savez('Audiobooks_data_train',inputs=train_inputs,targets=train_targets)
np.savez('Audiobooks_data_validation',inputs=validation_inputs,targets=validation_targets)
np.savez('Audiobooks_data_test',inputs=test_inputs,targets=test_targets)

### Extracting the files.

In [12]:
npz = np.load('Audiobooks_data_train.npz')

train_inputs = npz['inputs'].astype(np.float) #just to make sure that we get float values
train_targets = npz['targets'].astype(np.int) #same here, but for int.

npz = np.load('Audiobooks_data_validation.npz')

validation_inputs = npz['inputs'].astype(np.float) #just to make sure that we get float values
validation_targets = npz['targets'].astype(np.int)

npz = np.load('Audiobooks_data_test.npz')

test_inputs = npz['inputs'].astype(np.float) #just to make sure that we get float values
test_targets = npz['targets'].astype(np.int)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  train_inputs = npz['inputs'].astype(np.float) #just to make sure that we get float values
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  train_targets = npz['targets'].astype(np.int) #same here, but for int.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  validation_inputs = npz['inputs'].astype(np.float) #just to make sure that we get float values
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  validation_targets = npz['targets'].astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_inputs = npz['inputs'].astype(np.float) #just to make sure that we g

## Model 

In [13]:
input_size = 10 #Depends on 10 factors.
output_size = 2 #Since we to see if the customer will return or not
hidden_layer_size = 100 #This was initially set to 50 but then I have changed to different values like 150,100 but found this...
#as the best one beacuse this gives the val_acc and test_acc very close which means that there is overfitting of the model...
#with respect to the validation data.

model = tf.keras.Sequential([
                           
                            tf.keras.layers.Dense(hidden_layer_size,activation='relu'), #Layer 2 - hidden
                            tf.keras.layers.Dense(hidden_layer_size,activation='relu'), #Layer 3 - hidden
                            tf.keras.layers.Dense(output_size,activation='softmax')     #Layer 4 - output
                            ]) #Notice for the activation we ues softmax, Since we want to give probality.

model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [14]:
BATCH_SIZE = 200
MAX_EPOCHS = 100

early_stopping = tf.keras.callbacks.EarlyStopping(patience=5) #This will help us early stop the model to prevent overfitting...
#wrt the training data provided to the model. The patience parameter will run that many amount of epochs without any...
#restriction. If let's say we keep the brackets empty. it will start to assess from the start itself.

model.fit(train_inputs,
          train_targets,
          batch_size=BATCH_SIZE,
          epochs=MAX_EPOCHS,
          callbacks=[early_stopping],
          validation_data=(validation_inputs,validation_targets),
          
          verbose=2)

Epoch 1/100
18/18 - 1s - loss: 0.5919 - accuracy: 0.6949 - val_loss: 0.5004 - val_accuracy: 0.7718 - 600ms/epoch - 33ms/step
Epoch 2/100
18/18 - 0s - loss: 0.4635 - accuracy: 0.7681 - val_loss: 0.4271 - val_accuracy: 0.8054 - 39ms/epoch - 2ms/step
Epoch 3/100
18/18 - 0s - loss: 0.4092 - accuracy: 0.7907 - val_loss: 0.3929 - val_accuracy: 0.8009 - 36ms/epoch - 2ms/step
Epoch 4/100
18/18 - 0s - loss: 0.3867 - accuracy: 0.7941 - val_loss: 0.3698 - val_accuracy: 0.8322 - 34ms/epoch - 2ms/step
Epoch 5/100
18/18 - 0s - loss: 0.3737 - accuracy: 0.7963 - val_loss: 0.3619 - val_accuracy: 0.8277 - 35ms/epoch - 2ms/step
Epoch 6/100
18/18 - 0s - loss: 0.3638 - accuracy: 0.8072 - val_loss: 0.3474 - val_accuracy: 0.8322 - 34ms/epoch - 2ms/step
Epoch 7/100
18/18 - 0s - loss: 0.3585 - accuracy: 0.7997 - val_loss: 0.3506 - val_accuracy: 0.8255 - 36ms/epoch - 2ms/step
Epoch 8/100
18/18 - 0s - loss: 0.3525 - accuracy: 0.8086 - val_loss: 0.3412 - val_accuracy: 0.8523 - 34ms/epoch - 2ms/step
Epoch 9/100
18

<keras.src.callbacks.History at 0x176065c0a90>

### Testing the model.

In [15]:
test_loss, test_acc = model.evaluate(test_inputs,test_targets)



In [16]:
test_loss

0.3113090395927429

In [17]:
print("Test accuracy : " +str(test_acc*100.00))

Test accuracy : 81.91964030265808
