In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/audiobooksdata/Audiobooks_data.csv


# In this Notebook we're going to Explore a 2 Years (and an Half for Targets) DataSet containing Informations of an AudioBooks Selling Company, and build a Predictive Model for Forecasting wether an Existing Customers is likely to purchase AudioBooks again through this platform or not, during the following 6 Months since the end of the Data Collection Period.

# Basics

* **Importing the Libraries.**

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

* **Importing the DataSet.**

In [3]:
df = pd.read_csv("/kaggle/input/audiobooksdata/Audiobooks_data.csv")

In [4]:
df

Unnamed: 0,ID,BookLength(mins)Overall,BookLength(mins)Avg,PriceOverall,PriceAvg,Review,Review10/10,MinutesListened,Completion,SupportRequests,LastVisitedMinusPurchaseDate,Targets
0,873,2160.0,2160,10.13,10.13,0,8.91,0.00,0.0,0,0,1
1,611,1404.0,2808,6.66,13.33,1,6.50,0.00,0.0,0,182,1
2,705,324.0,324,10.13,10.13,1,9.00,0.00,0.0,1,334,1
3,391,1620.0,1620,15.31,15.31,0,9.00,0.00,0.0,0,183,1
4,819,432.0,1296,7.11,21.33,1,9.00,0.00,0.0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
14079,27398,2160.0,2160,7.99,7.99,0,8.91,0.00,0.0,0,54,0
14080,28220,1620.0,1620,5.33,5.33,1,9.00,0.61,0.0,0,4,0
14081,28671,1080.0,1080,6.55,6.55,1,6.00,0.29,0.0,0,29,0
14082,31134,2160.0,2160,6.14,6.14,0,8.91,0.00,0.0,0,0,0


# DataSet Description:

* **ID**

Numerical; the Customer ID.

* **BookLength(mins)Overall**

Numerical; Sum of the AudioBooks Length of all Purchases.

* **BookLength(mins)Avg**

Numerical; Average Length of AudioBooks Lengths.

* **PriceOverall**

Numerical; Sum of the Price Paid for all the AudioBooks.

* **PriceAvg**

Numerical; Average Price of AudioBooks Prices.

* **Review**

Boolean; Wether the Customer left a Review or not. (1 Yes / 0 No)

* **Review10/10**

Numerical; Average Review of a Customer, if Above the Status Quo (8.91) then it's a Positive Review Feeling.

* **MinutesListened**

Numerical; Total Minutes Listened of AudioBooks.

* **Completion**

Numerical; Total Minutes Listened Divided by the Book Length in Minutes Overall. (Floating from 0 to 1 and can be converted as a Percentage -> 1 == 100%)

* **SupportRequests**

Numerical; Number of Times the Customer contacted the Support.

* **LastVisitedMinusPurchaseDate**

Numerical; The Last Visited session minus the First Purchase Date in Minutes, the higher the Number, the Better, since it means the Customer is actively using the Platform.

* **Targets**

Boolean; Wether the Customer Purchased another AudioBook in the Following 6 Months After the Data Gathering Period of 2 Years. (1 Yes / 0 No)

# Data PreProcessing

* **Preparing the DataSet.**

In [5]:
AllAsArray = np.asarray(df.drop(["ID"], axis = 1))

Inputs = np.asarray(df.drop(["ID", "Targets"], axis = 1))

Targets = np.asarray(df["Targets"])

* **Balancing the DataSet.**

In [6]:
#Getting Number of 0s and 1s.

TargetsOnes = int(np.count_nonzero(Targets))
TargetsZeros = len(Targets) - TargetsOnes

In [7]:
#Now we'll Balance the Dataset by removing the Extra 0s.

Counter = 0
IdxRemove = []
for i in range(len(AllAsArray)):
    if Targets[i] == 0:
        Counter += 1
    if Counter > TargetsOnes:
        IdxRemove.append(i)
        
InputsBalanced = np.delete(Inputs, IdxRemove, axis = 0)
TargetsBalanced = np.delete(Targets, IdxRemove, axis = 0)

* **Standardizing the Inputs.**

In [8]:
from sklearn.preprocessing import scale as S

ScaledInputs = S(InputsBalanced)

* **Shuffling the Data.**

In [9]:
IdxShuffle = np.arange(len(ScaledInputs))
np.random.shuffle(IdxShuffle)

ShuffledInputs = ScaledInputs[IdxShuffle]
ShuffledTargets = TargetsBalanced[IdxShuffle]

* **Splitting the DataSet. (Train/Validation/Test)**

In [10]:
#The following can be done much quicker with train_test_split from sklearn.

Samples = len(ShuffledInputs)

#80/10/10 Split
TrainSamples = int(0.8*Samples)
ValTestSamples = int(0.1*Samples)

TrainX = ShuffledInputs[:TrainSamples]
TrainY = ShuffledTargets[:TrainSamples]

ValX = ShuffledInputs[TrainSamples:TrainSamples+ValTestSamples]
ValY = ShuffledTargets[TrainSamples:TrainSamples+ValTestSamples]

TestX = ShuffledInputs[TrainSamples+ValTestSamples:]
TestY = ShuffledTargets[TrainSamples+ValTestSamples:]

* **Saving the Data in Tensor Format.**

In [11]:
np.savez("AudiobooksTrain", Inputs = TrainX, Targets = TrainY)
np.savez("AudiobooksVal", Inputs = ValX, Targets = ValY)
np.savez("AudiobooksTest", Inputs = TestX, Targets = TestY)

# Modeling

In [12]:
import os
for dirname, _, filenames in os.walk('/kaggle/working'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/working/AudiobooksVal.npz
/kaggle/working/__notebook__.ipynb
/kaggle/working/AudiobooksTrain.npz
/kaggle/working/AudiobooksTest.npz


* **Loading the Data.**

In [13]:
#Can be done also by:

#Inputs, Targets = np.load(...), np.load(...)

#Below, a more Organized Way.

TrainInputs = np.load("AudiobooksTrain.npz")["Inputs"].astype(np.float_) #Ensuring Values are Float Type.
TrainTargets = np.load("AudiobooksTrain.npz")["Targets"].astype(np.float_)

ValInputs = np.load("AudiobooksVal.npz")["Inputs"].astype(np.float_)
ValTargets = np.load("AudiobooksVal.npz")["Targets"].astype(np.float_)

TestInputs = np.load("AudiobooksTest.npz")["Inputs"].astype(np.float_)
TestTargets = np.load("AudiobooksTest.npz")["Targets"].astype(np.float_)

* **Defining the Model.**

In [14]:
import tensorflow as tf

InputLayerSize = 10
OutputLayerSize = 2
HiddenLayersSize = 50

#Defining the Model.
Model = tf.keras.Sequential([
                            #Dense Takes the Inputs and Calculates the Dot Product of the Inputs and Weights and adds the Bias.
                            #This is where we can apply the Activation Function.
                            #The Process is done Twice since we will have 2 Hidden Layers of the same Size with the same Activation Function.
                            tf.keras.layers.Dense(HiddenLayersSize, activation = "relu"),
                            tf.keras.layers.Dense(HiddenLayersSize, activation = "relu"),
                            #Defining the Output Layer that will use SoftMax Activation Function and will have size 10.
                            #SoftMax is used to turn Values into Probabilities.
                            tf.keras.layers.Dense(OutputLayerSize, activation = "softmax"),
                            ])

#Setting the Optimizer and the Loss Function.
#We'll use the SparseCategoricalCrossEntropy Loss Function, since it applies One Hot Encoding to our Outputs.
Model.compile(optimizer = "adam", loss = "sparse_categorical_crossentropy", metrics = ["accuracy"])

In [15]:
BatchSize = 100
Epochs = 100

#Defining the Early Stopping Mechanism for when the Validation Loss starts Increasing, by Default it stops as soon as it Inreases.
#With "Patience" attribute we can set the Number of Epochs of Tolerance from Early Stopping.
EarlyStopping = tf.keras.callbacks.EarlyStopping(patience = 3)

#When using Arrays insted of Tensors, Batching is automatically done by TensorFlow by specifying the Batch Size in the Fit Method.
Model.fit(TrainInputs, TrainTargets, batch_size = BatchSize, epochs = Epochs, validation_data = (ValInputs, ValTargets), callbacks = [EarlyStopping] , verbose = 2)

Epoch 1/100
36/36 - 1s - loss: 0.5861 - accuracy: 0.7332 - val_loss: 0.4613 - val_accuracy: 0.8523 - 1s/epoch - 32ms/step
Epoch 2/100
36/36 - 0s - loss: 0.3788 - accuracy: 0.8731 - val_loss: 0.3553 - val_accuracy: 0.8635 - 78ms/epoch - 2ms/step
Epoch 3/100
36/36 - 0s - loss: 0.3121 - accuracy: 0.8849 - val_loss: 0.3296 - val_accuracy: 0.8680 - 79ms/epoch - 2ms/step
Epoch 4/100
36/36 - 0s - loss: 0.2949 - accuracy: 0.8885 - val_loss: 0.3193 - val_accuracy: 0.8770 - 76ms/epoch - 2ms/step
Epoch 5/100
36/36 - 0s - loss: 0.2825 - accuracy: 0.8924 - val_loss: 0.3060 - val_accuracy: 0.8814 - 79ms/epoch - 2ms/step
Epoch 6/100
36/36 - 0s - loss: 0.2728 - accuracy: 0.8952 - val_loss: 0.3005 - val_accuracy: 0.8792 - 79ms/epoch - 2ms/step
Epoch 7/100
36/36 - 0s - loss: 0.2639 - accuracy: 0.9014 - val_loss: 0.2940 - val_accuracy: 0.8859 - 74ms/epoch - 2ms/step
Epoch 8/100
36/36 - 0s - loss: 0.2597 - accuracy: 0.9011 - val_loss: 0.2973 - val_accuracy: 0.8859 - 77ms/epoch - 2ms/step
Epoch 9/100
36/36

<keras.callbacks.History at 0x7fe8292c40d0>

* **Testing the Model.**

In [16]:
Loss, Accuracy = Model.evaluate(TestInputs, TestTargets)

