### Data preprocessing

In [18]:
import numpy as np
from sklearn import preprocessing

In [14]:
raw_csv_data = np.loadtxt('data/Audiobooks-data.csv',delimiter=',')

unscaled_inputs_all = raw_csv_data[:,1:-1] #[row,column]
targets_all = raw_csv_data[:,-1]

unscaled_inputs_all.shape, targets_all.shape

((14084, 10), (14084,))

#### Balance the dataset

In [12]:
num_one_targets = int(np.sum(targets_all))
zero_targets_counter = 0
indices_to_remove = []

for i in range(targets_all.shape[0]):
    if targets_all[i] == 0:
        zero_targets_counter +=1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)

unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all,indices_to_remove,axis=0) # axis=0 ->row
targets_equal_priors = np.delete(targets_all,indices_to_remove,axis=0)

unscaled_inputs_equal_priors.shape, targets_equal_priors.shape

((4474, 10), (4474,))

#### Standardize the inputs

In [15]:
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

#### Shuffle the data
* to work for the batches as to prevent homogeneous data
* `np.arange([start],stop)` is a method that returns a evenly spaced values within a given interval

In [17]:
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

#### Split the dataset into train, validation, and test

In [19]:
samples_count = shuffled_inputs.shape[0]

# dividing the data sets
train_samples_count = int(0.8*samples_count)
validation_samples_count = int(0.1*samples_count)
test_samples_count = samples_count - train_samples_count - validation_samples_count

# splitting the data
#training
train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]
# validation
validation_inputs = shuffled_inputs[train_samples_count:train_samples_count+validation_samples_count]
validation_targets = shuffled_targets[train_samples_count:train_samples_count+validation_samples_count]
#test
test_inputs = shuffled_inputs[train_samples_count+validation_samples_count:]
test_targets = shuffled_targets[train_samples_count+validation_samples_count:]

print(np.sum(train_targets),train_samples_count,np.sum(train_targets)/train_samples_count)
print(np.sum(validation_targets),validation_samples_count,np.sum(validation_targets)/validation_samples_count)
print(np.sum(test_targets),test_samples_count,np.sum(test_targets)/test_samples_count)

1774.0 3579 0.4956691813355686
219.0 447 0.4899328859060403
244.0 448 0.5446428571428571


#### Save the three datasets in *.npz

In [20]:
np.savez('Audiobooks_data_train',inputs=train_inputs,targets=train_targets)
np.savez('Audiobooks_data_validation',inputs=validation_inputs,targets=validation_targets)
np.savez('Audioboos_data_test',inputs=test_inputs,targets=test_targets)

### Create the machine learning algorithm

In [21]:
import tensorflow as tf

#### Data

In [24]:
# Train data
npz = np.load('Audiobooks_data_train.npz')

train_inputs = npz['inputs'].astype(np.float)  # we expect all the inputs be float for the ML
train_targets = npz['targets'].astype(np.int)

# Validation data
npz = np.load('Audiobooks_data_validation.npz')

validation_inputs,validation_targets = npz['inputs'].astype(np.float),npz['targets'].astype(np.int)

# test data
npz = np.load('Audioboos_data_test.npz')

test_inputs, test_targets = npz['inputs'].astype(np.float),npz['targets'].astype(np.int)


#### Create the model
Outline, optimizers, loss, early stopping and training