# Import the data


In [21]:
import numpy as np
from sklearn import preprocessing

# import the data
path = 'Audiobooks_data.csv'
data_raw = np.loadtxt(path, delimiter = ',')
# want all the rows but do not want the first column which is ID and the last colunn which is the target
unscaled_data_raw = data_raw[:,1:-1]
# want all the rows but only for the last columns which is the target
unscaled_targets_raw = data_raw[:,-1]

# Preprocessing

## Balance Data

In [44]:
# need to find out how many rows are in each category, total 1 category
total_one_category_count = int(np.sum(unscaled_targets_raw))

zero_count = 0
indices_remove = []

for i in range(unscaled_data_raw.shape[0]):
    if unscaled_targets_raw[i] == 0:
        zero_count += 1
        if zero_count > total_one_category_count:
            indices_remove.append(i)
            
balance_data_raw = np.delete(unscaled_data_raw,indices_remove, axis = 0)
balance_target_raw = np.delete(unscaled_targets_raw,indices_remove, axis = 0)

# to make sure, the total_one_category_count should be around half of the count of the balanced data and target 
print(total_one_category_count/balance_data_raw.shape[0])
print(total_one_category_count/balance_target_raw.shape[0])

# both of them is around 0.50

0.5
0.5


## Shuffle Data

In [53]:
shuffled_indices = np.arange(balance_data_raw.shape[0])
np.random.shuffle(shuffled_indices)

shuffle_data = balance_data_raw[shuffled_indices]
shuffle_target = balance_target_raw[shuffled_indices]

## Scale Data

In [55]:
scaled_data = preprocessing.scale(shuffle_data)

## Setup training, validation, and test data

In [83]:
# we want 10% of the data to be for validation, 10% for testing, and 80% for training
# we reflect that in the counts
validation_count = int(scaled_data.shape[0]*0.1)
test_count = int(scaled_data.shape[0]*0.1)
training_count = int(scaled_data.shape[0]) - validation_count - test_count

print ('Training Count: ' + str(training_count))
print ('Testing Count: ' + str(test_count))
print ('Validation Count: ' + str(validation_count))

# we create the data for each training, validaiton, and testing with the preset count
# we only want rows for the count we need for each type of datasets

validation_data = scaled_data[:validation_count]
validation_targets = shuffle_target[:validation_count]

test_data = scaled_data[validation_count:validation_count+test_count]
test_targets = shuffle_target[validation_count:validation_count+test_count]

train_data = scaled_data[validation_count+test_count:]
train_targets = shuffle_target[validation_count+test_count:]

# we want to make sure that the data is correct
# first we make sure that the test and validation data is 10%
# and there are 50% of it that are 1 and 50% that are 0

print ('validation data percent: '+ str(validation_data.shape[0]/scaled_data.shape[0]))
print ('testing data percent: '+ str(test_data.shape[0]/scaled_data.shape[0]))
print ('training data percent: '+ str(train_data.shape[0]/scaled_data.shape[0]))

print ('validation category 1 is : '+ str(np.sum(validation_targets)/validation_targets.shape[0]))
print ('testing category 1 is : '+ str(np.sum(test_targets)/test_targets.shape[0]))
print ('training category 1 is : '+ str(np.sum(train_targets)/train_targets.shape[0]))

Training Count: 3580
Testing Count: 447
Validation Count: 447
validation data percent: 0.09991059454626733
testing data percent: 0.09991059454626733
training data percent: 0.8001788109074653
validation category 1 is : 0.5190156599552572
testing category 1 is : 0.5302013422818792
training category 1 is : 0.49385474860335193


## Create NPZ from data

In [None]:
np.savez('audiobooks_data_train',)