# GENERATION OF TRAINING AND TEST SETS

In [1]:
# Imports
import scipy.io as sio
import numpy as np

# Load the source file with the face data
mat_content = sio.loadmat('face.mat')
# Store face data and classification
face_data = mat_content['X']
face_label = mat_content['l']

# Append the matching data and labels into array 'faces'
faces = np.append(face_data, face_label, axis=0)

# Transpose in preparation for division into training and testing material
faces_transposed = faces.T

In the following section, we will split the dataset into a training and a test set. This will done by randomly sampling over all datapoints. 80% of the samples will be used for training and 20% will be used for testing.

In [2]:
# Divide the data set into training and testing sets: 80% is training, 20% is testing
np.random.shuffle(faces_transposed) # Shuffle the rows (This is why we transposed)
train, test = faces_transposed[:416,:], faces_transposed[416:,:]

# Store the new datasets as .gzip files
np.savetxt('split_whole_train.gzip', train)
np.savetxt('split_whole_test.gzip', test)

We will repeat the process of splitting the data into a training and a test set. However, this time will randomly take 80% of the datapoints from each class.

In [3]:
#create training data by sampling over each class
test_class = np.zeros((104,2577))
train_class = np.zeros((416,2577))
for i in range(52):
    temp_class_set = faces_transposed[i==(faces_transposed[:,-1]-1),:]
    np.random.shuffle(temp_class_set)
    train_class[8*i:(8*i)+8,:] = temp_class_set[:8,:]
    test_class[2*i:(2*i)+2,:]=temp_class_set[8:,:]
    
# Store the new datasets as .gzip files
np.savetxt('split_class_train.gzip', train_class)
np.savetxt('split_class_test.gzip', test_class)