# Data preprocess

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors
from mpl_toolkits.mplot3d import Axes3D
import cv2
import os

# parameters
NUM_EACH_SUBJECT = 170
NUM_SELFIES = 10
TRAIN_TEST_RATIO = 0.7
RANDOM_SEED = 22
np.random.seed(RANDOM_SEED)

def choose_random_idx(num, vmin, vmax, seed=RANDOM_SEED):
    np.random.seed(seed)
    idx = [i for i in range(vmin, vmax)]
    random_idx = np.random.permutation(idx)
    return sorted(random_idx[0 : num])

def get_train_test_list(input_list, ratio, seed=RANDOM_SEED):
    train_idx = choose_random_idx(num=round(len(input_list)*ratio), vmin=0, vmax=len(input_list), seed=RANDOM_SEED)

    train_list = []
    test_list = []
    for i in range(0,len(input_list)):
        if i in train_idx:
            train_list.append(input_list[i])
        else:
            test_list.append(input_list[i])
    return train_list, test_list

def get_pie_list(data_idx):
    # list of paths to PIE images
    pie_list = []
    pie_train_list = []
    pie_test_list = []

    for subj_idx in data_idx:
        subj_list = ['PIE/'+str(subj_idx)+'/'+str(i+1)+'.jpg' for i in range(0,NUM_EACH_SUBJECT)]
        subj_train_list, subj_test_list = get_train_test_list(subj_list, ratio=TRAIN_TEST_RATIO, seed=RANDOM_SEED)
        pie_train_list.extend(subj_train_list)
        pie_test_list.extend(subj_test_list)
        pie_list.extend(subj_list)
    return pie_list, pie_train_list, pie_test_list

def get_self_list():
    # list of paths to selfies
    self_list = ['selfimg/'+str(i+1)+'.jpg' for i in range(0,NUM_SELFIES)]
    self_train_list, self_test_list = get_train_test_list(self_list, ratio=TRAIN_TEST_RATIO, seed=RANDOM_SEED)
    return self_list, self_train_list, self_test_list

def get_img_vector(input_list):
    img_v = []
    labels = []
    for i in range(len(input_list)):
        path = input_list[i]
        pathsplit = path.split('/')
        img = cv2.imread(path)
        img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
        img_v.append(img)
        if pathsplit[0] == 'PIE':
            labels.append(int(pathsplit[1]))
        elif pathsplit[0] == 'selfimg':
            #labels.append(pathsplit[1]) # label of selfimg is set as 'selfimg'
            labels.append(0) # label of selfimg is set as 0
        else:
            print('Error: Wrong path list!')
        
    img_a = np.array(img_v)
    img_a = img_a.reshape(len(img_v), -1)
    
    labels_a = np.array(labels)
    
    return img_a, labels_a

#============================================================
data_idx = choose_random_idx(num=25, vmin=1, vmax=68, seed=RANDOM_SEED)

pie_list, pie_train_list, pie_test_list = get_pie_list(data_idx)
self_list, self_train_list, self_test_list = get_self_list()

# list of paths to all images of interest
list_img = pie_list + self_list
train_list = pie_train_list + self_train_list
test_list = pie_test_list + self_test_list


print('data_idx',data_idx)
print('Number of PIE images:', len(pie_list))
print('Number of PIE train images:', len(pie_train_list))
print('Number of PIE test images:', len(pie_test_list))
print('Number of self images:', len(self_list))
print('Number of self train images:', len(self_train_list))
print('Number of self test images:', len(self_test_list))
print('Number of whole train images:', len(train_list))
print('Number of whole test images:', len(test_list))

data_idx [1, 2, 4, 7, 13, 14, 16, 17, 22, 23, 26, 27, 29, 33, 36, 43, 47, 50, 52, 53, 57, 58, 63, 66, 67]
Number of PIE images: 4250
Number of PIE train images: 2975
Number of PIE test images: 1275
Number of self images: 10
Number of self train images: 7
Number of self test images: 3
Number of whole train images: 2982
Number of whole test images: 1278


# PCA preprocess(with dimensionality of 80 and 200)

In [2]:
# Use original method
def PCA_transform(X, n_components):
    # 计算协方差矩阵
    m = X.shape[0]
    X_mean = np.mean(X, axis=0)
    X_centered = X - X_mean
    covariance_matrix = 1 / m * np.dot(X_centered.T, X_centered)

    # 获取特征值，和特征向量
    eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)
    
    # 对特征值排序，并取最大的前n_component组
    idx = np.argsort(-eigenvalues)
    
    eigenvectors = eigenvectors[:, idx]
    eigenvectors = eigenvectors[:, :n_components]
    X_pca = X_centered @ eigenvectors
    return X_pca, X_mean, eigenvectors

# Use SVD
def PCA_SVD_transform(X, n_components):
    X_mean = np.mean(X, axis=0)
    X_centered = X - X_mean
    u, s, vh = np.linalg.svd(X_centered, full_matrices=False)
    components = vh[:n_components]
    X_pca = X_centered @ components.T
    return X_pca


# SVM

In [4]:
from libsvm.svm import svm_problem, svm_parameter
from libsvm.svmutil import svm_train, svm_predict

def SVM_PCA_preprocess(data, label, test_data, test_label, C, dimensionality):
    for c in C:
        print('penalty parameter C = ', c)
        param = ('-s 0 -t 0 -c ' + str(c))  
        if dimensionality=='raw':
            print('raw face images (vectorized)')
            prob = svm_problem(label, data)
            m = svm_train(prob, param)
            p_label, p_acc, p_val = svm_predict(test_label, test_data, m)
        else:
            for d in dimensionality:
                print('dimensionality is ',d)
                # prepare dataset
                data_new, mean, eigenv = PCA_transform(data, n_components=d)
                test_data_new = (test_data - mean) @ eigenv
                # train svm model
                prob = svm_problem(label, data_new)
                model = svm_train(prob, param)
                # predict test dataset
                pred_label, pref_acc, pred_val = svm_predict(test_label, test_data_new, model)

#=====================================================================================
C = [0.01, 0.1, 1] # penalty parameter
D = [10, 80, 200] # dimensionality

train_data, train_label = get_img_vector(train_list)
test_data, test_label = get_img_vector(test_list)

SVM_PCA_preprocess(train_data, train_label, test_data, test_label, C, dimensionality='raw')
SVM_PCA_preprocess(train_data, train_label, test_data, test_label, C, dimensionality=D)

penalty parameter C =  0.01
raw face images (vectorized)
Accuracy = 99.2958% (1269/1278) (classification)
penalty parameter C =  0.1
raw face images (vectorized)
Accuracy = 99.2958% (1269/1278) (classification)
penalty parameter C =  1
raw face images (vectorized)
Accuracy = 99.2958% (1269/1278) (classification)
penalty parameter C =  0.01
dimensionality is  10
Accuracy = 77.543% (991/1278) (classification)
dimensionality is  80
Accuracy = 98.9828% (1265/1278) (classification)
dimensionality is  200
Accuracy = 99.2958% (1269/1278) (classification)
penalty parameter C =  0.1
dimensionality is  10
Accuracy = 77.3865% (989/1278) (classification)
dimensionality is  80
Accuracy = 98.9828% (1265/1278) (classification)
dimensionality is  200
Accuracy = 99.2958% (1269/1278) (classification)
penalty parameter C =  1
dimensionality is  10
Accuracy = 74.3349% (950/1278) (classification)
dimensionality is  80
Accuracy = 98.9828% (1265/1278) (classification)
dimensionality is  200
Accuracy = 99.295