In [1]:
# import necessary libraries

# import built-in libraries
import os
import pandas as pd
import numpy as np
from glob import glob
import re

# import functions from tslearn
from tslearn.utils import to_time_series_dataset
from tslearn.preprocessing import TimeSeriesScalerMinMax
from tslearn.svm import TimeSeriesSVC

# import functions from sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [2]:
# a function that aggregates all csv files
def aggregate_data(csv_dir, label):
    
    # get the list of all csv files
    csv_lst = glob(os.path.join(csv_dir, '*.csv'))
    
    # compile a regular expression to extract
    # the digits part of the file name
    r = re.compile('\D*(\d*).csv')
    
    # create a lambda expression with this regular expression
    extract_num = lambda x:int(r.search(x).group(1))
    
    # sort the file name in ascending order
    csv_lst.sort(key=extract_num)
                               
    # create an empty dataframe to hold the results
    df = pd.DataFrame()
    
    # process each csv file in turn
    for csv in csv_lst:
        
        # read in the data from this csv file
        df_tmp = pd.read_csv(csv)
        
        # strip the white spaces before and after the column names
        df_tmp.columns = df_tmp.columns.str.strip()
        
        # add a column to denote the video id
        df_tmp['vid'] = extract_num(csv)
        
        # add another column to denote the label
        df_tmp['label'] = label
        
        # append this dataframe to the aggregated dataframe
        df = df.append(df_tmp, ignore_index=True)
        
        # reset the index without keeping the old one
        df.reset_index(drop=True, inplace=True)
    
    # return the resulting dataframe
    return df

In [3]:
# a function that sanitizes data
def sanitize_data(df):
    
    # copy the dataset to tmp
    df_tmp = df.copy()
    
    # create a filter that filters out unsuccessful entries
    filt1 = df_tmp['success'] != 1
    
    # remove unsuccessful entries
    df_tmp.drop(index=df.loc[filt1].index, inplace=True)
    
    # create a filter that filters out low-confidence entries
    filt2 = df_tmp['confidence'] < 0.8
    
    # remove low-confidence entries
    df_tmp.drop(index=df_tmp.loc[filt2].index, inplace=True)
    
    # reset the index without keeping the old one
    df_tmp.reset_index(drop=True, inplace=True)
    
    # return the resulting dataframe
    return df_tmp

In [4]:
# set the root folder of the dataset
root = 'dataset'

# set the folder that contains the dataset for happy smiles
dir_happy = os.path.join(root,'happy_frames_openface')

# set the folder that contains the dataset for nervous smiles
dir_nervous = os.path.join(root,'happy_frames_openface')

# aggregate the data for happy smiles
# the label for happy smiles is 1
df_happy = aggregate_data(dir_happy, 1)

# aggregate the data for nervous smiles
# the label for nervous smiles is 0
df_nervous = aggregate_data(dir_nervous, 0)

In [5]:
# data sanitizations

# the data for the happy smiles
df_happy = sanitize_data(df_happy)

# the data for the nervous smiles
df_nervous = sanitize_data(df_nervous)

# save to files
df_happy.to_csv('happy_smiles.csv')
df_nervous.to_csv('nervous_smiles.csv')

In [6]:
# a function that groups frames 
# belonging to each video
def group_featrues(df_features):
    
    # create an empty list to hold the results
    lst = []
    
    # process each entry by video id
    for vid in df_features['vid'].unique():
        
        # create a filter that leave frames
        # belonging to this vid alone
        filt = df_features['vid'] == vid
        
        # extract the frames belonging to this vid
        # convert to ndarray and add to the list
        lst.append(df_features[filt].drop(['vid'],axis=1).to_numpy())
    
    # return the results as a ndarray
    return np.array(lst)

In [7]:
# feature extraction

# selected features for classification
features = ['pose_Tx', 'pose_Ty', 'pose_Tz', 'AU06_r', 'AU12_r', 'AU26_r','vid']

# extract features for happy smiles
df_happy_features = df_happy.loc[:,features]

# extract features for nervous smiles
df_nervous_features = df_nervous.loc[:,features]

# group features to a ndarray
np_happy_features = group_featrues(df_happy_features)
np_nervous_features = group_featrues(df_nervous_features)

# create a combined features
X = np.concatenate([np_happy_features,np_nervous_features])

# create a combined labels
y = np.concatenate([np.ones(np_happy_features.shape[0]), np.zeros(np_nervous_features.shape[0])])

In [10]:
# split the dataset into the training set and test set
X_train,X_test,y_train,y_test = train_test_split(X, y, shuffle=True, random_state=16, test_size = 0.2)

# convert these features to time series

# features from the training set 
X_train = to_time_series_dataset(X_train)

# features from the test set
X_test = to_time_series_dataset(X_test)

# feature scaling

# features from the training set 
#X_train = TimeSeriesScalerMinMax().fit_transform(X_train)

# features from the test set
#X_test = TimeSeriesScalerMinMax().fit_transform(X_test)

In [11]:
# model selection

# create a kNN model with the metric of DTW 
svc = TimeSeriesSVC()

# possible k values
pgrid = {'kernel': ['gak', 'poly', 'rbf'], 'C':list(range(1,5))}

# 5-fold cross-validation for training
cv = KFold(n_splits=5, shuffle=True, random_state=0)

# create a searcher for the optimal value of k
grid_search = GridSearchCV(estimator=svc, n_jobs=6, param_grid=pgrid, cv=cv)

# SVM parameter tuning with the training set
grid_search.fit(X_train, y_train)

# get the optimal kernel from the grid search
k = grid_search.best_params_['kernel']

# get the optimal kernel
print(f'The optimal kernel for the kernel SVM is {k}')

# get the optimal C, penalty parameter C of the error term
# from the grid search
c = grid_search.best_params_['C']

# get the optimal kernel
print(f'The optimal C for the kernel SVM is {c}')

The optimal kernel for the kernel SVM is gak
The optimal C for the kernel SVM is 1
