### README
This script build and tests a baseline SVM model. The intent of this model is to
serve as a goal for our CNN+BiLSTM model to beat

In [3]:
# Import required libraries
import pandas as pd # for holding data in a dataframe
import os # for manipulating directories and listing files
from matplotlib import pyplot as plt #visualization
import numpy as np # linear algebra and array work
from sklearn import svm # Model for training
from sklearn.model_selection import train_test_split #Splitting data
from sklearn.metrics import accuracy_score, classification_report #accuracy report

In [54]:
def read_csvs(dirPath,marker):
    '''
    This function reads all the .csv files at the specified director 
    and then loads them individually into a merged dataframe. It 
    returns a merged dataframe with a reset index. It also appends the video number
    into the dataframe.
    '''
    # list files in directory
    files = os.listdir(dirPath)
    
    # Create list of valid .csv files in directory and sort numerically
    csvs = [csv for csv in files if csv.endswith('.csv')]
    csvs.sort(key = lambda x: int(x[1:-4]))
    
    # Parse through csv files
    for inx,csv in enumerate(csvs):
        # If this is the first file, we need to create the merged_df
        if inx == 0:
            merged_df = pd.read_csv(dirPath + csv)
            merged_df['video'] = str(0)+marker 
        # else we just read and append
        else :
            temp_df = pd.read_csv(dirPath + csv)
            temp_df['video'] = str(inx) + marker
            merged_df = merged_df.append(temp_df)

    return merged_df.reset_index(drop=True)

def keys_to_array(df, keys):
    '''
    Takes a dataframe and list of strings, as input. Takes the keys and indexes
    the required rows and then returns the array of values ready to go into
    a learning algorithm or to be tested
    '''
    
    inx = df['video'].isin(keys)
    temp_df = df.loc[inx, :].drop(['video'],axis=1)

    y = temp_df.pop('isNerv').values
    X = temp_df.values
    
    return X, y


In [17]:
# Load happy data
happyPath = 'dataset/happy_frames_openface/'
happy_df = read_csvs(happyPath,'h')
happy_df['isNerv'] = 0

# Load nervous data
nervousPath = 'dataset/nervous_frames_openface/'
nervous_df = read_csvs(nervousPath,'n')
nervous_df['isNerv'] = 1

In [20]:
merged_df = happy_df.append(nervous_df)

In [22]:
# Create list of AUs and cols that we want to extract from the imported dataset
cols = [' AU01_r', ' AU02_r', ' AU04_r', ' AU05_r', ' AU06_r', ' AU07_r',
       ' AU09_r', ' AU10_r', ' AU12_r', ' AU14_r', ' AU15_r', ' AU17_r',
       ' AU20_r', ' AU23_r', ' AU25_r', ' AU26_r', ' AU45_r', 'video', 'isNerv']
# Create a pointer to the original df that only includes the columns we want
merged_df = merged_df.loc[:,cols]

In [29]:
# Create list of keys corresponding to the video for splitting
X_keys = merged_df['video'].unique()

In [47]:
# Split into train/test
X_train_keys, X_test_keys = train_test_split(X_keys, random_state=1)

In [58]:
# Go from X_train_keys to X_train and etc
X_train, y_train = keys_to_array(merged_df, X_train_keys)
X_test, y_test = keys_to_array(merged_df, X_test_keys)

In [73]:
'''
Initialize default model with rbf kernel, train and then test the model. 
We understand that not tuning gamma and C will lead to worse performance.
However, we have already performed ~5 hours 
of hyperparameter tuning with minimal improvments to scores. Tuning of the SVM leads
to the SVM simply predicting 0 for everything. Further, this test is just a baseline
to serve as a goal for our NN implementation to beat.
'''
SVM = svm.SVC(kernel='rbf')

SVM.fit(X_train,y_train)

y_pred = SVM.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.68      0.82      0.75      3731
           1       0.41      0.24      0.30      1895

    accuracy                           0.63      5626
   macro avg       0.55      0.53      0.52      5626
weighted avg       0.59      0.63      0.60      5626

