# Video analysis on the MOUD dataset

This notebook contains a model to evaluate facial behaviors from videos from the MOUD dataset obtaining and processing data obtained from OpenFace toolkit. LINK: https://github.com/TadasBaltrusaitis/OpenFace

In [1]:
# The path of the train and test transcriptions
# The data is seperated in an 80-20 ratio and the test directory is untouched. 
train_path = r"C:\Users\Roshan Sridhar\Google Drive\Documents\NYU\GILAB\MMML\Datasets\MOUD\VideoReviews\transcriptions\train\*.csv"
test_path = r"C:\Users\Roshan Sridhar\Google Drive\Documents\NYU\GILAB\MMML\Datasets\MOUD\VideoReviews\transcriptions\test\*.csv"

In [2]:
import glob
import numpy as np
import pandas as pd
import sklearn
import scipy

In [3]:
# funcion to combine multiple speech, annotation columns to one and drop rest of columns
def clean_moud(df_name):
    if 'Speech' not in df_name.columns:
        df_name['Speech'] = ''    
    if 'speech' in df_name.columns:
        df_name['Speech'] = df_name[['Speech','speech']].fillna('').sum(axis=1)   
    if 'transcription' in df_name.columns:
        df_name['Speech'] = df_name[['Speech','transcription']].fillna('').sum(axis=1)

    if 'sentimentAnnotation' not in df_name.columns:
        df_name['sentimentAnnotation'] = 0    
    if 'sentimentAnnotations' in df_name.columns:
        df_name['sentimentAnnotation'] = df_name[['sentimentAnnotation','sentimentAnnotations']].fillna(0).sum(axis=1)
    if 'sentimentannotations' in df_name.columns:
        df_name['sentimentAnnotation'] = df_name[['sentimentAnnotation','sentimentannotations']].fillna(0).sum(axis=1)
    
    return df_name

In [4]:
# funcion to append all utterances to dataframe
def create_data_df(df_name,data_path):
    '''
    Returns a text dataframe with two columns 'Speech' and 'sentimentAnnotation'
    Returns a sparse matrix of video features to be combined with the text tfidf later'
    '''
    # Creating video df
    v_cols = []
    skeleton_path = r'C:\Users\Roshan Sridhar\Google Drive\Documents\NYU\GILAB\MMML\Python\MOUD\Text_Video\video_skeleton.csv'
    df_v = pd.DataFrame(pd.read_csv(skeleton_path, sep = ','))
    df_v = df_v.drop([df_v.columns.values[0]],axis=1)

    for f in glob.glob(data_path):
        
        # TEXT 
        # append speech utterances to text dataframe 
        df_name = df_name.append(pd.read_csv(f,sep=';'),ignore_index=True)
        
        # VIDEO
        # Create sparse video matrix for each file consecutively while creating text dataframe
        # It is done at this particular point to extract time related groups before the starttime and endtimes are lost
        
        # Creating a temporary text df to get times and clean
        df_name_temp = pd.read_csv(f,sep=';')

        df_name_temp = clean_moud(df_name_temp)
        
        # Remove neutral annotations
        df_name_temp = df_name_temp.query('sentimentAnnotation != 0')
         
        # Creating a df of the corredponding OpenFace features file 
        v_name = r"C:\Users\Roshan Sridhar\Google Drive\Documents\NYU\GILAB\MMML\Datasets\MOUD\OpenFaceFeatures" + "\\" + f.rsplit("\\",1)[1].split(".")[0] + ".mp4.csv"
        df_v_name = pd.read_csv(v_name, sep = ", ", engine = "python")
    
        # Splitting the video data by utterances
        for starttime,endtime in zip(df_name_temp['#starttime'],df_name_temp['#endtime']):    
            # Generate mean and standard deviation upto endtime of utterance, new df because columns need to be dropped
            df_v_name_temp = df_v_name.query('timestamp >='+str(starttime)+'& timestamp <='+str(endtime)).agg(['mean','std'])
            # Drop unwanted labels after querying because timestamp is required to filter in prev line
            df_v_name_temp.drop(['frame','timestamp','confidence','success'], axis = 1)
            # append single row of means and stds to the main dataframe
            
            df_v.loc[len(df_v)] = np.array(df_v_name_temp).ravel()

    # TEXT 
    # combine multiple speech, annotation columns to one and drop rest of columns
    df_name = clean_moud(df_name)
    
    # Remove neutral annotations
    df_name = df_name.query('sentimentAnnotation != 0')
    
    df_name = df_name[['Speech','sentimentAnnotation']].reset_index(drop=True)  
    
    return df_name, df_v

In [5]:
df = pd.DataFrame()
df_t = pd.DataFrame()

# Clean dataframe and create sparse video matrix
df, v_train = create_data_df(df,train_path)
df_t, v_test = create_data_df(df_t,test_path)

#
v_train_sparse = scipy.sparse.csr_matrix(v_train.values)
v_test_sparse = scipy.sparse.csr_matrix(v_test.values)

v_train.head()

Unnamed: 0,frame_mean,timestamp_mean,confidence_mean,success_mean,gaze_0_x_mean,gaze_0_y_mean,gaze_0_z_mean,gaze_1_x_mean,gaze_1_y_mean,gaze_1_z_mean,...,AU12_c_std,AU14_c_std,AU15_c_std,AU17_c_std,AU20_c_std,AU23_c_std,AU25_c_std,AU26_c_std,AU28_c_std,AU45_c_std
0,55.5,1.818485,0.981728,1.0,0.157933,0.22877,-0.960507,-0.067107,0.229786,-0.970847,...,0.0,0.0,0.0,0.13422,0.0,0.394816,0.260877,0.408521,0.0,0.209252
1,199.0,6.606607,0.927676,0.932203,0.136627,0.213596,-0.964651,-0.072389,0.204741,-0.974137,...,0.0,0.195448,0.34926,0.46175,0.208327,0.381166,0.442871,0.365956,0.129447,0.496392
2,357.0,11.878534,0.980742,1.0,0.147945,0.216788,-0.964866,-0.079685,0.213898,-0.973514,...,0.270928,0.498878,0.186892,0.48564,0.454361,0.186892,0.501795,0.500755,0.186892,0.427034
3,521.5,17.367371,0.975868,1.0,0.166863,0.217611,-0.961535,-0.060091,0.236668,-0.969625,...,0.0,0.0,0.365606,0.32705,0.483651,0.0,0.404961,0.314373,0.338926,0.499066
4,657.5,21.90525,0.981475,1.0,0.158007,0.217275,-0.963203,-0.066873,0.22761,-0.971425,...,0.0,0.0,0.329243,0.398733,0.110432,0.188897,0.481047,0.495691,0.155207,0.416463


### Data cleaning and text preprocessing

This section 'Data cleaning and text preprocessing' is to preprocess the text for text+video analysis

In [6]:
# from https://www.kaggle.com/c/word2vec-nlp-tutorial/
import re
from bs4 import BeautifulSoup
import nltk

# execute the following commented step to install the data packages if you don't already have it  
# nltk.download()

from nltk.corpus import stopwords

# resuable function to convert raw speech to preprocessed
def utterance_to_words(raw_utterance):
    # 1. Removing any HTML elements
    utterance_text = BeautifulSoup(raw_utterance, "lxml").get_text()
    # 2. Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", utterance_text) 
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()
    # 4. convert the stop words to a set
    stops = set(stopwords.words("spanish"))
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]
    # 6. Join the words back into one string separated by space, and return the result.
    return( " ".join( meaningful_words ))

# applying the function to the speech column
df['Speech'] = df['Speech'].apply(lambda x: utterance_to_words(x))
df_t['Speech'] = df_t['Speech'].apply(lambda x: utterance_to_words(x))

In [7]:
# from sklearn.model_selection import train_test_split

# # splitting dataset into train and test in stratified fashion and a ratio of 80% - 20%
# X, y = df[['Speech']],df[['sentimentAnnotation']]
# X_trn, X_tst, y_trn, y_tst = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_trn, y_trn = df[['Speech']],df[['sentimentAnnotation']]
X_tst, y_tst = df_t[['Speech']],df_t[['sentimentAnnotation']]
df.head()

Unnamed: 0,Speech,sentimentAnnotation
0,habia visto resenas decian picaba usabas,-1.0
1,verdad si use vez t arde asi usas arde ojo,-1.0
2,dije puede ser posible deseaba arde voy poder ...,-1.0
3,tambien tira poquito pelo hagan cuenta quebra ...,-1.0
4,igual lavadas dejado tirar,1.0


### Utterance level video-ONLY analysis

The following 'video-only analysis' code is present here due to dependency of timestamps from text dataset.
This section performs analysis on only the video features extracted. 

The next section 'Machine Learning' contains both the video and text stacked using the 'early fusion' method. (See section 6.1 https://arxiv.org/pdf/1705.09406.pdf)

In [8]:
from sklearn import svm
# SVM model creation and fitting train vector to annotations
model_tf_v = svm.SVC(kernel='linear', C=1, gamma=1).fit(v_train_sparse,y_trn['sentimentAnnotation'].values)

# generate predictions
predicted_tf_v = model_tf_v.predict(v_test_sparse)

In [9]:
# Classification report
from sklearn.metrics import classification_report
print(classification_report(y_tst['sentimentAnnotation'].values, predicted_tf_v))

#create df to show results
disp = y_tst.reset_index(drop=True).join(pd.DataFrame(predicted_tf_v,columns=['Prediction']))
disp = disp.join(pd.DataFrame(disp['sentimentAnnotation']==disp['Prediction'],columns=['Right/Wrong']))
scores = model_tf_v.score(v_test_sparse,y_tst['sentimentAnnotation'].values)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("Mean sentiment: {!r}. Predicted mean sentiment: {!r}.".format('Positive' if disp['sentimentAnnotation'].mean()>=0 else 'Negative','Positive' if disp['Prediction'].mean()>=0 else 'Negative'))
disp.head()

             precision    recall  f1-score   support

         -1       0.51      0.49      0.50        41
          1       0.62      0.65      0.64        54

avg / total       0.58      0.58      0.58        95

Accuracy: 0.58 (+/- 0.00)
Mean sentiment: 'Positive'. Predicted mean sentiment: 'Positive'.


Unnamed: 0,sentimentAnnotation,Prediction,Right/Wrong
0,1,1.0,True
1,1,-1.0,False
2,-1,1.0,False
3,1,1.0,True
4,-1,1.0,False


In [10]:
# cross validation of training set
from sklearn.model_selection import cross_val_score
clf_cv = svm.SVC(kernel='linear', C=1, gamma=1)
scores = cross_val_score(clf_cv, v_train_sparse, y_trn['sentimentAnnotation'].values, cv=10)
scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.49 (+/- 0.26)


In [11]:
from sklearn.linear_model import LogisticRegression
model_lr = LogisticRegression().fit(v_train_sparse,y_trn['sentimentAnnotation'].values)
# generate predictions
predicted_lr = model_lr.predict(v_test_sparse)
# Classification report
print(classification_report(y_tst['sentimentAnnotation'].values, predicted_lr))

             precision    recall  f1-score   support

         -1       0.57      0.56      0.57        41
          1       0.67      0.69      0.68        54

avg / total       0.63      0.63      0.63        95



In [12]:
from sklearn.tree import DecisionTreeClassifier
model_dt = DecisionTreeClassifier().fit(v_train_sparse,y_trn['sentimentAnnotation'].values)
# generate predictions
predicted_dt = model_dt.predict(v_test_sparse)
# Classification report
print(classification_report(y_tst['sentimentAnnotation'].values, predicted_dt))

             precision    recall  f1-score   support

         -1       0.42      0.32      0.36        41
          1       0.56      0.67      0.61        54

avg / total       0.50      0.52      0.50        95



In [13]:
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier().fit(v_train_sparse,y_trn['sentimentAnnotation'].values)
# generate predictions
predicted_rf = model_rf.predict(v_test_sparse)
# Classification report
print(classification_report(y_tst['sentimentAnnotation'].values, predicted_rf))

             precision    recall  f1-score   support

         -1       0.46      0.54      0.49        41
          1       0.60      0.52      0.55        54

avg / total       0.54      0.53      0.53        95



In [14]:
# Classification report
from sklearn.metrics import classification_report
print(classification_report(y_tst['sentimentAnnotation'].values, predicted_tf_v))
print(classification_report(y_tst['sentimentAnnotation'].values, predicted_lr))
print(classification_report(y_tst['sentimentAnnotation'].values, predicted_dt))
print(classification_report(y_tst['sentimentAnnotation'].values, predicted_rf))

             precision    recall  f1-score   support

         -1       0.51      0.49      0.50        41
          1       0.62      0.65      0.64        54

avg / total       0.58      0.58      0.58        95

             precision    recall  f1-score   support

         -1       0.57      0.56      0.57        41
          1       0.67      0.69      0.68        54

avg / total       0.63      0.63      0.63        95

             precision    recall  f1-score   support

         -1       0.42      0.32      0.36        41
          1       0.56      0.67      0.61        54

avg / total       0.50      0.52      0.50        95

             precision    recall  f1-score   support

         -1       0.46      0.54      0.49        41
          1       0.60      0.52      0.55        54

avg / total       0.54      0.53      0.53        95



RFE is used to select most useful features in the data. After studying the columns carefully we see that most of the features show us position of the face in the video (for eg. X, Y and Z co-ordinates which hinder the learning proess by providing the model misguided information (i.e. the model thinks that the position of the face matters in the sentiment and tries to generalize using that.) A new file with selected features like AU or Action Units are considered to provide a more robust and better model in 'Feature Selection' notebook.

In [15]:
#RFE
from sklearn.feature_selection import RFE
rfe = RFE(model_lr,1)
fit = rfe.fit(v_train,y_trn['sentimentAnnotation'].values)
# print("Num of features:",fit.n_features_)
dict(zip(fit.ranking_,v_train.columns))
# print("Selected Features")
# print(fit.support_)

Num of features: 1


{1: 'AU45_r_std',
 2: 'AU01_c_std',
 3: 'AU17_r_std',
 4: 'AU23_c_mean',
 5: 'AU09_r_std',
 6: 'AU14_r_std',
 7: 'AU04_r_mean',
 8: 'Z_50_std',
 9: 'pose_Tz_std',
 10: 'AU28_c_mean',
 11: 'p_18_mean',
 12: 'p_9_std',
 13: 'Y_4_std',
 14: 'y_5_std',
 15: 'y_56_std',
 16: 'Y_7_std',
 17: 'AU10_c_std',
 18: 'y_64_std',
 19: 'p_4_std',
 20: 'y_9_std',
 21: 'Y_31_std',
 22: 'y_54_std',
 23: 'p_11_mean',
 24: 'x_35_mean',
 25: 'x_24_mean',
 26: 'X_45_mean',
 27: 'X_58_mean',
 28: 'AU25_c_mean',
 29: 'Z_50_mean',
 30: 'Z_8_mean',
 31: 'y_65_std',
 32: 'Y_60_mean',
 33: 'Y_3_mean',
 34: 'X_59_mean',
 35: 'Y_21_std',
 36: 'p_13_std',
 37: 'x_4_std',
 38: 'X_2_std',
 39: 'x_31_std',
 40: 'X_35_std',
 41: 'x_0_std',
 42: 'p_22_mean',
 43: 'pose_Ty_std',
 44: 'Y_8_std',
 45: 'Z_57_std',
 46: 'Z_0_std',
 47: 'Z_31_std',
 48: 'AU05_c_mean',
 49: 'Z_41_std',
 50: 'Z_20_std',
 51: 'AU28_c_std',
 52: 'y_4_std',
 53: 'p_6_mean',
 54: 'x_44_mean',
 55: 'x_8_mean',
 56: 'Y_10_mean',
 57: 'AU04_c_std',
 58

### Machine learning

In [16]:
# countVectorizer initialization
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             lowercase = True,    \
                             max_features = 5000) 

# create bag of words vector for the training set using countVectorizer
train_data_features = vectorizer.fit_transform(X_trn['Speech'].values)

In [17]:
# transformation of test data
test_data_features = vectorizer.transform(X_tst['Speech'].values)

In [18]:
# tf-idf transformer initialization
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

# create tfidf transformed vector for the training set using tf-idf transformer
X_train_tfidf = tfidf_transformer.fit_transform(train_data_features)
X_test_tfidf = tfidf_transformer.transform(test_data_features)

In [19]:
# Stacking Video to Text
train_data_features_v = scipy.sparse.hstack([X_train_tfidf, v_train_sparse])
test_data_features_v = scipy.sparse.hstack([X_test_tfidf, v_test_sparse])

In [20]:
# SVM model creation and fitting train vector to annotations
from sklearn import svm
model_tf = svm.SVC(kernel='linear', C=1, gamma=1).fit(train_data_features_v,y_trn['sentimentAnnotation'].values)

# generate predictions
predicted_tf = model_tf.predict(test_data_features_v)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(y_tst['sentimentAnnotation'].values, predicted_tf))

#create df to show results
disp = X_tst.join(y_tst).reset_index(drop=True).join(pd.DataFrame(predicted_tf,columns=['Prediction']))
disp = disp.join(pd.DataFrame(disp['sentimentAnnotation']==disp['Prediction'],columns=['Right/Wrong']))
scores = model_tf.score(test_data_features_v,y_tst['sentimentAnnotation'].values)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("Mean sentiment: {!r}. Predicted mean sentiment: {!r}.".format('Positive' if disp['sentimentAnnotation'].mean()>=0 else 'Negative','Positive' if disp['Prediction'].mean()>=0 else 'Negative'))
disp.head()

             precision    recall  f1-score   support

         -1       0.54      0.51      0.53        41
          1       0.64      0.67      0.65        54

avg / total       0.60      0.60      0.60        95

Accuracy: 0.60 (+/- 0.00)
Mean sentiment: 'Positive'. Predicted mean sentiment: 'Positive'.


Unnamed: 0,Speech,sentimentAnnotation,Prediction,Right/Wrong
0,verdad recomiendo,1,1.0,True
1,eh,1,1.0,True
2,leido nunca ningun libro zombies,-1,1.0,False
3,peliculas tal tampoco suelen hacer mucha graci...,1,1.0,True
4,verdad gusta hablando,-1,1.0,False


In [21]:
# cross validation of training set
from sklearn.model_selection import cross_val_score
clf_cv = svm.SVC(kernel='linear', C=1, gamma=1)
scores = cross_val_score(clf_cv, train_data_features_v, y_trn['sentimentAnnotation'].values, cv=10)
scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.49 (+/- 0.26)


Logistic Regression

In [22]:
# LR model creation and fitting train vector to annotations
model_tf = LogisticRegression().fit(train_data_features_v,y_trn['sentimentAnnotation'].values)
# generate predictions
predicted_tf = model_tf.predict(test_data_features_v)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(y_tst['sentimentAnnotation'].values, predicted_tf))

#create df to show results
disp = X_tst.join(y_tst).reset_index(drop=True).join(pd.DataFrame(predicted_tf,columns=['Prediction']))
disp = disp.join(pd.DataFrame(disp['sentimentAnnotation']==disp['Prediction'],columns=['Right/Wrong']))
scores = model_tf.score(test_data_features_v,y_tst['sentimentAnnotation'].values)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("Mean sentiment: {!r}. Predicted mean sentiment: {!r}.".format('Positive' if disp['sentimentAnnotation'].mean()>=0 else 'Negative','Positive' if disp['Prediction'].mean()>=0 else 'Negative'))
disp.head()

             precision    recall  f1-score   support

         -1       0.57      0.56      0.57        41
          1       0.67      0.69      0.68        54

avg / total       0.63      0.63      0.63        95

Accuracy: 0.63 (+/- 0.00)
Mean sentiment: 'Positive'. Predicted mean sentiment: 'Positive'.


Unnamed: 0,Speech,sentimentAnnotation,Prediction,Right/Wrong
0,verdad recomiendo,1,1.0,True
1,eh,1,-1.0,False
2,leido nunca ningun libro zombies,-1,1.0,False
3,peliculas tal tampoco suelen hacer mucha graci...,1,1.0,True
4,verdad gusta hablando,-1,1.0,False


In [23]:
# cross validation of training set
from sklearn.model_selection import cross_val_score
clf_cv = svm.SVC(kernel='linear', C=1, gamma=1)
scores = cross_val_score(clf_cv, X_train_tfidf, y_trn['sentimentAnnotation'].values, cv=10)
scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.64 (+/- 0.15)
