# Emotion Recognition Project

## MAIZA Hichem
### M2 ATAL

In [1]:
import os
import glob
import sklearn
import pandas as pd
import numpy as np 
import arff
import re
import time
from scipy.stats import kurtosis, skew

# Define paths to the generated .arff and their labels

In [2]:
# train list files
AIBO_O_DIR = np.sort(glob.glob("/home/maiza/Bureau/Projets/emotion/AIBO/wav/AIBO-O/MFCCs/*"))
#test list files
AIBO_M_DIR = np.sort(glob.glob("/home/maiza/Bureau/Projets/emotion/AIBO/wav/AIBO-M/MFCCs/*"))
# labels : case of classes
labels_2cl = "/home/maiza/Bureau/Projets/emotion/AIBO/wav/chunk_labels_2cl_corpus.txt"
# labels : case of 5 classes
labels_5cl = "/home/maiza/Bureau/Projets/emotion/AIBO/wav/chunk_labels_5cl_corpus.txt"

# Read the .arff Files and extract the genrated MFCCs Vectors

In [3]:
def Extract_MFCC_Filename(MFCC, file_name ,list_file_dir):
    
    '''
    Function to extract MFCCs, and Files names 
    
    '''
    
    for file in list_file_dir:
        with open(file) as f:
            file_name.append(re.sub('.arff','',os.path.basename(file)))
            MFCC.append(np.array(f.readlines()[-1].split(',')[1:-1], dtype='float'))
    return MFCC, file_name

def Generate_MFCC_Data_Frame(train_list_file_dir, test_list_file_dir):
    
    '''
    Execute Extract_MFCC_Filename(...) function and then put MFCCs of 
    the training and test list file in one data frame
    
    Input-  train_list_file_dir
            test_list_file_dir
            
    output  Data Frame 
    '''
    
    tic = time.process_time()
    MFCC = []
    file_name = [] 
    
    MFCC, file_name = Extract_MFCC_Filename(MFCC, file_name ,train_list_file_dir)
    MFCC, file_name = Extract_MFCC_Filename(MFCC, file_name ,test_list_file_dir)
    
    train_test_data_set = {
        'file_name': file_name,
        'MFCC': MFCC
    }
    
    train_test_data_set = pd.DataFrame(train_test_data_set)
    toc = time.process_time() 
    print ('Executing_time_in_ms = ' + str(1000*(toc-tic)))
    
    return train_test_data_set
    

In [4]:
train_test_data_set = Generate_MFCC_Data_Frame(AIBO_O_DIR, AIBO_M_DIR)

Executing_time_in_ms = 4359.801334


In [5]:
train_test_data_set.head()

Unnamed: 0,MFCC,file_name
0,"[0.005667331, 0.0003117731, 0.005355558, 53.0,...",Ohm_01_015_00
1,"[0.01041963, 0.0002702086, 0.01014942, 61.0, 3...",Ohm_01_016_00
2,"[0.01909119, 0.0002510243, 0.01884016, 60.0, 1...",Ohm_01_017_00
3,"[0.01374097, 0.0002331673, 0.0135078, 66.0, 20...",Ohm_01_018_00
4,"[0.001530198, 0.0002304439, 0.001299754, 49.0,...",Ohm_01_018_01


# A function to read labels than reorgnize them in dictionnary structures

In [6]:
def read_labels(labels_file , c) : 
    '''
    Read labels file and return a data frame 
    
    Input- labels_file name
           c number of classes it can be 2 or 5  
    '''
    
    file_name = [] 
    classe = []
    value = []
    
    with open(labels_file) as labels :
        
        lines = labels.readlines()
        
        for line in lines :
            line = line.strip('\n').split(' ')
            file_name.append(line[0] ) 
            classe.append(line[1])
            value.append(line[2])
            
        dic = {
            'file_name':file_name, 
            'classe_{}'.format(c):classe,
            'value_{}'.format(c): value
        }
        
        data_frame = pd.DataFrame(dic)
        
    return data_frame

# Extract Both Dictionnary and reorgnize them 

In [7]:
data_frame_2c = read_labels (labels_2cl, 2)
data_frame_2c.head()

Unnamed: 0,classe_2,file_name,value_2
0,IDL,Mont_01_000_00,1.0
1,IDL,Mont_01_001_00,1.0
2,IDL,Mont_01_001_01,1.0
3,IDL,Mont_01_004_00,0.9
4,IDL,Mont_01_005_00,1.0


In [8]:
data_frame_5c = read_labels (labels_5cl, 5)
data_frame_5c.head()

Unnamed: 0,classe_5,file_name,value_5
0,N,Mont_01_000_00,1.0
1,N,Mont_01_001_00,1.0
2,N,Mont_01_001_01,1.0
3,N,Mont_01_004_00,0.9
4,N,Mont_01_005_00,1.0


In [9]:
complete_file = {
    
    'file_name': train_test_data_set['file_name'],
    'MFCC': train_test_data_set['MFCC'],
    'classe_5c': data_frame_5c['classe_5'],
    'classe_2c': data_frame_2c['classe_2'],
    'value_5c': data_frame_5c['value_5'],
    'value_2c': data_frame_2c['value_2'],
    
    # features
    'mean': [np.mean(i) for i in train_test_data_set['MFCC']], 
    'std' : [np.std(i) for i in train_test_data_set['MFCC']], 
    'skewness': [skew(i) for i in train_test_data_set['MFCC']],
    'kurtosis': [kurtosis(i) for i in train_test_data_set['MFCC']]
    
}

complete_file_data_frame = pd.DataFrame(complete_file)
complete_file_data_frame.head()

Unnamed: 0,MFCC,classe_2c,classe_5c,file_name,kurtosis,mean,skewness,std,value_2c,value_5c
0,"[0.005667331, 0.0003117731, 0.005355558, 53.0,...",IDL,N,Ohm_01_015_00,357.183887,22.074596,18.693648,185.900529,1.0,1.0
1,"[0.01041963, 0.0002702086, 0.01014942, 61.0, 3...",IDL,N,Ohm_01_016_00,370.521908,31.192448,19.195588,324.308852,1.0,1.0
2,"[0.01909119, 0.0002510243, 0.01884016, 60.0, 1...",IDL,N,Ohm_01_017_00,353.02596,34.716302,18.528228,271.927149,1.0,1.0
3,"[0.01374097, 0.0002331673, 0.0135078, 66.0, 20...",IDL,N,Ohm_01_018_00,351.132968,23.971229,18.470221,191.195626,0.9,0.9
4,"[0.001530198, 0.0002304439, 0.001299754, 49.0,...",IDL,N,Ohm_01_018_01,367.001161,34.619512,19.073093,394.509343,1.0,1.0


In [10]:
train_data_set = complete_file_data_frame.iloc[:len(AIBO_O_DIR)]
test_data_set = complete_file_data_frame.iloc[len(AIBO_O_DIR):]

In [11]:
def to_matrix(data):
    
    '''
    
    Sklearn accept vectors of shape (X,Y), so MFCCs has to be in matrix format, this function accept a Data Frame vector as
    Input and than output a matrix as output 
    
    Input- data Frame vector 
    output- Matrix format 
    
    train_data_frame['MFCC'][0] : is the number of columns it's the same in two cases :) 
    
    '''
    
    mfcc = np.zeros([len(data['MFCC']), len(train_data_set['MFCC'][0])]) 
    for index, value in enumerate(data['MFCC']):
        mfcc[index,:] = value
    return mfcc

def compute_metrics(true_data, prediction, average = None) :
    '''
    A function to compute metrics results
    
    '''
    
    from sklearn.metrics import accuracy_score, precision_score, recall_score
    
    acc = accuracy_score(np.array(true_data), prediction) 

    precision = precision_score(np.array(true_data), prediction, average= average)

    recall = recall_score(np.array(true_data), prediction ,average = average)

    print('the accuracy is equal to '+ str(acc))
    print('the precision is equal to ' + str (precision))
    print('the recall is equal to ' + str(recall))
    
def to_binary (data) : 
    a = [] 
    for i in data:
        if i == 'NEG':
            a.append(0)
        if i == 'IDL':
            a.append(1)
    return np.array(a) 

def to_digits(data, unique_labels_5c):
        
    a = [] 
    for i in data:
        if i == unique_labels_5c[0]:
            a.append(0)
        if i == unique_labels_5c[1]:
            a.append(1)
        if i == unique_labels_5c[2]:
            a.append(2)
        if i == unique_labels_5c[3]:
            a.append(3)
        if i == unique_labels_5c[4]:
            a.append(4)
            
    return np.array(a) 

# APPROACHES

In [12]:
train_mfcc = to_matrix(train_data_set)
test_mfcc = to_matrix(test_data_set)

# Naive Bayes Classifier 

In [16]:
from sklearn import naive_bayes
from sklearn.naive_bayes import GaussianNB
from scipy import sparse
model = GaussianNB ()
model.fit(train_mfcc, np.array(train_data_set['classe_2c'])) 
prediction = model.predict(test_mfcc)
compute_metrics(np.array(test_data_set['classe_2c']), prediction,  average='weighted')   

the accuracy is equal to 0.631706430907
the precision is equal to 0.552596642067
the recall is equal to 0.631706430907


# Logistic Regression 

In [17]:
from sklearn import linear_model

model = linear_model.LogisticRegression() 
model.fit(train_mfcc, np.array(train_data_set['classe_2c'])) 
prediction = model.predict(test_mfcc)
compute_metrics(np.array(test_data_set['classe_2c']), prediction,  average='weighted')

the accuracy is equal to 0.648540632191
the precision is equal to 0.537911100454
the recall is equal to 0.648540632191


# SVM Classifier

In [18]:
# test Normalizing features
mean_train_mfcc = train_mfcc/np.mean(train_mfcc, axis = 1).reshape(-1,1)
mean_test_mfcc  = test_mfcc/np.mean(test_mfcc, axis = 1).reshape(-1,1)

In [19]:
from sklearn import svm 

model = svm.SVC()
model.fit(train_mfcc, np.array(train_data_set['classe_5c'])) 
prediction = model.predict(test_mfcc)

In [33]:
compute_metrics(np.array(test_data_set['classe_5c']), prediction, average='weighted')  

the accuracy is equal to 0.560615235558
the precision is equal to 0.314289442339
the recall is equal to 0.560615235558


  'precision', 'predicted', average, warn_for)


# GMM  

In [14]:
train_binary_label = to_binary(train_data_set['classe_2c'])
test_binary_label = to_binary(test_data_set['classe_2c'])

from sklearn import mixture
g = mixture.GaussianMixture(n_components=2) 
g.fit(train_mfcc,train_binary_label)
prediction = g.predict(test_mfcc)

In [15]:
compute_metrics(test_binary_label, prediction, average='weighted')   

the accuracy is equal to 0.63122199346
the precision is equal to 0.555437498747
the recall is equal to 0.63122199346
