In [None]:
import soundfile # to read audio file
import numpy as np
import librosa # to extract speech features
import glob
import os
import re

In [None]:
####function for extracting speech features####

def extract_feature(file_name, **kwargs):         
    """
    Extract feature from audio file `file_name`
        Features supported:
            - MFCC (mfcc)
            - Chroma (chroma)
            - MEL Spectrogram Frequency (mel)
    """
    mfcc = kwargs.get("mfcc")            
    chroma = kwargs.get("chroma")        
    mel = kwargs.get("mel")              
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        X = librosa.to_mono(X)
        sample_rate = sound_file.samplerate
        if chroma or contrast:
            stft = np.abs(librosa.stft(X))
        result = np.array([])
        if mfcc:                         #getting the mfcc feature 
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result = np.hstack((result, mfccs))
            
        if chroma:                       #getting the chroma feature 
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result = np.hstack((result, chroma))
        if mel:                          #getting MEL Spectrogram Frequency (mel) feature
            mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
            result = np.hstack((result, mel))
            
                                         #stacking three features in the variable called result
    return result

In [None]:
AVAILABLE_EMOTIONS = {                 # available emotions set
    "Anger",
    "Happiness",
    "Neutra",
    "Sadness"
}

In [None]:
R,q=[],[]
count=0
#importing speech modality

for i in AVAILABLE_EMOTIONS:                   # made files of each of the four emotions for speech files
    for file in sorted(glob.glob(i+"/Ses01*.wav")):        
                                               #for every sound file in session 1 
        features = extract_feature(file, mfcc=True, chroma=True, mel=True) #calling the extract feature function for speech features
        count+=1                            #to count the number of files in session 1
        R.append(features)                  #appending speech features for every audio in list R 
        p=re.split("/", file, 1)            
        p.reverse()
        q.append(p)                         #q is storing the respective emotion for each sound file
        
        
    for file in sorted(glob.glob(i+"/Ses02*.wav")):    #for every sound file in session 2  
        features = extract_feature(file, mfcc=True, chroma=True, mel=True)
        count+=1
        R.append(features)
        p=re.split("/", file, 1)
        p.reverse()
        q.append(p)
        
print(count)        

In [None]:
R=np.array(R)                       #converting list into numpy array
q=np.array(q)                       #converting list into numpy array
R=R.astype('float32')      
q[q=='Anger']=0                     # numbering the respective emotions 0,1,2,3
q[q=='Happiness']=1
q[q=='Sadness']=2
q[q=='Neutra']=3

In [None]:
arr = np.concatenate((q,R), axis=1)  # concatenating the columns q and R to feed both speech and text features simultaneously to the model
li=arr.tolist()                      # converting arr to list to sort (easy to sort using a list)
li=sorted(li)                        #sorting the list on the basis of q value
li=np.array(li) 

l=li[:,1]                           #getting the label(labels are emotions) vector l using li array
M=li[:,2:]                          #getting the speech feature matrix M using li array
l=l.astype('int64') 
M=M.astype('float32')

In [None]:
####function for extracting text features####

#text features for each text file for session1 (doing it for all 10 emotions, will take the 4 emotions in later part)

y=[]
for i in range(1,9): 
    for file in glob.glob("Ses01F_impro0"+str(i)+".txt"):     #for every text file starting with Ses01F_impro0
        f = open(file)
        lines = f.readlines()
        for line in lines:
            x = re.search("^Ses0", line)                #getting file name and corresponding text for each file 
            if(x==None):
                continue
            p = re.split("\s", line, 2)                 
            del(p[1])                                  # deleting useless information in the list p
            y.append(p)                                # appending the file name and corresponding text in list 'p' of every audio file in y
for i in range(1,9): 
    for file in glob.glob("Ses01M_impro0"+str(i)+".txt"):    #for every text file starting with Ses01M_impro0
        f = open(file)
        lines = f.readlines()
        for line in lines:
            x = re.search("^Ses0", line)
            if(x==None):
                continue
            p = re.split("\s", line, 2)
            del(p[1])
            y.append(p) 

In [None]:
#session2
#text features for each text file for session2 (same as above)
for i in range(1,9): 
    for file in glob.glob("Ses02F_impro0"+str(i)+".txt"):
        f = open(file)
        lines = f.readlines()
        for line in lines:
            x = re.search("^Ses0", line)
            if(x==None):
                continue
            p = re.split("\s", line, 2)
            del(p[1])
            y.append(p)
for i in range(1,9): 
    for file in glob.glob("Ses02M_impro0"+str(i)+".txt"):
        f = open(file)
        lines = f.readlines()
        for line in lines:
            x = re.search("^Ses0", line)
            if(x==None):
                continue
            p = re.split("\s", line, 2)
            del(p[1])
            y.append(p)    
#y contain file name and corresponding text for every audio files in session 1 and 2 (all 10 emotions)

In [None]:
print(type(y))
x=sorted(y)
# sorting the text on the basis of file name
# x will contain file name and corresponding text, sorted according to file names

In [None]:
AVAILABLE_EMO = {
    "Anger",
    "Happiness",
    "Neutra",
    "Sadness",
    "Other",
    "Surprise",
    "Disgust",
    "Fear",
    "Excited",
    "Frustration"
}

In [None]:
y=[]
# looping again across all 10 emotions to get the desired file name and corresponding emotions
for i in AVAILABLE_EMO:
    for file in sorted(glob.glob(str(i)+"/Ses01*.wav")):   #again using the speech file 
        p=re.split("/", file, 2)
        p.reverse()
        p[0]=p[0][:-4]
        y.append(p)
for i in AVAILABLE_EMO:
    for file in sorted(glob.glob(str(i)+"/Ses02*.wav")):
        p=re.split("/", file, 2)
        p.reverse()
        p[0]=p[0][:-4]
        print(p)
        y.append(p) 

In [None]:
y=sorted(y)
import numpy as np
X=np.array(x)
Y=np.array(y)
X=X[:,1]
Y=Y[:,1]

In [None]:
print(X.shape,Y.shape)

In [None]:
x,y=[],[]
count=0
EMOT = {
    "Anger",
    "Happiness",
    "Neutra",
    "Sadness"
}
for i in range(0,1547):
    if Y[i] in EMOT:
        x.append(X[i])
        y.append(Y[i])
        count+=1

y=np.array(y)
x=np.array(x)        

In [None]:
import collections
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from nltk.stem import PorterStemmer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder

from keras import models
from keras import layers
from keras import regularizers
stopwords_list = stopwords.words('english')
porter = PorterStemmer()

In [None]:
NB_WORDS = 100000  # Parameter indicating the number of words we'll put in the dictionary
VAL_SIZE = 1000  # Size of the validation set
NB_START_EPOCHS = 18  # Number of epochs we usually start to train with
BATCH_SIZE = 512  # Size of the batches used in the mini-batch gradient descent
MAX_LEN = 20  # Maximum number of words in a sequence
GLOVE_DIM = 50# Number of dimensions of the GloVe word embeddings

In [None]:
glove_file = 'glove.6B.' + str(GLOVE_DIM) + 'd.txt'
f='glove.6B/'
emb_dict = {}
glove = open(f+glove_file)
for line in glove:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    emb_dict[word] = vector
glove.close()

In [None]:
tk = Tokenizer(num_words=NB_WORDS,
               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
               lower=True,
               split=" ")
tk.fit_on_texts(x)

x_train_seq = tk.texts_to_sequences(x)

x_train_seq_trunc = pad_sequences(x_train_seq, maxlen=MAX_LEN)


In [None]:
emb_matrix = np.zeros((NB_WORDS, GLOVE_DIM))

for w, i in tk.word_index.items():
    # The word_index contains a token for all words of the training data so we need to limit that
    if i < NB_WORDS:
        vect = emb_dict.get(w)
        # Check if the word from the training data occurs in the GloVe word embeddings
        # Otherwise the vector is kept with only zeros
        if vect is not None:
            emb_matrix[i] = vect
    else:
        break

In [None]:
def remove_stopwords(input_text):
    # Some words which might indicate a certain sentiment are kept via a whitelist
    whitelist = ["n't", "not", "no"]
    words = input_text.split() 
    clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
    return " ".join(clean_words) 

for i in range(x.shape[0]):
    x[i]=remove_stopwords(x[i])
    x[i]= porter.stem(x[i])

In [None]:
y[y=='Anger']=0
y[y=='Happiness']=1
y[y=='Sadness']=2
y[y=='Neutra']=3
y=y.astype('int64') 
y.shape

In [None]:
#splitting the text features, x_train_seq contains the processed text feature vector and y contains 
##  the respective emotions
x_train, x_test, y_train, y_test = train_test_split(x_train_seq_trunc,y,test_size=0.25,random_state=4)
x_train.shape

In [None]:
##Splitting the speech features, x_train_seq contains the processed speech feature vector and l contains 
##  the respective emotions
M_train, M_test, l_train, l_test = train_test_split(M,l,test_size=0.25,random_state=4)