In [1]:
import librosa
import numpy as np
import pandas as pd
import os
import tensorflow as tf




In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [3]:
emotions=['angry','disgust','fear','happy','neutral','sad','surprise']

In [4]:
main_path="C:\\Users\\navee\\NullClass-Tasks\\Emotion_Detection_voice\\audio\\train"

In [5]:
def load_extract_features(folder_path,emotion_class):
    features=[]
    labels=[]
    
    for file_name in os.listdir(folder_path):
        file_path=os.path.join(folder_path,file_name)
        audio_data,sampling_rate=librosa.load(file_path)
        
        mfccs=librosa.feature.mfcc(y=audio_data,sr=sampling_rate,n_mfcc=10)
        chroma=librosa.feature.chroma_stft(y=audio_data,sr=sampling_rate)
        mel=librosa.feature.melspectrogram(y=audio_data,sr=sampling_rate)
        
        mfccs_flat=np.mean(mfccs,axis=1)
        chroma_flat=np.mean(chroma,axis=1)
        mel_flat=np.mean(mel,axis=1)
        
        audio_features=np.concatenate([mfccs_flat,chroma_flat,mel_flat])
        
        features.append(audio_features)
        labels.append(emotion_class)
    
    return features,labels

In [6]:
all_features=[]
all_labels=[]

In [7]:
for emotion in emotions:
    emotion_folder=os.path.join(main_path,emotion)
    features,labels=load_extract_features(emotion_folder,emotion)
    all_features.append(features)
    all_labels.append(labels)

In [8]:
all_features=np.concatenate(all_features)

In [9]:
all_labels=np.concatenate(all_labels)

In [10]:
x_audio=np.array(all_features)
y_audio=np.array(all_labels)

In [11]:
x_audio.shape

(2496, 150)

In [12]:
y_audio.shape

(2496,)

In [13]:
for i in range(len(y_audio)):
    if y_audio[i]=='angry':
        y_audio[i]=0
    
    elif y_audio[i]=='disgust':
        y_audio[i]=1
    
    elif y_audio[i]=='fear':
        y_audio[i]=2
    
    elif y_audio[i]=='happy':
        y_audio[i]=3
    
    elif y_audio[i]=='neutral':
        y_audio[i]=4
    
    elif y_audio[i]=='sad':
        y_audio[i]=5
        
    elif y_audio[i]=='surprise':
        y_audio[i]=6

In [14]:
x_audio=x_audio.reshape((x_audio.shape[0],1,x_audio.shape[1]))

In [15]:
x_audio.shape

(2496, 1, 150)

In [16]:
y_audio=tf.keras.utils.to_categorical(y_audio,num_classes=7)

In [17]:
y_audio=np.argmax(y_audio,axis=1)

In [18]:
train_data=tf.keras.utils.image_dataset_from_directory("images\\train",
                                                      image_size=(48,48),
                                                      color_mode='grayscale',
                                                      label_mode='categorical',
                                                      batch_size=32,
                                                      shuffle=True)

Found 28709 files belonging to 7 classes.


In [19]:
x_image,y_image=[images for images,_ in train_data],[labels for _,labels in train_data]

In [20]:
x_image=np.concatenate(x_image)
y_image=np.concatenate(y_image)

In [21]:
y_image=np.argmax(y_image,axis=1)

In [22]:
angry=[]
disgust=[]
fear=[]
happy=[]
neutral=[]
sad=[]
surprise=[]

In [23]:
for i in range(len(x_image)):
    if y_image[i]==0 and len(angry)<384:
        angry.append(x_image[i])
        
    elif y_image[i]==1 and len(disgust)<384:
        disgust.append(x_image[i])
        
    elif y_image[i]==2 and len(fear)<384:
        fear.append(x_image[i])
        
    elif y_image[i]==3 and len(happy)<384:
        happy.append(x_image[i])
        
    elif y_image[i]==4 and len(neutral)<192:
        neutral.append(x_image[i])
        
    elif y_image[i]==5 and len(sad)<384:
        sad.append(x_image[i])
        
    elif y_image[i]==6 and len(surprise)<384:
        surprise.append(x_image[i])

In [24]:
x_image=angry+disgust+fear+happy+neutral+sad+surprise

In [25]:
x_image=np.array(x_image)

In [26]:
y_image=[]

In [27]:
y_image=[0]*384+[1]*384+[2]*384+[3]*384+[4]*192+[5]*384+[6]*384

In [28]:
count =0
for i in range(len(y_audio)):
    if y_audio[i]!=y_image[i]:
        count+=1

if count==0:
    print("both are same")
else:
    print("both are not same")

both are same


In [29]:
y=tf.keras.utils.to_categorical(y_image)

In [30]:
x_audio_tr,x_audio_te,x_image_tr,x_image_te,y_tr,y_te=train_test_split(x_audio,x_image,y,test_size=0.2,random_state=0)

In [31]:
y_te

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]], dtype=float32)

In [32]:
audio_input=tf.keras.layers.Input(shape=(1,150),name='audio_input')

model_lstm=tf.keras.models.Sequential(name='model_lstm')

model_lstm.add(tf.keras.layers.LSTM(64,return_sequences=True))

model_lstm.add(tf.keras.layers.LSTM(128,return_sequences=True))

model_lstm.add(tf.keras.layers.LSTM(256))

model_lstm.add(tf.keras.layers.Dense(128, activation='relu'))
model_lstm.add(tf.keras.layers.Dense(64, activation='relu'))
model_lstm.add(tf.keras.layers.Dense(7, activation='softmax'))

audio_output=model_lstm(audio_input)




In [33]:
image_input=tf.keras.layers.Input(shape=(48,48,1),name='image_input')

model_cnn=tf.keras.models.Sequential()

model_cnn.add(tf.keras.layers.Conv2D(filters=64,kernel_size=3,activation='relu'))
model_cnn.add(tf.keras.layers.BatchNormalization())
model_cnn.add(tf.keras.layers.Dropout(0.2))
model_cnn.add(tf.keras.layers.MaxPool2D(pool_size=(2,2),strides=2))

model_cnn.add(tf.keras.layers.Conv2D(filters=128,kernel_size=3,activation='relu'))
model_cnn.add(tf.keras.layers.MaxPool2D(pool_size=(2,2),strides=2))

model_cnn.add(tf.keras.layers.Conv2D(filters=256,kernel_size=3,activation='relu'))
model_cnn.add(tf.keras.layers.MaxPool2D(pool_size=(2,2),strides=2))

model_cnn.add(tf.keras.layers.Flatten())

model_cnn.add(tf.keras.layers.Dense(128,activation='relu'))
model_cnn.add(tf.keras.layers.Dense(64,activation='relu'))
model_cnn.add(tf.keras.layers.Dense(7,activation='softmax'))

image_output=model_cnn(image_input)




In [34]:
merged=tf.keras.layers.Concatenate()([audio_output,image_output])

In [35]:
final_output=tf.keras.layers.Dense(7,activation='softmax',name='final_output')(merged)

In [36]:
combined_model=tf.keras.models.Model(inputs=[audio_input,image_input],outputs=[final_output])

In [37]:
combined_model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])




In [38]:
combined_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 audio_input (InputLayer)    [(None, 1, 150)]             0         []                            
                                                                                                  
 image_input (InputLayer)    [(None, 48, 48, 1)]          0         []                            
                                                                                                  
 model_lstm (Sequential)     (None, 7)                    589703    ['audio_input[0][0]']         
                                                                                                  
 sequential (Sequential)     (None, 7)                    903047    ['image_input[0][0]']         
                                                                                              

In [39]:
checkpoint=tf.keras.callbacks.ModelCheckpoint('Model_Integrated_weights.h5',monitor='val_accuracy',mode='max',save_best_only=True)

In [40]:
combined_model.fit([x_audio_tr,x_image_tr], y_tr, epochs=300, batch_size=32, validation_data=([x_audio_te,x_image_te],y_te),callbacks=checkpoint)

Epoch 1/300


Epoch 2/300


  saving_api.save_model(


Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300


Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78/300
Epoch 79/300
Epoch 80/300
Epoch 81/300
Epoch 82/300
Epoch 83/300
Epoch 84/300
Epoch 85/300
Epoch 86/300
Epoch 87/300
Epoch 88/300
Epoch 89/300
Epoch 90/300
Epoch 91/300
Epoch 92/300
Epoch 93/300
Epoch 94/300
Epoch 95/300
Epoch 96/300
Epoch 97/300
Epoch 98/300
Epoch 99/300
Epoch 100/300
Epoch 101/300
Epoch 102/300
Epoch 103/300
Epoch 104/300
Epoch 105/300
Epoch 106/300
Epoch 107/300
Epoch 108/300
Epoch 109/300
Epoch 110/300
Epoch 111/300
Epoch 112/300
Epoch 113/300
Epoch 114/300
Epoch 115/300


Epoch 116/300
Epoch 117/300
Epoch 118/300
Epoch 119/300
Epoch 120/300
Epoch 121/300
Epoch 122/300
Epoch 123/300
Epoch 124/300
Epoch 125/300
Epoch 126/300
Epoch 127/300
Epoch 128/300
Epoch 129/300
Epoch 130/300
Epoch 131/300
Epoch 132/300
Epoch 133/300
Epoch 134/300
Epoch 135/300
Epoch 136/300
Epoch 137/300
Epoch 138/300
Epoch 139/300
Epoch 140/300
Epoch 141/300
Epoch 142/300
Epoch 143/300
Epoch 144/300
Epoch 145/300
Epoch 146/300
Epoch 147/300
Epoch 148/300
Epoch 149/300
Epoch 150/300
Epoch 151/300
Epoch 152/300
Epoch 153/300
Epoch 154/300
Epoch 155/300
Epoch 156/300
Epoch 157/300
Epoch 158/300
Epoch 159/300
Epoch 160/300
Epoch 161/300
Epoch 162/300
Epoch 163/300
Epoch 164/300
Epoch 165/300
Epoch 166/300
Epoch 167/300
Epoch 168/300
Epoch 169/300
Epoch 170/300
Epoch 171/300
Epoch 172/300
Epoch 173/300
Epoch 174/300
Epoch 175/300
Epoch 176/300
Epoch 177/300
Epoch 178/300
Epoch 179/300
Epoch 180/300
Epoch 181/300
Epoch 182/300
Epoch 183/300
Epoch 184/300
Epoch 185/300
Epoch 186/300
Epoch 

Epoch 229/300
Epoch 230/300
Epoch 231/300
Epoch 232/300
Epoch 233/300
Epoch 234/300
Epoch 235/300
Epoch 236/300
Epoch 237/300
Epoch 238/300
Epoch 239/300
Epoch 240/300
Epoch 241/300
Epoch 242/300
Epoch 243/300
Epoch 244/300
Epoch 245/300
Epoch 246/300
Epoch 247/300
Epoch 248/300
Epoch 249/300
Epoch 250/300
Epoch 251/300
Epoch 252/300
Epoch 253/300
Epoch 254/300
Epoch 255/300
Epoch 256/300
Epoch 257/300
Epoch 258/300
Epoch 259/300
Epoch 260/300
Epoch 261/300
Epoch 262/300
Epoch 263/300
Epoch 264/300
Epoch 265/300
Epoch 266/300
Epoch 267/300
Epoch 268/300
Epoch 269/300
Epoch 270/300
Epoch 271/300
Epoch 272/300
Epoch 273/300
Epoch 274/300
Epoch 275/300
Epoch 276/300
Epoch 277/300
Epoch 278/300
Epoch 279/300
Epoch 280/300
Epoch 281/300
Epoch 282/300
Epoch 283/300
Epoch 284/300
Epoch 285/300
Epoch 286/300
Epoch 287/300
Epoch 288/300
Epoch 289/300
Epoch 290/300
Epoch 291/300
Epoch 292/300
Epoch 293/300
Epoch 294/300
Epoch 295/300
Epoch 296/300
Epoch 297/300
Epoch 298/300
Epoch 299/300
Epoch 

<keras.src.callbacks.History at 0x1e53606e010>

In [41]:
combined_model.load_weights('Model_Integrated_weights.h5')

In [42]:
combined_model.evaluate([x_audio_te,x_image_te],y_te)



[1.1744788885116577, 0.6060000061988831]

In [43]:
model_json=combined_model.to_json()
with open("model_integrated_a.json","w") as json_file:
    json_file.write(model_json)