In [1]:
!pip install transformers
!pip install tensorboardx
!pip install chardet

Collecting chardet
  Downloading chardet-5.2.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.4/199.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: chardet
Successfully installed chardet-5.2.0


# Necessary imports

In [2]:
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib.backend_bases import RendererBase
from scipy import signal
from scipy.io import wavfile
import os
from PIL import Image
from scipy.fftpack import fft
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils import data
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from tensorboardX import SummaryWriter
import csv
# from file_util import *



# File util

In [3]:
#-*- coding: utf-8 -*-


import os
import chardet

'''
dirname        : path that need to be searched
ret                : files in the dirname (recursive)
list_avoid_dir : dirname need to be skipped
usage           : 
    list_files = []
    file_search(dirname, list_files):   
'''
def file_search(dirname, ret, list_avoid_dir=[]):
    
    filenames = os.listdir(dirname)
    
    for filename in filenames:
        full_filename = os.path.join(dirname, filename)

        if os.path.isdir(full_filename) :
            if full_filename.split('/')[-1] in list_avoid_dir:
                continue
            else:
                file_search(full_filename, ret, list_avoid_dir)
            
        else:
            ret.append( full_filename )          

            

'''
filename : filename (inc. path) that will be inspected
'''
def find_encoding(filename):
    rawdata = open(filename, 'rb').read()
    result = chardet.detect(rawdata)
    charenc = result['encoding']    
    return charenc
            
'''
dir_name : dir_name (inc. path) that will be created ( full-path name )
'''
def create_folder(dir_name):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

In [4]:

list_files = []
for x in range(5):
    sess_name = 'Session' + str(x+1)
    path = '/kaggle/input/iemocapfullrelease/IEMOCAP_full_release/'+ sess_name + '/sentences/wav/'
    file_search(path, list_files)
    list_files = sorted(list_files)
    print (sess_name + ", #sum files: " + str(len(list_files)))
#extract_feature( list_files, out_file )

Session1, #sum files: 1820
Session2, #sum files: 3633
Session3, #sum files: 5769
Session4, #sum files: 7873
Session5, #sum files: 10043


# Attention Based Fully Convolutional Network for Speech Emotion Recognition

In [5]:
def audio2spectrogram(filepath):
    #fig = plt.figure(figsize=(5,5))
    samplerate, test_sound  = wavfile.read(filepath,mmap=True)
    #print('samplerate',samplerate)
    _, spectrogram = log_specgram(test_sound, samplerate)
    #print(spectrogram.shape)
    #print(type(spectrogram))
    #plt.imshow(spectrogram.T, aspect='auto', origin='lower')
    return spectrogram
    
def audio2wave(filepath):
    fig = plt.figure(figsize=(5,5))
    samplerate, test_sound  = wavfile.read(filepath,mmap=True)
    plt.plot(test_sound)
    

def log_specgram(audio, sample_rate, window_size=40,
                 step_size=20, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    #print('noverlap',noverlap)
    #print('nperseg',nperseg)
    freqs, _, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, np.log(spec.T.astype(np.float32) + eps)

N_CHANNELS = 3
def get_3d_spec(Sxx_in, moments=None):
    if moments is not None:
        (base_mean, base_std, delta_mean, delta_std,
             delta2_mean, delta2_std) = moments
    else:
        base_mean, delta_mean, delta2_mean = (0, 0, 0)
        base_std, delta_std, delta2_std = (1, 1, 1)
    h, w = Sxx_in.shape
    right1 = np.concatenate([Sxx_in[:, 0].reshape((h, -1)), Sxx_in], axis=1)[:, :-1]
    delta = (Sxx_in - right1)[:, 1:]
    delta_pad = delta[:, 0].reshape((h, -1))
    delta = np.concatenate([delta_pad, delta], axis=1)
    right2 = np.concatenate([delta[:, 0].reshape((h, -1)), delta], axis=1)[:, :-1]
    delta2 = (delta - right2)[:, 1:]
    delta2_pad = delta2[:, 0].reshape((h, -1))
    delta2 = np.concatenate([delta2_pad, delta2], axis=1)
    base = (Sxx_in - base_mean) / base_std
    delta = (delta - delta_mean) / delta_std
    delta2 = (delta2 - delta2_mean) / delta2_std
    stacked = [arr.reshape((h, w, 1)) for arr in (base, delta, delta2)]
    return np.concatenate(stacked, axis=2)

In [6]:
import pandas as pd
df=pd.read_csv('/kaggle/input/emotion-recognition/processed_data.csv')

In [7]:
df['label'] = pd.to_numeric(df['label'])

# *EXTACT THE MFCC FEATURE USING LIBROSA*

In [8]:
no_rows=len(list_files)
index=0
sprectrogram_shape=[]
docs = []
bookmark=0
extraLabel=0
for everyFile in list_files:
  if(everyFile.split('/')[-1].endswith('.wav')):
    filename=everyFile.split('/')[-1].strip('.wav')
    lable=df.loc[df['sessionID']==filename]['label'].values[0]
    text=df.loc[df['sessionID']==filename]['text'].values[0]
    print('label',lable)
    if(lable!=-1):
      #sprectrogram_shape.append(audio2spectrogram(everyFile))
      spector=audio2spectrogram(everyFile)
      spector=get_3d_spec(spector)
      npimg = np.transpose(spector,(2,0,1))
      input_tensor=torch.tensor(npimg)
      input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model
      #X, sample_rate = librosa.load(everyFile, res_type='kaiser_fast',sr=22050*2)
      #sample_rate = np.array(sample_rate)
      #mfccs = np.mean(librosa.feature.mfcc(y=X,sr=sample_rate,n_mfcc=13),axis=0)
      #feature = mfccs
      docs.append({
         'fileName':everyFile.split('/')[-1].strip('.wav'),
         'text':text,
         'sprectrome':input_batch,
         'label':lable
              })
      index+=1
      print('index',index)
    else:
      extraLabel=extraLabel+1
      print('extraLabel',extraLabel)

label 3
index 1
label 3
index 2
label 3
index 3
label -1
extraLabel 1
label -1
extraLabel 2
label 3
index 4
label -1
extraLabel 3
label -1
extraLabel 4
label -1
extraLabel 5
label -1
extraLabel 6
label -1
extraLabel 7
label -1
extraLabel 8
label 0
index 5
label -1
extraLabel 9
label 3
index 6
label -1
extraLabel 10
label -1
extraLabel 11
label -1
extraLabel 12
label -1
extraLabel 13
label -1
extraLabel 14
label -1
extraLabel 15
label -1
extraLabel 16
label -1
extraLabel 17
label -1
extraLabel 18
label -1
extraLabel 19
label -1
extraLabel 20
label -1
extraLabel 21
label 0
index 7
label -1
extraLabel 22
label 0
index 8
label 2
index 9
label 2
index 10
label 2
index 11
label 3
index 12
label 2
index 13
label 2
index 14
label 3
index 15
label -1
extraLabel 23
label 2
index 16
label -1
extraLabel 24
label 2
index 17
label -1
extraLabel 25
label 2
index 18
label 2
index 19
label 2
index 20
label 2
index 21
label 3
index 22
label 2
index 23
label 3
index 24
label 2
index 25
label 2
index 26
l

In [9]:
class ModifiedAlexNet(nn.Module):
    def __init__(self, num_classes=4):
        super(ModifiedAlexNet, self).__init__()
        self.num_classes=num_classes
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(256, num_classes),
        )
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.features(x)
        #print('features',x.shape)
        x=torch.flatten(x, start_dim=2)#a1,a2,a3......al{a of dim c} 
        x=torch.sum(x, dim=2)#a1*alpha1+a2*alpha2+.......+al*alphal
        #print(x.shape)
        x=self.classifier(x)
        #print('classifier',x)
        #x=self.softmax(x)
        #print('softmax',x)
        #x = self.avgpool(x)
        #print('avgpool',x.shape)
        #x = torch.flatten(x, 1)
        #print('flatten',x.shape)
        #x = self.classifier(x)
        return x
def modifiedAlexNet(pretrained=False, progress=True, **kwargs):
    model_modified = ModifiedAlexNet(**kwargs)
    if pretrained:
        state_dict = load_state_dict_from_url(model_urls['alexnet'],
                                              progress=progress)
        model_modified.load_state_dict(state_dict)
    return model_modified

In [10]:
outputs_text= []
def hook_text(module, input, output):
    outputs_text.clear()
    outputs_text.append(output)
    return None
     

outputs_audio= []
def hook_audio(module, input, output):
    outputs_audio.clear()
    outputs_audio.append(output)
    return None

# new model

In [11]:
from transformers import BertModel, BertTokenizer

In [12]:

class CombinedAudioTextModel(nn.Module):
    def __init__(self, num_classes=4):
        super(CombinedAudioTextModel, self).__init__()
        self.num_classes=num_classes
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.text_model=torch.load('/kaggle/input/emotion-recognition/model_text.pt')
        self.audio_model=torch.load('/kaggle/input/emotion-recognition/model_audio_new_opt.pt')

        self.text_model.bert.pooler.register_forward_hook(hook_text)
        self.audio_model.features.register_forward_hook(hook_audio)

        for param in self.text_model.parameters():
          param.requires_grad = False
        for param in self.audio_model.parameters():
          param.requires_grad = False

        self.dropout = nn.Dropout(.5)
        self.linear = nn.Linear(1024, num_classes)

        self.softmax = nn.Softmax(dim=1)

    def forward(self,text,audio):
        self.text_model(text)
        self.audio_model(audio)
        audio_embed=outputs_audio[0]
        text_embed=outputs_text[0]
        audio_embed=torch.flatten(audio_embed, start_dim=2)#a1,a2,a3......al{a of dim c} 
        audio_embed=torch.sum(audio_embed, dim=2)
        concat_embded=torch.cat((text_embed,audio_embed),1)
        x=self.dropout(concat_embded)
        x=self.linear(x)
        return x

In [13]:
model=CombinedAudioTextModel(num_classes=4)
model.to('cuda')

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

CombinedAudioTextModel(
  (text_model): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_

# check trainable model parameter

In [14]:
for name, param in model.named_parameters():
      if(param.requires_grad):
        print(name)
      else:
        print('no grad',name)

no grad text_model.bert.embeddings.word_embeddings.weight
no grad text_model.bert.embeddings.position_embeddings.weight
no grad text_model.bert.embeddings.token_type_embeddings.weight
no grad text_model.bert.embeddings.LayerNorm.weight
no grad text_model.bert.embeddings.LayerNorm.bias
no grad text_model.bert.encoder.layer.0.attention.self.query.weight
no grad text_model.bert.encoder.layer.0.attention.self.query.bias
no grad text_model.bert.encoder.layer.0.attention.self.key.weight
no grad text_model.bert.encoder.layer.0.attention.self.key.bias
no grad text_model.bert.encoder.layer.0.attention.self.value.weight
no grad text_model.bert.encoder.layer.0.attention.self.value.bias
no grad text_model.bert.encoder.layer.0.attention.output.dense.weight
no grad text_model.bert.encoder.layer.0.attention.output.dense.bias
no grad text_model.bert.encoder.layer.0.attention.output.LayerNorm.weight
no grad text_model.bert.encoder.layer.0.attention.output.LayerNorm.bias
no grad text_model.bert.encoder.

# Train test split

In [15]:
import random
random.shuffle(docs)
random.shuffle(docs)
random.shuffle(docs)
total_length=len(docs)
train_length=int(.9*total_length)
train_list=docs[0:train_length]
test_list=docs[train_length:]
print('no of items for train ',len(train_list))
print('no of items for test ',len(test_list))

no of items for train  4977
no of items for test  554


# test the model

In [16]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
label1=train_list[12]['label']
text=train_list[12]['text']
input_ids = torch.tensor(tokenizer.encode(text, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
label1=torch.tensor([label1])
sprectrome=train_list[12]['sprectrome']
model.to('cpu')
model.eval()
with torch.no_grad():
    output = model(input_ids,sprectrome)
    #output.squeeze().shape
    #output=torch.flatten(output, start_dim=2)
    #print(output.shape)
    #output=torch.sum(output, dim=2)
    print(output)
     

tensor([[-4.6110,  1.3531,  2.8464,  1.3987]])


In [None]:

%load_ext tensorboard
%tensorboard --logdir ./

# optimizer

In [17]:
import torch.optim as optim
optimizer = optim.Adam(params=model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
writer = SummaryWriter(log_dir='/kaggle/working/content/')

In [18]:
total_steps = 1
NUM_EPOCHS=2
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model.train()
model.to('cuda')
for epoch in range(NUM_EPOCHS):
  lr_scheduler.step()
  random.shuffle(train_list)
  for every_trainlist in train_list:
    label1=every_trainlist['label']
    text=every_trainlist['text']
    label1=torch.tensor([label1])
    sprectrome=every_trainlist['sprectrome']
    if(sprectrome.shape[2]>65):
      optimizer.zero_grad()
      input_ids = torch.tensor(tokenizer.encode(text, add_special_tokens=True)).unsqueeze(0) 
      sprectrome = sprectrome.to('cuda')
      label1=label1.to('cuda')
      input_ids=input_ids.to('cuda')
      output = model(input_ids,sprectrome)
      #print('softmax output ',output)
      loss = criterion(output, label1)
      #print('label1',label1)
      print('loss',loss.item())
      loss.backward()
      optimizer.step()
      _, preds = torch.max(output, 1)
      accuracy = torch.sum(preds == label1)
      print('accuracy.item()',accuracy.item())
      #print('preds',preds)
      if total_steps % 10 == 0:
        with torch.no_grad():
          _, preds = torch.max(output, 1)
          accuracy = torch.sum(preds == label1)
          #print('Epoch: {} \tStep: {} \tLoss: {:.4f} \tAcc: {}'.format(epoch + 1, total_steps, loss.item(), accuracy.item()))
          writer.add_scalar('loss', loss.item(), total_steps)
          writer.add_scalar('accuracy', accuracy.item(), total_steps)                     
      total_steps+=1

loss 3.9563186168670654
accuracy.item() 0
loss 1.7704745531082153
accuracy.item() 0
loss 1.454175591468811
accuracy.item() 0
loss 2.10772967338562
accuracy.item() 0
loss 1.1933374404907227
accuracy.item() 0
loss 1.5555663108825684
accuracy.item() 0
loss 0.779208779335022
accuracy.item() 1
loss 3.9043328762054443
accuracy.item() 0
loss 2.2218210697174072
accuracy.item() 0
loss 5.8310675621032715
accuracy.item() 0
loss 3.6508755683898926
accuracy.item() 0
loss 0.48990440368652344
accuracy.item() 1
loss 1.4755288362503052
accuracy.item() 0
loss 2.299358606338501
accuracy.item() 0
loss 3.2120954990386963
accuracy.item() 0
loss 0.33703023195266724
accuracy.item() 1
loss 0.5518983602523804
accuracy.item() 1
loss 1.9973037242889404
accuracy.item() 0
loss 1.23804771900177
accuracy.item() 0
loss 3.0980348587036133
accuracy.item() 0
loss 3.8754994869232178
accuracy.item() 0
loss 4.113394737243652
accuracy.item() 0
loss 0.177431121468544
accuracy.item() 1
loss 0.4305236339569092
accuracy.item() 1

# Test the model

In [19]:
y_actu=[]
y_pred=[]
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model.to('cpu')
model.eval()
for every_test_list in test_list:
    label1=every_test_list['label']
    label1=torch.tensor([label1])
    sprectrome=every_test_list['sprectrome']
    text=every_test_list['text']
    input_ids = torch.tensor(tokenizer.encode(text, add_special_tokens=True)).unsqueeze(0)
    with torch.no_grad():
      if(sprectrome.shape[2]>65):
        #sprectrome = sprectrome.to('cuda')
        #label1=label1.to('cuda')
        output = model(input_ids,sprectrome)
        _, preds = torch.max(output, 1)
        y_actu.append(label1.numpy()[0])
        y_pred.append(preds.numpy()[0])

In [20]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_actu, y_pred)

array([[106,   0,   1,   2],
       [  0, 150,   3,   5],
       [  0,   2, 112,   3],
       [  3,   2,   5, 139]])

In [21]:
from sklearn.metrics import classification_report
print(classification_report(y_actu,y_pred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97       109
           1       0.97      0.95      0.96       158
           2       0.93      0.96      0.94       117
           3       0.93      0.93      0.93       149

    accuracy                           0.95       533
   macro avg       0.95      0.95      0.95       533
weighted avg       0.95      0.95      0.95       533



In [None]:
torch.save(model, '/kaggle/working/Combined_model_audio_text.pt')