In [None]:
# We tried to set the random seeds but we found that the results may still vary. As a result, we run every experiment 10 times and get the average result. 
# We are able to get comparable results & show the same tendency (e.g., enterface performs worst) as shown in the paper

In [22]:
import torch
# import pickle
from dataset_classification import Train, Test
from torch.utils.data import DataLoader
from torch.autograd import Variable
from dataset_SNN import Train_Siamese
from model_Siamese import Siamese
from model_classification import FNN
import time
import numpy as np
import math
import sys
from sklearn import metrics
import os
import random
import pandas as pd

import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

np.set_printoptions(threshold=sys.maxsize)

def max_key(score):  
     v=list(score.values())
     k=list(score.keys())
     return k[v.index(max(v))]
    
def run(model_name):
    trainID = ['Ses01F', 'Ses01M', 'Ses02F', 'Ses02M', 'Ses03F', 'Ses03M', 'Ses04F', 'Ses04M', 'Ses05F', 'Ses05M']
    print (model_name)
    
    # g is the number of samples per speaker per emotion
    for g in range(1, 11):
        score_all = []
        
        # because we did not specify the random seeds, we run 10 times and calculate the average
        for l in range(0, 10):
            num_train = g
            num_speaker = 5
            gender = ['F', 'M']

            lr = 0.0005
            max_iter = 250
            model_path = 'SNN_source_models/'+model_name

            # load the data for the Siamese nn
            trainSet = Train_Siamese('feature/iemocap_feature_processed.csv', trainID = trainID, num_sample = g, epoch = max_iter*128)
            trainLoader = DataLoader(trainSet, batch_size=128, shuffle=False)

            loss_fn = torch.nn.BCELoss(size_average=True)

            net = Siamese()
            net.load_state_dict(torch.load(model_path))

            net.train()

            optimizer = torch.optim.Adam(net.parameters(),lr = lr )
            optimizer.zero_grad()

            train_loss = []
            loss_val = 0

            for batch_id, (data1, data2, label, prob_return) in enumerate(trainLoader, 1):
                if batch_id > max_iter:
                    break
                data1, data2, label = Variable(data1), Variable(data2), Variable(label)
                optimizer.zero_grad()
                output = net.forward(data1, data2)
                loss = loss_fn(output, label)
                loss_val += loss.item()
                loss.backward()
                optimizer.step()

                # update the prob (likelihood in the paper) every 10 iterations. We don't update when there is only 1 sample per speaker per emotion because it's the only sample and updating is not necessary.
                if (num_train != 1):
                    if batch_id % 10 == 0:
                        prob_tmp = []
                        increase_tmp = []
                        for k in range(0, len(output)):
                            # if ((output[k][0] < 0.5 and label[k][0] == 1) or (output[k][0] > 0.5 and label[k][0] == 0)):
                            prob_tmp.append(prob_return[k].tolist())
                            increase_tmp.append( (abs(output[k][0] - label[k][0])*1).item() )              
                        trainSet.update_prob(prob_list = prob_tmp, prob_increase = increase_tmp)



            # initilize a new model and copy the weight of trained SNN to the new model
            model = FNN()
            pretrained_dict = net.state_dict()
            model_dict = model.state_dict()
            pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
            model_dict.update(pretrained_dict) 
            model.load_state_dict(model_dict)

            # fine-tuning the model in the supervised way (same as MeL-S)
            trainSet = Train('feature/iemocap_feature_processed.csv', trainID = trainID, num_sample = g, epoch = 200)
            trainLoader = DataLoader(trainSet, batch_size=32, shuffle=False)
            testSet = Test(datas_train = trainSet.datas_train, datas_test = trainSet.datas_test, num_classes = 3, trainID = trainID, num_sample = g)
            testLoader = DataLoader(testSet, batch_size=1, shuffle=False)

            loss_fn = torch.nn.CrossEntropyLoss()

            optimizer = torch.optim.Adam(model.parameters(),lr = 0.001 )
            optimizer.zero_grad()


            for batch_id, (data, label) in enumerate(trainLoader, 0):
                if batch_id > max_iter:
                    break
                data, label = Variable(data), Variable(label)
                optimizer.zero_grad()
                output = model.forward(data)
                loss = loss_fn(output, label.squeeze().type(torch.LongTensor))
                loss_val += loss.item()
                loss.backward()
                optimizer.step()

            correct_list = []
            pred_list = []
            for test_id, (test, label) in enumerate(testLoader, 0):
                # print (test_id)
                test = Variable(test)
                output = model.forward(test).data.cpu().numpy()[0]
                predict = output.argmax()
                pred_list.append(predict)
                correct_list.append(int(label[0]))


            score_all.append(metrics.recall_score(correct_list, pred_list, average='macro'))

        print ('num of speaker:', g, end = ' ')
        print(sum(score_all)/len(score_all))

In [24]:
run('enterface_base.pth')

enterface_base.pth
num of speaker: 1 0.6512944732506367
num of speaker: 2 0.6606824446574845
num of speaker: 3 0.6990602817077493
num of speaker: 4 0.6938101197106606
num of speaker: 5 0.6839032227899295
num of speaker: 6 0.7063660596132216
num of speaker: 7 0.7110691098693207
num of speaker: 8 0.7036592742369651
num of speaker: 9 0.6927425697620886
num of speaker: 10 0.7169763029737501


In [25]:
run('crema_d_base.pth')

crema_d_base.pth
num of speaker: 1 0.6654394584196694
num of speaker: 2 0.7067077439289335
num of speaker: 3 0.7126538511452551
num of speaker: 4 0.7217098975160747
num of speaker: 5 0.728233272366678
num of speaker: 6 0.7244800224566923
num of speaker: 7 0.7249402510538356
num of speaker: 8 0.7283649826776304
num of speaker: 9 0.7369849323790092
num of speaker: 10 0.7422009407453278


In [27]:
run('iemocap_base.pth')

iemocap_base.pth
num of speaker: 1 0.6933386267427688
num of speaker: 2 0.722188810186686
num of speaker: 3 0.7220840704075259
num of speaker: 4 0.7202940292298237
num of speaker: 5 0.7163047805417923
num of speaker: 6 0.7199755012336541
num of speaker: 7 0.7198059947501847
num of speaker: 8 0.7118827561952297
num of speaker: 9 0.7269342452853098
num of speaker: 10 0.7324882876325387


In [31]:
run('ravdess_base.pth')

ravdess_base.pth
num of speaker: 1 0.6940389615234261
num of speaker: 2 0.7086199315819602
num of speaker: 3 0.7075198956064279
num of speaker: 4 0.7096136034966973
num of speaker: 5 0.7099766926621729
num of speaker: 6 0.7168912583192089
num of speaker: 7 0.7340505629136954
num of speaker: 8 0.7233612957541411
num of speaker: 9 0.7302637691055484
num of speaker: 10 0.7308347968730133
