In [None]:
# We tried to set the random seeds but we found that the results may still vary. As a result, we run every experiment 10 times and get the average result.
# This is the sample running results of method MeL-S as mentioned in the paper.

In [1]:
import torch
# import pickle
from dataset_classification import Train, Test
from torch.utils.data import DataLoader
from torch.autograd import Variable
from dataset_SNN import Train_Siamese
from model_Siamese import Siamese
from model_classification import FNN
import time
import numpy as np
import math
import sys
from sklearn import metrics
import os
import random
import pandas as pd

import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

np.set_printoptions(threshold=sys.maxsize)

def max_key(score):  
     v=list(score.values())
     k=list(score.keys())
     return k[v.index(max(v))]
    
def run(model_name):
    trainID = ['Ses01F', 'Ses01M', 'Ses02F', 'Ses02M', 'Ses03F', 'Ses03M', 'Ses04F', 'Ses04M', 'Ses05F', 'Ses05M']
    print (model_name)
    
    # g is the number of samples per speaker per emotion
    for g in range(1, 11):
        score_all = []
        
        # because we did not specify the random seeds, we run 10 times and calculate the average
        for l in range(0, 10):
            num_train = g
            num_speaker = 5
            gender = ['F', 'M']

            lr = 0.0005
            max_iter = 250
            model_path = 'SNN_source_models/'+model_name

            # load the data for the Siamese nn
            trainSet = Train_Siamese('feature/iemocap_feature_processed.csv', trainID = trainID, num_sample = g, epoch = max_iter*128)
            trainLoader = DataLoader(trainSet, batch_size=128, shuffle=False)

            loss_fn = torch.nn.BCELoss(size_average=True)

            net = Siamese()
            net.load_state_dict(torch.load(model_path))

            net.train()

            optimizer = torch.optim.Adam(net.parameters(),lr = lr )
            optimizer.zero_grad()

            train_loss = []
            loss_val = 0

            for batch_id, (data1, data2, label, prob_return) in enumerate(trainLoader, 1):
                if batch_id > max_iter:
                    break
                data1, data2, label = Variable(data1), Variable(data2), Variable(label)
                optimizer.zero_grad()
                output = net.forward(data1, data2)
                loss = loss_fn(output, label)
                loss_val += loss.item()
                loss.backward()
                optimizer.step()

#                     # update the prob (likelihood in the paper) every 10 iterations
#                     if batch_id % 10 == 0:
#                         prob_tmp = []
#                         increase_tmp = []
#                         for k in range(0, len(output)):
#                             # if ((output[k][0] < 0.5 and label[k][0] == 1) or (output[k][0] > 0.5 and label[k][0] == 0)):
#                             prob_tmp.append(prob_return[k].tolist())
#                             increase_tmp.append( (abs(output[k][0] - label[k][0])*1).item() )              
#                         trainSet.update_prob(prob_list = prob_tmp, prob_increase = increase_tmp)



            # initilize a new model and copy the weight of trained SNN to the new model
            model = FNN()
            pretrained_dict = net.state_dict()
            model_dict = model.state_dict()
            pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
            model_dict.update(pretrained_dict) 
            model.load_state_dict(model_dict)

            # fine-tuning the model in the supervised way (same as MeL-S)
            trainSet = Train('feature/iemocap_feature_processed.csv', trainID = trainID, num_sample = g, epoch = 200)
            trainLoader = DataLoader(trainSet, batch_size=32, shuffle=False)
            testSet = Test(datas_train = trainSet.datas_train, datas_test = trainSet.datas_test, num_classes = 3, trainID = trainID, num_sample = g)
            testLoader = DataLoader(testSet, batch_size=1, shuffle=False)

            loss_fn = torch.nn.CrossEntropyLoss()

            optimizer = torch.optim.Adam(model.parameters(),lr = 0.001 )
            optimizer.zero_grad()


            for batch_id, (data, label) in enumerate(trainLoader, 0):
                if batch_id > max_iter:
                    break
                data, label = Variable(data), Variable(label)
                optimizer.zero_grad()
                output = model.forward(data)
                loss = loss_fn(output, label.squeeze().type(torch.LongTensor))
                loss_val += loss.item()
                loss.backward()
                optimizer.step()

            correct_list = []
            pred_list = []
            for test_id, (test, label) in enumerate(testLoader, 0):
                # print (test_id)
                test = Variable(test)
                output = model.forward(test).data.cpu().numpy()[0]
                predict = output.argmax()
                pred_list.append(predict)
                correct_list.append(int(label[0]))


            score_all.append(metrics.recall_score(correct_list, pred_list, average='macro'))

        print ('num of speaker:', g, end = ' ')
        print(sum(score_all)/len(score_all))

In [10]:
run('enterface_base.pth')

enterface_base.pth
num of speaker: 1 0.6431494024490615
num of speaker: 2 0.6660534293699456
num of speaker: 3 0.6703474639945772
num of speaker: 4 0.6776208090056939
num of speaker: 5 0.6880217435448688
num of speaker: 6 0.7058977521044496
num of speaker: 7 0.6910009614359881
num of speaker: 8 0.6911679160835706
num of speaker: 9 0.7065210364978425
num of speaker: 10 0.7140275325945178


In [2]:
run('crema_d_base.pth')

crema_d_base.pth
num of speaker: 1 0.6527660419486876
num of speaker: 2 0.6853930109838234
num of speaker: 3 0.710318820266236
num of speaker: 4 0.7167962872865705
num of speaker: 5 0.7223353297497266
num of speaker: 6 0.7263359796131653
num of speaker: 7 0.7134556818541636
num of speaker: 8 0.7343079303644416
num of speaker: 9 0.7502104241468316
num of speaker: 10 0.738815821347587


In [19]:
run('iemocap_base.pth')

iemocap_base.pth
num of speaker: 1 0.6801168003377236
num of speaker: 2 0.6917276857797304
num of speaker: 3 0.7162855333992315
num of speaker: 4 0.7197790782719125
num of speaker: 5 0.7136124562631778
num of speaker: 6 0.7117803743408591
num of speaker: 7 0.7117478962389086
num of speaker: 8 0.718206101550203
num of speaker: 9 0.7185453170095607
num of speaker: 10 0.6998388715611427


In [26]:
run('ravdess_base.pth')

ravdess_base.pth
num of speaker: 1 0.6929291640668469
num of speaker: 2 0.7011937139393326
num of speaker: 3 0.694432334179494
num of speaker: 4 0.719270577110825
num of speaker: 5 0.7105926506095628
num of speaker: 6 0.7008226584311161
num of speaker: 7 0.7126698575204146
num of speaker: 8 0.7265481969989148
num of speaker: 9 0.7225179574085336
num of speaker: 10 0.7252871323993715
