In [14]:
import numpy as np
import pandas as pd
import csv
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, f1_score, classification_report, mutual_info_score
from sklearn.cluster import SpectralClustering
from sklearn_extra.cluster import KMedoids
from scipy.spatial.distance import cdist
from sklearn.utils import shuffle
from matplotlib import pyplot as plt
import seaborn as sn
import itertools

In [15]:
def load_multiple_features(feat_path, feat_type):
    """
    Func to load the features and labels
    Args:
        feat_path
        feat_type: 'emb1' or 'emb2'
    return:
        features: [# samples, feat dimension]
        labels: [# samples]
    """
    if feat_type == 'emb1':
        feat_size = 512 
    elif feat_type == 'emb2':
        feat_size = 1000
    else:
        print('feat_type not supported!')
    f = []
    for fp in feat_path:
        a = pd.read_csv(fp, delimiter=',', header=None, dtype=str).values
        f.append(a)
    f = np.concatenate(f, axis = 0)

    return f[:, :feat_size].astype(float), f[:, -1]

In [16]:
def pre_processing(test_embedding, test_labels):
    """
    Func to filter invalid instances and unify the labels
    """
    
    # obtain valid test sounds
    idx_valid_voice = np.where(test_labels != 'x')            
    test_labels_filtered = np.copy(test_labels[idx_valid_voice])

    test_embedding_filtered = np.copy(test_embedding[idx_valid_voice])
    # shuffle the embeddings
    test_embedding_filtered, test_labels_filtered = shuffle(test_embedding_filtered, test_labels_filtered, random_state=0)
    
    print('valid test data size:', sub, len(test_labels_filtered), 
          'ambiguous (x) size:', len(test_labels) - len(test_labels_filtered))
    

    # Change test sound labels to a consistent format, '1' for wearer, '2' for non-wearer.
    idx_wearer_voice = np.where(test_labels_filtered == '1')   # wearer
    test_labels_filtered[idx_wearer_voice] = 1
    idx_other_voice = np.where(test_labels_filtered == 'm')   # mixed counted as wearer
    test_labels_filtered[idx_other_voice] = 1
    idx_other_voice = np.where(test_labels_filtered == '2')   # non-wearer speech
    test_labels_filtered[idx_other_voice] = 2
    idx_other_voice = np.where(test_labels_filtered == 'c')   # baby crying
    test_labels_filtered[idx_other_voice] = 2
    idx_other_voice = np.where(test_labels_filtered == 'p')   # phone call
    test_labels_filtered[idx_other_voice] = 2   
    idx_other_voice = np.where(test_labels_filtered == 't')   # TV
    test_labels_filtered[idx_other_voice] = 2 
    idx_noise = np.where(test_labels_filtered == 'b')   # non-vocal background
    test_labels_filtered[idx_noise] = 2
    
    test_labels_filtered = test_labels_filtered.astype(int)  
    
    return test_embedding_filtered, test_labels_filtered

In [17]:
from sklearn.metrics.pairwise import pairwise_distances

feat_type = 'emb2'   # use emb1 or emb2
sub_list = ['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15',
            'P16', 'P17', 'P18']   # study group to test

acc_container = []   # used for len(sub) > 1
f1_container = []   # used for len(sub) > 1
   
for sub in sub_list:
    # Load data 
    test_feat_path = ['./embeddings/%s/%s/call.csv' %(sub, feat_type),
                     './embeddings/%s/%s/dinner.csv' %(sub, feat_type),
                     './embeddings/%s/%s/game.csv' %(sub, feat_type),
                     './embeddings/%s/%s/outdoor.csv' %(sub, feat_type),
                     './embeddings/%s/%s/TV.csv' %(sub, feat_type)]    

    test_embedding, test_labels = load_multiple_features(test_feat_path, feat_type = feat_type)    
    print('loaded test data size:', sub, len(test_labels))

    # group 16-18 is labeled by int [0, 1] only, so we need to cast something like '1.0' to '1' in labels
    for i in range(len(test_labels)):
        try:
            test_labels[i] = str(int(float(test_labels[i])))
        except:
            pass
        
    test_embedding_filtered, test_labels_filtered = pre_processing(test_embedding, test_labels)
    
    
        
    D = pairwise_distances(test_embedding_filtered, metric='cosine')
            
    # The clustering methods to use
    #clf = KMedoids(n_clusters=2, metric='cosine', max_iter=1000, method='pam', random_state=0)
    clf = SpectralClustering(n_clusters=2, affinity='cosine', eigen_solver='arpack', random_state=0, n_jobs=-1)

    # prediction
    clustered_labels = clf.fit_predict(test_embedding_filtered) 
    idx_cluster_0 = np.where(clustered_labels == 0)
    idx_cluster_1 = np.where(clustered_labels == 1)

    # calculate the center for each cluster
    mean_embedding_cluster_0 = np.mean(test_embedding_filtered[idx_cluster_0], axis=0)
    mean_embedding_cluster_1 = np.mean(test_embedding_filtered[idx_cluster_1], axis=0)

    # compare similarity from center for each cluster    
    sim0, sim1 = [], []
    for i in range(len(test_embedding_filtered[idx_cluster_0])):
        sim0.append(cosine_similarity(mean_embedding_cluster_0.reshape(1, -1), 
                                      test_embedding_filtered[idx_cluster_0][i].reshape(1, -1))[0][0])
    for i in range(len(test_embedding_filtered[idx_cluster_1])):
        sim1.append(cosine_similarity(mean_embedding_cluster_1.reshape(1, -1), 
                                      test_embedding_filtered[idx_cluster_1][i].reshape(1, -1))[0][0])
    # bigger s indicates the background cluster
    if np.mean(sim0) > np.mean(sim1):
        print('cluster1 (smaller sim from centroid) is the foreground')
        clustered_labels[idx_cluster_0] = 2
        clustered_labels[idx_cluster_1] = 1
    else:
        print('cluster0 (smaller sim from centroid) is the foreground')
        clustered_labels[idx_cluster_1] = 2
        clustered_labels[idx_cluster_0] = 1
            
    acc = balanced_accuracy_score(test_labels_filtered, clustered_labels)               
    f1 = f1_score(test_labels_filtered, clustered_labels, average = 'macro')
        
    # print classification result
    print('sub %s balanced accuracy %f macro f1 %f' %(sub, acc, f1))                                                                                 
    acc_container.append(acc)
    f1_container.append(f1)
    
print('mean balanced acc all groups:', np.mean(acc_container))
print('mean macro f1 all groups:', np.mean(f1_container))

loaded test data size: P1 6862
valid test data size: P1 6740 ambiguous (x) size: 122
cluster0 (smaller sim from centroid) is the foreground
sub P1 balanced accuracy 0.820521 macro f1 0.804265
loaded test data size: P2 4627
valid test data size: P2 4465 ambiguous (x) size: 162
cluster1 (smaller sim from centroid) is the foreground
sub P2 balanced accuracy 0.839438 macro f1 0.848876
loaded test data size: P3 4927
valid test data size: P3 4847 ambiguous (x) size: 80
cluster1 (smaller sim from centroid) is the foreground
sub P3 balanced accuracy 0.774472 macro f1 0.775020
loaded test data size: P4 6652
valid test data size: P4 6639 ambiguous (x) size: 13
cluster1 (smaller sim from centroid) is the foreground
sub P4 balanced accuracy 0.775307 macro f1 0.813661
loaded test data size: P5 4568
valid test data size: P5 4451 ambiguous (x) size: 117
cluster1 (smaller sim from centroid) is the foreground
sub P5 balanced accuracy 0.854646 macro f1 0.839060
loaded test data size: P6 6424
valid test 