In [None]:
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
from pathlib import Path
import soundfile as sf
import fairseq
import librosa
import torch
import speechbrain as sb
import torch
import os
from speechbrain.pretrained import EncoderClassifier
import torchaudio


from sklearn.decomposition import PCA
from sklearn.preprocessing import Normalizer
from tqdm import tqdm
import hdbscan
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import cosine_distances, pairwise_distances

### Initialized Dataframe

In [None]:
labels = pd.read_csv('/mnt/sda/hsinghang/dataset/MSP-Podcast-v1.10/labels/labels_consensus.csv')
df = pd.DataFrame(columns=['file_name', 'emotion', 'A', 'V', 'D', 'spk_id', 'gender', 'split', 'wav_path', 'hubert_path', 'hubert_len', 'emb_xvec', 'hdb_label'])
df['file_name'] = labels['FileName']
df['emotion'] = labels['EmoClass']
df['A'] = labels['EmoAct']
df['V'] = labels['EmoVal']
df['D'] = labels['EmoDom']
df['spk_id'] = labels['SpkrID']
df['gender'] = labels['Gender']
df['split'] = labels['Split_Set']

# modify this to make it point to where your wav files are stored
df['wav_path'] = '/mnt/sda/hsinghang/dataset/MSP-Podcast-v1.10/audios/' + df['file_name']
# modify following to make them points to where you want to store these features
df['hubert_path'] = '/mnt/sda/hsinghang/dataset/MSP-Podcast-v1.10/features/hubert/' + df['file_name'].str.replace('.wav', '.pkl')
df['emb_xvec'] = '/mnt/sda/hsinghang/dataset/MSP-Podcast-v1.10/features/emb/xvec/' + df['file_name'].str.replace('.wav', '.pkl')

### Extract Hubert Features

In [None]:
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.enabled = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# check point path
ckpt_path = '/homes/hsinghang/model/hubert/hubert_base_ls960.pt'
models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path])
model = models[0]
model.to(device)

wav_list = list(df['wav_path'])
out_list = list(df['hubert_path'])
seq_len = [0] * len(out_list)
for c, wav_path in enumerate(tqdm(wav_list)):
    #s, sr = sf.read(wav_path)
    #print(sr)
    s, sr = librosa.load(wav_path, sr=16000)
    assert s.ndim == 1, s.ndim
    feats_audio = torch.FloatTensor(s).reshape((1, -1))
    with torch.no_grad():
        feats_audio = feats_audio.to(device)
        z = model.extract_features(feats_audio)[0]
        z = z.cpu().detach().numpy().squeeze()
    seq_len[c] = len(z)
    out_path = '/'.join(out_list[c].split('/')[:-1])
    path = Path(out_path)
    if not os.path.exists(str(path.as_posix())):
        path.mkdir(parents=True, exist_ok=True)
    
    joblib.dump(z, out_list[c])
    if c % 1000 == 0:
        torch.cuda.empty_cache()
#iemocap_meta['hubert_feature'] = out_list
df['hubert_len'] = seq_len


### Extract Speaker Embedding

In [None]:
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.enabled = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
run_opts={"device":"cuda"} 
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    wav_path = row['wav_path']
    emb_path = row['emb_xvec']
    signal, fs = torchaudio.load(wav_path)
    #signal = signal.to(device)
    embs = classifier.encode_batch(signal)
    joblib.dump(embs.cpu().numpy().squeeze(), emb_path)

### Split dataset

In [None]:
train_df = df[df['split'] == 'Train']
valid_df = df[df['split'] == 'Development']
test_df = df[(df['split'] == 'Test1') | (df['split'] == 'Test2')]
# I think we actually only use test 1 to evaluate fairness since Test 2 are speakers with unknown id and is filtered out by data_loader.py.

### Create cluster label

In [None]:
# get speaker embedding for each set
MIN_SAMPLE = 32
train_emb = []
for emb in tqdm(train_df['emb_xvec']):
    train_emb.append(joblib.load(emb))
train_emb = np.array(train_emb)

valid_emb = []
for emb in tqdm(valid_df['emb_xvec']):
    valid_emb.append(joblib.load(emb))
valid_emb = np.array(valid_emb)

test_emb = []
for emb in tqdm(test_df['emb_xvec']):
    test_emb.append(joblib.load(emb))
test_emb = np.array(test_emb)

train_v, train_c = np.unique(train_df['spk_id'], return_counts=True)
train_v = train_v[train_c >= MIN_SAMPLE]
train_v = train_v[train_v != 'Unknown']
selected_train_emb = train_emb[np.isin(train_df['spk_id'].to_numpy(), train_v)]

valid_v, c = np.unique(valid_df['spk_id'], return_counts=True)
valid_v = valid_v[c >= MIN_SAMPLE]
valid_v = valid_v[valid_v != 'Unknown']
selected_valid_emb = valid_emb[np.isin(valid_df['spk_id'].to_numpy(), valid_v)]

test_v, c = np.unique(test_df['spk_id'], return_counts=True)
test_v = test_v[c >= MIN_SAMPLE]
test_v = test_v[test_v != 'Unknown']
selected_test_emb = test_emb[np.isin(test_df['spk_id'].to_numpy(), test_v)]

In [None]:
# perform PCA for dimension reduction
pca = PCA(n_components=.80)
pca.fit(selected_train_emb)
selected_pca_train_emb = pca.transform(selected_train_emb)
selected_pca_valid_emb = pca.transform(selected_valid_emb)
selected_pca_test_emb = pca.transform(selected_test_emb)

In [None]:
train_clusterer = hdbscan.HDBSCAN(min_cluster_size=32, min_samples=4)
train_clusterer.fit(selected_pca_train_emb)

valid_clusterer = hdbscan.HDBSCAN(min_cluster_size=32, min_samples=4)
valid_clusterer.fit(selected_pca_valid_emb)

test_clusterer = hdbscan.HDBSCAN(min_cluster_size=32, min_samples=4)
test_clusterer.fit(selected_pca_test_emb)

In [None]:
# fit noise node
train_cluster_avg_emb = []
for i in range(train_clusterer.labels_.max()):
    train_cluster_avg_emb.append(np.mean(selected_pca_train_emb[train_clusterer.labels_ == i], axis=0, keepdims=False))
train_cluster_avg_emb = np.array(train_cluster_avg_emb)

valid_cluster_avg_emb = []
for i in range(valid_clusterer.labels_.max()):
    valid_cluster_avg_emb.append(np.mean(selected_pca_valid_emb[valid_clusterer.labels_ == i], axis=0, keepdims=False))
valid_cluster_avg_emb = np.array(valid_cluster_avg_emb)

test_cluster_avg_emb = []
for i in range(test_clusterer.labels_.max()):
    test_cluster_avg_emb.append(np.mean(selected_pca_test_emb[test_clusterer.labels_ == i], axis=0, keepdims=False))
test_cluster_avg_emb = np.array(test_cluster_avg_emb)

train_noise_similarity = cosine_similarity(selected_pca_train_emb[train_clusterer.labels_==-1], train_cluster_avg_emb)
valid_noise_similarity = cosine_similarity(selected_pca_valid_emb[valid_clusterer.labels_==-1], valid_cluster_avg_emb)
test_noise_similarity = cosine_similarity(selected_pca_test_emb[test_clusterer.labels_==-1], test_cluster_avg_emb)

train_noise_fitted_label = np.copy(train_clusterer.labels_)
j = 0
for i, clust_lab in enumerate(train_clusterer.labels_):
    if clust_lab == -1:
        train_noise_fitted_label[i] = np.argmax(train_noise_similarity[j])
        j += 1

valid_noise_fitted_label = np.copy(valid_clusterer.labels_)
j = 0
for i, clust_lab in enumerate(valid_clusterer.labels_):
    if clust_lab == -1:
        valid_noise_fitted_label[i] = np.argmax(valid_noise_similarity[j])
        j += 1
test_noise_fitted_label = np.copy(test_clusterer.labels_)
j = 0
for i, clust_lab in enumerate(test_clusterer.labels_):
    if clust_lab == -1:
        test_noise_fitted_label[i] = np.argmax(test_noise_similarity[j])
        j +=1

In [None]:
# make labels out of expieriment to -2
hdb_labels = np.ones(len(train_df['file_name'])) * -2
hdb_labels[np.isin(train_df['spk_id'].to_numpy(), train_v)] = train_noise_fitted_label
train_df['hdb_label'] = hdb_labels.astype('int')

hdb_labels = np.ones(len(valid_df['file_name'])) * -2
hdb_labels[np.isin(valid_df['spk_id'].to_numpy(), valid_v)] = valid_noise_fitted_label
valid_df['hdb_label'] = hdb_labels.astype('int')

hdb_labels = np.ones(len(test_df['file_name'])) * -2
hdb_labels[np.isin(test_df['spk_id'].to_numpy(), test_v)] = test_noise_fitted_label
test_df['hdb_label'] = hdb_labels.astype('int')

### Save result meta file

In [None]:
train_df.to_csv('./feature_extract/MSP/train_meta_hdb.csv', index=True)
valid_df.to_csv('./feature_extract/MSP/valid_meta_hdb.csv', index=True)
test_df.to_csv('./feature_extract/MSP/test_meta_hdb.csv', index=True)