In [1]:
import numpy as np
import librosa
import soundfile as sf
import sounddevice as sd
import soundfile
import wave
import os
from numpy.fft import fft, ifft
from IPython.display import Audio, Markdown
from collections import defaultdict
import speech_recognition
from io import BytesIO
import jiwer
from tqdm import tqdm

In [2]:
def extract_mfcc(file_path, n_mfcc=13):
    """
    Extract MFCC features from an audio file.
    
    Parameters:
    file_path (str): Path to the audio file.
    n_mfcc (int): Number of MFCC features to extract.
    
    Returns:
    np.ndarray: Extracted MFCC features.
    """
    y, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    return np.mean(mfccs.T, axis=0)  # Average across time frames

def aggregate_speaker_features(file_paths):
    """
    Aggregate MFCC features for all recordings of a speaker.
    
    Parameters:
    file_paths (list): List of file paths for a single speaker's recordings.
    
    Returns:
    np.ndarray: Aggregated feature vector for the speaker.
    """
    features = [extract_mfcc(file) for file in file_paths]
    return np.mean(features, axis=0)

def get_speaker_files(base_path, split_token='-'):
    """
    Get a dictionary of speaker IDs and their corresponding file paths.
    
    Parameters:
    base_path (str): Base directory containing the audio files.
    
    Returns:
    dict: Dictionary with speaker IDs as keys and lists of file paths as values.
    """
    speaker_files = defaultdict(list)
    for file in os.listdir(base_path):
        if file.endswith('.wav'):
            speaker_id = file.split(split_token)[0]
            speaker_files[speaker_id].append(os.path.join(base_path, file))
    return speaker_files

def calculate_similarity_matrix(utterances):
    """
    Compute the voice similarity matrix for given utterances.
    
    Parameters:
    utterances (list): List of feature vectors representing each speaker.
    
    Returns:
    np.ndarray: Similarity matrix.
    """
    num_speakers = len(utterances)
    similarity_matrix = np.zeros((num_speakers, num_speakers))
    
    for i in range(num_speakers):
        for j in range(num_speakers):
            similarity_matrix[i, j] = np.dot(utterances[i], utterances[j]) / (np.linalg.norm(utterances[i]) * np.linalg.norm(utterances[j]))
    
    return similarity_matrix

def calculate_diagonal_dominance(matrix):
    """
    Calculate the diagonal dominance of a given similarity matrix.
    
    Parameters:
    matrix (np.ndarray): Similarity matrix.
    
    Returns:
    float: Diagonal dominance value.
    """
    N = matrix.shape[0]
    diag_avg = np.mean(np.diag(matrix))
    off_diag_mask = np.ones(matrix.shape, dtype=bool)
    np.fill_diagonal(off_diag_mask, 0)
    off_diag_avg = np.mean(matrix[off_diag_mask])
    
    return abs(diag_avg - off_diag_avg)

def calculate_deid(matrix_oo, matrix_op):
    """
    Calculate the DeID metric given the original and original-protected similarity matrices.
    
    Parameters:
    matrix_oo (np.ndarray): Original similarity matrix.
    matrix_op (np.ndarray): Original-protected similarity matrix.
    
    Returns:
    float: DeID metric.
    """
    D_diag_OO = calculate_diagonal_dominance(matrix_oo)
    D_diag_OP = calculate_diagonal_dominance(matrix_op)
    
    DeID = 1 - (D_diag_OP / D_diag_OO)
    return DeID

def calculate_gvd(matrix_oo, matrix_pp):
    """
    Calculate the GVD metric given the original and pseudonymised similarity matrices.
    
    Parameters:
    matrix_oo (np.ndarray): Original similarity matrix.
    matrix_pp (np.ndarray): Pseudonymised similarity matrix.
    
    Returns:
    float: GVD metric.
    """
    D_diag_OO = calculate_diagonal_dominance(matrix_oo)
    D_diag_PP = calculate_diagonal_dominance(matrix_pp)
    
    GVD = 10 * np.log10(D_diag_PP / D_diag_OO)
    return GVD

In [3]:
def get_wav_files(base_path, split_token='.'):
    """
    Get a dictionary of speaker IDs and their corresponding file paths.
    
    Parameters:
    base_path (str): Base directory containing the audio files.
    
    Returns:
    dict: Dictionary with audio IDs as keys and file path as values.
    """
    wav_files = {}
    for file in os.listdir(base_path):
        if file.endswith('.wav'):
            audio_id = file.split(split_token)[0]
            wav_files[audio_id] = os.path.join(base_path, file)
    return wav_files




def transcribe_audio(file_path, recognizer, model):
    print(file_path)
    
    audio_file = speech_recognition.AudioFile(file_path)
    with audio_file as source:
            audio_data = recognizer.record(source)
    
    text = recognizer.recognize_whisper(audio_data, model)
    if text == "":
        text = "empty"
    return text  

In [4]:
def transcribe_audio2(file_path, recognizer, model):
    y, sr = librosa.load(file_path, sr=None)
    print(file_path)

    wav_io = BytesIO()
    sf.write(wav_io, y, 16000, format='WAV')
    wav_io.seek(0)

    audio_file = speech_recognition.AudioFile(wav_io)
    with audio_file as source:
            audio_data = recognizer.record(source)

    text = recognizer.recognize_whisper(audio_data, 'tiny')
    wav_io.close()
    if text == "":
        text = "empty"
    return text  


# VCTK
## SSL-SAS

SSL-SAS: GVD + DeID

In [5]:
print("hello world")

hello world


In [6]:
# Get speaker files and aggregate features
base_dir = 'data/vctk_dev'
speaker_files = get_speaker_files(base_dir)
utterances_original = [aggregate_speaker_features(paths) for speaker_id, paths in speaker_files.items()]

# Get speaker files and aggregate features for pseudonymised data
base_dir_pseudo = 'data/anonymized/ssl_sas/ssl_sas_vctk_dev'
speaker_files_pseudo = get_speaker_files(base_dir_pseudo)
utterances_pseudonymised = [aggregate_speaker_features(paths) for speaker_id, paths in speaker_files_pseudo.items()]

# Calculate similarity matrices
matrix_oo = calculate_similarity_matrix(utterances_original)
matrix_op = calculate_similarity_matrix(utterances_original + utterances_pseudonymised)[len(utterances_original):, :len(utterances_original)]
matrix_pp = calculate_similarity_matrix(utterances_pseudonymised)

# Calculate DeID
deid = calculate_deid(matrix_oo, matrix_op)
print(f"DeID, SSL-SAS: {deid:.2f}")

# Calculate GVD
gvd = calculate_gvd(matrix_oo, matrix_pp)
print(f"GVD, SSL-SAS: {gvd:.2f}")

DeID, SSL-SAS: 0.98
GVD, SSL-SAS: 0.17


SSL-SAS WER

In [7]:
transcripts_path = "data/anonymized/ssl_sas/ssl_sas_vctk_dev/transcript.txt"

transcrips_dict = {}
with open(transcripts_path, 'r') as f:
    for line in f:
        key, sentence = line.split(' ', 1)  # Split on the first space
        key = key.strip()
        sentence = sentence.strip()
        transcrips_dict[key] = sentence
        
vctk_paths = get_wav_files('data/anonymized/ssl_sas/ssl_sas_vctk_dev')
print("vctk_paths = ", vctk_paths)
vctk_wer = []


recognizer = speech_recognition.Recognizer()

for key, t in transcrips_dict.items():
    # clean
    transcript = t.lower()
    
    
    # vctk
    if key not in vctk_paths:
        print(f"{key} is not in vctk_paths")
        continue
    text = transcribe_audio2(vctk_paths[key], recognizer, 'tiny')
    if text != "empty":
        vctk_wer.append(jiwer.wer(text.lower(), transcript))
    

print("WER, SSL-SAS: ", sum(vctk_wer) / len(vctk_wer))


vctk_paths =  {'p283_013_mic2_gen': 'data/anonymized/ssl_sas/ssl_sas_vctk_dev/p283_013_mic2_gen.wav', 'p272_003_mic2_gen': 'data/anonymized/ssl_sas/ssl_sas_vctk_dev/p272_003_mic2_gen.wav', 'p293_030_mic2_gen': 'data/anonymized/ssl_sas/ssl_sas_vctk_dev/p293_030_mic2_gen.wav', 'p264_014_mic2_gen': 'data/anonymized/ssl_sas/ssl_sas_vctk_dev/p264_014_mic2_gen.wav', 'p260_014_mic2_gen': 'data/anonymized/ssl_sas/ssl_sas_vctk_dev/p260_014_mic2_gen.wav', 'p281_007_mic2_gen': 'data/anonymized/ssl_sas/ssl_sas_vctk_dev/p281_007_mic2_gen.wav', 'p283_007_mic2_gen': 'data/anonymized/ssl_sas/ssl_sas_vctk_dev/p283_007_mic2_gen.wav', 'p272_031_mic2_gen': 'data/anonymized/ssl_sas/ssl_sas_vctk_dev/p272_031_mic2_gen.wav', 'p247_023_mic2_gen': 'data/anonymized/ssl_sas/ssl_sas_vctk_dev/p247_023_mic2_gen.wav', 'p288_008_mic2_gen': 'data/anonymized/ssl_sas/ssl_sas_vctk_dev/p288_008_mic2_gen.wav', 'p234_031_mic2_gen': 'data/anonymized/ssl_sas/ssl_sas_vctk_dev/p234_031_mic2_gen.wav', 'p266_019_mic2_gen': 'data/a

## MCADAMS, NAC, ASRBN
GVD + DeID

In [8]:
# Get speaker files and aggregate features
base_dir = 'data/vctk_dev'
speaker_files = get_speaker_files(base_dir)
utterances_original = [aggregate_speaker_features(paths) for speaker_id, paths in speaker_files.items()]


#MCADAMS
# Get speaker files and aggregate features for pseudonymised data
base_dir_pseudo = 'data/anonymized/vpc/vctk/mcadams_vctk'
speaker_files_pseudo = get_speaker_files(base_dir_pseudo)
utterances_pseudonymised = [aggregate_speaker_features(paths) for speaker_id, paths in speaker_files_pseudo.items()]

# Calculate similarity matrices
matrix_oo = calculate_similarity_matrix(utterances_original)
matrix_op = calculate_similarity_matrix(utterances_original + utterances_pseudonymised)[len(utterances_original):, :len(utterances_original)]
matrix_pp = calculate_similarity_matrix(utterances_pseudonymised)

# Calculate DeID
deid = calculate_deid(matrix_oo, matrix_op)
print(f"DeID, MCADAMS: {deid:.2f}")

# Calculate GVD
gvd = calculate_gvd(matrix_oo, matrix_pp)
print(f"GVD, MCADAMS: {gvd:.2f}")


#NAC
# Get speaker files and aggregate features for pseudonymised data
base_dir_pseudo = 'data/anonymized/vpc/vctk/nac_vctk'
speaker_files_pseudo = get_speaker_files(base_dir_pseudo)
utterances_pseudonymised = [aggregate_speaker_features(paths) for speaker_id, paths in speaker_files_pseudo.items()]

# Calculate similarity matrices
matrix_oo = calculate_similarity_matrix(utterances_original)
matrix_op = calculate_similarity_matrix(utterances_original + utterances_pseudonymised)[len(utterances_original):, :len(utterances_original)]
matrix_pp = calculate_similarity_matrix(utterances_pseudonymised)

# Calculate DeID
deid = calculate_deid(matrix_oo, matrix_op)
print(f"DeID, NAC: {deid:.2f}")

# Calculate GVD
gvd = calculate_gvd(matrix_oo, matrix_pp)
print(f"GVD, NAC: {gvd:.2f}")


#ASRBN
# Get speaker files and aggregate features for pseudonymised data
base_dir_pseudo = 'data/anonymized/vpc/vctk/asrbn_vctk'
speaker_files_pseudo = get_speaker_files(base_dir_pseudo)
utterances_pseudonymised = [aggregate_speaker_features(paths) for speaker_id, paths in speaker_files_pseudo.items()]

# Calculate similarity matrices
matrix_oo = calculate_similarity_matrix(utterances_original)
matrix_op = calculate_similarity_matrix(utterances_original + utterances_pseudonymised)[len(utterances_original):, :len(utterances_original)]
matrix_pp = calculate_similarity_matrix(utterances_pseudonymised)

# Calculate DeID
deid = calculate_deid(matrix_oo, matrix_op)
print(f"DeID, ASRBN: {deid:.2f}")

# Calculate GVD
gvd = calculate_gvd(matrix_oo, matrix_pp)
print(f"GVD, ASRBN: {gvd:.2f}")

DeID, MCADAMS: 0.14
GVD, MCADAMS: 7.30
DeID, NAC: 0.46
GVD, NAC: 3.72
DeID, ASRBN: 0.50
GVD, ASRBN: 3.89


WER

In [13]:
transcripts_path = "data/anonymized/ssl_sas/ssl_sas_vctk_dev/transcript.txt" #Strange filepath, but this is where the general vctk_dev transcript is.

transcrips_dict = {}
with open(transcripts_path, 'r') as f:
    for line in f:
        key, sentence = line.split(' ', 1)  # Split on the first space
        key = key.strip().replace("_gen", '')
        sentence = sentence.strip()
        transcrips_dict[key] = sentence
        
clean_paths = get_wav_files('data/vctk_dev')
asrbn_paths = get_wav_files('data/anonymized/vpc/vctk/asrbn_vctk')
mcadams_paths = get_wav_files('data/anonymized/vpc/vctk/mcadams_vctk')
nac_paths = get_wav_files('data/anonymized/vpc/vctk/nac_vctk')

clean_wer = []
asrbn_wer = []
mcadams_wer = []
nac_wer = []

recognizer = speech_recognition.Recognizer()

for key, t in transcrips_dict.items():
    # clean
    transcript = t.lower()
    
    text = transcribe_audio(clean_paths[key], recognizer, 'tiny')
    if text != "empty":
        clean_wer.append(jiwer.wer(text.lower(), transcript))
    
    # asrbn
    text = transcribe_audio(asrbn_paths[key], recognizer, 'tiny')
    if text != "empty":
        asrbn_wer.append(jiwer.wer(text.lower(), transcript))
    
    # mcadams
    text = transcribe_audio(mcadams_paths[key], recognizer, 'tiny')
    if text != "empty":
        mcadams_wer.append(jiwer.wer(text.lower(), transcript))

    # nac
    text = transcribe_audio(nac_paths[key], recognizer, 'tiny')
    if nac_wer != "empty":
        nac_wer.append(jiwer.wer(text.lower(), transcript))
    

print("WER, clean: ", sum(clean_wer) / len(clean_wer))
print("WER, asrbn: ", sum(asrbn_wer) / len(asrbn_wer))
print("WER, mcadams: ", sum(mcadams_wer) / len(mcadams_wer))
print("WER, nac: ", sum(nac_wer) / len(nac_wer))

data/vctk_dev/p234_001_mic2.wav
data/anonymized/vpc/vctk/asrbn_vctk/p234_001_mic2.wav
data/anonymized/vpc/vctk/mcadams_vctk/p234_001_mic2.wav
data/anonymized/vpc/vctk/nac_vctk/p234_001_mic2.wav
data/vctk_dev/p234_002_mic2.wav
data/anonymized/vpc/vctk/asrbn_vctk/p234_002_mic2.wav
data/anonymized/vpc/vctk/mcadams_vctk/p234_002_mic2.wav
data/anonymized/vpc/vctk/nac_vctk/p234_002_mic2.wav
data/vctk_dev/p234_003_mic2.wav
data/anonymized/vpc/vctk/asrbn_vctk/p234_003_mic2.wav
data/anonymized/vpc/vctk/mcadams_vctk/p234_003_mic2.wav
data/anonymized/vpc/vctk/nac_vctk/p234_003_mic2.wav
data/vctk_dev/p234_004_mic2.wav
data/anonymized/vpc/vctk/asrbn_vctk/p234_004_mic2.wav
data/anonymized/vpc/vctk/mcadams_vctk/p234_004_mic2.wav
data/anonymized/vpc/vctk/nac_vctk/p234_004_mic2.wav
data/vctk_dev/p234_005_mic2.wav
data/anonymized/vpc/vctk/asrbn_vctk/p234_005_mic2.wav
data/anonymized/vpc/vctk/mcadams_vctk/p234_005_mic2.wav
data/anonymized/vpc/vctk/nac_vctk/p234_005_mic2.wav
data/vctk_dev/p234_006_mic2.wa