In [29]:
import numpy as np
import librosa
import soundfile as sf
import sounddevice as sd
import soundfile
import numpy as np
import wave
import librosa
import os
from numpy.fft import fft, ifft
from IPython.display import Audio, Markdown
from collections import defaultdict

In [30]:
def extract_mfcc(file_path, n_mfcc=13):
    """
    Extract MFCC features from an audio file.
    
    Parameters:
    file_path (str): Path to the audio file.
    n_mfcc (int): Number of MFCC features to extract.
    
    Returns:
    np.ndarray: Extracted MFCC features.
    """
    y, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    return np.mean(mfccs.T, axis=0)  # Average across time frames

def aggregate_speaker_features(file_paths):
    """
    Aggregate MFCC features for all recordings of a speaker.
    
    Parameters:
    file_paths (list): List of file paths for a single speaker's recordings.
    
    Returns:
    np.ndarray: Aggregated feature vector for the speaker.
    """
    features = [extract_mfcc(file) for file in file_paths]
    return np.mean(features, axis=0)

def get_speaker_files(base_path):
    """
    Get a dictionary of speaker IDs and their corresponding file paths.
    
    Parameters:
    base_path (str): Base directory containing the audio files.
    
    Returns:
    dict: Dictionary with speaker IDs as keys and lists of file paths as values.
    """
    speaker_files = defaultdict(list)
    for file in os.listdir(base_path):
        if file.endswith('.wav'):
            speaker_id = file.split('-')[0]
            speaker_files[speaker_id].append(os.path.join(base_path, file))
    return speaker_files

def calculate_similarity_matrix(utterances):
    """
    Compute the voice similarity matrix for given utterances.
    
    Parameters:
    utterances (list): List of feature vectors representing each speaker.
    
    Returns:
    np.ndarray: Similarity matrix.
    """
    num_speakers = len(utterances)
    similarity_matrix = np.zeros((num_speakers, num_speakers))
    
    for i in range(num_speakers):
        for j in range(num_speakers):
            similarity_matrix[i, j] = np.dot(utterances[i], utterances[j]) / (np.linalg.norm(utterances[i]) * np.linalg.norm(utterances[j]))
    
    return similarity_matrix

def calculate_diagonal_dominance(matrix):
    """
    Calculate the diagonal dominance of a given similarity matrix.
    
    Parameters:
    matrix (np.ndarray): Similarity matrix.
    
    Returns:
    float: Diagonal dominance value.
    """
    N = matrix.shape[0]
    diag_avg = np.mean(np.diag(matrix))
    off_diag_mask = np.ones(matrix.shape, dtype=bool)
    np.fill_diagonal(off_diag_mask, 0)
    off_diag_avg = np.mean(matrix[off_diag_mask])
    
    return abs(diag_avg - off_diag_avg)

def calculate_deid(matrix_oo, matrix_op):
    """
    Calculate the DeID metric given the original and original-protected similarity matrices.
    
    Parameters:
    matrix_oo (np.ndarray): Original similarity matrix.
    matrix_op (np.ndarray): Original-protected similarity matrix.
    
    Returns:
    float: DeID metric.
    """
    D_diag_OO = calculate_diagonal_dominance(matrix_oo)
    D_diag_OP = calculate_diagonal_dominance(matrix_op)
    
    DeID = 1 - (D_diag_OP / D_diag_OO)
    return DeID

In [31]:
# Define base directory containing audio files
base_dir = 'data/dev-clean-audio'

# Get speaker files
speaker_files = get_speaker_files(base_dir)

# Aggregate features for each speaker
utterances_original = [aggregate_speaker_features(paths) for speaker_id, paths in speaker_files.items()]

# For pseudonymised files
base_dir_pseudo = 'data/voice-privacy-challenge/asr_project_asrbn'
speaker_files_pseudo = get_speaker_files(base_dir_pseudo)
utterances_pseudonymised = [aggregate_speaker_features(paths) for speaker_id, paths in speaker_files_pseudo.items()]

# Calculate similarity matrices
matrix_oo = calculate_similarity_matrix(utterances_original)
matrix_op = calculate_similarity_matrix(utterances_original + utterances_pseudonymised)[len(utterances_original):, :len(utterances_original)]

# Calculate DeID
deid = calculate_deid(matrix_oo, matrix_op)
print(f"DeID: {deid:.2f}")

DeID: 0.94
