In [52]:
import numpy as np
import librosa
import soundfile as sf
import sounddevice as sd
import soundfile
import numpy as np
import wave
import librosa
from numpy.fft import fft, ifft
from IPython.display import Audio, Markdown

In [53]:
def calculate_similarity_matrix(utterances):
    """
    Compute the voice similarity matrix for given utterances.
    Here, a placeholder function to compute similarity scores is used.
    In practice, this would involve more sophisticated signal processing.
    """
    num_speakers = len(utterances)
    similarity_matrix = np.zeros((num_speakers, num_speakers))
    
    # Placeholder similarity computation (e.g., cosine similarity of feature vectors)
    for i in range(num_speakers):
        for j in range(num_speakers):
            similarity_matrix[i, j] = np.dot(utterances[i], utterances[j]) / (np.linalg.norm(utterances[i]) * np.linalg.norm(utterances[j]))
    
    return similarity_matrix

def calculate_diagonal_dominance(matrix):
    """
    Calculate the diagonal dominance of a given similarity matrix.
    """
    N = matrix.shape[0]
    diag_avg = np.mean(np.diag(matrix))
    off_diag_avg = np.sum(matrix) / (N * (N - 1)) - diag_avg / (N - 1)
    return abs(diag_avg - off_diag_avg)

def calculate_deid(matrix_oo, matrix_op):
    """
    Calculate the DeID metric given the original and original-protected similarity matrices.
    """
    D_diag_OO = calculate_diagonal_dominance(matrix_oo)
    D_diag_OP = calculate_diagonal_dominance(matrix_op)
    
    DeID = 1 - (D_diag_OP / D_diag_OO)
    return DeID

In [54]:
# Example usage:
# Assuming utterances_original and utterances_pseudonymised are lists of feature vectors for each speaker's utterances
utterances_original = [np.random.rand(10) for _ in range(7)]  # Replace with actual feature vectors
utterances_pseudonymised = [np.random.rand(10) for _ in range(7)]  # Replace with actual feature vectors

# Calculate similarity matrices
matrix_oo = calculate_similarity_matrix(utterances_original)
matrix_op = calculate_similarity_matrix(utterances_original + utterances_pseudonymised)[len(utterances_original):, :len(utterances_original)]

# Calculate DeID
deid = calculate_deid(matrix_oo, matrix_op)
print(f"DeID: {deid:.2f}")

DeID: 0.80


In [55]:
print(utterances_original)
print(utterances_pseudonymised)

[array([0.86300385, 0.49664825, 0.34254031, 0.50664216, 0.61600229,
       0.31962294, 0.77859733, 0.85478051, 0.58500757, 0.36875667]), array([0.59362313, 0.15863392, 0.0327007 , 0.62629433, 0.51459695,
       0.66953638, 0.57666206, 0.94751394, 0.43372053, 0.58849876]), array([0.02630141, 0.37048278, 0.06060924, 0.21745853, 0.40341658,
       0.30384773, 0.41287547, 0.66203681, 0.88378035, 0.00882888]), array([0.76760253, 0.47634752, 0.78092778, 0.32448453, 0.17202781,
       0.35041646, 0.21388058, 0.42415907, 0.50396953, 0.26123198]), array([0.76093181, 0.93533426, 0.22803676, 0.97280311, 0.6884773 ,
       0.24550228, 0.38311875, 0.98911096, 0.22320754, 0.9209401 ]), array([0.00709774, 0.23047372, 0.75671617, 0.15221856, 0.37953828,
       0.84067746, 0.68651152, 0.33312033, 0.13362028, 0.35308286]), array([0.25277164, 0.46372664, 0.65470194, 0.69907428, 0.68074457,
       0.89789829, 0.38451436, 0.32678854, 0.8129246 , 0.76324025])]
[array([0.96524289, 0.20238118, 0.97399241, 0.0

In [56]:
# implemetation from https://github.com/sannawag/TD-PSOLA/blob/master/td_psola.py
def psola(signal, peaks, f_ratio):
    """
    Time-Domain Pitch Synchronous Overlap and Add
    :param signal: original time-domain signal
    :param peaks: time-domain signal peak indices
    :param f_ratio: pitch shift ratio
    :return: pitch-shifted signal
    """
    N = len(signal)
    # Interpolate
    new_signal = np.zeros(N)
    new_peaks_ref = np.linspace(0, len(peaks) - 1, int(len(peaks) * f_ratio))
    new_peaks = np.zeros(len(new_peaks_ref)).astype(int)
    
    for i in range(len(new_peaks)):
        weight = new_peaks_ref[i] % 1
        left = np.floor(new_peaks_ref[i]).astype(int)
        right = np.ceil(new_peaks_ref[i]).astype(int)
        new_peaks[i] = int(peaks[left] * (1 - weight) + peaks[right] * weight)

    # PSOLA
    for j in range(len(new_peaks)):
        # find the corresponding old peak index
        i = np.argmin(np.abs(peaks - new_peaks[j]))
        # get the distances to adjacent peaks
        P1 = [new_peaks[j] if j == 0 else new_peaks[j] - new_peaks[j-1],
              N - 1 - new_peaks[j] if j == len(new_peaks) - 1 else new_peaks[j+1] - new_peaks[j]]
        # edge case truncation
        if peaks[i] - P1[0] < 0:
            P1[0] = peaks[i]
        if peaks[i] + P1[1] > N - 1:
            P1[1] = N - 1 - peaks[i]
        # linear OLA window
        window = list(np.linspace(0, 1, P1[0] + 1)[1:]) + list(np.linspace(1, 0, P1[1] + 1)[1:])
        # center window from original signal at the new peak
        new_signal[new_peaks[j] - P1[0]: new_peaks[j] + P1[1]] += window * signal[peaks[i] - P1[0]: peaks[i] + P1[1]]
    return new_signal

def compute_periods_per_sequence(signal, sequence, min_period, max_period):
    """
    Computes periodicity of a time-domain signal using autocorrelation
    :param sequence: analysis window length in samples. Computes one periodicity value per window
    :param min_period: smallest allowed periodicity
    :param max_period: largest allowed periodicity
    :return: list of measured periods in windows across the signal
    """
    offset = 0  # current sample offset
    periods = []  # period length of each analysis sequence
    N = len(signal)
    while offset < N:
        fourier = fft(signal[offset: offset + sequence])
        fourier[0] = 0  # remove DC component
        autoc = ifft(fourier * np.conj(fourier)).real
        autoc_peak = min_period + np.argmax(autoc[min_period: max_period])
        periods.append(autoc_peak)
        offset += sequence
    return periods

def find_peaks(signal, fs, max_hz=950, min_hz=75, analysis_win_ms=40, max_change=1.005, min_change=0.995):
    """
    Find sample indices of peaks in time-domain signal
    :param max_hz: maximum measured fundamental frequency
    :param min_hz: minimum measured fundamental frequency
    :param analysis_win_ms: window size used for autocorrelation analysis
    :param max_change: restrict periodicity to not increase by more than this ratio from the mean
    :param min_change: restrict periodicity to not decrease by more than this ratio from the mean
    :return: peak indices
    """
    N = len(signal)
    min_period = fs // max_hz
    max_period = fs // min_hz

    # compute pitch periodicity
    sequence = int(analysis_win_ms / 1000 * fs)  # analysis sequence length in samples
    periods = compute_periods_per_sequence(signal, sequence, min_period, max_period)

    # simple hack to avoid octave error: assume that the pitch should not vary much, restrict range
    mean_period = np.mean(periods)
    max_period = int(mean_period * 1.1)
    min_period = int(mean_period * 0.9)
    periods = compute_periods_per_sequence(signal, sequence, min_period, max_period)

    # find the peaks
    peaks = [np.argmax(signal[:int(periods[0]*1.1)])]
    while True:
        prev = peaks[-1]
        idx = prev // sequence  # current autocorrelation analysis window
        if prev + int(periods[idx] * max_change) >= N:
            break
        # find maximum near expected location
        peaks.append(prev + int(periods[idx] * min_change) +
                np.argmax(signal[prev + int(periods[idx] * min_change): prev + int(periods[idx] * max_change)]))
    return np.array(peaks)

def shift_pitch(signal, fs, f_ratio):
    """
    Calls psola pitch shifting algorithm
    :param signal: original signal in the time-domain
    :param fs: sample rate
    :param f_ratio: ratio by which the frequency will be shifted
    :return: pitch-shifted signal
    """
    peaks = find_peaks(signal, fs)
    new_signal = psola(signal, peaks, f_ratio)
    return new_signal

In [59]:
file_path = "data/dev-clean/3752/4944/3752-4944-0001.flac"
file_path2 = "data/dev-clean/5338/24615/5338-24615-0001.flac"
file_path3 = "data/dev-clean/6241/61943/6241-61943-0001.flac"
original, rate = librosa.load(file_path, sr=16000)
# Displaying Original Audio
display(Markdown('## Original (0 to 22050 Hz)'))
display(Audio(original, rate=rate))

original2, rate = librosa.load(file_path2, sr=16000)
# Displaying Original Audio
display(Markdown('## Original 2 (0 to 22050 Hz)'))
display(Audio(original2, rate=rate))

original3, rate = librosa.load(file_path3, sr=16000)
# Displaying Original Audio
display(Markdown('## Original 3 (0 to 22050 Hz)'))
display(Audio(original3, rate=rate))

f_ratio = 1.4
p_rate = int(rate/0.8)
psola_signal = shift_pitch(original, p_rate, f_ratio)

# Display superwideband audio
display(Markdown('## TD-PSOLA (10) ** (-2 / 12))'))
display(Audio(psola_signal, rate=rate))

psola_signal2 = shift_pitch(original2, p_rate, f_ratio)

# Display superwideband audio
display(Markdown('## TD-PSOLA 2(10) ** (-2 / 12))'))
display(Audio(psola_signal2, rate=rate))

psola_signal3 = shift_pitch(original3, p_rate, f_ratio)

# Display superwideband audio
display(Markdown('## TD-PSOLA (10) ** (-2 / 12))'))
display(Audio(psola_signal3, rate=rate))



## Original (0 to 22050 Hz)

## Original 2 (0 to 22050 Hz)

## Original 3 (0 to 22050 Hz)

## TD-PSOLA (10) ** (-2 / 12))

## TD-PSOLA 2(10) ** (-2 / 12))

## TD-PSOLA (10) ** (-2 / 12))

In [60]:
def extract_mfcc(audio, n_mfcc=13):
    """
    Extract MFCC features from an audio file.
    
    Parameters:
    file_path (str): Path to the audio file.
    n_mfcc (int): Number of MFCC features to extract.
    
    Returns:
    np.ndarray: Extracted MFCC features.
    """
    #y, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=audio, sr=16000, n_mfcc=n_mfcc)
    return np.mean(mfccs.T, axis=0)  # Average across time frames

# Example usage:
file_paths = [original, original2, original3]  # Replace with actual file paths
utterances_original = [extract_mfcc(file) for file in file_paths]

# Assuming pseudonymized files have the same naming convention with a suffix "_pseudo"
file_paths_pseudo = [psola_signal, psola_signal2, psola_signal3]  # Replace with actual file paths
utterances_pseudonymised = [extract_mfcc(file) for file in file_paths_pseudo]

# Calculate similarity matrices
matrix_oo = calculate_similarity_matrix(utterances_original)
matrix_op = calculate_similarity_matrix(utterances_original + utterances_pseudonymised)[len(utterances_original):, :len(utterances_original)]

# Calculate DeID
deid = calculate_deid(matrix_oo, matrix_op)
print(f"DeID: {deid:.2f}")


DeID: 0.16
