In [1]:
import os
import numpy as np
from numpy.fft import fft, ifft
import librosa
import IPython.display as ipd
import speech_recognition
import soundfile as sf
from io import BytesIO
import jiwer
import soundfile as sf
import sounddevice as sd
import soundfile
import wave
from IPython.display import Audio, Markdown
from collections import defaultdict

# Implement Psola

In [2]:
# implemetation from https://github.com/sannawag/TD-PSOLA/blob/master/td_psola.py
def psola(signal, peaks, f_ratio):
    """
    Time-Domain Pitch Synchronous Overlap and Add
    :param signal: original time-domain signal
    :param peaks: time-domain signal peak indices
    :param f_ratio: pitch shift ratio
    :return: pitch-shifted signal
    """
    N = len(signal)
    # Interpolate
    new_signal = np.zeros(N)
    new_peaks_ref = np.linspace(0, len(peaks) - 1, int(len(peaks) * f_ratio))
    new_peaks = np.zeros(len(new_peaks_ref)).astype(int)
    
    for i in range(len(new_peaks)):
        weight = new_peaks_ref[i] % 1
        left = np.floor(new_peaks_ref[i]).astype(int)
        right = np.ceil(new_peaks_ref[i]).astype(int)
        new_peaks[i] = int(peaks[left] * (1 - weight) + peaks[right] * weight)

    # PSOLA
    for j in range(len(new_peaks)):
        # find the corresponding old peak index
        i = np.argmin(np.abs(peaks - new_peaks[j]))
        # get the distances to adjacent peaks
        P1 = [new_peaks[j] if j == 0 else new_peaks[j] - new_peaks[j-1],
              N - 1 - new_peaks[j] if j == len(new_peaks) - 1 else new_peaks[j+1] - new_peaks[j]]
        # edge case truncation
        if peaks[i] - P1[0] < 0:
            P1[0] = peaks[i]
        if peaks[i] + P1[1] > N - 1:
            P1[1] = N - 1 - peaks[i]
        # linear OLA window
        window = list(np.linspace(0, 1, P1[0] + 1)[1:]) + list(np.linspace(1, 0, P1[1] + 1)[1:])
        # center window from original signal at the new peak
        new_signal[new_peaks[j] - P1[0]: new_peaks[j] + P1[1]] += window * signal[peaks[i] - P1[0]: peaks[i] + P1[1]]
    return new_signal

def compute_periods_per_sequence(signal, sequence, min_period, max_period):
    """
    Computes periodicity of a time-domain signal using autocorrelation
    :param sequence: analysis window length in samples. Computes one periodicity value per window
    :param min_period: smallest allowed periodicity
    :param max_period: largest allowed periodicity
    :return: list of measured periods in windows across the signal
    """
    offset = 0  # current sample offset
    periods = []  # period length of each analysis sequence
    N = len(signal)
    while offset < N:
        fourier = fft(signal[offset: offset + sequence])
        fourier[0] = 0  # remove DC component
        autoc = ifft(fourier * np.conj(fourier)).real
        if len(autoc) <= min_period:
            autoc_peak = min_period + autoc[-1]
        else:    
            autoc_peak = min_period + np.argmax(autoc[min_period: max_period])
        periods.append(autoc_peak)
        offset += sequence
    return periods

def find_peaks(signal, fs, max_hz=950, min_hz=75, analysis_win_ms=40, max_change=1.005, min_change=0.995):
    """
    Find sample indices of peaks in time-domain signal
    :param max_hz: maximum measured fundamental frequency
    :param min_hz: minimum measured fundamental frequency
    :param analysis_win_ms: window size used for autocorrelation analysis
    :param max_change: restrict periodicity to not increase by more than this ratio from the mean
    :param min_change: restrict periodicity to not decrease by more than this ratio from the mean
    :return: peak indices
    """
    N = len(signal)
    min_period = fs // max_hz
    max_period = fs // min_hz

    # compute pitch periodicity
    sequence = int(analysis_win_ms / 1000 * fs)  # analysis sequence length in samples

    periods = compute_periods_per_sequence(signal, sequence, min_period, max_period)

    # simple hack to avoid octave error: assume that the pitch should not vary much, restrict range
    mean_period = np.mean(periods)
    max_period = int(mean_period * 1.1)
    min_period = int(mean_period * 0.9)
    periods = compute_periods_per_sequence(signal, sequence, min_period, max_period)

    # find the peaks
    peaks = [np.argmax(signal[:int(periods[0]*1.1)])]
    while True:
        prev = peaks[-1]
        idx = prev // sequence  # current autocorrelation analysis window
        if prev + int(periods[idx] * max_change) >= N:
            break
        # find maximum near expected location
        peaks.append(prev + int(periods[idx] * min_change) +
                np.argmax(signal[prev + int(periods[idx] * min_change): prev + int(periods[idx] * max_change)]))
    return np.array(peaks)

def shift_pitch(signal, fs, f_ratio):
    """
    Calls psola pitch shifting algorithm
    :param signal: original signal in the time-domain
    :param fs: sample rate
    :param f_ratio: ratio by which the frequency will be shifted
    :return: pitch-shifted signal
    """
    peaks = find_peaks(signal, fs)
    new_signal = psola(signal, peaks, f_ratio)
    return new_signal

In [3]:
def add_noise(input_signal, noise_level):
    white_noise = np.random.randn(len(input_signal))
    mixed_audio = input_signal + noise_level * white_noise[:len(input_signal)]
    return mixed_audio


# Code for calculating metrics

In [4]:
def transcribe_audio(file, recognizer, language):
    # Language Code for Dutch = "nl-NL", for english use "en-US"
    
    # try:
    wav_io = BytesIO()
    sf.write(wav_io, file, 16000, format='WAV')
    wav_io.seek(0)
    
    audio_file = speech_recognition.AudioFile(wav_io)
    with audio_file as source:
            audio_data = recognizer.record(source)
    
    text = recognizer.recognize_whisper(audio_data, language)
    wav_io.close()
    if text == "":
        text = "empty"
    return text  
    # except:
    #     return "empty"
    # finally:
    #    wav_io.close()
      
       

In [5]:
def extract_mfcc(y, n_mfcc=13, rate=16000):
    """
    Extract MFCC features from an audio file.
    
    Parameters:
    file_path (str): Path to the audio file.
    n_mfcc (int): Number of MFCC features to extract.
    
    Returns:
    np.ndarray: Extracted MFCC features.
    """

    mfccs = librosa.feature.mfcc(y=y, sr=rate, n_mfcc=n_mfcc)
    return np.mean(mfccs.T, axis=0)  # Average across time frames

def aggregate_speaker_features(files):
    """
    Aggregate MFCC features for all recordings of a speaker.
    
    Parameters:
    file_paths (list): List of file paths for a single speaker's recordings.
    
    Returns:
    np.ndarray: Aggregated feature vector for the speaker.
    """
    features = [extract_mfcc(file) for file in files]
    return np.mean(features, axis=0)

def get_speaker_files(base_path):
    """
    Get a dictionary of speaker IDs and their corresponding file paths.
    
    Parameters:
    base_path (str): Base directory containing the audio files.
    
    Returns:
    dict: Dictionary with speaker IDs as keys and lists of file paths as values.
    """
    speaker_files = defaultdict(list)
    for file in os.listdir(base_path):
        if file.endswith('.wav'):
            speaker_id = file.split('-')[0]
            speaker_files[speaker_id].append(os.path.join(base_path, file))
    return speaker_files

def calculate_similarity_matrix(utterances):
    """
    Compute the voice similarity matrix for given utterances.
    
    Parameters:
    utterances (list): List of feature vectors representing each speaker.
    
    Returns:
    np.ndarray: Similarity matrix.
    """
    num_speakers = len(utterances)
    similarity_matrix = np.zeros((num_speakers, num_speakers))
    
    for i in range(num_speakers):
        for j in range(num_speakers):
            similarity_matrix[i, j] = np.dot(utterances[i], utterances[j]) / (np.linalg.norm(utterances[i]) * np.linalg.norm(utterances[j]))
    
    return similarity_matrix

def calculate_diagonal_dominance(matrix):
    """
    Calculate the diagonal dominance of a given similarity matrix.
    
    Parameters:
    matrix (np.ndarray): Similarity matrix.
    
    Returns:
    float: Diagonal dominance value.
    """
    N = matrix.shape[0]
    diag_avg = np.mean(np.diag(matrix))
    off_diag_mask = np.ones(matrix.shape, dtype=bool)
    np.fill_diagonal(off_diag_mask, 0)
    off_diag_avg = np.mean(matrix[off_diag_mask])
    
    return abs(diag_avg - off_diag_avg)

def calculate_deid(matrix_oo, matrix_op):
    """
    Calculate the DeID metric given the original and original-protected similarity matrices.
    
    Parameters:
    matrix_oo (np.ndarray): Original similarity matrix.
    matrix_op (np.ndarray): Original-protected similarity matrix.
    
    Returns:
    float: DeID metric.
    """
    D_diag_OO = calculate_diagonal_dominance(matrix_oo)
    D_diag_OP = calculate_diagonal_dominance(matrix_op)
    
    DeID = 1 - (D_diag_OP / D_diag_OO)
    return DeID

def calculate_gvd(matrix_oo, matrix_pp):
    """
    Calculate the GVD metric given the original and pseudonymised similarity matrices.
    
    Parameters:
    matrix_oo (np.ndarray): Original similarity matrix.
    matrix_pp (np.ndarray): Pseudonymised similarity matrix.
    
    Returns:
    float: GVD metric.
    """
    D_diag_OO = calculate_diagonal_dominance(matrix_oo)
    D_diag_PP = calculate_diagonal_dominance(matrix_pp)
    
    GVD = 10 * np.log10(D_diag_PP / D_diag_OO)
    return GVD


fuck


# On dev-clean dataset

In [6]:
data_folder = 'data/dev-clean-audio'
audio_dict = {}
for root, dirs, files in os.walk(data_folder):
    for file in files:
        file_path = os.path.join(root, file)
        file_path = file_path.replace('\\', '/')
        if file.endswith('.wav'):
            temp_dict = {}
            original, rate = librosa.load(file_path, sr=16000)
            temp_dict['original'] = original
            f_ratio = 1.3
            psola_audio = shift_pitch(original, rate, f_ratio)
            temp_dict['psola'] = psola_audio
            noise_audio = add_noise(psola_audio, 0.001)
            temp_dict['noise'] = noise_audio
            key = file.split('.')[0]
            audio_dict[key] = temp_dict
        elif file.endswith('.txt'):
            with open(file_path, 'r') as f:
                for line in f:
                    key, sentence = line.split(' ', 1)  # Split on the first space
                    key = key.strip()
                    sentence = sentence.strip()
                    audio_dict[key]['transcript'] = sentence

            

In [7]:
clean_wer = []
psola_wer = []
noise_wer = []

recognizer = speech_recognition.Recognizer()


for key in audio_dict.keys():
    transcript = audio_dict[key]['transcript'].lower()
    
    # clean
    text = transcribe_audio(audio_dict[key]['original'], recognizer, 'tiny')
    clean_wer.append(jiwer.wer(text.lower(), transcript))
    
    # psola
    text = transcribe_audio(audio_dict[key]['psola'], recognizer, 'tiny')
    psola_wer.append(jiwer.wer(text.lower(), transcript))
    
    # noise
    text = transcribe_audio(audio_dict[key]['noise'], recognizer, 'tiny')
    noise_wer.append(jiwer.wer(text.lower(), transcript))

In [8]:
print("Before: ", sum(clean_wer) / len(clean_wer))
print("PSOLA: ", sum(psola_wer) / len(psola_wer))
print("PSOLA + noise: ", sum(noise_wer) / len(noise_wer))

Before:  0.23365612442399394
PSOLA:  0.2508304506804439
PSOLA + noise:  0.25760155178128086


In [9]:
speaker_files = defaultdict(list)
speaker_files_psola = defaultdict(list)
speaker_files_noise = defaultdict(list)
for key, items in audio_dict.items():
   speaker_id = key.split('-')[0] 
   speaker_files[speaker_id].append(items['original'])
   speaker_files_psola[speaker_id].append(items['psola'])
   speaker_files_noise[speaker_id].append(items['noise'])

utterances_original = [aggregate_speaker_features(paths) for speaker_id, paths in speaker_files.items()]
utterances_psola = [aggregate_speaker_features(paths) for speaker_id, paths in speaker_files_psola.items()]
utterances_noise = [aggregate_speaker_features(paths) for speaker_id, paths in speaker_files_noise.items()]

# Psola
matrix_oo_psola = calculate_similarity_matrix(utterances_original)
matrix_op_psola = calculate_similarity_matrix(utterances_original + utterances_psola)[len(utterances_original):, :len(utterances_original)]
matrix_pp_psola = calculate_similarity_matrix(utterances_psola)

# Calculate DeID
deid_psola = calculate_deid(matrix_oo_psola, matrix_op_psola)
print(f"DeID psola: {deid_psola:.2f}")

# Calculate GVD
gvd_psola = calculate_gvd(matrix_oo_psola, matrix_pp_psola)
print(f"GVD psola: {gvd_psola:.2f}")


# Psola + Noise
matrix_oo_noise = calculate_similarity_matrix(utterances_original)
matrix_op_noise = calculate_similarity_matrix(utterances_original + utterances_noise)[len(utterances_original):, :len(utterances_original)]
matrix_pp_noise = calculate_similarity_matrix(utterances_noise)

# Calculate DeID
deid_noise = calculate_deid(matrix_oo_noise, matrix_op_noise)
print(f"DeID noise: {deid_noise:.2f}")

# Calculate GVD
gvd_noise = calculate_gvd(matrix_oo_noise, matrix_pp_noise)
print(f"GVD noise: {gvd_noise:.2f}")

DeID psola: 0.13
GVD psola: -0.93
DeID noise: 0.18
GVD noise: -1.14


# On dutch dataset

In [10]:
data_folder = 'data/nl'
audio_dict_NL = {}
for root, dirs, files in os.walk(data_folder):
    for file in files:
        file_path = os.path.join(root, file)
        file_path = file_path.replace('\\', '/')
        if file.endswith('.wav'):
            temp_dict = {}
            original, rate = librosa.load(file_path, sr=16000)
            temp_dict['original'] = original
            f_ratio = 1.3
            psola_audio = shift_pitch(original, rate, f_ratio)
            temp_dict['psola'] = psola_audio
            noise_audio = add_noise(psola_audio, 0.001)
            temp_dict['noise'] = noise_audio
            key = file.split('.')[0]
            audio_dict_NL[key] = temp_dict
        elif file.endswith('.txt'):
            with open(file_path, 'r') as f:
                for line in f:
                    key, sentence = line.split(' ', 1)  # Split on the first space
                    key = key.strip()
                    sentence = sentence.strip()
                    audio_dict_NL[key]['transcript'] = sentence

In [11]:
clean_wer_NL = []
psola_wer_NL = []
noise_wer_NL = []

recognizer = speech_recognition.Recognizer()


for key in audio_dict_NL.keys():
    transcript = audio_dict_NL[key]['transcript'].lower()
    
    # clean
    text = transcribe_audio(audio_dict_NL[key]['original'], recognizer, 'tiny')
    clean_wer_NL.append(jiwer.wer(text.lower(), transcript))
    
    # psola
    text = transcribe_audio(audio_dict_NL[key]['psola'], recognizer, 'tiny')
    psola_wer_NL.append(jiwer.wer(text.lower(), transcript))
    
    # noise
    text = transcribe_audio(audio_dict_NL[key]['noise'], recognizer, 'tiny')
    noise_wer_NL.append(jiwer.wer(text.lower(), transcript))

In [12]:
print("Before: ", sum(clean_wer_NL) / len(clean_wer_NL))
print("PSOLA: ", sum(psola_wer_NL) / len(psola_wer_NL))
print("PSOLA + noise: ", sum(noise_wer_NL) / len(noise_wer_NL))

Before:  0.49291687579217325
PSOLA:  0.5338737449221851
PSOLA + noise:  0.5431410591421738


In [13]:
speaker_files_NL = defaultdict(list)
speaker_files_psola_NL = defaultdict(list)
speaker_files_noise_NL = defaultdict(list)

with open('files_for_harm/spk2utt.txt', 'r') as f:
    for line in f:
        parts = line.split(' ')
        audios_clean = []
        audios_psola = []
        audios_noise = []
        for i in parts[1:]:
            i = i.strip()
            audios_clean.append(audio_dict_NL[i]['original'])
            audios_psola.append(audio_dict_NL[i]['psola'])
            audios_noise.append(audio_dict_NL[i]['noise'])
        speaker_files_NL[parts[0]] = audios_clean
        speaker_files_psola_NL[parts[0]] = audios_psola
        speaker_files_noise_NL[parts[0]] = audios_noise
        
utterances_original_NL = [aggregate_speaker_features(paths) for speaker_id, paths in speaker_files_NL.items()]
utterances_psola_NL = [aggregate_speaker_features(paths) for speaker_id, paths in speaker_files_psola_NL.items()]
utterances_noise_NL = [aggregate_speaker_features(paths) for speaker_id, paths in speaker_files_noise_NL.items()]

# Psola
matrix_oo_psola_NL = calculate_similarity_matrix(utterances_original_NL)
matrix_op_psola_NL = calculate_similarity_matrix(utterances_original_NL + utterances_psola_NL)[len(utterances_original_NL):, :len(utterances_original_NL)]
matrix_pp_psola_NL = calculate_similarity_matrix(utterances_psola_NL)

# Calculate DeID
deid_psola_NL = calculate_deid(matrix_oo_psola_NL, matrix_op_psola_NL)
print(f"DeID psola: {deid_psola_NL:.2f}")

# Calculate GVD
gvd_psola_NL = calculate_gvd(matrix_oo_psola_NL, matrix_pp_psola_NL)
print(f"GVD psola: {gvd_psola_NL:.2f}")


# Psola + Noise
matrix_oo_noise_NL = calculate_similarity_matrix(utterances_original_NL)
matrix_op_noise_NL = calculate_similarity_matrix(utterances_original_NL + utterances_noise_NL)[len(utterances_original_NL):, :len(utterances_original_NL)]
matrix_pp_noise_NL = calculate_similarity_matrix(utterances_noise_NL)

# Calculate DeID
deid_noise_NL = calculate_deid(matrix_oo_noise_NL, matrix_op_noise_NL)
print(f"DeID noise: {deid_noise_NL:.2f}")

# Calculate GVD
gvd_noise_NL = calculate_gvd(matrix_oo_noise_NL, matrix_pp_noise_NL)
print(f"GVD noise: {gvd_noise_NL:.2f}")

DeID psola: 0.11
GVD psola: -0.84
DeID noise: 0.13
GVD noise: -0.67


# on VCTK dataset

In [14]:
data_folder = 'data/wav_vctk'
audio_dict_VCTK = {}
for root, dirs, files in os.walk(data_folder):
    for file in files:
        file_path = os.path.join(root, file)
        file_path = file_path.replace('\\', '/')
        if file.endswith('.wav'):
            temp_dict = {}
            original, rate = librosa.load(file_path, sr=16000)
            temp_dict['original'] = original
            f_ratio = 1.3
            psola_audio = shift_pitch(original, rate, f_ratio)
            temp_dict['psola'] = psola_audio
            noise_audio = add_noise(psola_audio, 0.001)
            temp_dict['noise'] = noise_audio
            parts = file.split('_')
            key = '_'.join(parts[:2])
            audio_dict_VCTK[key] = temp_dict

transcripts_path = "data/wav_vctk/transcripts.txt"
with open(transcripts_path, 'r') as f:
    for line in f:
        key, sentence = line.split(' ', 1)  # Split on the first space
        key = key.strip()
        sentence = sentence.strip()
        audio_dict_VCTK[key]['transcript'] = sentence
            

In [15]:
clean_wer_VCTK = []
psola_wer_VCTK = []
noise_wer_VCTK = []

recognizer = speech_recognition.Recognizer()


for key in audio_dict_VCTK.keys():
    transcript = audio_dict_VCTK[key]['transcript'].lower()
    
    # clean
    text = transcribe_audio(audio_dict_VCTK[key]['original'], recognizer, 'tiny')
    if text != "empty":
        clean_wer_VCTK.append(jiwer.wer(text.lower(), transcript))
    
    # psola
    
    text = transcribe_audio(audio_dict_VCTK[key]['psola'], recognizer, 'tiny')
    
    if text != "empty":
        psola_wer_VCTK.append(jiwer.wer(text.lower(), transcript))
    
    # noise
    text = transcribe_audio(audio_dict_VCTK[key]['noise'], recognizer, 'tiny')
    
    if text != "empty":
        noise_wer_VCTK.append(jiwer.wer(text.lower(), transcript))

In [16]:
print("Before: ", sum(clean_wer_VCTK) / len(clean_wer_VCTK))
print("PSOLA: ", sum(psola_wer_VCTK) / len(psola_wer_VCTK))
print("PSOLA + noise: ", sum(noise_wer_VCTK) / len(noise_wer_VCTK))

Before:  0.14842702731034715
PSOLA:  0.17863867799899005
PSOLA + noise:  0.18156435380337635


In [17]:
speaker_files_VCTK = defaultdict(list)
speaker_files_psola_VCTK = defaultdict(list)
speaker_files_noise_VCTK = defaultdict(list)
for key, items in audio_dict_VCTK.items():
   speaker_id = key.split('_')[0] 
   speaker_files_VCTK[speaker_id].append(items['original'])
   speaker_files_psola_VCTK[speaker_id].append(items['psola'])
   speaker_files_noise_VCTK[speaker_id].append(items['noise'])

utterances_original_VCTK = [aggregate_speaker_features(paths) for speaker_id, paths in speaker_files_VCTK.items()]
utterances_psola_VCTK = [aggregate_speaker_features(paths) for speaker_id, paths in speaker_files_psola_VCTK.items()]
utterances_noise_VCTK = [aggregate_speaker_features(paths) for speaker_id, paths in speaker_files_noise_VCTK.items()]

# Psola
matrix_oo_psola_VCTK = calculate_similarity_matrix(utterances_original_VCTK)
matrix_op_psola_VCTK = calculate_similarity_matrix(utterances_original_VCTK + utterances_psola_VCTK)[len(utterances_original_VCTK):, :len(utterances_original_VCTK)]
matrix_pp_psola_VCTK = calculate_similarity_matrix(utterances_psola_VCTK)

# Calculate DeID
deid_psola_VCTK = calculate_deid(matrix_oo_psola_VCTK, matrix_op_psola_VCTK)
print(f"DeID psola: {deid_psola_VCTK:.2f}")

# Calculate GVD
gvd_psola_VCTK = calculate_gvd(matrix_oo_psola_VCTK, matrix_pp_psola_VCTK)
print(f"GVD psola: {gvd_psola_VCTK:.2f}")


# Psola + Noise
matrix_oo_noise_VCTK = calculate_similarity_matrix(utterances_original_VCTK)
matrix_op_noise_VCTK = calculate_similarity_matrix(utterances_original_VCTK + utterances_noise_VCTK)[len(utterances_original_VCTK):, :len(utterances_original_VCTK)]
matrix_pp_noise_VCTK = calculate_similarity_matrix(utterances_noise_VCTK)

# Calculate DeID
deid_noise_VCTK = calculate_deid(matrix_oo_noise_VCTK, matrix_op_noise_VCTK)
print(f"DeID noise: {deid_noise_VCTK:.2f}")

# Calculate GVD
gvd_noise_VCTK = calculate_gvd(matrix_oo_noise_VCTK, matrix_pp_noise_VCTK)
print(f"GVD noise: {gvd_noise_VCTK:.2f}")

DeID psola: 0.21
GVD psola: -1.31
DeID noise: 0.25
GVD noise: -1.15
