In [None]:
import os
import numpy as np
from numpy.fft import fft, ifft
import librosa
import IPython.display as ipd

import speech_recognition as sr
import soundfile as sf
from io import BytesIO
import jiwer

# Implement Psola

In [None]:
# implemetation from https://github.com/sannawag/TD-PSOLA/blob/master/td_psola.py
def psola(signal, peaks, f_ratio):
    """
    Time-Domain Pitch Synchronous Overlap and Add
    :param signal: original time-domain signal
    :param peaks: time-domain signal peak indices
    :param f_ratio: pitch shift ratio
    :return: pitch-shifted signal
    """
    N = len(signal)
    # Interpolate
    new_signal = np.zeros(N)
    new_peaks_ref = np.linspace(0, len(peaks) - 1, int(len(peaks) * f_ratio))
    new_peaks = np.zeros(len(new_peaks_ref)).astype(int)
    
    for i in range(len(new_peaks)):
        weight = new_peaks_ref[i] % 1
        left = np.floor(new_peaks_ref[i]).astype(int)
        right = np.ceil(new_peaks_ref[i]).astype(int)
        new_peaks[i] = int(peaks[left] * (1 - weight) + peaks[right] * weight)

    # PSOLA
    for j in range(len(new_peaks)):
        # find the corresponding old peak index
        i = np.argmin(np.abs(peaks - new_peaks[j]))
        # get the distances to adjacent peaks
        P1 = [new_peaks[j] if j == 0 else new_peaks[j] - new_peaks[j-1],
              N - 1 - new_peaks[j] if j == len(new_peaks) - 1 else new_peaks[j+1] - new_peaks[j]]
        # edge case truncation
        if peaks[i] - P1[0] < 0:
            P1[0] = peaks[i]
        if peaks[i] + P1[1] > N - 1:
            P1[1] = N - 1 - peaks[i]
        # linear OLA window
        window = list(np.linspace(0, 1, P1[0] + 1)[1:]) + list(np.linspace(1, 0, P1[1] + 1)[1:])
        # center window from original signal at the new peak
        new_signal[new_peaks[j] - P1[0]: new_peaks[j] + P1[1]] += window * signal[peaks[i] - P1[0]: peaks[i] + P1[1]]
    return new_signal

def compute_periods_per_sequence(signal, sequence, min_period, max_period):
    """
    Computes periodicity of a time-domain signal using autocorrelation
    :param sequence: analysis window length in samples. Computes one periodicity value per window
    :param min_period: smallest allowed periodicity
    :param max_period: largest allowed periodicity
    :return: list of measured periods in windows across the signal
    """
    offset = 0  # current sample offset
    periods = []  # period length of each analysis sequence
    N = len(signal)
    while offset < N:
        fourier = fft(signal[offset: offset + sequence])
        fourier[0] = 0  # remove DC component
        autoc = ifft(fourier * np.conj(fourier)).real
        if len(autoc) <= min_period:
            autoc_peak = min_period + autoc[-1]
        else:    
            autoc_peak = min_period + np.argmax(autoc[min_period: max_period])
        periods.append(autoc_peak)
        offset += sequence
    return periods

def find_peaks(signal, fs, max_hz=950, min_hz=75, analysis_win_ms=40, max_change=1.005, min_change=0.995):
    """
    Find sample indices of peaks in time-domain signal
    :param max_hz: maximum measured fundamental frequency
    :param min_hz: minimum measured fundamental frequency
    :param analysis_win_ms: window size used for autocorrelation analysis
    :param max_change: restrict periodicity to not increase by more than this ratio from the mean
    :param min_change: restrict periodicity to not decrease by more than this ratio from the mean
    :return: peak indices
    """
    N = len(signal)
    min_period = fs // max_hz
    max_period = fs // min_hz

    # compute pitch periodicity
    sequence = int(analysis_win_ms / 1000 * fs)  # analysis sequence length in samples

    periods = compute_periods_per_sequence(signal, sequence, min_period, max_period)

    # simple hack to avoid octave error: assume that the pitch should not vary much, restrict range
    mean_period = np.mean(periods)
    max_period = int(mean_period * 1.1)
    min_period = int(mean_period * 0.9)
    periods = compute_periods_per_sequence(signal, sequence, min_period, max_period)

    # find the peaks
    peaks = [np.argmax(signal[:int(periods[0]*1.1)])]
    while True:
        prev = peaks[-1]
        idx = prev // sequence  # current autocorrelation analysis window
        if prev + int(periods[idx] * max_change) >= N:
            break
        # find maximum near expected location
        peaks.append(prev + int(periods[idx] * min_change) +
                np.argmax(signal[prev + int(periods[idx] * min_change): prev + int(periods[idx] * max_change)]))
    return np.array(peaks)

def shift_pitch(signal, fs, f_ratio):
    """
    Calls psola pitch shifting algorithm
    :param signal: original signal in the time-domain
    :param fs: sample rate
    :param f_ratio: ratio by which the frequency will be shifted
    :return: pitch-shifted signal
    """
    peaks = find_peaks(signal, fs)
    new_signal = psola(signal, peaks, f_ratio)
    return new_signal

In [None]:
def add_noise(input_signal, noise_level):
    white_noise = np.random.randn(len(input_signal))
    mixed_audio = input_signal + noise_level * white_noise[:len(input_signal)]
    return mixed_audio


# Code for calculating metrics

In [None]:
def transcribe_audio(file, recognizer, language):
    # Language Code for Dutch = "nl-NL", for english use "en-US"
    
    # try:
    wav_io = BytesIO()
    sf.write(wav_io, file, 16000, format='WAV')
    wav_io.seek(0)
    
    audio_file = sr.AudioFile(wav_io)
    with audio_file as source:
            audio_data = recognizer.record(source)
    
    text = recognizer.recognize_whisper(audio_data, language)
    wav_io.close()
    return text  
    # except:
    #     return "empty"
    # finally:
    #    wav_io.close()
      
       

# On dev-clean dataset

In [None]:
data_folder = 'data/dev-clean-audio'
audio_dict = {}
for root, dirs, files in os.walk(data_folder):
    for file in files:
        file_path = os.path.join(root, file)
        file_path = file_path.replace('\\', '/')
        if file.endswith('.flac'):
            temp_dict = {}
            original, rate = librosa.load(file_path, sr=16000)
            temp_dict['original'] = original
            f_ratio = 1.3
            psola_audio = shift_pitch(original, rate, f_ratio)
            temp_dict['psola'] = psola_audio
            noise_audio = add_noise(psola_audio, 0.001)
            temp_dict['noise'] = noise_audio
            key = file.split('.')[0]
            audio_dict[key] = temp_dict
        elif file.endswith('.txt'):
            with open(file_path, 'r') as f:
                for line in f:
                    key, sentence = line.split(' ', 1)  # Split on the first space
                    key = key.strip()
                    sentence = sentence.strip()
                    audio_dict[key]['transcript'] = sentence

            

In [None]:
original = audio_dict['1272-128104-0013']['original']
psola = audio_dict['1272-128104-0013']['psola']
noise = audio_dict['1272-128104-0013']['noise']
ipd.display(ipd.HTML('Original'))
ipd.display(ipd.Audio(original, rate=rate))

ipd.display(ipd.HTML('Anonymized'))
ipd.display(ipd.Audio(psola, rate=rate))

ipd.display(ipd.HTML('with noise'))
ipd.display(ipd.Audio(noise, rate=rate))


print(audio_dict['1272-128104-0013']['transcript'])

In [None]:
clean_wer = []
psola_wer = []
noise_wer = []

recognizer = sr.Recognizer()


for key in audio_dict.keys():
    transcript = audio_dict[key]['transcript'].lower()
    
    # clean
    text = transcribe_audio(audio_dict[key]['psola'], recognizer, 'small')
    clean_wer.append(jiwer.wer(text.lower(), transcript))
    
    # psola
    text = transcribe_audio(audio_dict[key]['psola'], recognizer, 'small')
    psola_wer.append(jiwer.wer(text.lower(), transcript))
    
    # noise
    text = transcribe_audio(audio_dict[key]['noise'], recognizer, 'small')
    noise_wer.append(jiwer.wer(text.lower(), transcript))

In [None]:
print("Before: ", sum(clean_wer) / len(clean_wer))
print("PSOLA: ", sum(psola_wer) / len(psola_wer))
print("PSOLA + noise: ", sum(noise_wer) / len(noise_wer))

# On dutch dataset

In [None]:
data_folder = 'data/nl'
audio_dict_NL = {}
for root, dirs, files in os.walk(data_folder):
    for file in files:
        file_path = os.path.join(root, file)
        file_path = file_path.replace('\\', '/')
        if file.endswith('.wav'):
            temp_dict = {}
            original, rate = librosa.load(file_path, sr=16000)
            temp_dict['original'] = original
            f_ratio = 1.3
            psola_audio = shift_pitch(original, rate, f_ratio)
            temp_dict['psola'] = psola_audio
            noise_audio = add_noise(psola_audio, 0.001)
            temp_dict['noise'] = noise_audio
            key = file.split('.')[0]
            audio_dict_NL[key] = temp_dict
        elif file.endswith('.txt'):
            with open(file_path, 'r') as f:
                for line in f:
                    key, sentence = line.split(' ', 1)  # Split on the first space
                    key = key.strip()
                    sentence = sentence.strip()
                    audio_dict_NL[key]['transcript'] = sentence

In [None]:
original = audio_dict_NL['fn001601']['original']
psola = audio_dict_NL['fn001601']['psola']
noise = audio_dict_NL['fn001601']['noise']
ipd.display(ipd.HTML('Original'))
ipd.display(ipd.Audio(original, rate=rate))

ipd.display(ipd.HTML('Anonymized'))
ipd.display(ipd.Audio(psola, rate=rate))

ipd.display(ipd.HTML('with noise'))
ipd.display(ipd.Audio(noise, rate=rate))


print(audio_dict_NL['fn001601']['transcript'])

In [None]:
key = 'fn001601'
recognizer = sr.Recognizer()

ipd.display(ipd.HTML('Original'))
ipd.display(ipd.Audio(original, rate=rate))
transcript = audio_dict_NL[key]['transcript'].lower()
    
# clean
text = transcribe_audio(audio_dict_NL[key]['original'], recognizer, 'small')
print("clean = ", text)

# psola
text = transcribe_audio(audio_dict_NL[key]['psola'], recognizer, 'small')
print("psola = ", text)

# noise
text = transcribe_audio(audio_dict_NL[key]['noise'], recognizer, 'small')
print("noise = ", text)

In [None]:
clean_wer = []
psola_wer = []
noise_wer = []

recognizer = sr.Recognizer()


for key in audio_dict.keys():
    transcript = audio_dict[key]['transcript'].lower()
    
    # clean
    text = transcribe_audio(audio_dict_NL[key]['original'], recognizer, 'small')
    clean_wer.append(jiwer.wer(text.lower(), transcript))
    
    # psola
    text = transcribe_audio(audio_dict_NL[key]['psola'], recognizer, 'small')
    psola_wer.append(jiwer.wer(text.lower(), transcript))
    
    # noise
    text = transcribe_audio(audio_dict_NL[key]['noise'], recognizer, 'small')
    noise_wer.append(jiwer.wer(text.lower(), transcript))