In [2]:
pip install fastdtw pesq

Collecting fastdtw
  Downloading fastdtw-0.3.4.tar.gz (133 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 KB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m[31m1.4 MB/s[0m eta [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: fastdtw
  Building wheel for fastdtw (setup.py) ... [?25ldone
[?25h  Created wheel for fastdtw: filename=fastdtw-0.3.4-py3-none-any.whl size=3582 sha256=9fac012c9286f7d2a44dee1654ad1f80ca591576fc418f5ccdad9fc1592b24a2
  Stored in directory: /home/lindel/.cache/pip/wheels/73/c8/f7/c25448dab74c3acf4848bc25d513c736bb93910277e1528ef4
Successfully built fastdtw
Installing collected packages: fastdtw
Successfully installed fastdtw-0.3.4
Note: you may need to restart the kernel to use updated packages.


In [4]:
import sys
sys.path.append('/home/lindel/diploma')

In [5]:
import numpy as np
from copy import deepcopy
import librosa
import pandas as pd
import os
import librosa
import soundfile as sf
import pesq
from tqdm import trange
from IPython.display import Audio
from fastdtw import fastdtw


from utils.audio_funcs import audioread, audiowrite, snr_mixer, audio_normalization, align_audio

In [6]:
class SpeechTranslatedDataset:
    def __init__(self, speech_root, translated_root, sample_rate=48000):
        self.speech_root = speech_root
        self.translated_root = translated_root
        self.len_speech_dir = self.__get_len_dir(self.speech_root)
        self.sample_rate = sample_rate

    def __getitem__(self, index):
        if index >= self.len_speech_dir:
            raise IndexError('Index out of range')

        # reading speech wav by index
        speech_path = f'{self.speech_root}/speech_{index}.wav'
        speech, sr = audioread(speech_path, sr=self.sample_rate)
        
        translated_path = f'{self.translated_root}/speech_{index}.wav'
        translated, sr = audioread(translated_path, sr=self.sample_rate)
        
        return speech, translated, speech_path, translated_path
        
    def __len__(self):
        return self.len_speech_dir

    def __get_len_dir(self, dir_path):
        return len(
            [name for name in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, name))]
        )

In [7]:
speech_root = "/kaggle/input/speech-translated/speech"
translated_root = "/kaggle/input/speech-translated/translated_speech"
sample_rate = 48000

speech_translated_ds = SpeechTranslatedDataset(speech_root, translated_root, sample_rate)

In [8]:
def compute_mcc(audio, n_mfcc=13, sr=48000):
    # Load audio file

    # Extract MFCCs
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)

    return mfcc

def compute_mcd(reference_mcc, degraded_mcc):
    # Align the MFCCs using dynamic time warping
    _, path = fastdtw(reference_mcc.T, degraded_mcc.T)
    
    # Extract the aligned frames
    aligned_reference_mcc = reference_mcc[:, [idx[0] for idx in path]]
    aligned_degraded_mcc = degraded_mcc[:, [idx[1] for idx in path]]

    # Compute the Euclidean distance between aligned frames
    mcd = np.mean(np.sqrt(np.sum((aligned_reference_mcc - aligned_degraded_mcc) ** 2, axis=0)))

    return mcd

# Stretching

In [9]:
orig_longer = 0
columns = ["translated_pesq", "modified_translated_pesq", "translated_mfcc", "modified_translated_mfcc"]
stretchin_data = []
compressing_data = []

for i in trange(len(speech_translated_ds)):
    speech_audio, translated_audio, speech_path, translated_path = speech_translated_ds[i]
    orig_len = len(speech_audio) / sample_rate
    trans_len = len(translated_audio) / sample_rate
    need_to_stretch = translated_audio
    time_stretch_factor = trans_len / orig_len
    modified_translated_audio = librosa.effects.time_stretch(need_to_stretch, rate=time_stretch_factor)
    
    # computing for original translation pesq
    original_pesq_score = pesq.pesq(16000, speech_audio, translated_audio)
    
    # computing for original translation mfcc
    reference_mfcc = compute_mcc(speech_audio)
    degraded_mfcc = compute_mcc(translated_audio)
    # Compute MCD
    original_mcd_score = compute_mcd(reference_mfcc, degraded_mfcc)
    
    # computing for modified translation pesq
    modified_pesq_score = pesq.pesq(16000, speech_audio, modified_translated_audio)
    
    # computing for modified translation mfcc
    reference_mfcc = compute_mcc(speech_audio)
    degraded_mfcc = compute_mcc(modified_translated_audio)
    # Compute MCD
    modified_mcd_score = compute_mcd(reference_mfcc, degraded_mfcc)
    
    metrics = dict(zip(columns, [original_pesq_score, modified_pesq_score, original_mcd_score, modified_mcd_score]))
    
    if time_stretch_factor < 1:
        stretchin_data.append(metrics)
    else:
        compressing_data.append(metrics)

100%|██████████| 201/201 [08:18<00:00,  2.48s/it]


In [12]:
compressing_df = pd.DataFrame(data=compressing_data, columns=columns)
stretching_df = pd.DataFrame(data=stretchin_data, columns=columns)

In [14]:
compressing_df.to_csv("compressing.csv", index=False)
stretching_df.to_csv("stretching.csv", index=False)

In [24]:
grouping_columns = ["avg_trans_pesq", "avg_modified_translated_pesq", "avg_translated_mfcc", "modified_translated_mfcc"]
avg_metrics = {}
for df_column, grouping_column in zip(stretching_df.columns, grouping_columns):
    avg_metrics[grouping_column] = stretching_df[df_column].mean()
avg_stretching_metrics_list = [avg_metrics]
grouping_stretching_df = pd.DataFrame(data=avg_stretching_metrics_list, columns=grouping_columns)
grouping_stretching_df.to_csv("grouping_stretching.csv", index=False)

In [23]:
grouping_columns = ["avg_trans_pesq", "avg_modified_translated_pesq", "avg_translated_mfcc", "modified_translated_mfcc"]
avg_metrics = {}
for df_column, grouping_column in zip(compressing_df.columns, grouping_columns):
    avg_metrics[grouping_column] = compressing_df[df_column].mean()
# grouping_stretching_df = pd.DataFrame(data=avg_metrics, columns=grouping_columns, index=0)
avg_compressing_metrics_list = [avg_metrics]
grouping_compressing_df = pd.DataFrame(data=avg_compressing_metrics_list, columns=grouping_columns)
grouping_compressing_df.to_csv("grouping_compressing.csv", index=False)

In [45]:
Audio(speech_audio, rate=sample_rate)

In [85]:
time_stretch_factor = trans_len / orig_len

stretched_audio = librosa.effects.time_stretch(translated_audio, rate=time_stretch_factor)

Audio(stretched_audio, rate=sample_rate)

In [73]:


pesq_score = pesq.pesq(16000, speech_audio, stretched_audio)

print("PESQ Score:", pesq_score)

PESQ Score: 1.9112250804901123


In [75]:
time_stretch_factor = orig_len / trans_len 

stretched_audio = librosa.effects.time_stretch(speech_audio, rate=time_stretch_factor)

Audio(stretched_audio, rate=sample_rate)

In [76]:
import pesq

pesq_score = pesq.pesq(16000, translated_audio, stretched_audio)

print("PESQ Score:", pesq_score)

PESQ Score: 1.0212228298187256


MCD Score: 125.277885


# Silence

In [112]:
silence_duration = round(trans_len - orig_len, 5) / 2  # 2 seconds of silence

# Calculate the number of samples for the silence
silence_samples = int(silence_duration * sample_rate)

silence = np.zeros(silence_samples)

audio_with_silence = np.concatenate((silence, speech_audio, silence))

In [113]:
len(audio_with_silence) / sample_rate

3.573958333333333

In [114]:
audiowrite(audio_with_silence, sample_rate, "/kaggle/working/silence/test.wav")

'done'