In [1]:
%load_ext autoreload

In [16]:
import sys
sys.path.append('/home/lindel/diploma')

In [25]:
%aimport utils.audio_funcs

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys
import io
import librosa
import select
from pathlib import Path
from shutil import rmtree
from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.decomposition import FastICA
from tqdm import trange
from IPython.display import Audio
import subprocess
from typing import Optional, Union, Dict, Tuple, IO


from utils.audio_funcs import audioread, audiowrite, snr_mixer, audio_normalization, align_audio

In [23]:
BASEPATH = os.getcwd()
BASEPATH

'/home/lindel/diploma/signal_noise'

In [None]:
class SpeechNoiseDataset:
    def __init__(self, speech_root, noise_root, sample_rate=48000):
        self.noise_root = noise_root
        self.speech_root = speech_root
        self.len_speech_dir = self.__get_len_dir(self.speech_root)
        self.len_noise_dir = self.__get_len_dir(self.noise_root)
        self.signal_noise = {}
        self.sample_rate = sample_rate

    def __getitem__(self, index):
        if index >= self.len_speech_dir:
            raise IndexError('Index out of range')

        if index in self.signal_noise:
            noise_idx = self.signal_noise.get(index)
        else:
            # getting random noise
            noise_idx = np.random.randint(0, self.len_noise_dir)
            self.signal_noise.update({index: noise_idx})

        # reading noise wav by index
        rand_noise_path = f'{self.noise_root}/noise_{noise_idx}.wav'
        noise, _ = audioread(rand_noise_path, sr=self.sample_rate)

        # reading speech wav by index
        speech_path = f'{self.speech_root}/speech_{index}.wav'
        speech, sr = audioread(speech_path, sr=self.sample_rate)    
            
        return speech, noise, sr

        
    def __len__(self):
        return self.len_speech_dir

    def __get_len_dir(self, dir_path):
        return len(
            [name for name in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, name))]
        )

In [None]:
def calculate_sdr_metric(clean_signal, splitted_signal):
    # Calculate the power of clean signal
    power_clean = np.mean(np.square(clean_signal))

    # Calculate the power of noise-removed signal
    power_residual = np.mean(np.square(clean_signal - splitted_signal))

    # Calculate SDR in dB
    sdr_db = 10 * np.log10(power_clean / power_residual)

    return sdr_db


def calculate_r2_metric(clean_signal, splitted_signal):
    # Compute mean of the clean signal
    mean_clean = np.mean(clean_signal)

    # Compute sum of squares of the residuals (SSR)
    ssr = np.sum(np.square(clean_signal - splitted_signal))
    
    # Calculate TSS as the total sum of squares of the clean signal
    tss = ssr + np.sum(np.square(splitted_signal - mean_clean))
    
    # Calculate R2
    r2 = 1 - (ssr / tss)

    return r2

def rmse_metric(clean_signal, splitted_signal):
    return mean_squared_error(clean_signal, splitted_signal, squared=False)

In [None]:
def score_metrics(clean_speech_signal, splitted_speech_signal, clean_noise_signal=None, splitted_noise_signal=None):
    metric_functions = {'sdr': calculate_sdr_metric, 'r2': calculate_r2_metric, 'rmse': rmse_metric}
    json_result = {}

    for metric_function_name, metric_function in metric_functions.items():
        json_result.setdefault('speech', {})[metric_function_name] = metric_function(clean_speech_signal, splitted_speech_signal)
        if clean_noise_signal is not None and splitted_noise_signal is not None:
            if len(clean_noise_signal) > len(splitted_noise_signal):
                clean_noise_signal = clean_noise_signal[:len(splitted_noise_signal)]
            else:
                num_repeats = len(splitted_noise_signal) // len(clean_noise_signal) + 1
                clean_noise_signal = np.tile(clean_noise_signal, num_repeats)[:len(splitted_noise_signal)]
            json_result.setdefault('noise', {})[metric_function_name] = metric_function(clean_noise_signal, splitted_noise_signal)

    return json_result

In [11]:
def get_mean_avg_metrics(type_of_audio: Union['speech', 'noise'], metric_list):
    # Define the columns and metrics
    columns = ['sdr', 'r2', 'rmse', 'snr_level']
    metrics = ['sdr', 'r2', 'rmse']
    
    # Create an empty DataFrame with the specified columns
    method_metrics_df = pd.DataFrame(columns=columns)
    
    # Iterate through the list of metrics for each track
    data_to_append = []
    
    for track_metrics in metric_list:
        for snr_level, speech_metrics in track_metrics.items():
            speech_metrics = speech_metrics[type_of_audio]
            
            # Extract the desired metrics for speech and SNR level
            metric_values = [float(speech_metrics.get(metric)) for metric in metrics] + [snr_level]
            
            # Add the metrics as a new row to the list
            data_to_append.append(metric_values)
    
    # Concatenate the data to the DataFrame
    method_metrics_df = pd.concat([method_metrics_df, pd.DataFrame(data_to_append, columns=columns)], ignore_index=True)
    
    mean_columns = ['avg_sdr', 'avg_r2', 'avg_rmse']
    method_groupby_mean = method_metrics_df.groupby(by='snr_level').mean()
    method_groupby_mean.columns = mean_columns
    
    median_columns = ['median_sdr', 'median_r2', 'median_rmse']
    method_groupby_median = method_metrics_df.groupby(by='snr_level').median()
    method_groupby_median.columns = median_columns
    
    method_metrics_agg_df = pd.concat([method_groupby_mean, method_groupby_median], axis=1)
    
    return method_metrics_agg_df

In [None]:
snr_lower = -20
snr_higher = 30
snr_step = 10

snd = SpeechNoiseDataset(f'{BASEPATH}/speech_data/', f'{BASEPATH}/noise_data/')

## Experiment №1. ISA

In [None]:
def ica_separation(noised_speech, speech):
    print(noised_speech.shape)
    ica = FastICA(n_components=2)  # Assuming 2 components: speech and noise
    separated_sources = ica.fit_transform(noised_speech)
    print(separated_sources.shape)
    noise = audio_normalization(separated_sources[:, 0])
    voice = audio_normalization(separated_sources[:, 1])
        
    return noise, voice, calculate_r2_metric(voice, speech)

In [None]:
ica_metrics_list = []

for idx_speech_track in trange(len(snd)):
    speech, noise, sr = snd[idx_speech_track]
    noised_speech_list = []
    for current_snr in np.arange(snr_lower, snr_higher, snr_step):
        _, _, noised_speech = snr_mixer(speech, noise, current_snr)
        noised_speech_list.append(noised_speech)
    noised_speech_array = np.array(noised_speech_list)
    splitted_noise, splitted_speech, r2_speech = ica_separation(noised_speech_array.T, speech)
    break
    while r2_speech < 0:
        splitted_noise, splitted_speech, r2_speech = ica_separation(noised_speech_array.T, speech)
    ica_metric_res = score_metrics(speech, splitted_speech, noise, splitted_noise)
    ica_metrics_list.append(ica_metric_res)

In [None]:
ica_metrics_raw = pd.DataFrame([list_metric['speech'] for list_metric in ica_metrics_list])
ica_metrics_raw.to_csv('./metrics/ica_metrics/raw_speech.csv')

In [None]:
ica_metrics_raw = pd.DataFrame([list_metric['noise'] for list_metric in ica_metrics_list])
ica_metrics_raw.to_csv('./metrics/ica_metrics/raw_noise.csv')

In [None]:
splitted_part_list = ['speech', 'noise']

isa_metrics_agg_df = pd.DataFrame(
    columns=['avg_sdr', 'median_sdr', 'avg_r2', 'median_r2', 'mean_rmse', 'median_rmse'],
    index=splitted_part_list
)

for idx, splitted_part in enumerate(splitted_part_list):
    isa_metrics_part_df = pd.read_csv(f'./metrics/ica_metrics/raw_{splitted_part}.csv')
    current_part_df = pd.Series({ 
        'avg_sdr': isa_metrics_part_df['sdr'].mean(),
        'median_sdr': isa_metrics_part_df['sdr'].median(),
        'avg_r2': isa_metrics_part_df['r2'].mean(),
        'median_r2': isa_metrics_part_df['r2'].median(),
        'mean_rmse': isa_metrics_part_df['rmse'].mean(),
        'median_rmse': isa_metrics_part_df['rmse'].median(),
    })
    isa_metrics_agg_df.iloc[[idx]] = current_part_df

isa_metrics_agg_df

In [None]:
isa_metrics_agg_df.to_csv('./metrics/ica_metrics/grouped_speech_noise.csv')

## Experiment №2. Substraction

In [None]:
snr_lower = -20
snr_higher = 30
snr_step = 10

snd = SpeechNoiseDataset(F'{BASEPATH}/speech_data/', f'{BASEPATH}/noise_data/')

In [None]:
def spectral_subtraction(input_signal, noise_signal, signal_speech):
    # Short-time Fourier transform (STFT) of the input signal
    input_stft = librosa.stft(input_signal)
    input_magnitude = np.abs(input_stft)
    input_phase = np.angle(input_stft)

    # STFT of the noise signal and compute the mean
    noise_stft = librosa.stft(noise_signal)
    noise_magnitude = np.abs(noise_stft)
    noise_mean = np.mean(noise_magnitude, axis=1)

    # Subtract noise spectral mean from input spectral magnitude
    subtracted_magnitude = input_magnitude - noise_mean[:, np.newaxis]
    
    # Reconstruct the signal using the modified magnitude and original phase
    reconstructed_stft = subtracted_magnitude * np.exp(1.0j * input_phase)
    reconstructed_signal = librosa.istft(reconstructed_stft, length=len(input_signal))
    return audio_normalization(reconstructed_signal)

In [None]:
substraction_metrics_list = []

for idx_speech_track in trange(len(snd)):
    speech, noise, sr = snd[idx_speech_track]
    noised_speech_list = []
    current_track_metrics = {}
    for current_snr in np.arange(snr_lower, snr_higher, snr_step):
        _, _, noised_speech = snr_mixer(speech, noise, current_snr)
        splitted_speech = spectral_subtraction(noised_speech, noise, speech)
        current_snr_substraction_metrics = score_metrics(speech, splitted_speech)
        current_track_metrics.update({current_snr: current_snr_substraction_metrics})
    substraction_metrics_list.append(current_track_metrics)

In [None]:
for idx_speech_track in trange(8015, len(snd)):
    speech, noise, sr = snd[idx_speech_track]
    noised_speech_list = []
    current_track_metrics = {}
    for current_snr in np.arange(snr_lower, snr_higher, snr_step):
        _, _, noised_speech = snr_mixer(speech, noise, current_snr)
        splitted_speech = spectral_subtraction(noised_speech, noise, speech)
        current_snr_substraction_metrics = score_metrics(speech, splitted_speech)
        current_track_metrics.update({current_snr: current_snr_substraction_metrics})
    substraction_metrics_list.append(current_track_metrics)

In [None]:
len(substraction_metrics_list)

In [None]:
for current_snr in np.arange(snr_lower, snr_higher, snr_step):
    curr_snr_list = []
    for elem_list in substraction_metrics_list:
        curr_snr_list.append(
            elem_list[current_snr]['speech']
        )
    # print(elem_list)
    substraction_metrics_raw_speech_curr_snr = pd.DataFrame(curr_snr_list, columns=['sdr', 'r2', 'rmse'])
    substraction_metrics_raw_speech_curr_snr.to_csv(f'./metrics/substraction_metrics/raw_speech_{current_snr}.csv')

In [None]:
substraction_metrics_agg_df = get_mean_avg_metrics('speech', substraction_metrics_list)

In [None]:
substraction_metrics_agg_df.to_csv(f'./metrics/substraction_metrics/grouped_speech.csv')

In [None]:
substraction_metrics_agg_df

## Experiment №3. Splitter

In [None]:
def splitter_separation(files):
    files_path = ' '.join(files)
    splitting_command = f'spleeter separate -o output/ -p spleeter:2stems {files_path}'
    subprocess.run(splitting_command, shell=True)

In [None]:
snr_lower = -20
snr_higher = 30
snr_step = 10

snd = SpeechNoiseDataset(F'{BASEPATH}/speech_data/', f'{BASEPATH}/noise_data/')

In [None]:
start = 16000
end = len(snd)

In [None]:
files = []

for idx_speech_track in trange(16290, end + 1):
    speech, noise, sr = snd[idx_speech_track]
    for current_snr in np.arange(snr_lower, snr_higher, snr_step):
        _, _, noised_speech = snr_mixer(speech, noise, current_snr)
        filename = f'speech_{current_snr}'
        filepath = f'/kaggle/working/noised_speech/{idx_speech_track}_{filename}.wav'
        audiowrite(noised_speech, sr, filepath)
        files.append(filepath)

In [None]:
for root, dirs, ai_files in os.walk(f'{BASEPATH}/noised_speech'):
    files = [f'{root}{file}' for file in sorted(ai_files)]

In [None]:
for i in trange(0, len(files) + 1, 50):
    splitter_separation(files[i: i+50])

In [None]:
spleeter_metrics_list = []

for root, dirs, files in os.walk('/kaggle/working/output/'):
    for sub_dir in dirs:
        idx_speech_track = int(sub_dir.split('_')[0])
        current_snr = sub_dir.split('_')[-1]
        path_to_track_dir = f'{root}{sub_dir}/'
        splitted_speech, _ = audioread(f'/{path_to_track_dir}/vocals.wav', norm=False, sr=sr)
        splitted_noise, _ = audioread(f'/{path_to_track_dir}/accompaniment.wav', norm=False, sr=sr)
        speech, noise, sr = snd[idx_speech_track]
        current_snr_spleeter_metrics = score_metrics(speech, splitted_speech[:len(speech)], noise, splitted_noise)
        spleeter_metrics_list.append({current_snr: current_snr_spleeter_metrics})

In [None]:
spleeter_metrics_list[0], len(spleeter_metrics_list)

In [None]:
import json
with open(f'/kaggle/working/spleeter_{start}_{end}.json', 'w') as fp:
    json.dump(spleeter_metrics_list, fp, default=lambda x: str(x))

In [None]:
!zip -r spleeter_16000_16289.zip /kaggle/working/spleeter_16000_16289.json

In [15]:
import os
import json

spleeter_metrics_list = []

for root, dirs, files in os.walk(f'{BASEPATH}/metrics/spleeter_metrics/data'):
    for file in files:
        with open(f'{BASEPATH}/metrics/spleeter_metrics/data/{file}') as f:
            spleeter_metrics_list.extend(json.load(f))

In [20]:
get_mean_avg_metrics('speech', spleeter_metrics_list).to_csv(f'{BASEPATH}/metrics/spleeter_metrics/grouped_speech.csv')

In [21]:
get_mean_avg_metrics('noise', spleeter_metrics_list).to_csv(f'{BASEPATH}/metrics/spleeter_metrics/grouped_noise.csv')

## Experiment №4. Demucs

In [None]:
pip install demucs -q

In [None]:
model = "hdemucs_mmi"
mp3 = False
float32 = False  # output as float 32 wavs, unsused if 'mp3' is True.
int24 = False 
two_stems = 'vocals'
gpu = True

In [None]:
def copy_process_streams(process: subprocess.Popen):
    def raw(stream: Optional[IO[bytes]]) -> IO[bytes]:
        assert stream is not None
        if isinstance(stream, io.BufferedIOBase):
            stream = stream.raw
        return stream

    p_stdout, p_stderr = raw(process.stdout), raw(process.stderr)
    stream_by_fd: Dict[int, Tuple[IO[bytes], io.StringIO, IO[str]]] = {
        p_stdout.fileno(): (p_stdout, sys.stdout),
        p_stderr.fileno(): (p_stderr, sys.stderr),
    }
    fds = list(stream_by_fd.keys())

    while fds:
        # `select` syscall will wait until one of the file descriptors has content.
        ready, _, _ = select.select(fds, [], [])
        for fd in ready:
            p_stream, std = stream_by_fd[fd]
            raw_buf = p_stream.read(2 ** 16)
            if not raw_buf:
                fds.remove(fd)
                continue
            buf = raw_buf.decode()
            std.write(buf)
            std.flush()

def separate(noised_speech, output_folder, sr, snr):
    filename = f'speech_{snr}'
    audiowrite(noised_speech, sr, f'/kaggle/working/{filename}.wav')
    cmd = ["python3", "-m", "demucs.separate", "-o", str(output_folder), "-n", model]
    if mp3:
        cmd += ["--mp3", f"--mp3-bitrate={mp3_rate}"]
    if float32:
        cmd += ["--float32"]
    if int24:
        cmd += ["--int24"]
    if two_stems is not None:
        cmd += [f"--two-stems={two_stems}"]
    if gpu:
        cmd += ["-d", "cuda", "--segment", "176", '-j', '2']
    cmd += ['tracks', f'{filename}.wav']

    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    copy_process_streams(p)
    p.wait()
    if p.returncode != 0:
        print("Command failed, something went wrong.")

    speech, sp_sr = audioread(f'/kaggle/working/{output_folder}/{model}/{filename}/vocals.wav', norm=False, sr=sr)
    noise, _ = audioread(f'/kaggle/working/{output_folder}/{model}/{filename}/no_vocals.wav', norm=False, sr=sr)

    cmd_delete_file = f'rm /kaggle/working/{filename}.wav'
    subprocess.run(cmd_delete_file, shell=True)
    
    return speech, noise


def separate_new(filenames, output_folder):
    cmd = ["python3", "-m", "demucs.separate", "-o", str(output_folder), "-n", model]
    if mp3:
        cmd += ["--mp3", f"--mp3-bitrate={mp3_rate}"]
    if float32:
        cmd += ["--float32"]
    if int24:
        cmd += ["--int24"]
    if two_stems is not None:
        cmd += [f"--two-stems={two_stems}"]
    if gpu:
        cmd += ["-d", "cuda", "--segment", "352", '-j', '5']
    cmd += list(map(lambda x: f"{x}", filenames))
    

    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    copy_process_streams(p)
    p.wait()
    if p.returncode != 0:
        print("Command failed, something went wrong.")

    cmd_delete_file = f'rm -r /kaggle/working/noised_speech/'
    subprocess.run(cmd_delete_file, shell=True)
    

In [None]:
snr_lower = -20
snr_higher = 30
snr_step = 10
sample_rate = 41000

snd = SpeechNoiseDataset(f'{BASEPATH}/speech-data/speech_data/', f'{BASEPATH}/noise-data-audio/noise_data/', sample_rate=sample_rate)

In [None]:
start = 0
end = 1000

In [None]:
files = []

for idx_speech_track in trange(start, end):
    speech, noise, sr = snd[idx_speech_track]
    current_track_metrics = {}
    for current_snr in np.arange(snr_lower, snr_higher, snr_step):
        _, _, noised_speech = snr_mixer(speech, noise, current_snr)
        filename = f'speech_{current_snr}'
        filepath = f'/kaggle/working/noised_speech/{idx_speech_track}_{filename}.wav'
        audiowrite(noised_speech, sr, filepath)
        files.append(filepath)
separate_new(files, 'demucs_new')

In [None]:
demucs_metrics_list = []

for root, dirs, files in os.walk('/kaggle/working/demucs_new/hdemucs_mmi/'):
    for sub_dir in dirs:
        idx_speech_track = int(sub_dir.split('_')[0])
        current_snr = sub_dir.split('_')[-1]
        path_to_track_dir = f'{root}{sub_dir}/'
        splitted_speech, _ = audioread(f'/{path_to_track_dir}/vocals.wav', norm=False, sr=sr)
        splitted_noise, _ = audioread(f'/{path_to_track_dir}/no_vocals.wav', norm=False, sr=sr)
        speech, noise, sr = snd[idx_speech_track]
        current_snr_demucs_metrics = score_metrics(speech, splitted_speech[:len(speech)], noise, splitted_noise)
        demucs_metrics_list.append({current_snr: current_snr_demucs_metrics})

In [None]:
demucs_metrics_list[:4], len(demucs_metrics_list)

In [None]:
!zip -r demucs_0_1000.zip /kaggle/working/demucs_0_1000.json

In [None]:
import os
import json

demucs_metrics_list = []

for root, dirs, files in os.walk(f'{BASEPATH}/metrics/demucs_metrics/data'):
    for file in files:
        with open(f'{BASEPATH}/metrics/demucs_metrics/data/{file}') as f:
            spleeter_metrics_list.extend(json.load(f))

In [None]:
get_mean_avg_metrics('speech', demucs_metrics_list).to_csv(f'{BASEPATH}/metrics/demucs_metrics/grouped_speech.csv')

In [None]:
get_mean_avg_metrics('noise', demucs_metrics_list).to_csv(f'{BASEPATH}/metrics/demucs_metrics/grouped_noise.csv')