In [10]:
import os
import pandas as pd
from pydub import AudioSegment
import random
import numpy as np

# Load the TSV file
tsv_path = '/media/external_2T/datasets/cv-corpus-16.1-2023-12-06/fa/validated.tsv'
df = pd.read_csv(tsv_path, sep='\t')

# Directory paths
normal_audio_dir = '/media/external_2T/datasets/cv-corpus-16.1-2023-12-06/fa/clips'
crowd_audio_dir = '/media/external_2T/datasets/crowd_talking'
output_dir = '/media/external_2T/fetrat/speech-enhancemnet/persian_data/train/noisy'
test_output_dir = '/media/external_2T/fetrat/speech-enhancemnet/persian_data/test/noisy'
original_output_dir = '/media/external_2T/fetrat/speech-enhancemnet/persian_data/train/clean'
original_test_output_dir = '/media/external_2T/fetrat/speech-enhancemnet/persian_data/test/clean'

# Ensure the output directories exist
os.makedirs(output_dir, exist_ok=True)
os.makedirs(test_output_dir, exist_ok=True)
os.makedirs(original_output_dir, exist_ok=True)
os.makedirs(original_test_output_dir, exist_ok=True)

# Load all crowd talking audios into memory
crowd_audio_files = [os.path.join(crowd_audio_dir, f'crowd_{i}.mp3') for i in range(1, 11)]
crowd_audios = [AudioSegment.from_mp3(file) for file in crowd_audio_files]

# Function to normalize the loudness of the audio
def normalize_audio(audio, target_dBFS):
    change_in_dBFS = target_dBFS - audio.dBFS
    return audio.apply_gain(change_in_dBFS)

# Function to overlay crowd noise on normal speech
def add_crowd_noise(normal_audio, crowd_audios, target_dBFS=-30):
    crowd_audio = random.choice(crowd_audios)
    
    # Loop crowd audio if it's shorter than normal audio
    while len(crowd_audio) < len(normal_audio):
        crowd_audio += random.choice(crowd_audios)
    
    # Trim crowd audio to the length of normal audio
    crowd_audio = crowd_audio[:len(normal_audio)]
    
    # Normalize loudness
    crowd_audio = normalize_audio(crowd_audio, target_dBFS)
    
    # Overlay audios
    combined_audio = normal_audio.overlay(crowd_audio)
    return combined_audio

# Function to add Gaussian noise with adjustable strength
def add_gaussian_noise(normal_audio, target_dBFS=-30, noise_std_dev=20):
    samples = np.array(normal_audio.get_array_of_samples())
    noise = np.random.normal(0, noise_std_dev, samples.shape)
    noisy_samples = samples + noise
    noisy_samples = noisy_samples.astype(samples.dtype)
    noisy_audio = normal_audio._spawn(noisy_samples)
    
    # Normalize loudness
    noisy_audio = normalize_audio(noisy_audio, target_dBFS)
    return noisy_audio

# Function to process and save audio
def process_and_save_audio(normal_audio_path, crowd_audios, output_dir, original_output_dir, min_speech_dBFS=-10, target_noise_dBFS=-30, noise_std_dev=20):
    normal_audio = AudioSegment.from_file(normal_audio_path)
    
    # Normalize the speech signal to the minimum loudness
    normal_audio = normalize_audio(normal_audio, min_speech_dBFS)
    
    # Save the original audio
    original_audio_output_path = os.path.join(original_output_dir, os.path.basename(normal_audio_path))
    normal_audio.export(original_audio_output_path, format='mp3')
    
    # Decide whether to add Gaussian noise or crowd noise
    if random.random() < 0.5:
        # Add Gaussian noise with the specified standard deviation
        processed_audio = add_gaussian_noise(normal_audio, target_noise_dBFS, noise_std_dev)
    else:
        # Add crowd noise
        processed_audio = add_crowd_noise(normal_audio, crowd_audios, target_noise_dBFS)
    
    # Save the processed audio with the same name in the output directory
    processed_audio_output_path = os.path.join(output_dir, os.path.basename(normal_audio_path))
    processed_audio.export(processed_audio_output_path, format='mp3')

    print(original_audio_output_path)
    print(processed_audio_output_path)

# Randomly select 100 normal audios for test data and remove them from the main DataFrame
test_sample_df = df.sample(n=1000)
df = df.drop(test_sample_df.index)

j = 0
# Process and save all normal audios for main data
for index, row in df.iterrows():
    normal_audio_path = os.path.join(normal_audio_dir, row['path'])
    process_and_save_audio(normal_audio_path, crowd_audios, output_dir, original_output_dir)
    j += 1
    if j == 6000:
        break

# Process and save the selected audios for test data
for index, row in test_sample_df.iterrows():
    test_audio_path = os.path.join(normal_audio_dir, row['path'])
    process_and_save_audio(test_audio_path, crowd_audios, test_output_dir, original_test_output_dir)

print("Processing complete.")


  df = pd.read_csv(tsv_path, sep='\t')


/media/external_2T/fetrat/speech-enhancemnet/persian_data/train/clean/common_voice_fa_18325365.mp3
/media/external_2T/fetrat/speech-enhancemnet/persian_data/train/noisy/common_voice_fa_18325365.mp3
/media/external_2T/fetrat/speech-enhancemnet/persian_data/train/clean/common_voice_fa_18960256.mp3
/media/external_2T/fetrat/speech-enhancemnet/persian_data/train/noisy/common_voice_fa_18960256.mp3
/media/external_2T/fetrat/speech-enhancemnet/persian_data/train/clean/common_voice_fa_37417377.mp3
/media/external_2T/fetrat/speech-enhancemnet/persian_data/train/noisy/common_voice_fa_37417377.mp3
/media/external_2T/fetrat/speech-enhancemnet/persian_data/train/clean/common_voice_fa_19446941.mp3
/media/external_2T/fetrat/speech-enhancemnet/persian_data/train/noisy/common_voice_fa_19446941.mp3
/media/external_2T/fetrat/speech-enhancemnet/persian_data/train/clean/common_voice_fa_37416976.mp3
/media/external_2T/fetrat/speech-enhancemnet/persian_data/train/noisy/common_voice_fa_37416976.mp3
/media/ext

KeyboardInterrupt: 