In [None]:
import numpy
import os
import pandas as pd
import torch
import pyloudnorm as pyln
import random
import soundfile as sf
from tqdm import tqdm
import librosa
from config import DATASET_PERCUSSION_PATH, DATASET_NOISE_PATH, DATASET_MIX_AUDIO_PATH
from utils import dynamic_loudnorm
# Setup
random.seed(42)  # Ensure reproducibility

# Loudness meter
sample_rate = 7812
meter = pyln.Meter(sample_rate)

# Noise classes
noise_class_list = [
    'air_conditioner',
    'car_horn',
    'children_playing',
    'dog_bark',
    'drilling',
    'engine_idling',
    'siren',
    'jackhammer'
]

def pad_audio_center(audio_path, sample_rate=7812, target_length=31248):
    audio, sr = librosa.load(audio_path, sr=sample_rate)

    if len(audio) < target_length:
        pad_len = (target_length - len(audio)) // 2
        audio = numpy.pad(audio, (pad_len, target_length -
                       len(audio) - pad_len), 'constant')
    
    audio = audio[:target_length]
    # audio = torch.tensor(audio, dtype=torch.float32)
    
    return audio

# Helper functions for normalization and mixing
def normalize_loudness(audio, target_loudness=-3):
    meter = pyln.Meter(7812)
    loudness = meter.integrated_loudness(audio)
    if loudness == -float('inf'):
        audio /= numpy.max(numpy.abs(audio))
        loudness = meter.integrated_loudness(audio)

    audio = pyln.normalize.loudness(audio, loudness, target_loudness)

    # Ensure audio is within [-1, 1]
    max_amplitude = max(abs(audio))
    audio = audio / max_amplitude
    
    return torch.tensor(audio, dtype=torch.float32)


def create_mixture(percussion_audio, noise_audio, k):
    percussion_audio = k * percussion_audio
    noise_audio = (1 - k) * noise_audio
    mixture_audio = percussion_audio + noise_audio
    
    # mixture_audio /= numpy.max(numpy.abs(mixture_audio))
    mixture_audio /= torch.max(torch.abs(mixture_audio))
    return mixture_audio, noise_audio

def create_dataset(metadata_noise, output_dir, num_mixes=7358, target_loudness=-3, max_noise_classes=2, k_values=[0.5, 0.6, 0.7, 0.8, 0.9]):
    os.makedirs(output_dir, exist_ok=True)
    metadata = []
    percussion_files = sorted(os.listdir(DATASET_PERCUSSION_PATH))
    percussion_files = [f for f in percussion_files if f.endswith('.wav')]
    total_percussion = len(percussion_files)
    
    # Shuffling and iterating through all noise files
    noise_files = metadata_noise.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Calculating how many mixes per percussion file
    mixes_per_percussion = num_mixes // total_percussion
    remainder_mixes = num_mixes % total_percussion
    
    noise_idx = 0
    for perc_idx, perc_file in enumerate(tqdm(percussion_files, desc="Creating Mixtures")):
        percussion_path = os.path.join(DATASET_PERCUSSION_PATH, perc_file)
        percussion_audio = pad_audio_center(percussion_path)
        percussion_audio = normalize_loudness(percussion_audio, target_loudness)

        # Adjusting number of mixes for the current percussion file
        current_num_mixes = mixes_per_percussion + (1 if perc_idx < remainder_mixes else 0)
        
        for _ in range(current_num_mixes):
            num_noise_classes = random.randint(1, max_noise_classes)
            noise_classes = []
            k = random.choice(k_values)
            noise_audio_combined = torch.zeros_like(percussion_audio)

            selected_noise_files = []
            selected_noise_classes = []
            
            for _ in range(num_noise_classes):
                noise_row = noise_files.iloc[noise_idx]
                noise_idx = (noise_idx + 1) % len(noise_files)
                
                noise_file = os.path.join(DATASET_NOISE_PATH, f"fold{noise_row['fold']}", noise_row['slice_file_name'])
                noise_audio = pad_audio_center(noise_file)
                noise_audio = normalize_loudness(noise_audio, target_loudness)
                
                noise_audio_combined += dynamic_loudnorm(noise_audio, percussion_audio)
                selected_noise_files.append(noise_row['slice_file_name'])
                selected_noise_classes.append(noise_row['class'])
                
                noise_classes.append(noise_row['class'])

            noise_audio_combined = dynamic_loudnorm(noise_audio_combined, percussion_audio)
            noise_audio_combined /= torch.max(torch.abs(noise_audio_combined))
            mixture_audio, noise_audio_combined = create_mixture(percussion_audio, noise_audio_combined, k)

            mix_file_name = f"mixture_{perc_idx}_noise_{'_'.join(noise_classes)}_k_{k:.2f}.wav"
            mix_file_path = os.path.join(output_dir, mix_file_name)
            sf.write(mix_file_path, mixture_audio, sample_rate)

            noise_file_name = f"noise_{perc_idx}_noise_{'_'.join(noise_classes)}_k_{k:.2f}.wav"
            noise_file_path = os.path.join(output_dir, noise_file_name)
            sf.write(noise_file_path, noise_audio_combined, sample_rate)
            
            metadata.append({
                'percussion_file': perc_file,
                'mix_file': mix_file_name,
                'noise_files': ','.join(selected_noise_files),
                'noise_file': noise_file_name,
                'noise_classes': ','.join(selected_noise_classes),
                'k': k
            })

    metadata_df = pd.DataFrame(metadata)
    metadata_df.to_csv(os.path.join(output_dir, "metadata.csv"), index=False)
    print(f"Dataset and metadata saved in '{output_dir}'")


# Load noise metadata
metadata_noise_path = os.path.join(DATASET_NOISE_PATH, "UrbanSound8k.csv")
metadata_noise = pd.read_csv(metadata_noise_path)

# Change classID 8 to 6
metadata_noise['classID'] = metadata_noise['classID'].replace(8, 6)
# Sort by class ID and fold and reset index
metadata_noise = metadata_noise.sort_values(
    by=['classID', 'fold']).reset_index(drop=True)

# Create dataset with mixtures and save metadata
output_dir = DATASET_MIX_AUDIO_PATH
create_dataset(metadata_noise, output_dir, num_mixes=7358)

#%%
# # Load metadata
metadata_path = os.path.join(output_dir, "metadata.csv")
metadata = pd.read_csv(metadata_path)

Creating Mixtures: 100%|██████████| 387/387 [02:18<00:00,  2.80it/s]

Dataset and metadata saved in 'C:\Users\jejep\Desktop\STAGE\data\mixture_audio'





In [None]:
#%%

print(f"The number of unique noise classes: {metadata['noise_classes'].nunique()}")
print(f"The count of each k value: {metadata['k'].value_counts()}")
print(f"The number of unique percussion files: {metadata['percussion_file'].nunique()}")
print(f"The number of unique mix files: {metadata['mix_file'].nunique()}")
print(f"The number of unique noise files: {metadata['noise_files'].nunique()}")
print(f"The count of each noise file: {metadata['noise_files'].value_counts()}")
print(metadata.groupby("noise_files").size().describe())

The number of unique noise classes: 72
The count of each k value: k
0.8    1522
0.6    1509
0.7    1459
0.5    1443
0.9    1425
Name: count, dtype: int64
The number of unique percussion files: 387
The number of unique mix files: 6823
The number of unique noise files: 6562
The count of each noise file: noise_files
46918-5-0-1.wav,34050-7-5-0.wav       2
189023-0-0-1.wav,167464-0-0-2.wav     2
195969-0-0-14.wav,184623-8-0-2.wav    2
107357-8-1-13.wav                     2
169098-7-4-5.wav,39856-5-0-14.wav     2
                                     ..
121286-0-0-5.wav                      1
74726-8-0-2.wav,103357-4-0-0.wav      1
123688-8-0-14.wav                     1
50668-5-5-2.wav,162134-7-11-1.wav     1
89724-5-0-0.wav,103076-3-1-0.wav      1
Name: count, Length: 6562, dtype: int64
count    6562.000000
mean        1.121304
std         0.326506
min         1.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         2.000000
dtype: float64


In [None]:
# %%

# see the different noise files saved
noise_files = metadata['noise_files'].str.split(',')
noise_files = [item for sublist in noise_files for item in sublist]
print(f"The number of unique noise files: {len(set(noise_files))}")
print(f"The count of each noise file: {pd.Series(noise_files).nunique()}")

The number of unique noise files: 7358
The count of each noise file: 7358
The count of each noise file: 7358


In [None]:
# %%

sample_rate = 7812

# check lenght of all files created (mix and noises )
perc_files = metadata['percussion_file'].unique()

perc_files_len = []

for perc_file in perc_files:
    audio, _ = librosa.load(os.path.join(DATASET_PERCUSSION_PATH, perc_file), sr=sample_rate)
    perc_files_len.append(len(audio))
    
    if len(audio) > 31248:
        print(f"percussion files length: {perc_file}")
        # # pad the perc
        # audio = audio[:31248]
        # sf.write(os.path.join(os.path.join(DATASET_PERCUSSION_PATH, perc_file)), audio, sample_rate)
        
        # print(f'new length: {len(audio)}')
        
print(f"lengt: {set(perc_files_len)}")        

lengt: {31248}
