In [49]:
import os
import numpy as np
import librosa
import soundfile as sf
from imblearn.over_sampling import ADASYN

In [50]:
# Define folder paths for datasets
noise_folder = './noise'
unknown_folder = './unknown'
kha_folder = './kha'

# Load .wav files into dataset
def load_wav_files(folder):
    dataset = []
    for filename in os.listdir(folder):
        if filename.endswith('.wav'):
            file_path = os.path.join(folder, filename)
            audio, sr = librosa.load(file_path, sr=None)
            dataset.append(audio)
    return dataset

# Load datasets
noise_dataset = load_wav_files(noise_folder)
unknown_dataset = load_wav_files(unknown_folder)
kha_dataset = load_wav_files(kha_folder)

# Data augmentation functions
def time_stretch(audio, rate):
    """Time stretch the audio by the given rate."""
    return librosa.effects.time_stretch(audio, rate=rate)

def pitch_shift(audio, sr, n_steps):
    """Shift the pitch of the audio by n_steps semitones."""
    return librosa.effects.pitch_shift(audio, sr=sr, n_steps=n_steps)

def add_noise(audio, noise_factor=0.005):
    """Add random noise to the audio."""
    noise = np.random.randn(len(audio)) * noise_factor
    return audio + noise

# Augment Kha dataset
def augment_kha_data(kha_data):
    augmented_data = []
    sr = 16000  # Set sample rate

    for audio in kha_data:
        # Original audio
        augmented_data.append(audio)

        # Time stretching
        for rate in np.arange(0.8, 1.3, 0.1):  # Stretching from 0.8 to 1.2
            augmented_data.append(time_stretch(audio, rate))

        # # Pitch shifting
        # for n_steps in range(-3, 4):  # Shift pitch from -3 to +3 semitones
        #     augmented_data.append(pitch_shift(audio, sr, n_steps))

        # Adding noise
        augmented_data.append(add_noise(audio))

    return augmented_data

# Function to save augmented audio data into subfolders
def save_augmented_data(augmented_kha, noise_data, unknown_data, folder_kha, folder_noise, folder_unknown, base_name='augmented'):
    os.makedirs(folder_kha, exist_ok=True)  # Create Kha folder
    os.makedirs(folder_noise, exist_ok=True)  # Create Noise folder
    os.makedirs(folder_unknown, exist_ok=True)  # Create Unknown folder
    
    # Save Kha augmented data
    for i, audio in enumerate(augmented_kha):
        file_path = os.path.join(folder_kha, f'{base_name}_{i}.wav')
        sf.write(file_path, audio, 16000)  # Save the audio file

    # Save Noise dataset
    for i, audio in enumerate(noise_data):
        file_path = os.path.join(folder_noise, f'noise_{i}.wav')
        sf.write(file_path, audio, 16000)  # Save the audio file

    # Save Unknown dataset
    for i, audio in enumerate(unknown_data):
        file_path = os.path.join(folder_unknown, f'unknown_{i}.wav')
        sf.write(file_path, audio, 16000)  # Save the audio file

# Augment the Kha dataset
augmented_kha_dataset = augment_kha_data(kha_dataset)

# # Save augmented audio files to respective subfolders
# save_augmented_data(augmented_kha_dataset, noise_dataset, unknown_dataset, './balanced_dataset/kha', './balanced_dataset/noise', './balanced_dataset/unknown')

# # Convert Kha dataset to a numerical format for ADASYN
# X_kha = [librosa.feature.mfcc(y=audio, sr=16000, n_mfcc=13) for audio in augmented_kha_dataset]

# # Define a fixed length for MFCC features
# fixed_length = 100  # Adjust this based on your requirements

# # Pad or truncate MFCC features to ensure consistent shapes for Kha
# X_kha_flat = np.array([np.pad(x, ((0, 0), (0, fixed_length - x.shape[1])), mode='constant') if x.shape[1] < fixed_length else x[:, :fixed_length] for x in X_kha])

# # Flatten the MFCC features into 2D array for Kha
# X_kha_flat = np.array([x.flatten() for x in X_kha_flat])

# # Prepare labels for Kha dataset
# y_kha = ['Kha'] * len(X_kha_flat)

# # Extract MFCC features for noise and unknown datasets
# X_noise = [librosa.feature.mfcc(y=audio, sr=16000, n_mfcc=13) for audio in noise_dataset]
# X_unknown = [librosa.feature.mfcc(y=audio, sr=16000, n_mfcc=13) for audio in unknown_dataset]

# # Pad or truncate MFCC features for noise dataset
# X_noise_flat = np.array([np.pad(x, ((0, 0), (0, fixed_length - x.shape[1])), mode='constant') if x.shape[1] < fixed_length else x[:, :fixed_length] for x in X_noise])

# # Pad or truncate MFCC features for unknown dataset
# X_unknown_flat = np.array([np.pad(x, ((0, 0), (0, fixed_length - x.shape[1])), mode='constant') if x.shape[1] < fixed_length else x[:, :fixed_length] for x in X_unknown])

# # Flatten the MFCC features for noise and unknown datasets
# X_noise_flat = np.array([x.flatten() for x in X_noise_flat])
# X_unknown_flat = np.array([x.flatten() for x in X_unknown_flat])

# # Combine the datasets
# X_combined = np.concatenate((X_kha_flat, X_noise_flat, X_unknown_flat), axis=0)
# y_combined = y_kha + ['noise'] * len(X_noise_flat) + ['unknown'] * len(X_unknown_flat)

# # Implement ADASYN for oversampling the minority class
# adasyn = ADASYN(sampling_strategy='minority', n_neighbors=5)
# X_resampled, y_resampled = adasyn.fit_resample(X_combined, y_combined)

# # Output shape verification
# print(f"Kha resampled dataset size: {len(y_resampled)}")


In [51]:
import os
import numpy as np
import librosa
import soundfile as sf

# Define folder paths for datasets
noise_folder = './noise'
unknown_folder = './unknown'
kha_folder = './kha'

# Load .wav files into dataset
def load_wav_files(folder):
    dataset = []
    for filename in os.listdir(folder):
        if filename.endswith('.wav'):
            file_path = os.path.join(folder, filename)
            audio, sr = librosa.load(file_path, sr=None)
            dataset.append(audio)
    return dataset

# Load datasets
noise_dataset = load_wav_files(noise_folder)
unknown_dataset = load_wav_files(unknown_folder)
kha_dataset = load_wav_files(kha_folder)

# Smart truncate or pad function
def truncate_or_pad_smart(audio, sr, target_length_ms=1000):
    target_length_samples = int((target_length_ms / 1000) * sr)  # Convert ms to samples
    if len(audio) < target_length_samples:
        # Pad audio with zeros if it's shorter than target length
        padding = target_length_samples - len(audio)
        audio = np.pad(audio, (0, padding), 'constant')
    else:
        # Calculate short-term energy without reshaping
        segment_length = sr // 10
        num_segments = len(audio) // segment_length
        short_time_energy = np.array([np.sum(np.square(audio[i*segment_length:(i+1)*segment_length])) for i in range(num_segments)])
        
        # Find the peak energy index
        peak_index = np.argmax(short_time_energy)  
        
        # Calculate the segment to keep around the peak index
        start = max(0, peak_index * segment_length - target_length_samples // 2)
        audio = audio[start:start + target_length_samples]  # Truncate audio around the main voice
    return audio

# Advanced time stretching using Phase Vocoder
def advanced_time_stretch(audio, rate, sr=16000):
    # Apply time stretching using librosa's phase vocoder
    stretched_audio = librosa.effects.time_stretch(audio, rate=rate)
    return truncate_or_pad_smart(stretched_audio, sr)

# Augment Kha dataset
def augment_kha_data(kha_data):
    augmented_data = []
    sr = 16000  # Set sample rate

    for audio in kha_data:
        # Original audio
        augmented_data.append(truncate_or_pad_smart(audio, sr))

        # Time stretching
        for rate in np.arange(0.75, 1.2, 0.1):  # Stretching from 0.7 to 1.1
            augmented_data.append(advanced_time_stretch(audio, rate, sr))

    return augmented_data

# Create a new folder name starting with 'kha' to save augmented data
new_folder = './kha_augmented'
os.makedirs(new_folder, exist_ok=True)  # Create the folder if it doesn't exist

# Augment the Kha dataset
augmented_kha_dataset = augment_kha_data(kha_dataset)

# Save augmented audio files to the new folder
for i, audio in enumerate(augmented_kha_dataset):
    file_path = os.path.join(new_folder, f'augmented_{i}.wav')
    sf.write(file_path, audio, 16000)  # Save the audio file

# Output verification
print(f"Augmented Kha dataset size: {len(augmented_kha_dataset)}")
print("Augmented audio files saved successfully in 'kha_augmented' folder.")

Augmented Kha dataset size: 360
Augmented audio files saved successfully in 'kha_augmented' folder.
