In [1]:
from pydub import AudioSegment
from pydub.silence import split_on_silence
import os


def preprocess_audio(file_path, output_dir, min_silence_len=500, silence_thresh=-40):
    """
    Preprocess the audio by splitting it based on silence and normalizing it.

    Parameters:
    - file_path: Path to the audio file
    - output_dir: Directory to save the processed chunks
    - min_silence_len: Minimum silence length to consider a split
    - silence_thresh: Silence threshold (in dB)

    Returns:
    - List of paths to processed audio chunks
    """
    # Load audio file
    audio = AudioSegment.from_file(file_path)

    # Normalize audio
    audio = audio.normalize()

    # Split on silence
    audio_chunks = split_on_silence(
        audio, min_silence_len=min_silence_len, silence_thresh=silence_thresh
    )

    # Save chunks
    chunk_paths = []
    for i, chunk in enumerate(audio_chunks):
        chunk_path = os.path.join(
            output_dir, f"{os.path.basename(file_path).split('.')[0]}_chunk{i}.wav"
        )
        chunk.export(chunk_path, format="wav")
        chunk_paths.append(chunk_path)

    return chunk_paths


# Example usage
audio_file = "../Dataset/SandalWoodNewsStories_1.mp3"
output_dir = "processed_audio_chunks"
os.makedirs(output_dir, exist_ok=True)
processed_files = preprocess_audio(audio_file, output_dir)
print("Processed audio chunks:", processed_files)

Processed audio chunks: ['processed_audio_chunks\\SandalWoodNewsStories_1_chunk0.wav', 'processed_audio_chunks\\SandalWoodNewsStories_1_chunk1.wav', 'processed_audio_chunks\\SandalWoodNewsStories_1_chunk2.wav', 'processed_audio_chunks\\SandalWoodNewsStories_1_chunk3.wav', 'processed_audio_chunks\\SandalWoodNewsStories_1_chunk4.wav', 'processed_audio_chunks\\SandalWoodNewsStories_1_chunk5.wav', 'processed_audio_chunks\\SandalWoodNewsStories_1_chunk6.wav', 'processed_audio_chunks\\SandalWoodNewsStories_1_chunk7.wav', 'processed_audio_chunks\\SandalWoodNewsStories_1_chunk8.wav', 'processed_audio_chunks\\SandalWoodNewsStories_1_chunk9.wav', 'processed_audio_chunks\\SandalWoodNewsStories_1_chunk10.wav', 'processed_audio_chunks\\SandalWoodNewsStories_1_chunk11.wav', 'processed_audio_chunks\\SandalWoodNewsStories_1_chunk12.wav', 'processed_audio_chunks\\SandalWoodNewsStories_1_chunk13.wav', 'processed_audio_chunks\\SandalWoodNewsStories_1_chunk14.wav', 'processed_audio_chunks\\SandalWoodNewsS