In [8]:
import os
from pydub import AudioSegment
from pydub.silence import split_on_silence


def split_audio_on_pauses(input_audio_path, output_dir, min_duration_ms=5000, max_duration_ms=15000, silence_thresh_db=-40):
    """
    Splits a single audio file into chunks based on silence, ensuring chunks are
    within a specified duration range.

    Args:
        input_audio_path (str): The file path of the single, long audio file.
        output_dir (str): The directory to save the output audio chunks.
        min_duration_ms (int): The minimum desired duration for a chunk in milliseconds (default: 5s).
        max_duration_ms (int): The maximum desired duration for a chunk in milliseconds (default: 15s).
        silence_thresh_db (int): The threshold in dBFS below which audio is considered silence (default: -40dBFS).
    """

    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Created directory: {output_dir}")

    # Load the audio file
    try:
        audio = AudioSegment.from_file(input_audio_path)
    except Exception as e:
        print(f"Error loading {input_audio_path}: {e}")
        return

    # Split the audio based on silence
    # min_silence_len is the minimum length of silence in ms to be considered a pause.
    # We will use a value that is likely to be a pause between sentences.
    print("Splitting audio on silence...")
    chunks = split_on_silence(audio,
                              min_silence_len=700, # A pause of 0.7 seconds or more between phrases
                              silence_thresh=silence_thresh_db, # Adjust based on your audio's noise level
                              keep_silence=500 # Keep a little silence at start/end to avoid abrupt cuts
                             )

    print(f"Initial split resulted in {len(chunks)} chunks.")

    # Process chunks to meet the desired duration
    processed_chunks = []
    combined_chunk = AudioSegment.empty()

    for chunk in chunks:
        # Check if adding the current chunk exceeds the max duration
        if len(combined_chunk) + len(chunk) > max_duration_ms:
            # If so, and the combined chunk is large enough, save it
            if len(combined_chunk) >= min_duration_ms:
                processed_chunks.append(combined_chunk)
            # Start a new combined chunk with the current chunk
            combined_chunk = chunk
        else:
            # Otherwise, add the current chunk to the combined one
            combined_chunk += chunk

    # Handle the last combined chunk
    if len(combined_chunk) >= min_duration_ms:
        processed_chunks.append(combined_chunk)

    print(f"Final processing resulted in {len(processed_chunks)} chunks within the desired duration.")

    # Export each processed chunk
    for i, chunk in enumerate(processed_chunks):
        output_path = os.path.join(output_dir, f"chunk_{i+1}.wav")
        chunk.export(output_path, format="wav")
        print(f"Exported {output_path} (Duration: {len(chunk)/1000:.2f}s)")

# --- Usage Example ---
if __name__ == "__main__":
    # --- Configuration ---
    # Replace with the path to your long audio file and wrap in quotes to handle whitespace
    long_audio_file = "/Users/admin/University/Samsung Prism/Novel Dataset/female 1 audio/2m audio/Harleen-1-2m.wav"
    # Wrap in quotes to handle whitespace
    output_directory_name = "/Users/admin/University/Samsung Prism/Novel Dataset/female 1 audio/2m audio"

    # You will need to fine-tune the min_silence_len and silence_thresh
    # based on the characteristics of your audio files.
    # For very quiet recordings, you might use a value like -40 dBFS.
    # For noisy recordings, you may need a higher threshold (e.g., -25 dBFS).

    split_audio_on_pauses(long_audio_file, output_directory_name)

Error loading /Users/admin/University/Samsung Prism/Novel Dataset/female 1 audio/2m audio/Harleen-1-2m.wav: [Errno 2] No such file or directory: 'ffprobe'




Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.2
    Uninstalling pip-24.2:
      Successfully uninstalled pip-24.2
Successfully installed pip-25.2
Note: you may need to restart the kernel to use updated packages.
