# Split up a wav file into multiple smaller files
and do so attempting to use silence as good breaking points

Probably, there is a much better and more consistent way to do this.  Some fine tuning might be 
necessary to adjust for the bacground noise of your source.  Overall, though, this is a basic
structure for splitting big files into multiple smaller ones for easier, more consistent, more debuggable
feeding into Kaldi or other speech to text engines that might choke on files too large or too long

In [1]:
# Import the AudioSegment class for processing audio and the 
# split_on_silence function for separating out silent chunks.
from pydub import AudioSegment
from pydub.silence import split_on_silence
from pydub.utils import mediainfo

In [2]:
# Define a function to normalize a chunk to a target amplitude.
def signal_process_for_kaldi(aChunk, target_dBFS=-20):
    """normalize a chunk to a target amplitude"""
    
    # normalize in dB
    change_in_dBFS = target_dBFS - aChunk.dBFS
    aChunk = aChunk.apply_gain(change_in_dBFS)

    # apply simple filters for noise
    aChunk = aChunk.low_pass_filter(1600)
    aChunk = aChunk.high_pass_filter(200)

    # return AudioSegment
    return aChunk

In [3]:
# Function to measure and adjust the length of an array of chunks 
# Do this after splitting so that you don't have chunks
# that are too small to be useful; default min size is 3 seconds
def check_min_chunk_length(chunks, minimum_seconds=3):
    """measure and adjust the length of chunks in an array of chunks to be minimum_seconds long"""

    # now recombine the chunks so that the parts are at least "min_seconds" sec long
    target_length_minimum = minimum_seconds * 1000  # converst from s to ms

    output_chunks = [chunks[0]]

    # this was really hard to write and understand but it works
    for chunk in chunks[1:]:
        if len(output_chunks[-1]) < target_length_minimum:
            output_chunks[-1] += chunk
        else:
            # if the last output chunk is longer than the target length,
            # we can start a new one
            output_chunks.append(chunk)

    return output_chunks

# Run code
let's try some samples

In [4]:
# Load your audio file
input_sound_file_name = "The Sound of the Swahili language (Numbers, Greetings, Words & Sample Text).mp3" 
# from https://www.youtube.com/watch?v=h74ZKoXiL0E


In [5]:
input_sound_file = AudioSegment.from_mp3(input_sound_file_name)
input_sound_file = input_sound_file.set_channels(1)
input_sound_file = input_sound_file.set_frame_rate(16000)

print("input file name: {}".format(input_sound_file_name))
print("input average dB: {}".format(input_sound_file.dBFS) )
print("new frame rate (khz): {}".format(input_sound_file.frame_rate/1000 ))
print("new channels: {}".format(input_sound_file.channels))


input file name: The Sound of the Swahili language (Numbers, Greetings, Words & Sample Text).mp3
input average dB: -15.133787820994764
new frame rate (khz): 16.0
new channels: 1


In [6]:
# Split track where the silence is 2 seconds or more and get chunks using 
# the imported function.
chunks = split_on_silence (
    # Use the loaded audio.
    input_sound_file, 
    # Specify that a silent chunk must be at least 1 seconds or 1000 ms long.
    min_silence_len = 1000,
    # Consider a chunk silent if it's quieter than -16 dBFS, where 0 is max dB of
    # (You may want to adjust this parameter.)
    silence_thresh = -26,
    # amount of silence to leave at the beginning
    # and end of the chunks. Keeps the sound from sounding like it is
    # abruptly cut off
    keep_silence = 500
)
print("number of chunks: {}".format(len(chunks)))


number of chunks: 62


In [7]:
# let's adjust the chunks to make sure the min chunk size of 3 seconds
chunks = check_min_chunk_length(chunks, 3)

In [8]:
# Process and export each chunk 
for i, chunk in enumerate(chunks):
    # Create a silence chunk that's 0.5 seconds (or 500 ms) long for padding.
    silence_padding = AudioSegment.silent(duration=500)
    
    # Add the padding chunk to beginning and end of the entire chunk.
    audio_chunk = silence_padding + chunk + silence_padding

    # Normalize the entire chunk.
    normalized_chunk = signal_process_for_kaldi(audio_chunk, -16.0)

    # Export the audio chunk with new bitrate.
    print("Exporting: chunk{0}.wav.".format(i))
    normalized_chunk.export(
        ".//soundchunks//chunk{0}.wav".format(i),
        bitrate = "192k",
        format = "wav"
    )

   #if i > 7 :
   #     break

Exporting: chunk0.wav.
Exporting: chunk1.wav.
Exporting: chunk2.wav.
Exporting: chunk3.wav.
Exporting: chunk4.wav.
Exporting: chunk5.wav.
Exporting: chunk6.wav.
Exporting: chunk7.wav.
Exporting: chunk8.wav.
Exporting: chunk9.wav.
Exporting: chunk10.wav.
Exporting: chunk11.wav.
Exporting: chunk12.wav.
Exporting: chunk13.wav.
Exporting: chunk14.wav.
Exporting: chunk15.wav.
Exporting: chunk16.wav.
Exporting: chunk17.wav.
Exporting: chunk18.wav.
Exporting: chunk19.wav.
Exporting: chunk20.wav.
Exporting: chunk21.wav.
Exporting: chunk22.wav.
Exporting: chunk23.wav.
Exporting: chunk24.wav.
Exporting: chunk25.wav.
Exporting: chunk26.wav.
Exporting: chunk27.wav.
Exporting: chunk28.wav.
Exporting: chunk29.wav.
Exporting: chunk30.wav.
Exporting: chunk31.wav.
Exporting: chunk32.wav.
Exporting: chunk33.wav.
Exporting: chunk34.wav.
Exporting: chunk35.wav.
Exporting: chunk36.wav.
Exporting: chunk37.wav.
Exporting: chunk38.wav.
Exporting: chunk39.wav.
Exporting: chunk40.wav.
Exporting: chunk41.wav.
Ex