# Audio segmentation

Automatic audio segmentation using vosk toolbox

In [3]:
from vosk import Model, KaldiRecognizer
import wave
import os

First define a function to cut the wav file, understanding when a sentence end

In [71]:
def trim_wav_vosk(file_path, model):
    l = []
    with wave.open(file_path, 'rb') as wf:
        rec = KaldiRecognizer(model, wf.getframerate())
        rec.SetWords(True)
        rec.SetPartialWords(True)
        fname = os.path.splitext(os.path.basename(file_path))[0]
        new_folder = os.path.join(os.path.dirname(file_path),'Vosk', fname)
        if not os.path.exists(new_folder):
            os.makedirs(new_folder)
        while True:
            data = wf.readframes(4000)
            if len(data) == 0:
                break
            if rec.AcceptWaveform(data):
                l.append(rec.Result())
        if len(l)>0:
            for i in range(len(l)):
                x=l[i].split()
                if len(x)>11:
                    indices = [position for position, phrase in enumerate(x) if 'text' in phrase]
                    params = wf.getparams()
                    frame_rate = params[2]
                    start = int(float(x[12].rstrip(','))*frame_rate)
                    end=int(float(x[indices[0]-8].rstrip(','))*frame_rate)
                    wf.setpos(start)
                    frame_to_trim = end-start
                    frame = wf.readframes(frame_to_trim)
                    with wave.open(os.path.join(new_folder,fname+ str(i) +".wav"), "wb") as output_wav:
                        output_wav.setparams(params)
                        output_wav.writeframes(frame)

Define the different paths

In [4]:
path_code=os.getcwd()
path_SLA=os.path.join(path_code, "../Data/SLA/Normal")
path_Stroke=os.path.join(path_code, "../Data/Stroke/Normal")
path_HC=os.path.join(path_code, "../Data/Healthy control/Normal")

In [5]:
model = Model(lang="en-us")

In [72]:
for element in os.listdir(path_HC):
    file_path = os.path.join(path_HC, element)
    if os.path.isfile(file_path):
        trim_wav_vosk(file_path, model)

In [63]:
for element in os.listdir(path_SLA):
    file_path = os.path.join(path_SLA, element)
    if os.path.isfile(file_path):
        trim_wav_vosk(file_path, model)

In [64]:
for element in os.listdir(path_Stroke):
    file_path = os.path.join(path_Stroke, element)
    if os.path.isfile(file_path):
        trim_wav_vosk(file_path, model)

In [5]:
path_HC_PA=os.path.join(path_code, "../Data/Healthy control/PA")
path_SLA_PA=os.path.join(path_code, "../Data/SLA/PA")
path_Stroke_PA=os.path.join(path_code, "../Data/Stroke/PA")

In [68]:
for element in os.listdir(path_HC_PA):
    file_path = os.path.join(path_HC_PA, element)
    if os.path.isfile(file_path):
        trim_wav_vosk(file_path, model)


for element in os.listdir(path_SLA_PA):
    file_path = os.path.join(path_SLA_PA, element)
    if os.path.isfile(file_path):
        trim_wav_vosk(file_path, model)


for element in os.listdir(path_Stroke_PA):
    file_path = os.path.join(path_Stroke_PA, element)
    if os.path.isfile(file_path):
        trim_wav_vosk(file_path, model)


In [6]:
path_HC_PATAKA=os.path.join(path_code, "../Data/Healthy control/PATAKA")
path_SLA_PATAKA=os.path.join(path_code, "../Data/SLA/PATAKA")
path_Stroke_PATAKA=os.path.join(path_code, "../Data/Stroke/PATAKA")

In [73]:
for element in os.listdir(path_HC_PATAKA):
    file_path = os.path.join(path_HC_PATAKA, element)
    if os.path.isfile(file_path):
        trim_wav_vosk(file_path, model)

for element in os.listdir(path_SLA_PATAKA):
    file_path = os.path.join(path_SLA_PATAKA, element)
    if os.path.isfile(file_path):
        trim_wav_vosk(file_path, model)
       
for element in os.listdir(path_Stroke_PATAKA):
    file_path = os.path.join(path_Stroke_PATAKA, element)
    if os.path.isfile(file_path):
        trim_wav_vosk(file_path, model)

# Speech recognition approach

In [74]:
import speech_recognition as sr
from pydub import AudioSegment



In [80]:
def trim_audio_sr(input_file, output_file, keyword):
    # Load the audio file
    audio = AudioSegment.from_file(input_file)
    
    # Use speech recognition to get timestamps of the keyword
    recognizer = sr.Recognizer()
    with sr.AudioFile(input_file) as source:
        audio_data = recognizer.record(source)  # Record the entire audio file
        try:
            # Recognize the speech and get the timestamps
            result = recognizer.recognize_google(audio_data, show_all=True)
            if 'alternative' in result:
                alternative = result['alternative'][0]
                if 'timestamps' in alternative:
                    timestamps = alternative['timestamps']
                    start_time = timestamps[result.lower().index(keyword)][1]
                    end_time = timestamps[result.lower().index(keyword) + len(keyword) - 1][2]
                else:
                    print("Timestamps not found in the recognition result.")
                    return
            else:
                print("No alternative found in the recognition result.")
                return
        except sr.UnknownValueError:
            print("Could not understand audio")
            return
        except sr.RequestError as e:
            print(f"Could not request results: {e}")
            return
    
    # Trim the audio
    trimmed_audio = audio[start_time * 1000:end_time * 1000]  # Convert seconds to milliseconds
    
    # Export the trimmed audio
    trimmed_audio.export(output_file, format="wav")
    # Export the trimmed audio
    trimmed_audio.export(output_file, format="wav")

In [79]:
input_file = "N001_02_BBP_NORMAL.wav"
output_file = "output_audio.wav"
keyword = "pataka"

trim_audio_sr(input_file, output_file, keyword)

Timestamps not found in the recognition result.


Does not found the words

# Silence removal

In [4]:
from pydub import AudioSegment, silence
from pyAudioAnalysis import audioBasicIO, audioSegmentation



In [7]:
def trim_audio_pydub(file_path):
    # Load the audio file
   
    audio = AudioSegment.from_file(file_path)
    fname = os.path.splitext(os.path.basename(file_path))[0]
    new_folder = os.path.join(os.path.dirname(file_path),'Pocketsphinx', fname)
    if not os.path.exists(new_folder):
        os.makedirs(new_folder)
    non_silence_ranges = silence.detect_nonsilent(audio, min_silence_len=200, silence_thresh=-50)

    
    # Export each voice segment to a separate file
    for i, (start, end) in enumerate(non_silence_ranges):
        
        # Trim the audio to the current voice segment
        trimmed_audio = audio[start:end]
        
        # Create a unique filename for the segment
        filename = os.path.join(new_folder, f"segment_{i}.wav")
        
        # Export the trimmed audio
        trimmed_audio.export(filename, format="wav")


In [9]:
for element in os.listdir(path_HC):
    file_path = os.path.join(path_HC, element)
    if os.path.isfile(file_path) and os.path.splitext(file_path)[1].lower() == ".wav":
        trim_audio_pydub(file_path)

for element in os.listdir(path_SLA):
    file_path = os.path.join(path_SLA, element)
    if os.path.isfile(file_path) and os.path.splitext(file_path)[1].lower() == ".wav":
        trim_audio_pydub(file_path)

for element in os.listdir(path_Stroke):
    file_path = os.path.join(path_Stroke, element)
    if os.path.isfile(file_path) and os.path.splitext(file_path)[1].lower() == ".wav":
        trim_audio_pydub(file_path)

In [12]:
for element in os.listdir(path_HC_PA):
    file_path = os.path.join(path_HC_PA, element)
    if os.path.isfile(file_path) and os.path.splitext(file_path)[1].lower() == ".wav":
        trim_audio_pydub(file_path)


for element in os.listdir(path_SLA_PA):
    file_path = os.path.join(path_SLA_PA, element)
    if os.path.isfile(file_path) and os.path.splitext(file_path)[1].lower() == ".wav":
        trim_audio_pydub(file_path)


for element in os.listdir(path_Stroke_PA):
    file_path = os.path.join(path_Stroke_PA, element)
    if os.path.isfile(file_path) and os.path.splitext(file_path)[1].lower() == ".wav":
        trim_audio_pydub(file_path)

In [13]:
for element in os.listdir(path_HC_PATAKA):
    file_path = os.path.join(path_HC_PATAKA, element)
    if os.path.isfile(file_path) and os.path.splitext(file_path)[1].lower() == ".wav":
        trim_audio_pydub(file_path)

for element in os.listdir(path_SLA_PATAKA):
    file_path = os.path.join(path_SLA_PATAKA, element)
    if os.path.isfile(file_path) and os.path.splitext(file_path)[1].lower() == ".wav":
        trim_audio_pydub(file_path)
       
for element in os.listdir(path_Stroke_PATAKA):
    file_path = os.path.join(path_Stroke_PATAKA, element)
    if os.path.isfile(file_path) and os.path.splitext(file_path)[1].lower() == ".wav":
        trim_audio_pydub(file_path)

# Pocketsphinx

In [7]:
import pocketsphinx
import librosa

In [20]:
def trim_audio_pocket(file_path, config):
    audio_p = wave.open(file_path, "rb")
    audio, fs = librosa.load(file_path, sr=None)
    params = audio_p.getparams()
    audio_file = open(file_path, 'rb').read()
    fname = os.path.splitext(os.path.basename(file_path))[0]
    new_folder = os.path.join(os.path.dirname(file_path),'Pocketsphinx', fname)
    if not os.path.exists(new_folder):
        os.makedirs(new_folder)
        
    decoder = pocketsphinx.Decoder(config)
    decoder.start_utt()
    decoder.process_raw(audio_file, False, True)
    decoder.end_utt()

    phoneme_segments = [(seg.word, seg.start_frame, seg.end_frame) for seg in decoder.seg()]

    i=0
    for label, start_frame, end_frame in phoneme_segments:
        start_sample = int(start_frame * fs)
        end_sample = int(end_frame * fs)
        segment = audio[start_sample:end_sample]
        if label == '[SPEECH]':
            with wave.open(os.path.join(new_folder,fname+ str(i) +".wav"), 'wb') as wf:
                wf.setparams(params)
                wf.writeframes(segment.tobytes())
            i=i+1

In [21]:
config = pocketsphinx.Decoder.default_config()
config.set_string('-hmm', os.path.join(path_code, '../Acoustic model'))
config.set_string('-dict', os.path.join(path_code,'../cmudict.dict'))
config.set_string('-lm', os.path.join(path_code,'../en-70k-0.1.lm'))

for element in os.listdir(path_HC):
    file_path = os.path.join(path_HC, element)
    if os.path.isfile(file_path) and os.path.splitext(file_path)[1].lower() == ".wav":
        trim_audio_pocket(file_path, config)

  config = pocketsphinx.Decoder.default_config()


In [None]:
for element in os.listdir(path_SLA):
    file_path = os.path.join(path_SLA, element)
    if os.path.isfile(file_path) and os.path.splitext(file_path)[1].lower() == ".wav":
        trim_audio_pocket(file_path, config)

for element in os.listdir(path_Stroke):
    file_path = os.path.join(path_Stroke, element)
    if os.path.isfile(file_path) and os.path.splitext(file_path)[1].lower() == ".wav":
        trim_audio_pocket(file_path, config)

for element in os.listdir(path_HC_PA):
    file_path = os.path.join(path_HC_PA, element)
    if os.path.isfile(file_path) and os.path.splitext(file_path)[1].lower() == ".wav":
        trim_audio_pocket(file_path,config)


for element in os.listdir(path_SLA_PA):
    file_path = os.path.join(path_SLA_PA, element)
    if os.path.isfile(file_path) and os.path.splitext(file_path)[1].lower() == ".wav":
        trim_audio_pocket(file_path, config)


for element in os.listdir(path_Stroke_PA):
    file_path = os.path.join(path_Stroke_PA, element)
    if os.path.isfile(file_path) and os.path.splitext(file_path)[1].lower() == ".wav":
        trim_audio_pocket(file_path, config)

for element in os.listdir(path_HC_PATAKA):
    file_path = os.path.join(path_HC_PATAKA, element)
    if os.path.isfile(file_path) and os.path.splitext(file_path)[1].lower() == ".wav":
        trim_audio_pydub(file_path)

for element in os.listdir(path_SLA_PATAKA):
    file_path = os.path.join(path_SLA_PATAKA, element)
    if os.path.isfile(file_path) and os.path.splitext(file_path)[1].lower() == ".wav":
        trim_audio_pydub(file_path)
       
for element in os.listdir(path_Stroke_PATAKA):
    file_path = os.path.join(path_Stroke_PATAKA, element)
    if os.path.isfile(file_path) and os.path.splitext(file_path)[1].lower() == ".wav":
        trim_audio_pydub(file_path)