In [1]:
#!pip3 install vosk

In [2]:
from vosk import Model, KaldiRecognizer, SetLogLevel
from pydub import AudioSegment
from imutils import paths

import pandas as pd
import wave
import json
import os

In [3]:
# key: command, value: number of detected and trimmed commands 
# you can modify it with a custom list of commands 
cmdCounter = {'артқа':0, 
              'алға':0, 
              'оңға':0, 
              'солға':0, 
              'төмен':0, 
              'жоғары':0, 
              'жүр':0, 
              'тоқта':0,
              'қос':0, 
              'өшір':0, 
              'иә':0, 
              'жоқ':0, 
              'үйрен':0, 
              'орында':0, 
              'нөл':0, 
              'бір':0, 
              'екі':0, 
              'үш':0, 
              'төрт':0,
              'бес':0, 
              'алты':0, 
              'жеті':0, 
              'сегіз':0, 
              'тоғыз':0, 
              'төсек':0, 
              'құс':0, 
              'мысық':0, 
              'ит':0, 
              'бақытты':0,
              'үй':0, 
              'оқы':0, 
              'жаз':0, 
              'ағаш':0, 
              'көрнекі':0, 
              'мәссаған':0}

In [4]:
dataset_path = 'speech_corpus/'
command_path = 'speech_commands/'

# create a new directory 
if not os.path.exists(command_path):
    os.makedirs(command_path)

In [5]:
# get paths to wav and text files
wavPaths = sorted(list(paths.list_files(dataset_path, validExts="wav")))
txtPaths = sorted(list(paths.list_files(dataset_path, validExts="txt")))

print("Number of wav files (.wav):", len(wavPaths))
print("Number of txt files (.txt):", len(txtPaths))

Number of wav files (.wav): 2
Number of txt files (.txt): 2


In [12]:
# load Vosk Speech Recognizer model for Kazakh
# you can find models for other languages here: 
# https://alphacephei.com/vosk/models
model = Model('vosk-model-kz-0.15')
sr = 16000
conf_thr = 0.5 # confidence threshold for Vosk model detections
rec = KaldiRecognizer(model, sr)
rec.SetWords(True)

delta_t = 0 
i = 1
for wavPath, txtPath in zip(wavPaths, txtPaths):
    print("[INFO] Processing file: {}/{}".format(i, len(wavPaths)))
    
    # verify the correspondence of file names
    wav_name = wavPath.split('/')[0].split('.')[0]
    txt_name = txtPath.split('/')[0].split('.')[0]
    assert wav_name == txt_name
    
    # open the txt file and split 
    # lines into words
    words = []
    txt_file = open(txtPath, 'r')
    lines = txt_file.readlines()
    for line in lines:
        line = line.strip()
        line = line.split()
        words += line
    txt_file.close()
    
    # load the wav file
    wav_file = wave.open(wavPath, "rb")

    # get speech recognition results 
    # as JSON dictionaries
    vosk_results = []

    # recognize speech using the vosk model
    while True:
        data = wav_file.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            part_result = json.loads(rec.Result())
            vosk_results.append(part_result)

    part_result = json.loads(rec.FinalResult())
    vosk_results.append(part_result)

    # load the original wav file via
    # AudioSegment for segmentation
    org_wav = AudioSegment.from_wav(wavPath)
    org_wav.set_frame_rate(sr)
    
    for sentence in vosk_results:
        if len(sentence) == 1:
            # sometimes there are bugs in recognition 
            # and it returns an empty dictionary
            # {'text': ''}
            continue
        
        for word in sentence['result']:
            if word['word'] in words and word['word'] in cmdCounter and word['conf'] > conf_thr:
                t1 = word['start'] * 1000 - delta_t
                t2 = word['end'] * 1000 - delta_t
                
                cmd_wav = org_wav[t1:t2]
                save_path = os.path.join(command_path, word['word'])
                if not os.path.exists(save_path):
                    os.makedirs(save_path)
                
                cmd_wav.export(os.path.join(save_path, word['word'] + ".wav"), format="wav")
                cmdCounter[word['word']] += 1
                
                print(f"Conf: {word['conf']:.2f} | Start-End time (s): {word['start']:.3f}-{word['end']:.3f} | Cmd: {word['word']}")
                
    delta_t += len(org_wav)

LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=13 max-active=7000 lattice-beam=6
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 0 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 0 orphan components.
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from vosk-model-kz-0.15/ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:279) Loading HCLG from vosk-model-kz-0.15/graph/HCLG.fst
LOG (VoskAPI:ReadDataFiles():model.cc:294) Loading words from vosk-model-kz-0.15/graph/words.txt
LOG (VoskAPI:ReadDataFiles():model.cc:303) Loading winfo vosk-model-kz-0.15/graph/phones/word_boundary.int
LOG (VoskAPI:ReadDataFiles():model.cc:310) Loading subtract G.fst model from

[INFO] Processing file: 1/2
Conf: 1.00 | Start-End time: 0.750-0.960 | Cmd: бес
Conf: 0.78 | Start-End time: 1.290-1.498 | Cmd: жүр
Conf: 0.99 | Start-End time: 1.498-2.040 | Cmd: тоқта
Conf: 1.00 | Start-End time: 2.790-3.150 | Cmd: солға
Conf: 1.00 | Start-End time: 3.630-3.900 | Cmd: екі
Conf: 0.99 | Start-End time: 4.170-4.500 | Cmd: жүр
[INFO] Processing file: 1/2
Conf: 0.94 | Start-End time: 6.509-6.869 | Cmd: оқы


In [7]:
# convert dict to pandas dataframe for further analysis
df = pd.DataFrame.from_dict(cmdCounter, orient ='index')
df

Unnamed: 0,0
артқа,0
алға,0
оңға,0
солға,1
төмен,0
жоғары,0
жүр,2
тоқта,1
қос,0
өшір,0
