### SpeechRecognition

In [4]:
import os
from pydub import AudioSegment
import speech_recognition as sr
recognizer = sr.Recognizer()

def speech_recog_stt(source_file: str):
    file = sr.AudioFile(source_file)
    with file as source:
        audio = recognizer.record(source)
    return recognizer.recognize_google(audio)


converter

In [2]:
import os

def create_folder(path):
    if not os.path.exists(path):
        os.mkdir(path)

def to_wav(source_file): 

    if os.path.isfile(source_file):
        file_name, file_suff = os.path.splitext(source_file)
        dir_name = os.path.dirname(source_file)
        base_name = os.path.basename(source_file)[:-4]
        destination = f'{dir_name}/to_wavs/'

        if file_suff == '.mp3':
            try:
                sound = AudioSegment.from_file(source_file, "mp3")
            except:
                sound = AudioSegment.from_file(source_file, format="mp4")

            create_folder(destination)
            sound.export(f"{destination}/{base_name}.wav", format="wav")
            print('sound file successfully converted wav')

            return f'{destination}/{base_name}.wav'

In [60]:
import moviepy.editor as ed

path = "C:\\Users\\mickl\\Videos"
destination = 'C:\\Users\\mickl\\OneDrive - Lexcode\\My Research\\STT Engines\\audios\\phonetic_lines'


for file in os.listdir(path):

    if os.path.isfile(file) and '.mp4' in file:
        mp4_file = f'{path}\{file}'
        mp3_file = f'{destination}\{file[:-4]}.mp3'

        video = ed.VideoFileClip(mp4_file)
        video.audio.write_audiofile(mp3_file)

### Faster Whisper

In [5]:
from faster_whisper import WhisperModel

def load_whisper_model(model_size : str):
    return WhisperModel(model_size, device="cpu", compute_type="int8")

def faster_whisper_stt(whisper_model, source_file: str):

    try:
        model = whisper_model
        segments, info = model.transcribe(source_file, beam_size=5)

        for segment in segments:
            return segment.text.strip()

    except Exception as e:
        print(f"Exception: {e}")

  from .autonotebook import tqdm as notebook_tqdm


### Vosk

In [6]:
from vosk import Model, KaldiRecognizer
from pydub import AudioSegment
import json

def load_vosk_model(modelName: str):
    return Model(model_name = modelName)

def vosk_stt(vosk_model, source_file : str):
    FRAME_RATE = 16000
    CHANNELS = 1

    try:
        mp3 = AudioSegment.from_file(source_file, "mp3")
    except:
        mp3 = AudioSegment.from_file(source_file, format="mp4")
    mp3 = mp3.set_channels(CHANNELS)
    mp3 = mp3.set_frame_rate(FRAME_RATE)
    
    rec = KaldiRecognizer(vosk_model, FRAME_RATE)
    rec.SetWords(True)
    rec.AcceptWaveform(mp3.raw_data)

    result = rec.Result()   
    return json.loads(result)['text']

### Load All Models

In [7]:
whisper_model = load_whisper_model('base.en')
vosk_model = load_vosk_model('vosk-model-en-us-0.22')

config.json: 100%|██████████| 2.23k/2.23k [00:00<00:00, 557kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development

tokenizer.json: 100%|██████████| 2.13M/2.13M [00:00<00:00, 5.55MB/s]

vocabulary.txt: 100%|██████████| 422k/422k [00:00<00:00, 773kB/s]
model.bin: 100%|██████████| 145M/145M [00:12<00:00, 11.3MB/s] 
vosk-model-en-us-0.22.zip: 100%|██████████| 1.78G/1.78G [04:31<00:00, 7.05MB/s]   


### Download Test Audios

In [5]:
from pytube import YouTube 
import os 

In [18]:
urls = ['https://www.youtube.com/watch?v=HS7YZhsjRAo',
        'https://www.youtube.com/watch?v=CWgAOFM3HN0',
        'https://www.youtube.com/watch?v=IKmQW7JTb6s']

In [19]:
# url input from user 

for url in urls:

    yt = YouTube(url)
    video = yt.streams.filter(only_audio=True).first() 
    out_file = video.download('audios/pre_recorded_vid_to_sound_clips') 

    base, ext = os.path.splitext(out_file) 
    new_file = base + '.mp3'
    os.rename(out_file, new_file) 
    
    print(yt.title + " has been successfully downloaded.")

"Bazinga Punk!" - Sheldon Cooper - The Big Bang Theory has been successfully downloaded.
Homer Simpson - I'm drawing a line down, a la "I love Lucy" has been successfully downloaded.
Spider Man - With Great Power Comes Great Responsibility has been successfully downloaded.


### Test Area

#### Pre-Process

In [None]:
import moviepy.editor as ed

path = "C:\\Users\\mickl\\Videos"
destination = 'C:\\Users\\mickl\\OneDrive - Lexcode\\My Research\\STT Engines\\audios\\phonetic_lines'


for file in os.listdir(path):

    if os.path.isfile(file) and '.mp4' in file:
        mp4_file = f'{path}\{file}'
        mp3_file = f'{destination}\{file[:-4]}.mp3'

        video = ed.VideoFileClip(mp4_file)
        video.audio.write_audiofile(mp3_file)

In [61]:
# parent_dir = "audios/famous_monologues"
parent_dir = "audios/phonetic_lines"

for file in os.listdir(parent_dir):
    
    if ".mp3" in file:
        AUDIO_FILE = f"{parent_dir}/{file}"
        try:
            to_wav(AUDIO_FILE)
        except Exception as e:
            print(e)

sound file successfully converted wav
sound file successfully converted wav
sound file successfully converted wav
sound file successfully converted wav
sound file successfully converted wav


#### Test

In [62]:
speech_recog_results = []
faster_whisper_results = []
vosk_results = []

In [63]:
# parent_dir = "audios/famous_monologues/to_wavs"
parent_dir = "audios/phonetic_lines/to_wavs"

for file in os.listdir(parent_dir):

    if ".wav" in file:
        source_file = f'{parent_dir}/{file}'

        try:
            result = speech_recog_stt(source_file)
            print(result)
            speech_recog_results.append(result)
        except Exception as e:
            print(e)
            speech_recog_results.append('"Error Data"')

how much wood would a woodchuck chuck if a woodchuck could chuck wood
peter piper picked a peck of pickled peppers
sally sells seashells by the seashore
she sells seashells on the seashore the shells that she sells are the seashells i'm sure
the quick brown fox jumps over the lazy dog


In [64]:
# parent_dir = "audios/famous_monologues"
parent_dir = "audios/phonetic_lines"

for file in os.listdir(parent_dir):

    if ".mp3" in file:
        source_file = f'{parent_dir}/{file}'

        try:
            result = faster_whisper_stt(whisper_model, source_file)
            print(result)
            faster_whisper_results.append(result)

        except Exception as e:
            print(e)
            faster_whisper_results.append('"Error Data"')

How much wood would a wood chop if a wood chop could chop wood?
Peter Piper Pig, a pack of pickled peppers.
Sally sells seashells my DC short.
She sells seashells on the seashore. The shells that she sells are the seashells, I'm sure.
The weak brown fox jumps over the lazy dog.


In [65]:
# parent_dir = "audios/famous_monologues"
parent_dir = "audios/phonetic_lines"

for file in os.listdir(parent_dir):

    if ".mp3" in file:
        source_file = f'{parent_dir}/{file}'

        try:
            result = vosk_stt(vosk_model, source_file)
            print(result)
            vosk_results.append(result)

        except Exception as e:
            print(e)
            vosk_results.append('"Error Data"')

how much would would a woodchuck chuck if a woodchuck could chuck wood
peter piper picked a peck of pickled peppers
sally sells seashells by the seashore
she says seashells on the seashore the shells that she says are the seashells i'm sure
the quick brown fox jumps over the lazy


### View Results

In [66]:
import pandas as pd

In [67]:
df = pd.DataFrame({'Speech Recognition': speech_recog_results, 'Faster-Whisper': faster_whisper_results, 'Vosk': vosk_results}, index=None)
df

Unnamed: 0,Speech Recognition,Faster-Whisper,Vosk
0,how much wood would a woodchuck chuck if a woo...,How much wood would a wood chop if a wood chop...,how much would would a woodchuck chuck if a wo...
1,peter piper picked a peck of pickled peppers,"Peter Piper Pig, a pack of pickled peppers.",peter piper picked a peck of pickled peppers
2,sally sells seashells by the seashore,Sally sells seashells my DC short.,sally sells seashells by the seashore
3,she sells seashells on the seashore the shells...,She sells seashells on the seashore. The shell...,she says seashells on the seashore the shells ...
4,the quick brown fox jumps over the lazy dog,The weak brown fox jumps over the lazy dog.,the quick brown fox jumps over the lazy


In [72]:
print(' =Speech Recognition=')
for line in df['Speech Recognition']:
    print(line)

print('\n','=Faster-Whisper=')
for line in df['Faster-Whisper']:
    print(line)

print('\n','==Vosk==')
for line in df['Vosk']:
    print(line)

 =Speech Recognition=
how much wood would a woodchuck chuck if a woodchuck could chuck wood
peter piper picked a peck of pickled peppers
sally sells seashells by the seashore
she sells seashells on the seashore the shells that she sells are the seashells i'm sure
the quick brown fox jumps over the lazy dog

 =Faster-Whisper=
How much wood would a wood chop if a wood chop could chop wood?
Peter Piper Pig, a pack of pickled peppers.
Sally sells seashells my DC short.
She sells seashells on the seashore. The shells that she sells are the seashells, I'm sure.
The weak brown fox jumps over the lazy dog.

 ==Vosk==
how much would would a woodchuck chuck if a woodchuck could chuck wood
peter piper picked a peck of pickled peppers
sally sells seashells by the seashore
she says seashells on the seashore the shells that she says are the seashells i'm sure
the quick brown fox jumps over the lazy
