In [None]:
# Notebook tests functionality of speech_recognition library for Russian language (for mic input and audio files)
# Main part is about transcribing long .m4a files.

In [None]:
# I could install speech_recognition only from wheel (pip install didn't work)
# Also, it required PyAudio which in turn required portaudio (brew install portaudio)

In [None]:
# test mic input

import speech_recognition as sr

r = sr.Recognizer()

with sr.Microphone() as source:
    print("Скажите что-нибудь:")
    audio = r.listen(source,timeout=1,phrase_time_limit=5)

try:
     print(r.recognize_google(audio, language="ru-RU"))
except sr.UnknownValueError:
    print("Не расслышал фразу")
except sr.RequestError as e:
    print("Ошибка сервиса; {0}".format(e))

In [None]:
# Convert m4a extension files to wav extension files
      
import os
import argparse

from pydub import AudioSegment

formats_to_convert = ['.m4a']

m4a_folder = "recordings_m4a/"

for (dirpath, dirnames, filenames) in os.walk(m4a_folder):
    for filename in filenames:
        if filename.endswith(tuple(formats_to_convert)):

            filepath = dirpath + '/' + filename
            (path, file_extension) = os.path.splitext(filepath)
            file_extension_final = file_extension.replace('.', '')
            try:
                track = AudioSegment.from_file(filepath,
                        file_extension_final)
                wav_filename = filename.replace(file_extension_final, 'wav')
                wav_path = dirpath + '/' + wav_filename
                print('CONVERTING: ' + str(filepath))
                file_handle = track.export(wav_path, format='wav')
                os.remove(filepath)
            except:
                print("ERROR CONVERTING " + str(filepath))

# Rename folder M4a_files as wav_files
# !mv recordings_m4a wav_files

In [None]:
# test one audio file

r = sr.Recognizer()

file = "42/0_New Recording 42.wav"

with sr.WavFile(file) as source:              # use "test.wav" as the audio source
    audio = r.record(source)                        # extract audio data from the file

try:
    list = r.recognize_google(audio, language="ru-RU")                 # generate a list of possible transcriptions
    print("Possible transcriptions:")
#     for prediction in list:
#         print(" " + prediction["text"] + " (" + str(prediction["confidence"]*100) + "%)") - it doesn't work for Russian language
    print(list)

except LookupError:                                 # speech is unintelligible
    print("Could not understand audio")

In [None]:
# for short files - all in one folder
from tqdm import tqdm
transcriptions = []

folder = 'recordings_wav'
print(f'Folder: {folder}')
folder = str(folder)

for (dirpath, dirnames, filenames) in tqdm(os.walk(f"{folder}/")):
#         filenames sort
    for filename in filenames:
        if filename.endswith('wav'):
            r = sr.Recognizer()
            print(f'Transcribing {filename}...')
            with sr.WavFile(f"{folder}/{filename}") as source:              # use "test.wav" as the audio source
                audio = r.record(source)                        # extract audio data from the file

            try:
                transcription = r.recognize_google(audio, language="ru-RU")                 # generate a list of possible transcriptions
                transcriptions.append([filename, transcription])
                print(f'{filename} saved!')
            except sr.UnknownValueError:
                print("No sound")
            except LookupError:                                 # speech is unintelligible
                print("Could not understand audio")

print('')


text_dict = {}
for i in transcriptions:
    text_dict[i[0]] = i[1]
    
text_dict

In [None]:
# Google API fails ('Bad error') for large files
# split large file into 2-min files (need to test what is the largest size possible)

from pydub import AudioSegment
import math

class SplitWavAudioMubin():
    def __init__(self, folder, filename):
        self.folder = folder
        self.filename = filename
        self.filepath = folder + '/' + filename
        
        self.audio = AudioSegment.from_wav(self.filepath)
    
    def get_duration(self):
        return self.audio.duration_seconds
    
    def single_split(self, from_min, to_min, split_filename):
        t1 = from_min * 60 * 1000
        t2 = to_min * 60 * 1000
        split_audio = self.audio[t1:t2]
        split_audio.export(self.folder + '/' + split_filename, format="wav")
        
    def multiple_split(self, min_per_split):
        total_mins = math.ceil(self.get_duration() / 60)
        for i in range(0, total_mins, min_per_split):
            split_fn = str(i) + '_' + self.filename
            self.single_split(i, i+min_per_split, split_fn)
            print(str(i) + ' Done')
            if i == total_mins - min_per_split:
                print('All splited successfully')


In [None]:
from tqdm import tqdm
transcriptions = []

folders = [39, 40, 41, 42] # each long audio file in separate folder

for folder in folders:
    print(f'Folder: {folder}')
    folder = str(folder)
    file = f'New Recording {folder}.wav'
    split_wav = SplitWavAudioMubin(folder, file)
    split_wav.multiple_split(min_per_split=2)
    
    for (dirpath, dirnames, filenames) in tqdm(os.walk(f"{folder}/")):
#         filenames sort
        for filename in filenames:

            if filename.endswith('wav') and filename[0].isdigit():
                r = sr.Recognizer()
                print(f'Transcribing {filename}...')
                with sr.WavFile(f"{folder}/{filename}") as source:              # use "test.wav" as the audio source
                    audio = r.record(source)                        # extract audio data from the file

                try:
                    transcription = r.recognize_google(audio, language="ru-RU")                 # generate a list of possible transcriptions
                    transcriptions.append([folder, filename[:2], transcription])
                    print(f'{folder} - {filename[:2]} saved!')
                except sr.UnknownValueError:
                    print("No sound")
                except LookupError:                                 # speech is unintelligible
                    print("Could not understand audio")
                
    print('')
        

In [None]:
import pandas as pd
df = pd.DataFrame(transcriptions, columns=['rec', 'N', 'text'])
df['N'] = df.N.apply(lambda x: int(x.split('_')[0]))

df = df.sort_values(['rec', 'N']).reset_index(drop=True)

In [None]:
df

In [None]:
# concatinate transcriptions of small file parts into one string for each file

rec_dict = {}
for rec in df.rec.unique():
    print(rec)
    print('-')
    df_rec = df[df.rec==rec]
    text = ''
    for (i, t) in df_rec.iterrows():
        print(t['N'])
        text_row = t['text']
        text = text + text_row
    rec_dict[rec] = text
    print('')

In [None]:
rec_dict