# Auto subtitles

In [2]:
from google.cloud import speech_v1p1beta1
from google.cloud.speech_v1p1beta1 import enums
from google.cloud import speech

from pydub.utils import mediainfo

from webvtt.structures import Caption
from webvtt import WebVTT
from path import Path

import os
import io
import datetime

In [2]:
# Audio and text taken from https://www.voiptroubleshooter.com/open_speech/american.html
path = Path(os.path.abspath(os.getcwd()) + '\\speech59.wav')

In [3]:
# Information about audio
def audio_info(audio_filepath):

    audio_data = mediainfo(audio_filepath)
    channels = int(audio_data["channels"])
    sample_rate = float(audio_data["sample_rate"])

    return channels, sample_rate

In [4]:
channels, sample_rate = audio_info(path)

In [5]:
def recognize(audio_path, channels, sample_rate):
    
    client = speech_v1p1beta1.SpeechClient()

    config = {
        "language_code": "en-US",
        "sample_rate_hertz": int(sample_rate),
        "encoding": enums.RecognitionConfig.AudioEncoding.LINEAR16,
        "audio_channel_count": int(channels),
        "enable_word_time_offsets": True,
        "enable_automatic_punctuation":True
    }
    
    with io.open(audio_path, "rb") as f:
        content = f.read()
    audio = {"content": content}

    operation = client.long_running_recognize(config, audio)
    response = operation.result() # Responce from google cloud
    return response

In [6]:
responce = recognize(path, channels, sample_rate)

In [15]:
def generate_vtt(responce):   
    vtt = WebVTT() # Subtitle Creation
    hypothesis = '' # Text to check word error rate
    
    for result in responce.results:
        words = result.alternatives[0].words
        first_word = True # Is this the first word in the sentence.
        sentence = ''
        for word_info in words:
            sentence += word_info.word + ' '
            
            if first_word:
                start_time = datetime.timedelta(seconds=word_info.start_time.seconds)
                microseconds = int(word_info.start_time.nanos/(10**6))  # Nanoseconds to microseconds
                start_time = str(start_time) + '.' + str(microseconds).zfill(3)
                first_word = False

            if word_info.word[-1] == '.': # If this is the last word in the sentence.
                end_time = datetime.timedelta(seconds=word_info.end_time.seconds)
                microseconds = int(word_info.end_time.nanos/(10**6))
                end_time = str(end_time) + '.' + str(microseconds).zfill(3)
                
                hypothesis += sentence
                
                caption = Caption(str(start_time), str(end_time), sentence) # Title generation
                vtt.captions.append(caption)
                
                print(f'Added sentence: \n"{sentence}" \n Start time:{start_time}; end time: {end_time}\n')
                
                sentence = ''
                first_word = True
                
    return vtt, hypothesis

In [16]:
vtt, hypothesis = generate_vtt(responce)
vtt.save('result.vtt') # Saving subtitles to file

Added sentence: 
"Every word and phrase he speaks is true. " 
 Start time:0:00:00.900; end time: 0:00:04.200

Added sentence: 
"He put his last cartridge into the gun and fired. " 
 Start time:0:00:05.200; end time: 0:00:08.900

Added sentence: 
"They took their kids from the public school. " 
 Start time:0:00:09.700; end time: 0:00:12.800

Added sentence: 
"Dr. " 
 Start time:0:00:13.800; end time: 0:00:14.500

Added sentence: 
"keep the hatch tight and the watch constant. " 
 Start time:0:00:17.300; end time: 0:00:20.500

Added sentence: 
"Sever the twine with a quick tip of the knife. " 
 Start time:0:00:21.400; end time: 0:00:24.200

Added sentence: 
"Paper will dry out when went. " 
 Start time:0:00:25.400; end time: 0:00:27.900

Added sentence: 
"Slide the cash back and open the desk. " 
 Start time:0:00:28.800; end time: 0:00:31.900

Added sentence: 
"Help the week to preserve their strength. " 
 Start time:0:00:33.000; end time: 0:00:35.800

Added sentence: 
"Hey Solon, smile g

# WER

In [3]:
from jiwer import wer

In [10]:
truth_text = '''Every word and phrase he speaks is true.
He put his last cartridge into the gun and fired.
They took their kids from the public school.
Drive the screw straight into the wood.
Keep the hatch tight and the watch constant.
Sever the twine with a quick snip of the knife.
Paper will dry out when wet.
Slide the catch back and open the desk.
Help the weak to preserve their strength.
A sullen smile gets few friends.'''

In [11]:
print(hypothesis)

Every word and phrase he speaks is true. He put his last cartridge into the gun and fired. They took their kids from the public school. Dr. keep the hatch tight and the watch constant. Sever the twine with a quick tip of the knife. Paper will dry out when went. Slide the cash back and open the desk. Help the week to preserve their strength. Hey Solon, smile gets few friends. 


In [12]:
wer_result = wer(truth_text.replace('\n',''), hypothesis)
print(f'Quality of recognition: {1 - wer_result}')

Quality of recognition: 0.6376811594202898
