### Watson STT

[Demo](https://speech-to-text-demo.ng.bluemix.net/?_ga=2.217141713.693030876.1609788968-299385472.1609788968&cm_mc_uid=14191683397816047653786&cm_mc_sid_50200000=19705671609796523033)   

[API Docs](https://cloud.ibm.com/apidocs/speech-to-text?code=python#recognize)

[Github](https://github.com/watson-developer-cloud/python-sdk/blob/master/examples/speech_to_text_v1.py)

In [1]:
# authentication

import json
from ibm_watson import SpeechToTextV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator

with open('key.json') as f:
    cred = json.load(f)

authenticator = IAMAuthenticator(cred['apikey'])
STT = SpeechToTextV1(authenticator=authenticator)

STT.set_service_url(cred['url'])

### Small test

In [2]:
# get model
model = STT.get_model('en-US_BroadbandModel').get_result()
print(json.dumps(model, indent=4))

{
    "name": "en-US_BroadbandModel",
    "rate": 16000,
    "language": "en-US",
    "description": "US English broadband model.",
    "supported_features": {
        "custom_language_model": true,
        "speaker_labels": true
    },
    "url": "https://api.us-south.speech-to-text.watson.cloud.ibm.com/instances/4021736b-f05b-49c8-b274-bae949b55613/v1/models/en-US_BroadbandModel"
}


In [2]:
# speech recognition

with open('audio/test.mp3','rb') as audio_file:
    STT_result = STT.recognize(
        audio=audio_file,
        content_type='audio/mp3',
        word_alternatives_threshold=0.9,
        keywords=['she','related','him'],
        keywords_threshold=0.5
    ).get_result()
print(json.dumps(STT_result, indent=2))

{
  "result_index": 0,
  "results": [
    {
      "final": true,
      "alternatives": [
        {
          "transcript": "she is related to him ",
          "confidence": 0.96
        }
      ],
      "word_alternatives": [
        {
          "start_time": 0.07,
          "end_time": 0.37,
          "alternatives": [
            {
              "word": "she",
              "confidence": 0.99
            }
          ]
        },
        {
          "start_time": 0.37,
          "end_time": 0.55,
          "alternatives": [
            {
              "word": "is",
              "confidence": 0.99
            }
          ]
        },
        {
          "start_time": 0.55,
          "end_time": 1.13,
          "alternatives": [
            {
              "word": "related",
              "confidence": 0.99
            }
          ]
        },
        {
          "start_time": 1.13,
          "end_time": 1.28,
          "alternatives": [
            {
              "word": "to",
      

### cut audio based on time stamp

In [4]:
from pydub import AudioSegment

In [5]:
sound = AudioSegment.from_mp3("audio/covid.mp3")

In [13]:
print("The audio is {0:.2f} minutes.".format(len(sound)/1000/60))

The audio is 1.95 minutes.


In [15]:
# speech recognition

# keywords=['Health Canada','their own']

# Dr. Tam
keywords=['I think the company','not plentiful']

with open('audio/covid.mp3','rb') as audio_file:
    STT_result = STT.recognize(
        audio=audio_file,
        content_type='audio/mp3',
        word_alternatives_threshold=0.9,
        keywords=keywords,
        keywords_threshold=0.5
    ).get_result()
print(json.dumps(STT_result, indent=2))

{
  "result_index": 0,
  "results": [
    {
      "final": true,
      "alternatives": [
        {
          "transcript": "Renda severe won't protect anyone from covert nineteen but it can speed up a sick patients recovery time ",
          "confidence": 0.81
        }
      ],
      "word_alternatives": [
        {
          "start_time": 2.16,
          "end_time": 2.66,
          "alternatives": [
            {
              "word": "anyone",
              "confidence": 0.97
            }
          ]
        },
        {
          "start_time": 2.66,
          "end_time": 2.87,
          "alternatives": [
            {
              "word": "from",
              "confidence": 0.99
            }
          ]
        },
        {
          "start_time": 3.28,
          "end_time": 3.89,
          "alternatives": [
            {
              "word": "nineteen",
              "confidence": 0.98
            }
          ]
        },
        {
          "start_time": 3.89,
          "end_

In [29]:
keywords_result = dict()

for result in STT_result['results']:
    keywords_result = {**keywords_result, **result['keywords_result']}  # merge dict

In [31]:
keywords_result

{'I think the company': [{'start_time': 86.49,
   'end_time': 87.3,
   'confidence': 0.98,
   'normalized_text': 'I think the company'}],
 'not plentiful': [{'start_time': 93.67,
   'end_time': 94.58,
   'confidence': 1.0,
   'normalized_text': 'not plentiful'}]}

In [32]:
for keyword in keywords:
    
    print(keyword)
    print(keywords_result[keyword][0]['start_time'])
    print(keywords_result[keyword][0]['end_time'])

I think the company
86.49
87.3
not plentiful
93.67
94.58


In [36]:
# check confidence level 

for keyword in keywords:
    
    if keyword in keywords_result: 
        if keywords_result[keyword][0]['confidence'] > 0.7:
            pass
    else:
        raise Exception("low confidence level")

In [19]:
# cutting and exporting audio

test = sound[86.49*1000:94.58*1000]
test.export("audio/drTam.mp3", format="mp3")

<_io.BufferedRandom name='audio/drTam.mp3'>