## Importing and Instaling Neccessary Libraries

In [1]:
!pip install SpeechRecognition
!pip install ffmpeg-python
!pip install ffprobe
!pip install pydub



In [0]:
## Import files from Local Computer to Google Drive
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving api-key.json to api-key.json
Saving InboundSampleRecording.mp3 to InboundSampleRecording.mp3
User uploaded file "api-key.json" with length 2330 bytes
User uploaded file "InboundSampleRecording.mp3" with length 1132992 bytes


## Converting from MP3 to WAV audio format

In [2]:
from pydub import AudioSegment
import os
# files                                                                         
src = "InboundSampleRecording.mp3"
dst = "InboundSampleRecording.wav"

# convert wav to mp3                                                            
sound = AudioSegment.from_mp3(file=src)
sound.export(dst, format="wav")

<_io.BufferedRandom name='InboundSampleRecording.wav'>

## Breaking the Audio into Chunks and Transcribing

In [0]:
# Import necessary libraries 
from pydub import AudioSegment 
import speech_recognition as sr 
  
# Input audio file to be sliced 
audio = AudioSegment.from_wav("InboundSampleRecording.wav") 
  
''' 
Step #1 - Slicing the audio file into smaller chunks. 
'''
# Length of the audiofile in milliseconds 
n = len(audio) 
  
# Variable to count the number of sliced chunks 
counter = 1
  
# Text file to write the recognized audio 
fh = open("recognized.txt", "w+") 
  
interval = 10 * 1000
overlap = 1.5 *1000 
  
# Initialize start and end seconds to 0 
start = 0
end = 0
  
# Flag to keep track of end of file. 
# When audio reaches its end, flag is set to 1 and we break 
flag = 0

In [10]:
# Iterate from 0 to end of the file, 
# with increment = interval 
for i in range(0, 2 * n, interval): 
      
    # During first iteration, 
    # start is 0, end is the interval 
    if i == 0: 
        start = 0
        end = interval 
  
    # All other iterations, 
    # start is the previous end - overlap 
    # end becomes end + interval 
    else: 
        start = end - overlap 
        end = start + interval  
  
    # When end becomes greater than the file length, 
    # end is set to the file length 
    # flag is set to 1 to indicate break. 
    if end >= n: 
        end = n 
        flag = 1
    # Storing audio file from the defined start to end 
    chunk = audio[start:end] 
  
    # Filename / Path to store the sliced audio 
    filename = 'chunk'+str(counter)+'.wav'
  
    # Store the sliced audio file to the defined path 
    chunk.export(filename, format ="wav") 
    # Print information about the current chunk 
    print("Processing chunk "+str(counter)+". Start = "
                        +str(start)+" end = "+str(end)) 
  
    # Increment counter for the next chunk 
    counter = counter + 1
  
    AUDIO_FILE = filename 
    # Initialize the recognizer 
    r = sr.Recognizer() 
  
    # Traverse the audio file and listen to the audio 
    with sr.AudioFile(AUDIO_FILE) as source: 
        audio_listened = r.listen(source) 
  
    # Try to recognize the listened audio 
    # And catch expections. 
    try:     
        rec = r.recognize_google(audio_listened) 
          
        # If recognized, write into the file. 
        fh.write(rec+" ") 
      
    # If google could not understand the audio 
    except sr.UnknownValueError: 
        print("Could not understand audio") 
  
    # If the results cannot be requested from Google. 
    # Probably an internet connection error. 
    except sr.RequestError as e: 
        print("Could not request results.") 
  
    # Check for flag. 
    # If flag is 1, end of the whole audio reached. 
    # Close the file and break. 
    if flag == 1: 
        fh.close() 
        break

Processing chunk 1. Start = 0 end = 10000
Processing chunk 2. Start = 8500.0 end = 18500.0
Processing chunk 3. Start = 17000.0 end = 27000.0
Processing chunk 4. Start = 25500.0 end = 35500.0
Processing chunk 5. Start = 34000.0 end = 44000.0
Processing chunk 6. Start = 42500.0 end = 52500.0
Processing chunk 7. Start = 51000.0 end = 61000.0
Processing chunk 8. Start = 59500.0 end = 69500.0
Processing chunk 9. Start = 68000.0 end = 78000.0
Processing chunk 10. Start = 76500.0 end = 86500.0
Could not understand audio
Processing chunk 11. Start = 85000.0 end = 95000.0
Processing chunk 12. Start = 93500.0 end = 103500.0
Processing chunk 13. Start = 102000.0 end = 112000.0
Processing chunk 14. Start = 110500.0 end = 120500.0
Processing chunk 15. Start = 119000.0 end = 129000.0
Processing chunk 16. Start = 127500.0 end = 137500.0
Processing chunk 17. Start = 136000.0 end = 146000.0
Processing chunk 18. Start = 144500.0 end = 154500.0
Processing chunk 19. Start = 153000.0 end = 163000.0
Process