## Installations

In [None]:
pip install SpeechRecognition

In [None]:
pip install pyaudio

In [None]:
pip install pydub

In [None]:
conda install -c conda-forge speechrecognition

In [None]:
conda install -c "conda-forge/label/cf201901" speechrecognition

In [None]:
conda install -c "conda-forge/label/cf202003" speechrecognition

In [None]:
pip install librosa


## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt 
import seaborn as sns
from glob import glob
from itertools import cycle

import librosa
import librosa.display
import IPython.display as ipd

sns.set_theme(style="white", palette=None)
color_pal = plt.rcParams["axes.prop_cycle"].by_key()["color"]
color_cycle = cycle(plt.rcParams["axes.prop_cycle"].by_key()["color"])

In [None]:
import speech_recognition as sr

In [None]:
sr.__version__ #Checking version should be 3.8.1

## Instantiating recognizer class

In [None]:
r = sr.Recognizer() 

## Load audio file

In [None]:

audio_files_ = (r'') #Set working directory on local machine to folder containing captcha. Include captcha name and extension
audio_files = glob(audio_files_)

## Play audio files

In [None]:

ipd.Audio(audio_files[0])

## Speech Recognition

In [None]:
output = sr.AudioFile(audio_files_) 


with output as source:
    #r.adjust_for_ambient_noise(source) 
    #audio = r.adjust_for_ambient_noise(source)
    audio = r.record(source) #, duration = 10) #can also use audio = r.listen(source) or r.record(source)
    try:
        text = r.recognize_google(audio)
        
        text = text.upper()
        
        ### Correcting mistakes
        
        if "." in text:
            text = text.replace(".","")
        
        if "&" in text:
            text = text.replace("&","")
        
        if " " in text:
            text = text.replace(" ","")
            
        if "-" in text:
            text = text.replace("-","")
        
        if "THREE" in text:
            text = text.replace("THREE","3")
            
        if "THOR" in text:
            text = text.replace("THOR","4")
        
        if "PORT" in text:
            text = text.replace("PORT","4")
            
        if "FOR" in text:
            text = text.replace("FOR","4")
            
        if "GORE" in text:
            text = text.replace("GORE","4")
        
        if "BY" in text:
            text = text.replace("BY","5")
        
        if "THAI" in text:
            text = text.replace("THAI","5")
            
        if "SEX" in text:
            text = text.replace("SEX","6")
        
        if "HEY" in text:
            text = text.replace("HEY","8")
        
        if "FEET" in text:
            text = text.replace("FEET","8")
        
        if "NIGHTS" or "NIGHT" in text:
            text = text.replace("NIGHTS" and "NIGHT","9")
        
        if "SI" in text:
            text = text.replace("SI","C")
        
        if "CI" in text:
            text = text.replace("CI","C")
            
        if "SEA" in text:
            text = text.replace("SEA","C")
       
        if "JAY" in text:
            text = text.replace("JAY","J")
            
        if "YEAH" in text:
            text = text.replace("YEAH","J")
        
        if "KAY" in text:
            text = text.replace("KAY","K")
        
        if "TE" in text:
            text = text.replace("TE","T")
        
        if "TY" in text:
            text = text.replace("TY","T")
            
        if "OUR" in text:
            text = text.replace("OUR","R")
            
        if "EM" in text:
            text = text.replace("EM","M")
        
        if "AM" in text:
            text = text.replace("AM","M")
            
        if "FIGHTS" in text:
            text = text.replace("FIGHTS","5")
        
        if "AND" in text:
            text = text.replace("AND","n")
        
        if "ARE" in text:
            text = text.replace("ARE","R")
        
        if "RI" in text:
            text = text.replace("RI","R")
            
        if "YOU" in text:
            text = text.replace("YOU","U")
            
        if "NEW" in text:
            text = text.replace("NEW","u")
        
        if "MOVIE" in text:
            text = text.replace("MOVIE","v")
            
        if "WI" in text:
            text = text.replace("WI","Y")
        
        if "YES" in text:
            text = text.replace("YES","S")
            
        if "EGGS" in text:
            text = text.replace("EGGS","X")

        if "WHY" in text:
            text = text.replace("WHY","Y")
            
            
        print(text)

    except:
        print("Could not understand audio")
    

## Visual Analysis

In [None]:
y, sr = librosa.load(audio_files[0]) #naming the audio file y
print(f'y: {y[:10]}')
print(f'shape y: {y.shape}')
print(f'sample rate:{sr}')

In [None]:
#Plot the audio file visually
pd.Series(y).plot(figsize=(10,5), lw=1, title="Raw audio file")
plt.show()

In [None]:
#r.energy_threshold = 20 #Sets the threshold of the model. The model will attempt to recognize all sound above this threshold only


### Trimming leading and trailing silence (dead air)

In [None]:

y_trimmed, _ = librosa.effects.trim(y, top_db=20)
pd.Series(y_trimmed).plot(figsize=(10,5),
                         lw=1,
                         title='Raw Audio Trimmed',
                         color=color_pal[2])

plt.show()

## Zoom in on specific section

In [None]:
#Zoom in

pd.Series(y[20000:20500]).plot(figsize=(10,5), #change [:] to zoom in on a different part
                         lw=1,
                         title='Raw Audio Zoomed',
                         color=color_pal[2])
plt.show()

## Slowing the file down

In [None]:
y_slowed = librosa.effects.time_stretch(y, rate=0.5)

In [None]:
type(y_slowed)

In [None]:
D = librosa.stft(y)
S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)
S_db.shape

## Spectogram 
#### Can extract which frequencies are sounding at which time. 
#### Can be used to analyse specific sections of the track

In [None]:
#Plotting the transformed audio data
fig, ax = plt.subplots(figsize=(10,5))
img = librosa.display.specshow(S_db,
                              x_axis='time',
                              y_axis='log',
                              ax=ax)
ax.set_title('Spectogram',fontsize=20)
fig.colorbar(img, ax=ax, format=f'%0.2f')
plt.show()