# Automatic Speech Recognition System

```
Audio Processing and synthesis
Prof. Jose Rieta
```



In [None]:
#importing libraries
import pandas as pd
import librosa as lr
import numpy as np
!pip install SpeechRecognition --quiet
import speech_recognition as sr

Database: Audio Corpora, Fala Brasil Group (Universidade Federal do Pará - UFPA)

- https://gitlab.com/fb-audio-corpora/alcaim16k-DVD1de4/-/tree/master/CarolinaMagalhaes_F050?ref_type=heads

In [None]:
# Mounting google drive in the folder containing the portion of the dataset used
from google.colab import drive
drive.mount('/content/gdrive')
Path = "gdrive/MyDrive/WAVES/UPV/APS/Database/" # You may save the database in you google drive and change this path accordingly
#Path = "gdrive/MyDrive/APS/Database/"

Mounted at /content/gdrive


In [None]:
Nrec = 500 #number of recordings
df = pd.DataFrame(columns=['Fs', 'Data', 'Sentence']) #creating a dataframe

#loading all files to the dataframe
for k in range (Nrec):
  file_number = str(k).rjust(4, '0') #transforming the integer number into a 4 digit string with 0s to the left
  df.loc[k,'Fs'], df.loc[k,'Data'] = lr.load(Path+'F050-'+file_number+'.wav', sr=None) #assembling the final string and uploading the .wav file
  S = open(Path+'F050-'+file_number+'.txt') #assembling the final string and opening the .txt file
  df.loc[k,'Sentence'] = S.read().replace("\n", "").split(" ") #removing the \n from the string and adding the sentence to the dataframe (word by word)

display(df)

Unnamed: 0,Fs,Data,Sentence
0,"[0.00021362305, 0.0007324219, 0.00039672852, 0...",16000,"[pesquisa, é, uma, coisa, que, muda, a, toda, ..."
1,"[0.0015869141, 0.001373291, 0.0014343262, 0.00...",16000,"[no, total, serão, chamados, vinte, e, seis, m..."
2,"[0.0007324219, 0.00064086914, 0.0006713867, 0....",16000,"[o, número, de, convocados, por, vaga, é, de, ..."
3,"[0.0014953613, 0.0017700195, 0.0014038086, 0.0...",16000,"[atualmente, esse, abatimento, é, limitado, a,..."
4,"[0.0008239746, 0.0009765625, 0.00079345703, 0....",16000,"[sandra, regina, machado, acho, que, ela, enfi..."
...,...,...,...
495,"[-0.0008239746, -0.0010070801, -0.0005187988, ...",16000,"[passava, o, dia, vendendo, comprando, barganh..."
496,"[0.0009460449, 0.0012817383, 0.0010375977, 0.0...",16000,"[a, alta, foi, de, três, vírgula, vinte, e, se..."
497,"[-0.0013122559, -0.0017089844, -0.0012817383, ...",16000,"[o, time, só, caiu, nos, pênaltis, para, a, fr..."
498,"[-0.00048828125, -0.0008239746, -0.0008239746,...",16000,"[empresa, que, serve, ao, ministério, da, faze..."


- Converting Digit Numbers into Words

In [None]:
def number2words_ptbr(number):
  # A dictionary to map digits to words in Portuguese for simple numbers
  comma = ',' in str(number)
  if (comma==False):
    number = int(number)
    number_words_ptbr = {
      0: "zero", 1: "um", 2: "dois", 3: "três", 4: "quatro",
      5: "cinco", 6: "seis", 7: "sete", 8: "oito", 9: "nove",
      10: "dez", 11: "onze", 12: "doze", 13: "treze", 14: "quatorze",
      15: "quinze", 16: "dezesseis", 17: "dezessete", 18: "dezoito", 19: "dezenove",
      20: "vinte", 30: "trinta", 40: "quarenta", 50: "cinquenta",
      60: "sessenta", 70: "setenta", 80: "oitenta", 90: "noventa",
      100: "cem", 200: "duzentos", 300: "trezentos", 400: "quatrocentos",
      500: "quinhentos", 600: "seiscentos", 700: "setecentos", 800: "oitocentos", 900: "novecentos"
    }
    if number < 20: # in portuguese, if the number is lower than 20, its name comes directly
        return number_words_ptbr[number]
    elif number < 100: # for hundreds it needs to be  composed by the amount of hundreds plus "e" and the tens and units
        tens, ones = divmod(number, 10)
        return f"{number_words_ptbr[tens * 10]} e {number_words_ptbr[ones]}" if ones else f"{number_words_ptbr[tens * 10]}"
    elif number < 1000: # for thousands, the placement of the "e" will depend if it has hundreds or not
        hundreds, remainder = divmod(number, 100)
        if remainder == 0:
            return number_words_ptbr[hundreds * 100]
        elif hundreds==1: # there is a name exception for the case of thousands and ONE HUNDRED which is called "cento"
            return f"cento e {number2words_ptbr(remainder)}"
        else:
            return f"{number_words_ptbr[hundreds * 100]} e {number2words_ptbr(remainder)}"
    elif number < 1000000: # in the case of millions (which will be the last addressed), it will depend on the format of all the lower magnitude digits
        thousands, remainder = divmod(number, 1000)
        thousands_word = "mil" if thousands == 1 else f"{number2words_ptbr(thousands)} mil"
        if remainder == 0:
            return thousands_word
        elif remainder%100==0 or remainder<100:
            return f"{thousands_word} e {number2words_ptbr(remainder)}"
        else:
            return f"{thousands_word} {number2words_ptbr(remainder)}"
    else:
        millions, remainder = divmod(number, 1000000)
        millions_word = "um milhão" if millions == 1 else f"{number2words_ptbr(millions)} milhões"
        if remainder == 0:
            return millions_word
        else:
            return f"{millions_word} {number2words_ptbr(remainder)}"
  else:
    inte = int(number.split(',')[0])
    dec = int(number.split(',')[1])
    return f"{number2words_ptbr(inte)} vírgula {number2words_ptbr(dec)}"

- Google Speech Recognition Engine

In [None]:
def transcribe_google(audio_path):
    # Initialize recognizer
    recognizer = sr.Recognizer()

    # Convert audio to audio source for recognition
    with sr.AudioFile(audio_path) as source:
        audio_data = recognizer.record(source)

    # Transcribe audio to text
    try:
        transcription = recognizer.recognize_google(audio_data,language="pt-BR")
        transcription = transcription.lower()

        # Convert numbers to words
        words = transcription.split()
        final_sentence=[]
        for word in words:
          if word.isdigit() or (',' in word and not '%' in word): # handle number convention for
            final_sentence.append(number2words_ptbr(word))
          elif word.endswith("%"):
            # Handle percentage, "50%" -> "cinquenta por cento"
            number_part = word.rstrip("%")
            final_sentence.append(f"{number2words_ptbr(number_part)} por cento")
          elif '.' in word:
            number_split = word.split(".")
            number_part = ''.join(number_split)
            final_sentence.append(f"{number2words_ptbr(number_part)} por cento")
          else:
            final_sentence.append(word)
          transcription_corrected = ' '.join(final_sentence)
        return transcription_corrected

    except sr.UnknownValueError:
        print("Google Speech Recognition could not understand audio")
        return
    except sr.RequestError as e:
        print("Could not request results from Google Speech Recognition service; {0}".format(e))
        return

In [None]:
# Transcribing sentence by sentence
gsent = []
for k in range (Nrec):
  file_number = str(k).rjust(4, '0') #transforming the integer number into a 4 digit string with 0s to the left
  gsent.append(transcribe_google(Path+'F050-'+file_number+'.wav'))
display(gsent)

['pesquisa é uma coisa que muda a toda hora',
 'no total serão chamados vinte e seis mil por cento candidatos',
 'o número de convocados por vaga é de doze candidatos',
 'atualmente esse abatimento é limitado a setenta por cento dos gastos',
 'sandra regina machado acho que ela enfim criou o juízo',
 'eles estão colocando armadilhas nas fazendas onde já ocorreram os ataques',
 'dessas somente umas trezentos e vinte foram inauguradas em território americano',
 'no total sete mísseis foram disparados contra o encrave',
 'em florianópolis foi registrado dois graus celsus na manhã de domingo',
 'as situações ditas embaraçosas são resolvidas com os dados',
 'itamar tem razão de estar exultante como nunca desde que virou presidente',
 'a mãe de todas as reformas é a reforma política',
 'conseguiram eliminar áreas supérfluas ou que antes eram desperdiçadas',
 'uma lata de leite em pó integral vale o ingresso',
 'a maioria dos passageiros do barco naufragado era de crianças',
 'a provável caus

- Error Analysis

In [None]:
wErrorCount = []
aux_list = []
for h in range (Nrec):
  aux = 0
  for o in df["Sentence"][h]:
    if o not in gsent[h]:
      aux=aux+1
  l = len(df["Sentence"][h])
  wErrorCount.append([aux, l, round(aux/l,3)]) # for comparison between the amount of words wront in each sentence and its corresponding percentage
display(wErrorCount)

[[0, 9, 0.0],
 [0, 9, 0.0],
 [0, 10, 0.0],
 [0, 11, 0.0],
 [0, 9, 0.0],
 [0, 11, 0.0],
 [1, 11, 0.091],
 [0, 9, 0.0],
 [1, 11, 0.091],
 [0, 9, 0.0],
 [0, 12, 0.0],
 [0, 10, 0.0],
 [0, 9, 0.0],
 [0, 10, 0.0],
 [0, 10, 0.0],
 [0, 11, 0.0],
 [0, 11, 0.0],
 [0, 9, 0.0],
 [0, 10, 0.0],
 [0, 11, 0.0],
 [0, 12, 0.0],
 [1, 12, 0.083],
 [1, 9, 0.111],
 [0, 10, 0.0],
 [0, 9, 0.0],
 [0, 11, 0.0],
 [0, 11, 0.0],
 [0, 9, 0.0],
 [0, 10, 0.0],
 [0, 9, 0.0],
 [0, 9, 0.0],
 [0, 11, 0.0],
 [0, 12, 0.0],
 [1, 10, 0.1],
 [0, 9, 0.0],
 [0, 12, 0.0],
 [1, 9, 0.111],
 [0, 11, 0.0],
 [0, 9, 0.0],
 [0, 11, 0.0],
 [0, 10, 0.0],
 [0, 12, 0.0],
 [0, 12, 0.0],
 [0, 11, 0.0],
 [1, 10, 0.1],
 [0, 9, 0.0],
 [1, 9, 0.111],
 [0, 11, 0.0],
 [0, 12, 0.0],
 [0, 9, 0.0],
 [0, 9, 0.0],
 [0, 9, 0.0],
 [0, 12, 0.0],
 [0, 10, 0.0],
 [0, 9, 0.0],
 [1, 9, 0.111],
 [1, 11, 0.091],
 [0, 11, 0.0],
 [1, 11, 0.091],
 [1, 12, 0.083],
 [1, 12, 0.083],
 [1, 10, 0.1],
 [1, 12, 0.083],
 [0, 9, 0.0],
 [0, 9, 0.0],
 [0, 10, 0.0],
 [0, 9, 0.

In [None]:
auxerror = 0
auxerrort = 0
tErrors=[]
for k in range (len(wErrorCount)):
  auxerror = auxerror+wErrorCount[k][0]
  auxerrort = auxerrort+wErrorCount[k][1]
print(f'Total Error: {round(auxerror/auxerrort,3)*100} %')

Total Error: 2.9000000000000004 %


In [None]:
mydictionary = []
for j in range(Nrec):
  for g in df.loc[j, "Sentence"]:
    if (len(mydictionary)==0):
      mydictionary.append(g)
    elif (not g in mydictionary):
      mydictionary.append(g)

print(mydictionary)
print(len(mydictionary))

['pesquisa', 'é', 'uma', 'coisa', 'que', 'muda', 'a', 'toda', 'hora', 'no', 'total', 'serão', 'chamados', 'vinte', 'e', 'seis', 'mil', 'candidatos', 'o', 'número', 'de', 'convocados', 'por', 'vaga', 'doze', 'atualmente', 'esse', 'abatimento', 'limitado', 'setenta', 'cento', 'dos', 'gastos', 'sandra', 'regina', 'machado', 'acho', 'ela', 'enfim', 'criou', 'juízo', 'eles', 'estão', 'colocando', 'armadilhas', 'nas', 'fazendas', 'onde', 'já', 'ocorreram', 'os', 'ataques', 'dessas', 'somente', 'umas', 'trezentas', 'foram', 'inauguradas', 'em', 'território', 'americano', 'sete', 'mísseis', 'disparados', 'contra', 'encrave', 'florianópolis', 'foi', 'registrado', 'dois', 'graus', 'celsius', 'na', 'manhã', 'domingo', 'as', 'situações', 'ditas', 'embaraçosas', 'são', 'resolvidas', 'com', 'dados', 'itamar', 'tem', 'razão', 'estar', 'exultante', 'como', 'nunca', 'desde', 'virou', 'presidente', 'mãe', 'todas', 'reformas', 'reforma', 'política', 'conseguiram', 'eliminar', 'áreas', 'supérfluas', 'ou',