# Using the pre-trained speech recognition model mozilla Deepspeech to transcribe .wav files (original (lm=0.75), lm=0,lm=1, lm=1.5, lm=2 and noisy (SNR=3,6,9)


In [1]:
from deepspeech import Model
import numpy as np
import os
import wave
import json
from IPython.display import Audio
import glob
import re
import pandas as pd

In [2]:
!pip install deepspeech



In [3]:
model_file_path = 'deepspeech-0.9.3-models.pbmm'
lm_file_path = 'deepspeech-0.9.3-models.scorer'
beam_width = 100
lm_alpha = 0.75
lm_beta = 1.85
model = Model(model_file_path)
model.enableExternalScorer(lm_file_path)
 
#To create the files we adjusted the lm_alpha with the following values: 0, 1, 1.5,2

RuntimeError: CreateModel failed with 'Failed to initialize memory mapped model.' (0x3000)

# Effect of the weight of the language model

I chose to experiment with the follownig weight on the lm_alpha: 

lm_alpha=0: Which means we assigned no weight to the language model, disabling its influence during decoding. The model will rely solely on the acoustic model predictions.

lm_alpha=1: We assign equal weight to the language model and the acoustic model predictions. This weight implies that we have a high level of confidence  in the language model and want it to have a significant impact on the decoding process.

lm_alpha=1.5, lm_alpha=2: The next two weights we experiment with, imply that we overtrust the language model and want to check how biased the results will be.

In [5]:
model.setScorerAlphaBeta(lm_alpha, lm_beta)
model.setBeamWidth(beam_width)

0

In [6]:
def read_wav_file(filename):
    with wave.open(filename, 'rb') as w:
        rate = w.getframerate()
        frames = w.getnframes()
        buffer = w.readframes(frames)
        print("Rate:", rate)
        print("Frames:", frames)
        print("Buffer Len:", len(buffer))

    return buffer, rate

In [7]:
def transcribe_batch(audio_file):
    buffer, rate = read_wav_file(audio_file)
    data16 = np.frombuffer(buffer, dtype=np.int16)
    return model.stt(data16)


In [48]:
#This one works for the noisy files better
output_path ="C:/Users/kleop/Documents/repos/Exercises/ASR_assignment/LibriSpeech/dev-clean-2/8842/transcripts/trans_clean_lm_0.txt"
directory = 'C:/Users/kleop/Documents/repos/Exercises/ASR_assignment/LibriSpeech/dev-clean-2/8842/Outputs'
for filename in glob.glob(os.path.join(directory, '*.wav')):
    t=transcribe_batch(filename)
    filename_part = []
    filename_part = filename.split('/')
    line_transcr = re.sub('.wav','',filename_part[-1])
    line_transcr = re.sub('Outputs','',line_transcr)
    line_transcr = line_transcr.replace('\\', '')
    with open(output_path, "a") as output_file:
            output_file.write(line_transcr+" "+t+ "\n")

            

Rate: 16000
Frames: 43200
Buffer Len: 86400
C:/Users/kleop/Documents/repos/Exercises/ASR_assignment/LibriSpeech/dev-clean-2/8842/304647\8842-304647-0001.wav
Rate: 16000
Frames: 507200
Buffer Len: 1014400
C:/Users/kleop/Documents/repos/Exercises/ASR_assignment/LibriSpeech/dev-clean-2/8842/304647\8842-304647-0002.wav
Rate: 16000
Frames: 367360
Buffer Len: 734720
C:/Users/kleop/Documents/repos/Exercises/ASR_assignment/LibriSpeech/dev-clean-2/8842/304647\8842-304647-0006.wav
Rate: 16000
Frames: 195120
Buffer Len: 390240
C:/Users/kleop/Documents/repos/Exercises/ASR_assignment/LibriSpeech/dev-clean-2/8842/304647\8842-304647-0010.wav
Rate: 16000
Frames: 140080
Buffer Len: 280160
C:/Users/kleop/Documents/repos/Exercises/ASR_assignment/LibriSpeech/dev-clean-2/8842/304647\8842-304647-0011.wav


In [129]:
#This one works for the clean files better
output_path ="C:/Users/kleop/Documents/repos/Exercises/ASR_assignment/LibriSpeech/dev-clean-2/8842/transcripts/trans_clean_lm_1.txt"
directory = 'C:/Users/kleop/Documents/repos/Exercises/ASR_assignment/LibriSpeech/dev-clean-2/8842/304647'
with open(output_path, "a") as output_file:
    for filename in glob.glob(os.path.join(directory, '*.wav')):
        t = transcribe_batch(filename)
        line_transcr = os.path.basename(filename)
        line_transcr = os.path.splitext(line_transcr)[0]
        line_transcr = line_transcr.replace('\\', '')
        output_file.write(line_transcr + " " + t + "\n")

Rate: 16000
Frames: 43200
Buffer Len: 86400
Rate: 16000
Frames: 507200
Buffer Len: 1014400
Rate: 16000
Frames: 367360
Buffer Len: 734720
Rate: 16000
Frames: 195120
Buffer Len: 390240
Rate: 16000
Frames: 140080
Buffer Len: 280160
