In [1]:
!pip install webrtcvad
!pip install deepspeech

Collecting webrtcvad
[?25l  Downloading https://files.pythonhosted.org/packages/89/34/e2de2d97f3288512b9ea56f92e7452f8207eb5a0096500badf9dfd48f5e6/webrtcvad-2.0.10.tar.gz (66kB)
[K     |█████                           | 10kB 5.9MB/s eta 0:00:01[K     |██████████                      | 20kB 8.6MB/s eta 0:00:01[K     |██████████████▉                 | 30kB 6.0MB/s eta 0:00:01[K     |███████████████████▉            | 40kB 2.9MB/s eta 0:00:01[K     |████████████████████████▊       | 51kB 3.5MB/s eta 0:00:01[K     |█████████████████████████████▊  | 61kB 4.1MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 3.1MB/s 
[?25hBuilding wheels for collected packages: webrtcvad
  Building wheel for webrtcvad (setup.py) ... [?25l[?25hdone
  Created wheel for webrtcvad: filename=webrtcvad-2.0.10-cp37-cp37m-linux_x86_64.whl size=72334 sha256=801953264b7e650555c275b81366adc6e0ca0aa6086b20e37cfacd104e9af070
  Stored in directory: /root/.cache/pip/wheels/44/2a/18/bd1aec41cac7

In [2]:
!wget https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm
!wget https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer

--2021-03-03 19:05:28--  https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm
Resolving github.com (github.com)... 52.69.186.44
Connecting to github.com (github.com)|52.69.186.44|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github-releases.githubusercontent.com/60273704/8b25f180-3b0f-11eb-8fc1-de4f4ec3b5a3?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20210303%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20210303T190528Z&X-Amz-Expires=300&X-Amz-Signature=72fbdc7d3ad216504db2a63540b4c2d9c19ea0a88cb5003563f5f38c897ec182&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=60273704&response-content-disposition=attachment%3B%20filename%3Ddeepspeech-0.9.3-models.pbmm&response-content-type=application%2Foctet-stream [following]
--2021-03-03 19:05:28--  https://github-releases.githubusercontent.com/60273704/8b25f180-3b0f-11eb-8fc1-de4f4ec3b5a3?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Creden

In [3]:
import numpy as np
import sys
import os
import IPython
import wave
import json
import webrtcvad
import collections
from deepspeech import Model

In [4]:
model_file_path = "deepspeech-0.9.3-models.pbmm"
sc_file_path = "deepspeech-0.9.3-models.scorer"
beam_width = 500
sc_alpha = 0.93
sc_beta = 1.18

model = Model(model_file_path)
model.enableExternalScorer(sc_file_path)

model.setScorerAlphaBeta(sc_alpha, sc_beta)
model.setBeamWidth(beam_width)

0

In [5]:
#obtain input data
#Check the audio parameters (rate at 16000, channels at mono)
#Gets the audio duration and the data
def input_read(filename):
    with wave.open(filename, 'r') as s:
        channels = s.getnchannels()
        assert channels == 1
        sample_width = s.getsampwidth()
        assert sample_width == 2
        rate = s.getframerate()
        assert rate == 16000
        frames = s.getnframes()
        audio_data = s.readframes(frames)
        duration = frames / rate
    return audio_data, rate, duration     

In [6]:
#Define an object called frames~
#Used to store the instances we'll create from the frames
class Frame(object):
    def __init__(self, bytes, timestamp, duration):
        self.bytes = bytes
        self.timestamp = timestamp
        self.duration = duration


#So, deepspeech models need tiny chunks of data to work
#This function is used to create those tiny chunks
def segment_generator(frame_time, audio, rate):
    n = int(rate * (frame_time / 1000.0) * 2)
    offset = 0
    timestamp = 0.0
    duration = (float(n) / rate) / 2.0
    while offset + n < len(audio):
        yield Frame(audio[offset:offset + n], timestamp, duration)
        timestamp += duration
        offset += n

In [7]:
def vad_and_buffer(rate, frame_duration_ms, padding_duration_ms, vad, frames):
    num_padding_frames = int(padding_duration_ms / frame_duration_ms)
    # We use a deque for our sliding window/ring buffer.
    ring_buffer = collections.deque(maxlen=num_padding_frames)
    # We have two states: TRIGGERED and NOTTRIGGERED. We start in the
    # NOTTRIGGERED state.
    triggered = False

    voiced_frames = []
    for frame in frames:
        is_speech = vad.is_speech(frame.bytes, rate)

        if not triggered:
            ring_buffer.append((frame, is_speech))
            num_voiced = len([f for f, speech in ring_buffer if speech])
            # If we're NOTTRIGGERED and more than 90% of the frames in
            # the ring buffer are voiced frames, then enter the
            # TRIGGERED state.
            if num_voiced > 0.9 * ring_buffer.maxlen:
                triggered = True
                # We want to yield all the audio we see from now until
                # we are NOTTRIGGERED, but we have to start with the
                # audio that's already in the ring buffer.
                for f, s in ring_buffer:
                    voiced_frames.append(f)
                ring_buffer.clear()
        else:
            # We're in the TRIGGERED state, so collect the audio data
            # and add it to the ring buffer.
            voiced_frames.append(frame)
            ring_buffer.append((frame, is_speech))
            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
            # If more than 90% of the frames in the ring buffer are
            # unvoiced, then enter NOTTRIGGERED and yield whatever
            # audio we've collected.
            if num_unvoiced > 0.9 * ring_buffer.maxlen:
                triggered = False
                yield b''.join([f.bytes for f in voiced_frames])
                ring_buffer.clear()
                voiced_frames = []
    if triggered:
        pass
    # If we have any leftover voiced audio when we run out of input,
    # yield it.
    if voiced_frames:
        yield b''.join([f.bytes for f in voiced_frames])


In [8]:
def the_final_segment_generator(filename, aggressiveness):
    audio, rate, audio_length = input_read(filename)
    vad = webrtcvad.Vad(int(aggressiveness))
    frames = segment_generator(30, audio, rate)
    frames = list(frames)
    segments = vad_and_buffer(rate, 30, 300, vad, frames)

    return segments, rate, audio_length


In [9]:
def transcription (filename):
  #This transcribes the audio segemnts and saves them to a list
  segments, rate, audio_length = the_final_segment_generator(filename, 1)
  one_list = []
  for i, segment in enumerate(segments):
    #run deepspeech on each segment and append it to a list
    audio = np.frombuffer(segment, dtype = np.int16)
    output = model.stt(audio)
    one_list.append(output)
  return (one_list)

In [10]:
file_name_1 = "/content/diarization_cluster_0.wav"
file_name_2 = "/content/diarization_cluster_1.wav"
text_1  = transcription(file_name_1)
with open('transcript_1.txt', 'w') as f:
  f.write(json.dumps(text_1))
f.close

text_2  = transcription(file_name_2)
with open('transcript_2.txt', 'w') as f:
  f.write(json.dumps(text_2))
f.close

<function TextIOWrapper.close>