# Setup

In [1]:
import pandas as pd
import numpy as np
from IPython import display

from jiwer import wer, cer
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import torchaudio

from torchnlp.encoders import LabelEncoder

import os
import csv

# Custom Dataset

In [2]:
class SilentSpeech(torch.utils.data.Dataset):
    def __init__(self, metadata_path, dataset_type=None):
        with open(metadata_path) as metadata:
            flist = csv.reader(metadata, delimiter="|", quotechar="'", quoting=csv.QUOTE_MINIMAL)
            self._flist = list(flist)
            fis = []
            if dataset_type:
                for fi in self._flist:
                    line = fi
                    _, _, cur_dataset_type, modality = line
                    if cur_dataset_type == dataset_type:
                        fis.append(fi)
            print("LIST OF FILES:", self._flist[0])
            self._flist = fis

    def __getitem__(self, n):
        line = self._flist[n]
        cur_path, text, dataset_type, _ = line
        waveform, sr = torchaudio.load(cur_path)
        return (waveform, sr, text, dataset_type)

    def __len__(self):
        return len(self._flist)

In [3]:
class SilentSpeechPred(torch.utils.data.Dataset):
    def __init__(self, metadata_path, dataset_type=None, silent_only=False, voiced_only=False):
        with open(metadata_path) as metadata:
            flist = csv.reader(metadata, delimiter="|", quotechar="'", quoting=csv.QUOTE_MINIMAL)
            self._flist = list(flist)
            fis = []
            if dataset_type:
                for fi in self._flist:
                    line = fi
                    _, _, cur_dataset_type, modality = line
                    if cur_dataset_type == dataset_type:
                        if silent_only and modality == "silent":
                            fis.append(fi)
                        elif voiced_only and modality == "voiced":
                            fis.append(fi)
                        elif not silent_only and not voiced_only:
                            fis.append(fi)
                        else:
                            Exception("You've selected silent only and voiced only.")

            self._flist = fis
            print("(1) LIST OF FILES:", len(self._flist))
            print("(2) LIST OF FILES:", self._flist[0])

    def __getitem__(self, n):
        line = self._flist[n]
        cur_path, text, dataset_type, _ = line
        # waveform, sr = torchaudio.load(cur_path)
        mel_spectrogram = torch.load(cur_path)
        return (mel_spectrogram, text, dataset_type)

    def __len__(self):
        return len(self._flist)

# Preprocessing

In [4]:
# characters = [x for x in " abcdefghijklmnopqrstuvwxyz0123456789-"]
characters = [x for x in " abcdefghijklmnopqrstuvwxyz-"]
encoder = LabelEncoder(characters)

print(encoder.vocab, len(encoder.vocab))

['<unk>', ' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '-'] 29


In [5]:
from torchaudio.prototype.ctc_decoder import download_pretrained_files
files = download_pretrained_files("librispeech-4-gram")
print(files)

PretrainedFiles(lexicon='/home/joe/.cache/torch/hub/torchaudio/decoder-assets/librispeech-4-gram/lexicon.txt', tokens='/home/joe/.cache/torch/hub/torchaudio/decoder-assets/librispeech-4-gram/tokens.txt', lm='/home/joe/.cache/torch/hub/torchaudio/decoder-assets/librispeech-4-gram/lm.bin')


In [5]:
from torchaudio.prototype.ctc_decoder import ctc_decoder

CUR_DATASET = "SILENT_SPEECH" # "LJSPEECH", "SILENT_SPEECH"
if CUR_DATASET == "SILENT_SPEECH":
    SR = 16000 # Silent Speech 22_050 # LJSpeech
else:
    SR = 22_050

train_audio_transforms = nn.Sequential(
    torchaudio.transforms.MelSpectrogram(
        sample_rate=SR,
        n_mels=128,
        hop_length=160,
        win_length=432,
        n_fft=512,
        center=False),
    torchaudio.transforms.FrequencyMasking(freq_mask_param=15),
    torchaudio.transforms.TimeMasking(time_mask_param=35)
)

valid_audio_transforms = torchaudio.transforms.MelSpectrogram()

import jiwer
transformation = jiwer.Compose(\
        [jiwer.RemovePunctuation(), jiwer.ToLowerCase()])

def data_processing(data, data_type="train"):
    spectrograms = []
    labels = []
    input_lengths = []
    label_lengths = []

    # for (waveform, _, utterance) in data:
    for cur in data:
        if CUR_DATASET == "SILENT_SPEECH":
            waveform, _, utterance, dataset_type = cur
        else:
            waveform, _, _, utterance = cur

        if data_type == 'train':
            spec = train_audio_transforms(waveform).squeeze(0).transpose(0, 1)
        elif data_type == "valid":
            spec = valid_audio_transforms(waveform).squeeze(0).transpose(0, 1)
        else:
            raise Exception('data_type should be train or valid')
        spectrograms.append(spec)

        label = transformation(utterance)
        label = encoder.batch_encode(utterance.lower())
        labels.append(label)
        input_lengths.append(spec.shape[0]//2)
        label_lengths.append(len(label))

    spectrograms = nn.utils.rnn.pad_sequence(spectrograms, batch_first=True).unsqueeze(1).transpose(2, 3)
    labels = nn.utils.rnn.pad_sequence(labels, batch_first=True)

    return spectrograms, labels, input_lengths, label_lengths

def data_processing_preds(data, data_type="train"):
    """THIS IS ONLY FOR THE PREDICTED MEL_SPECTROGRAMS FOR THE SEMG SILENT SPEECH MODEL!"""
    spectrograms = []
    labels = []
    input_lengths = []
    label_lengths = []

    # for (waveform, _, utterance) in data:
    for cur in data:
        mel_spectrogram, utterance, dataset_type = cur

        #print("pre-mel shape:", mel_spectrogram.shape)
        # mel_spectrogram = mel_spectrogram.transpose(0, 1)
        #print("post-mel shape:", mel_spectrogram.shape)
        spectrograms.append(mel_spectrogram)

        label = transformation(utterance)
        label = encoder.batch_encode(utterance.lower())
        labels.append(label)
        input_lengths.append(mel_spectrogram.shape[0]//2)
        label_lengths.append(len(label))

    spectrograms = nn.utils.rnn.pad_sequence(spectrograms, batch_first=True).unsqueeze(1).transpose(2, 3)
    labels = nn.utils.rnn.pad_sequence(labels, batch_first=True)

    return spectrograms, labels, input_lengths, label_lengths

def BeamDecoder(beam_search_decoder,
                output,
                labels,
                label_lengths):
    print("beam output shape:", output.shape)

    decodes = []
    targets = []

    for i, pred in enumerate(output):
        pred = pred.cpu()
        pred = pred.unsqueeze(0)
        decode = []
        cur_target = labels[i][:label_lengths[i]]
        if len(cur_target) > 0:
            cur_target = \
                "".join(encoder.batch_decode(torch.tensor(cur_target)))
        else:
            cur_target = ""
        targets.append(cur_target)

        beam_search_result     = beam_search_decoder(pred)
        beam_search_transcript = " ".join(beam_search_result[0][0].words).strip()

        # cur_decode = decode
        cur_decode = beam_search_transcript

        """
        if len(cur_decode) > 0:
            cur_decode = \
                "".join(encoder.batch_decode(torch.tensor(cur_decode)))
        else:
            cur_decode = ""
        """
        decodes.append(cur_decode)

    return decodes, targets

def GreedyDecoder(output, labels, label_lengths, blank_label=28, collapse_repeated=True):
    arg_maxes = torch.argmax(output, dim=2)
    print("greedy output shape:", output.shape, arg_maxes.shape)
    decodes = []
    targets = []
    for i, args in enumerate(arg_maxes):
        decode = []
        cur_target = labels[i][:label_lengths[i]]
        if len(cur_target) > 0:
            cur_target = \
                "".join(encoder.batch_decode(torch.tensor(cur_target)))
        else:
            cur_target = ""
        targets.append(cur_target)
        for j, index in enumerate(args):
            if index != blank_label:
                if collapse_repeated and j != 0 and index == args[j -1]:
                    continue
                decode.append(index.item())
        cur_decode = decode
        if len(cur_decode) > 0:
            cur_decode = \
                "".join(encoder.batch_decode(torch.tensor(cur_decode)))
        else:
            cur_decode = ""
        decodes.append(cur_decode)

    return decodes, targets



# Model

## DeepSpeech2 Model

In [6]:
class CNNLayerNorm(nn.Module):
    """Layer normalization built for cnns input"""
    def __init__(self, n_feats):
        super(CNNLayerNorm, self).__init__()
        self.layer_norm = nn.LayerNorm(n_feats)

    def forward(self, x):
        # x (batch, channel, feature, time)
        x = x.transpose(2, 3).contiguous() # (batch, channel, time, feature)
        x = self.layer_norm(x)
        return x.transpose(2, 3).contiguous() # (batch, channel, feature, time) 

class ResidualCNN(nn.Module):
    """Residual CNN inspired by https://arxiv.org/pdf/1603.05027.pdf
        except with layer norm instead of batch norm
    """
    def __init__(self, in_channels, out_channels, kernel, stride, dropout, n_feats):
        super(ResidualCNN, self).__init__()

        self.cnn1 = nn.Conv2d(in_channels, out_channels, kernel, stride, padding=kernel//2)
        self.cnn2 = nn.Conv2d(out_channels, out_channels, kernel, stride, padding=kernel//2)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.layer_norm1 = CNNLayerNorm(n_feats)
        self.layer_norm2 = CNNLayerNorm(n_feats)

    def forward(self, x):
        residual = x  # (batch, channel, feature, time)
        x = self.layer_norm1(x)
        x = F.gelu(x)
        x = self.dropout1(x)
        x = self.cnn1(x)
        x = self.layer_norm2(x)
        x = F.gelu(x)
        x = self.dropout2(x)
        x = self.cnn2(x)
        x += residual
        return x # (batch, channel, feature, time)

class BidirectionalGRU(nn.Module):
    def __init__(self, rnn_dim, hidden_size, dropout, batch_first):
        super(BidirectionalGRU, self).__init__()

        self.BiGRU = nn.GRU(
            input_size=rnn_dim, hidden_size=hidden_size,
            num_layers=1, batch_first=batch_first, bidirectional=True)
        self.layer_norm = nn.LayerNorm(rnn_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.layer_norm(x)
        x = F.gelu(x)
        x, _ = self.BiGRU(x)
        x = self.dropout(x)
        return x

class SpeechRecognitionModel(nn.Module):
    def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride=2, dropout=0.1):
        super(SpeechRecognitionModel, self).__init__()
        n_feats = n_feats//2
        self.cnn = nn.Conv2d(1, 32, 3, stride=stride, padding=3//2)  # cnn for extracting heirachal features

        # n residual cnn layers with filter size of 32
        self.rescnn_layers = nn.Sequential(*[
            ResidualCNN(32, 32, kernel=3, stride=1, dropout=dropout, n_feats=n_feats) 
            for _ in range(n_cnn_layers)
        ])
        self.fully_connected = nn.Linear(n_feats*32, rnn_dim)
        self.birnn_layers = nn.Sequential(*[
            BidirectionalGRU(rnn_dim=rnn_dim if i==0 else rnn_dim*2,
                             hidden_size=rnn_dim, dropout=dropout, batch_first=i==0)
            for i in range(n_rnn_layers)
        ])
        self.classifier = nn.Sequential(
            nn.Linear(rnn_dim*2, rnn_dim),  # birnn returns rnn_dim*2
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(rnn_dim, n_class)
        )

    def forward(self, x):
        x = self.cnn(x)
        x = self.rescnn_layers(x)
        sizes = x.size()
        x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3])  # (batch, feature, time)
        x = x.transpose(1, 2) # (batch, time, feature)
        x = self.fully_connected(x)
        x = self.birnn_layers(x)
        x = self.classifier(x)
        return x

# Train / Test Code

In [7]:
from torch.cuda.amp.grad_scaler import GradScaler
import random

amp_enabled = True

class IterMeter(object):
    """keeps track of total iterations"""
    def __init__(self):
        self.val = 0

    def step(self):
        self.val += 1

    def get(self):
        return self.val

def train(model, device, train_loader, criterion, optimizer, scheduler, epoch, iter_meter, run):
    model.train()
    data_len = len(train_loader.dataset)

    # AMP
    scaler = GradScaler()

    for batch_idx, _data in enumerate(train_loader):
        spectrograms, labels, input_lengths, label_lengths = _data
        spectrograms, labels = spectrograms.to(device), labels.to(device)

        # optimizer.zero_grad()
        
        with torch.autocast(
            enabled=amp_enabled,
            dtype=torch.bfloat16,
            device_type="cuda"):

            output = model(spectrograms)  # (batch, time, n_class)
            output = F.log_softmax(output, dim=2)
            output = output.transpose(0, 1) # (time, batch, n_class)

            loss = criterion(output, labels, input_lengths, label_lengths)

        # loss.backward()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        # scaler.step(scheduler)
        scaler.update()

        if run:
            run["train_loss"].log(loss.item())
            run["learning_rate"].log(scheduler.get_last_lr())

        #optimizer.step()
        scheduler.step()
        
        iter_meter.step()
        if batch_idx % 100 == 0 or batch_idx == data_len:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(spectrograms), data_len,
                100. * batch_idx / len(train_loader), loss.item()))

def test(model, device, test_loader, criterion, run, beam=False, beam_decoder=None):
    print('\nevaluating...')
    model.eval()
    test_loss = 0
    test_cer, test_wer = [], []

    if beam:
        beam_test_cer, beam_test_wer = [], []

    """
    starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
    repetitions = 300
    timings=np.zeros((repetitions,1))

    N = 12
    torch.set_num_threads(N)

    # GPU warmup
    DUMMY = [ x[0] for x in iter(test_loader).next() ]
    print("DUMMY:", len(DUMMY))
    spectrograms, labels, input_lengths, label_lengths = DUMMY
    dummy_input = spectrograms.unsqueeze(0).to(device)
    for _ in range(10):
        _ = model(dummy_input)

    """
    with torch.no_grad():
        for i, _data in enumerate(test_loader):
            spectrograms, labels, input_lengths, label_lengths = _data 
            spectrograms, labels = spectrograms.to(device), labels.to(device)
            
            output = model(spectrograms)  # (batch, time, n_class)

            output = F.log_softmax(output, dim=2)
            output = output.transpose(0, 1) # (time, batch, n_class)

            loss = criterion(output, labels, input_lengths, label_lengths)
            test_loss += loss.item() / len(test_loader)

            decoded_preds, decoded_targets = GreedyDecoder(output.transpose(0, 1), labels, label_lengths)
            if beam:
                beam_preds, beam_targets = \
                    BeamDecoder(beam_decoder, output.transpose(0, 1), labels, label_lengths)

            print(f"greedy TEST {i}", decoded_preds[0:3], decoded_targets[0:3])
            if beam:
                print(f"beam TEST {i}", beam_preds, beam_targets)

            #print("Targets:", decoded_targets[0:2])
            #print("Preds:", decoded_preds[0:2])
                
            for j in range(len(decoded_preds)):
                test_cer.append(cer(decoded_targets[j], decoded_preds[j]))
                test_wer.append(wer(decoded_targets[j], decoded_preds[j]))

                if beam:
                    beam_test_cer.append(cer(beam_targets[j], beam_preds[j]))
                    beam_test_wer.append(cer(beam_targets[j], beam_preds[j]))

    """
    # inference
    with torch.no_grad():
        for rep in range(repetitions):
            starter.record()
            _ = model(dummy_input)
            ender.record()
            torch.cuda.synchronize()
            curr_time = starter.elapsed_time(ender)
            run["infer_log"].log(curr_time)
            timings[rep] = curr_time
    mean_syn = np.sum(timings) / repetitions
    std_syn = np.std(timings)

    repetitions = 100

    # throughput
    total_time = 0
    optimal_batch_size = 1
    with torch.no_grad():
        for rep in range(repetitions):
            starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
            starter.record()
            _ = model(dummy_input)
            ender.record()
            torch.cuda.synchronize()
            curr_time = starter.elapsed_time(ender)/1000
            run["throughput_log"].log(curr_time)
            total_time += curr_time
    throughput = (repetitions*optimal_batch_size) / total_time
    """

    avg_cer = sum(test_cer) / len(test_cer)
    avg_wer = sum(test_wer) / len(test_wer)

    if beam:
        avg_beam_cer = sum(beam_test_cer) / len(beam_test_cer)
        avg_beam_wer = sum(beam_test_wer) / len(beam_test_wer)

    if run:
        run["test_loss"].log(test_loss)
        run["cer"].log(avg_cer)
        run["wer"].log(avg_wer)
        if beam:
            run["beam_cer"].log(avg_beam_cer)
            run["beam_wer"].log(avg_beam_wer)
        #run["mean_syn"].log(mean_syn)
        #run["throughput"].log(throughput)
    
    print('Test set: Average loss: {:.4f}, Average CER: {:4f} Average WER: {:.4f}\n'.format(test_loss, avg_cer, avg_wer))
    return test_loss, avg_wer

def main(dataset_path, learning_rate=5e-4, batch_size=20, \
    epochs=10, checkpoint_path="", run=None, device=None, quantize=False, semg_eval=False, \
    semg_train=False, silent_only=False, voiced_only=False, beam=False):
    hparams = {
        "n_cnn_layers":  3,
        "n_rnn_layers":  5,
        "rnn_dim":       512,
        "n_class":       len(encoder.vocab),
        "n_feats":       128,
        "stride":        2,
        "dropout":       0.1,
        "learning_rate": learning_rate,
        "batch_size":    batch_size,
        "epochs":        epochs
    }

    if run:
        run["hparams"] = hparams
        run["quantize"] = quantize

    print("hparams:", hparams)

    use_cuda = torch.cuda.is_available()
    seed = 7
    random.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    
    if not device:
        device = torch.device("cuda" if use_cuda else "cpu")
        print("device:", device)
    else:
        device = torch.device(device)

    if CUR_DATASET == "SILENT_SPEECH":
        if semg_eval:
            train_dataset = None
            test_dataset  = SilentSpeechPred(\
                "./utils/metadata_dgaddy_preds.csv", dataset_type="test", silent_only=True)
        else:
            if semg_train:
                train_dataset = SilentSpeechPred(\
                    "./utils/metadata_dgaddy_preds.csv",
                    dataset_type="train",
                    silent_only=silent_only,
                    voiced_only=voiced_only)
                test_dataset  = SilentSpeechPred(\
                    "./utils/metadata_dgaddy_preds.csv",
                    dataset_type="test",
                    silent_only=True)
                print("LEN TRAIN TEST:", len(train_dataset), len(test_dataset))
            else:
                train_dataset = SilentSpeech("./utils/metadata_dgaddy.csv", dataset_type="train")
                test_dataset  = SilentSpeech("./utils/metadata_dgaddy.csv", dataset_type="test")
                print("LEN TRAIN TEST:", len(train_dataset), len(test_dataset))
            
    else:
        dataset = torchaudio.datasets.LJSPEECH(dataset_path, download=False)

    kwargs = {'num_workers': 4, 'pin_memory': True} if use_cuda else {}

    """
    dataset_len = 500 # int(len(dataset) * 1.0)
    train_split = int(dataset_len * 0.9)
    test_split  = dataset_len - train_split

    dataset = torch.utils.data.Subset(dataset, range(0, dataset_len))
    train_dataset, test_dataset = \
        torch.utils.data.random_split(dataset, [train_split, test_split])
    """
    
    if train_dataset:
        if semg_train:
            train_loader = data.DataLoader(dataset=train_dataset,
                                        batch_size=hparams['batch_size'],
                                        shuffle=True,
                                        collate_fn=lambda x: data_processing_preds(x, 'train'),
                                        **kwargs)
        else:
            train_loader = data.DataLoader(dataset=train_dataset,
                                        batch_size=hparams['batch_size'],
                                        shuffle=True,
                                        collate_fn=lambda x: data_processing(x, 'train'),
                                        **kwargs)

    if semg_eval or semg_train:
        test_loader = data.DataLoader(dataset=test_dataset,
                                    batch_size=hparams['batch_size'],
                                    shuffle=False,
                                    collate_fn=lambda x: data_processing_preds(x, 'valid'),
                                    **kwargs)
    else:
        test_loader = data.DataLoader(dataset=test_dataset,
                                    batch_size=hparams['batch_size'],
                                    shuffle=False,
                                    collate_fn=lambda x: data_processing(x, 'valid'),
                                    **kwargs)

    if train_dataset:                       
        print("(BATCHES) TRAIN LEN, TEST LEN:", len(train_loader), len(test_loader))

    model = SpeechRecognitionModel(
        hparams['n_cnn_layers'], hparams['n_rnn_layers'], hparams['rnn_dim'],
        hparams['n_class'], hparams['n_feats'], hparams['stride'], hparams['dropout']
    ).to(device)

    if checkpoint_path:
        model.load_state_dict(torch.load(checkpoint_path))
    
    if quantize:
        model = torch.quantization.quantize_dynamic(
            model,  # the original model
            {torch.nn.GRU, torch.nn.Linear},  # a set of layers to dynamically quantize
            dtype=torch.qint8)

    # print("QUANTIZE:", quantize)

    print(model)
    print('Num Model Parameters', sum([param.nelement() for param in model.parameters()]))

    if beam:
        tokens_path  = "beam_decoder/tokens.txt"
        lm_weight    = 0 # 6.00 # 3.23
        word_score   = 0 # -0.26
        lexicon_path = "beam_decoder/lexicon.txt"
        use_lm = True
        beam_size    = 1
        nbest        = 1

        # beam params
        run["beam_size"] = beam_size
        run["beam_use_lm"] = use_lm
        run["beam_lm_weight"] = lm_weight
        run["beam_word_score"] = word_score
        run["beam_nbest"] = nbest

        lm = files.lm if use_lm else None

        beam_search_decoder = ctc_decoder(
            lexicon=lexicon_path,
            tokens=tokens_path,
            lm=lm,
            nbest=nbest,
            beam_size=beam_size,
            lm_weight=lm_weight,
            word_score=word_score,
            sil_token="<unk>")
    else:
        beam_search_decoder = None

    optimizer = optim.AdamW(model.parameters(), hparams['learning_rate'])
    criterion = nn.CTCLoss(blank=28).to(device)
    # criterion = nn.CTCLoss(blank=0).to(device)
    if train_dataset:
        scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=hparams['learning_rate'], 
                                                steps_per_epoch=int(len(train_loader)),
                                                epochs=hparams['epochs'],
                                                anneal_strategy='linear')
    # scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', 0.5, patience=5)

    best_test_loss = float("inf")
    best_avg_wer = float("inf")

    iter_meter = IterMeter()
    for epoch in range(1, epochs + 1):
        if train_dataset:
            train(model, device, train_loader, criterion, optimizer, scheduler, epoch, iter_meter, run)
        test_loss, avg_wer = test(model, device, test_loader, criterion, run, beam, beam_search_decoder)

        if train_dataset:
            # if test_loss < best_test_loss:
            if avg_wer < best_avg_wer:
                torch.save(model.state_dict(), f"./models/ds2_DATASET_{CUR_DATASET}_EPOCHS_{epoch}_TEST_LOSS_{test_loss}_WER_{avg_wer}")
                best_avg_wer = avg_wer
    
    return best_avg_wer

# GPU Runtime

In [9]:
!nvidia-smi

Fri Apr 29 22:36:26 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:2B:00.0  On |                  N/A |
| 94%   44C    P8    27W / 200W |    507MiB /  8192MiB |      4%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Train

In [8]:
def go(device=None,
       quantize=False,
       semg_eval=False,
       semg_train=False,
       silent_only=False,
       voiced_only=False,
       epochs=200,
       checkpoint_path="",
       beam=False):
    from dotenv import dotenv_values
    import neptune.new as neptune
    config = dotenv_values(".env")

    learning_rate = 5e-4
    # learning_rate = 5e-5
    batch_size = 5 # 20
    epochs = epochs # 200 # 50 # 10
    dataset_path = "/mnt/datasets/ljspeech/"
    # checkpoint_path = "/home/joe/projects/asr/models/silent_speech_asr_5_pc/1/ds2_DATASET_SILENT_SPEECH_EPOCHS_255_TEST_LOSS_-0.4287225107351939"
    checkpoint_path = checkpoint_path

    neptune_project = config["NEPTUNE_PROJECT"]
    neptune_token   = config["NEPTUNE_TOKEN"]

    run = neptune.init(project=neptune_project,
                    api_token=neptune_token)
    
    # run = None

    if run:
        run["dataset"] = CUR_DATASET
        run["checkpoint_path"] = checkpoint_path
        run["device"] = device
        run["beam"] = beam
        
    final_wer = main(
        dataset_path,
        learning_rate,
        batch_size,
        epochs,
        checkpoint_path,
        run,
        device,
        quantize,
        semg_eval,
        semg_train,
        silent_only,
        voiced_only,
        beam)
        
    if run:
        run.stop()
    return final_wer

# go(device="cuda", quantize=False)

#final_wer = go()
#print(final_wer)

# Train on Ground Truth Audio

In [None]:
checkpoint_path = \
    "/home/joe/projects/asr/models/silent_speech_asr_100_pc/1/"\
    "ds2_DATASET_SILENT_SPEECH_EPOCHS_23_TEST_LOSS_0.5270409451460275_45_WER"

final_wer = go(
    semg_eval=False,
    semg_train=False,
    epochs=100, # open vocab parallel := 100, closed vocab := 200
    checkpoint_path=checkpoint_path)
print(final_wer)

# Train on Transduction Silent Speech Predictions (E<sub>s</sub> Preds)

In [9]:
"""
checkpoint_path = \
    "/home/joe/projects/asr/models/silent_speech_open_parallel/silent_speech_open_parallel(silent_and_vocal_preds_and_full_ground)/"\
    "ds2_DATASET_SILENT_SPEECH_EPOCHS_9_TEST_LOSS_2.351101124286652_WER_0.798235669013118"
# checkpoint_path = ""
"""

checkpoint_path = \
    "/home/joe/projects/asr/models/silent_speech_asr_100_pc/1/"\
    "ds2_DATASET_SILENT_SPEECH_EPOCHS_23_TEST_LOSS_0.5270409451460275_45_WER"

final_wer = go(
    semg_eval=False,
    semg_train=True,
    silent_only=False,
    voiced_only=False,
    beam=False,
    epochs=100, # open vocab parallel := 100, closed vocab := 200
    checkpoint_path=checkpoint_path)
    
print(final_wer)

https://app.neptune.ai/miscellaneousstuff/asr-initial-experiments/e/AS-223
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#.stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
hparams: {'n_cnn_layers': 3, 'n_rnn_layers': 5, 'rnn_dim': 512, 'n_class': 29, 'n_feats': 128, 'stride': 2, 'dropout': 0.1, 'learning_rate': 0.0005, 'batch_size': 5, 'epochs': 100}
device: cuda
(1) LIST OF FILES: 2778
(2) LIST OF FILES: ['/home/joe/projects/silent_speech/pred_audio/open_vocab_parallel/voiced/991', 'Death!" and leaving him to digest that if he could, I hurried on after the artillery-man.', 'train', 'voiced']
(1) LIST OF FILES: 99
(2) LIST OF FILES: ['/home/joe/projects/silent_speech/pred_audio/open_vocab_parallel/silent/616', 'They seemed very helpless in that pit of theirs.', 'test', 'silent']
LEN TRAIN TEST: 2778 99
(BATCHES) TRAIN LEN, TEST LEN: 556 20




SpeechRecognitionModel(
  (cnn): Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (rescnn_layers): Sequential(
    (0): ResidualCNN(
      (cnn1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (cnn2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
      (layer_norm1): CNNLayerNorm(
        (layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      )
      (layer_norm2): CNNLayerNorm(
        (layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      )
    )
    (1): ResidualCNN(
      (cnn1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (cnn2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
      (layer_norm1): CNNLayerNorm(
        (layer_norm): LayerNorm((64,),

  "".join(encoder.batch_decode(torch.tensor(cur_target)))


greedy TEST 1 ['he cundes sexciemen taf the avbets had to lout thow mie for sateng paers annd set of i riths', 'the tran  i vecke of ctroks the sho wassllolty engens hr the juthon manggeo wit their sels s of er mens nors<unk> excad the me came ito the sation about na oclock with h credible signgs<unk> and yast do were stirmens the tdrieters migh a don<unk>', 'he tot staird<unk> all something about crolling hout an th think lik vish comver<unk> and ran on to the gad of the house at the grist<unk>'] ['the intense excitement of the events had no doubt left my perceptive powers in a state of erethism<unk>', 'the ringing impact of trucks<unk> the sharp whistle of the engines from the junction<unk> mingled with their shouts of <unk>men from mars<unk><unk> excited men came into the station about nine o<unk>clock with incredible tidings<unk> and caused no more disturbance than drunkards might have done<unk>', 'he turned<unk> stared<unk> bawled something about <unk>crawling out in a thing like 




evaluating...
greedy output shape: torch.Size([5, 848, 29]) torch.Size([5, 848])
greedy TEST 0 ['they simpet felling upphis ind that pid of thers<unk>', 'hat i news fwom thei copom<unk><unk> said i<unk>', 'and tives i so from the stragens osess of the thes from myself n the world apoutmy<unk> sat woghes all from my houtsi<unk> from somemar is to seem bly rboted<unk> out of the hime out ofs pace<unk> out of the strece and drashiy of a ar<unk>'] ['they seemed very helpless in that pit of theirs<unk>', '<unk>what news from the common<unk><unk> said i<unk>', 'at times i suffer from the strangest sense of detachment from myself and the world about me<unk> i seem to watch it all from the outside<unk> from somewhere inconceivably remote<unk> out of time<unk> out of space<unk> out of the stress and tragedy of it all<unk>']
greedy output shape: torch.Size([5, 916, 29]) torch.Size([5, 916])
greedy TEST 1 ['he gunde sexciemin of the avets had to lout to mime por save paers annd set of iriths<unk

# Eval on Silent Speech Predictions

In [10]:
checkpoint_path = \
        "/home/joe/projects/asr/models/silent_speech_open_parallel/silent_speech_open_parallel(silent_and_vocal_preds_and_ground)/"\
        "ds2_DATASET_SILENT_SPEECH_EPOCHS_9_TEST_LOSS_2.351101124286652_WER_0.798235669013118"

"""
checkpoint_path = \
        "/home/joe/projects/asr/models/silent_speech_asr_100_pc/2/"\
        "ds2_DATASET_SILENT_SPEECH_EPOCHS_41_TEST_LOSS_0.6224472553241907_51_WER"
"""

final_wer = go(
    semg_eval=True,
    epochs=1,
    checkpoint_path = checkpoint_path,
    beam=False)
print(final_wer)

https://app.neptune.ai/miscellaneousstuff/asr-initial-experiments/e/AS-209
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#.stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
hparams: {'n_cnn_layers': 3, 'n_rnn_layers': 5, 'rnn_dim': 512, 'n_class': 29, 'n_feats': 128, 'stride': 2, 'dropout': 0.1, 'learning_rate': 0.0005, 'batch_size': 5, 'epochs': 1}
device: cuda
(1) LIST OF FILES: 99
(2) LIST OF FILES: ['/home/joe/projects/silent_speech/pred_audio/open_vocab_parallel/silent/616', 'They seemed very helpless in that pit of theirs.', 'test', 'silent']


AttributeError: 'tuple' object has no attribute 'seek'. You can only torch.load from a file that is seekable. Please pre-load the data into a buffer like io.BytesIO and try to load from it instead.

# Post-Training Quantization

## Inference Time, Throughput (CPU)

In [16]:
final_wer = go(device="cpu", quantize=True)

https://app.neptune.ai/miscellaneousstuff/asr-initial-experiments/e/AS-93
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#.stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
hparams: {'n_cnn_layers': 3, 'n_rnn_layers': 5, 'rnn_dim': 512, 'n_class': 39, 'n_feats': 128, 'stride': 2, 'dropout': 0.1, 'learning_rate': 0.0005, 'batch_size': 20, 'epochs': 1}
(BATCHES) TRAIN LEN, TEST LEN: 23 3




QUANTIZE: True
SpeechRecognitionModel(
  (cnn): Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (rescnn_layers): Sequential(
    (0): ResidualCNN(
      (cnn1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (cnn2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
      (layer_norm1): CNNLayerNorm(
        (layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      )
      (layer_norm2): CNNLayerNorm(
        (layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      )
    )
    (1): ResidualCNN(
      (cnn1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (cnn2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
      (layer_norm1): CNNLayerNorm(
        (layer_norm): L

  "".join(encoder.batch_decode(torch.tensor(cur_target)))


Targets: ['11<unk>50 pm', 'july 31 1999']
Preds: ['11<unk>5 pm', 'july 31 199']
Targets: ['october 23 2006', 'monday july 09']
Preds: ['october 23 26', 'monday july 9']
Targets: ['monday february 28', 'march 31 1893']
Preds: ['monday february 28', 'march 1 1893']
Test set: Average loss: -0.3829, Average CER: 0.081869 Average WER: 0.3567

Shutting down background jobs, please wait a moment...
Done!


Waiting for the remaining 13 operations to synchronize with Neptune. Do not kill this process.


All 13 operations synced, thanks for waiting!
