In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#! pip install git+https://github.com/beroguedou/SpecAugment.git
#!pip install torchaudio

In [3]:
import os
import csv
import time
import librosa

import warnings
import pandas as pd
from models import *
from utils import *
from decode import *
import torch
import torchaudio
import numpy as np
import torch.optim as optim
from transformers import BertTokenizer

warnings.filterwarnings('ignore')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

seed_value = 2020
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)

In [4]:

limit = 80
params = {'batch_size': None,
          'shuffle': True,
          'num_workers': 10,
          'drop_last': True}


In [5]:
%%time 

training_set = LibriSpeechDataset(limit=limit, n_frames=600, version='train-clean-360')
dev_set = LibriSpeechDataset(limit=limit, n_frames=600, version='dev-clean')


CPU times: user 375 ms, sys: 107 ms, total: 482 ms
Wall time: 1.46 s


In [6]:
# Let's see a single utterance

LibriSpeechDataset(limit=limit, n_frames=600, version='train-clean-360')[0]

(tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]]),
 tensor([  101,   101,  2045,  2001,  2625,  1997,  1037,  4306,  2055,  2085,
          1998, 14163,  6894,  5289,  2106,  2025,  2031,  2000,  2562,  2067,
          1037,  5481,  2004,  2002,  2441,  1996,  9445,  3460,  6031,  2988,
          1996, 14460,  4214,  8721,  2000, 10767,  1998, 27468,  1996,  2221]))

In [7]:
encoder = EncoderCONV2DRNN(device=device, hidden_size=64).to(device)

decoder = DecoderATTRNN(30000, dec_units=64, hidden_size=64).to(device)


encoder_optimizer = optim.Adam(encoder.parameters())
decoder_optimizer = optim.Adam(decoder.parameters())

criterion = nn.NLLLoss()

In [None]:
# Train the model

global_trainer(5, training_set, dev_set, params, encoder, decoder, encoder_optimizer, decoder_optimizer,
                                    criterion, device)

      The model has 4954139 parameters


Epoch        1: 100%|████████████████████| 5/5 [01:12<00:00, 14.58s/it, Train loss 5.3144 Eval loss 5.8326]
Epoch        2: 100%|████████████████████| 5/5 [01:14<00:00, 14.84s/it, Train loss 5.3247 Eval loss 5.9415]
Epoch        3: 100%|████████████████████| 5/5 [01:13<00:00, 14.74s/it, Train loss 5.2689 Eval loss 5.9547]
Epoch        4: 100%|████████████████████| 5/5 [01:12<00:00, 14.50s/it, Train loss 5.1985 Eval loss 5.7526]
Epoch        5: 100%|████████████████████| 5/5 [01:08<00:00,  8.85s/it, Train loss 5.2172 Eval loss 5.8079]

In [None]:
mfccs, references = training_set[1]
tokenizer =  BertTokenizer.from_pretrained('bert-base-uncased')
references = [tokenizer.convert_ids_to_tokens(ind) for ind in references.numpy().tolist()]
evaluate(mfccs.unsqueeze(0), references, 40, encoder, decoder, targ_lang_tokenizer=tokenizer, 
          device=device, beam_search=True)

In [None]:
# Vérifications

# 1 - Data Augmentation
# 2 - Encoder
# 3 - Attention Mechanism Bahdanau Audio
# 4 - Smoothing and Topk to the attention
# 5 - Decoder 
# 6 - Métrique BLEU

In [13]:
encoder.named_parameters

<bound method Module.named_parameters of EncoderCONV2DRNN(
  (conv_base): ConvBase(
    (avg_pool_1): AvgPool2d(kernel_size=3, stride=(2, 3), padding=0)
    (batchnorm2d_1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv2d_1): Conv2d(1, 256, kernel_size=(3, 3), stride=(1, 1))
    (conv2d_2): Conv2d(256, 64, kernel_size=(3, 3), stride=(1, 1))
  )
  (norm_layer_1): LayerNorm((198, 64), eps=1e-05, elementwise_affine=True)
  (norm_layer_2): LayerNorm((198, 64), eps=1e-05, elementwise_affine=True)
  (norm_layer_3): LayerNorm((198, 64), eps=1e-05, elementwise_affine=True)
  (norm_layer_4): LayerNorm((198, 64), eps=1e-05, elementwise_affine=True)
  (norm_layer_5): LayerNorm((198, 64), eps=1e-05, elementwise_affine=True)
  (norm_layer_6): LayerNorm((198, 64), eps=1e-05, elementwise_affine=True)
  (rnn_base_1): RnnBase(
    (gru): GRU(1088, 64, batch_first=True, dropout=0.2, bidirectional=True)
  )
  (rnn_base_2): RnnBase(
    (gru): GRU(64, 64, batch_

In [None]:
encoder.conv_base.conv2d_1.bias

In [None]:
encoder.norm_layer_3.weight

In [None]:
encoder.norm_layer_3.bias

In [9]:
encoder.rnn_base_1.gru.weight_hh_l0

Parameter containing:
tensor([[-0.0674, -0.1302,  0.0384,  ..., -0.0591, -0.1039,  0.0268],
        [-0.0764,  0.0268, -0.1037,  ...,  0.0132,  0.0557,  0.1091],
        [ 0.0118,  0.0105, -0.0064,  ..., -0.0182, -0.0036, -0.0241],
        ...,
        [ 0.0903,  0.0023,  0.0194,  ..., -0.0641,  0.0586, -0.1119],
        [-0.0462,  0.1185,  0.0371,  ...,  0.0341, -0.0499,  0.1254],
        [-0.1108, -0.0882, -0.0694,  ...,  0.1346,  0.0861, -0.0641]],
       requires_grad=True)

In [12]:
encoder.rnn_base_1.gru.bias_hh_l0

Parameter containing:
tensor([ 0.0711,  0.0114,  0.0669, -0.0574, -0.0121, -0.0404, -0.0870,  0.0906,
         0.0896, -0.0941, -0.0627,  0.0708,  0.0439,  0.0731,  0.0125,  0.1143,
         0.0114, -0.0362, -0.0148, -0.1334,  0.0269,  0.0966, -0.0006,  0.0693,
        -0.0164, -0.0917,  0.0635, -0.0467,  0.0485, -0.0766,  0.0766,  0.0086,
        -0.0265,  0.0303,  0.0703, -0.0791, -0.0572, -0.0941,  0.0705,  0.0156,
        -0.0063, -0.1090,  0.0431,  0.0985, -0.0457,  0.0080,  0.0751,  0.0311,
        -0.0500,  0.0871, -0.1000, -0.0376,  0.0398, -0.1056, -0.0243,  0.0167,
         0.1059, -0.0190, -0.0105, -0.1084, -0.1201,  0.0273,  0.1197, -0.1171,
         0.0333,  0.0950,  0.0810, -0.1045, -0.0057,  0.1010, -0.1195, -0.0595,
        -0.0480, -0.0428,  0.0579, -0.0701,  0.0709,  0.0011,  0.1195, -0.0903,
         0.1042, -0.0676, -0.0142,  0.0617,  0.0562, -0.1038,  0.0304,  0.0476,
         0.0258,  0.0646,  0.1247,  0.0680, -0.0428,  0.0224, -0.0486, -0.0267,
        -0.1085, -

In [None]:
encoder.rnn_base_.gru.weight_ih_l0_reverse