In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#! pip install git+https://github.com/beroguedou/SpecAugment.git
#!pip install torchaudio
#!pip install torchsummary

In [3]:
import os
import csv
import time
import librosa

import warnings
import pandas as pd
from models import *
from utils import *
from decode import *
import torch
import torchaudio
import numpy as np
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer

warnings.filterwarnings('ignore')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

seed_value = 2020
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)

In [4]:

limit = 80
params = {'batch_size': None,
          'shuffle': True,
          'num_workers': 10,
          'drop_last': True}


In [5]:
%%time 

training_set = LibriSpeechDataset(limit=limit, n_frames=800, version='train-clean-360')
dev_set = LibriSpeechDataset(limit=limit, n_frames=800, version='dev-clean')


CPU times: user 347 ms, sys: 32.8 ms, total: 380 ms
Wall time: 1.17 s


In [6]:
# Let's see a single utterance

LibriSpeechDataset(limit=limit, n_frames=800, version='train-clean-360')[0]

(tensor([[[2.9673e-05, 2.9673e-05, 2.9673e-05,  ..., 8.6926e-04,
           9.1071e-04, 9.5212e-04],
          [4.4498e-06, 4.4498e-06, 4.4498e-06,  ..., 2.5194e+00,
           2.4819e+00, 2.4445e+00],
          [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          ...,
          [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00]]]),
 tensor([  101,   101,  2045,  2001,  2625,  1997,  1037,  4306,  2055,  2085,
          1998, 14163,  6894,  5289,  2106,  2025,  2031,  2000,  2562,  2067,
          1037,  5481,  2004,  2002,  2441,  1996,  9445,  3460,  6031,  2988,
          1996, 14460,  4214,  8721,  2000, 10767,  1998, 27468,  1996,  2221]))

In [7]:
units = 64

encoder = EncoderCONV2DRNN(device=device, hidden_size=units).to(device)
decoder = DecoderATTRNN(vocab_size=30000, dec_units=units, hidden_size=units, 
                        encoder_timestamp=encoder.encoder_timestamp).to(device)

encoder_optimizer = optim.Adam(encoder.parameters())
decoder_optimizer = optim.Adam(decoder.parameters())

criterion = nn.NLLLoss()

In [8]:
# Train the model

global_trainer(80, training_set, dev_set, params, encoder, decoder, encoder_optimizer,
               decoder_optimizer, criterion, device)

      The model has 6029288 parameters


Epoch        1: 100%|████████████████████| 5/5 [01:20<00:00, 16.01s/it, Train loss 9.9365 Eval loss 9.7899]
Epoch        2: 100%|████████████████████| 5/5 [01:21<00:00, 16.23s/it, Train loss 9.2872 Eval loss 8.7636]
Epoch        3: 100%|████████████████████| 5/5 [01:22<00:00, 16.47s/it, Train loss 8.7807 Eval loss 8.2838]
Epoch        4: 100%|████████████████████| 5/5 [01:21<00:00, 16.35s/it, Train loss 8.2911 Eval loss 7.8296]
Epoch        5: 100%|████████████████████| 5/5 [01:21<00:00, 16.29s/it, Train loss 7.8049 Eval loss 7.4408]
Epoch        6: 100%|████████████████████| 4/4 [01:16<00:00, 19.24s/it, Train loss 7.3704 Eval loss 7.1450]
Epoch        7: 100%|████████████████████| 4/4 [01:17<00:00, 19.49s/it, Train loss 6.9989 Eval loss 6.8231]
Epoch        8: 100%|████████████████████| 4/4 [01:16<00:00, 19.23s/it, Train loss 6.6392 Eval loss 6.5842]
Epoch        9: 100%|████████████████████| 4/4 [01:16<00:00, 19.25s/it, Train loss 6.3321 Eval 

Epoch       75: 100%|████████████████████| 4/4 [01:17<00:00, 19.42s/it, Train loss 5.2196 Eval loss 6.5121]
Epoch       76: 100%|████████████████████| 4/4 [01:18<00:00, 19.54s/it, Train loss 5.2108 Eval loss 6.7603]
Epoch       77: 100%|████████████████████| 4/4 [01:17<00:00, 19.39s/it, Train loss 4.9524 Eval loss 6.0750]
Epoch       78: 100%|████████████████████| 4/4 [01:17<00:00, 19.39s/it, Train loss 4.7605 Eval loss 6.6600]
Epoch       79: 100%|████████████████████| 4/4 [01:17<00:00, 19.34s/it, Train loss 4.7309 Eval loss 6.0234]
Epoch       80: 100%|████████████████████| 4/4 [01:17<00:00, 19.39s/it, Train loss 4.6999 Eval loss 5.8988]

Time taken for the training 1.7451 hours



In [None]:
mfccs, references = training_set[1]
tokenizer =  BertTokenizer.from_pretrained('bert-base-uncased')
references = [tokenizer.convert_ids_to_tokens(ind) for ind in references.numpy().tolist()]
evaluate(mfccs.unsqueeze(0), references, 40, encoder, decoder, targ_lang_tokenizer=tokenizer, 
          device=device, beam_search=True)

In [None]:
# Vérifications

# 1 - Data Augmentation
# 2 - Encoder
# 3 - Attention Mechanism Bahdanau Audio
# 4 - Smoothing and Topk to the attention
# 5 - Decoder 
# 6 - Métrique BLEU

 

In [None]:
layer = nn.AvgPool2d(kernel_size=(3,3) , stride=(2,3), padding=(0,0))

def dim_calcul_avg_pool2d(N, C_in, H_in, W_in, layer):

    padding = layer.padding
    kernel_size = layer.kernel_size
    stride = layer.stride
    C_out = C_in
    H_out = 1 + (H_in + 2 * padding[0] - kernel_size[0]) // stride[0]
    W_out = 1 + (W_in + 2 * padding[1] - kernel_size[1]) // stride[1]
        
    return (N, C_out, H_out, W_out)

        
    
dim_calcul_avg_pool2d(16, 32, 35, 596, layer)