In [232]:
from common import load_train_index
from pathlib import Path
from hw_asr.base.base_text_encoder import BaseTextEncoder
from string import ascii_lowercase

VOCAB_SIZE = 100

index_directory = Path('pretrained_model/index/')

tokenizer_directory = Path('pretrained_model/tokenizer')
tokenizer_directory.mkdir(exist_ok=True)
texts_path = tokenizer_directory / 'texts.txt'
model_directory = tokenizer_directory / f'sentence_piece_vocab_{VOCAB_SIZE}'

datasets = load_train_index(index_directory)

sentences = []
for dataset in datasets:
    for observation in dataset:
        sentences.append(BaseTextEncoder.normalize_text(observation['text']))
with open(texts_path, 'w') as f:
    print(*sentences, sep='\n', file=f)
    print(*([' '.join(ascii_lowercase)] * len(sentences)), sep='\n', file=f)

In [233]:
import sentencepiece as spm

model_prefix = f'sentence_piece_vocab_{VOCAB_SIZE}'
model_prefix = tokenizer_directory / model_prefix


if not model_prefix.with_suffix('.model').exists():
    spm.SentencePieceTrainer.Train(
        input=texts_path,
        model_prefix=model_prefix,
        vocab_size=VOCAB_SIZE,
        model_type='bpe'
    )
sp_model = spm.SentencePieceProcessor(model_file=str(model_prefix) + '.model')

In [234]:
sentences[0]

'it had no ornamentation being exceedingly plain in appearance'

In [235]:
encoded = sp_model.Encode('it had no ornamentation being exceedingly plain in appearance')
print('|'.join([sp_model.IdToPiece(c).replace('▁', ' ') for c in encoded]))

 it| ha|d| n|o| o|r|n|a|m|en|t|at|i|on| be|ing| e|x|c|e|ed|ing|ly| p|l|a|in| in| a|p|p|e|ar|an|c|e


In [236]:
from string import ascii_lowercase

for c in ascii_lowercase:
    if sp_model.Decode(sp_model.Encode(c)) != c:
        print(c)

In [237]:
for i in range(sp_model.vocab_size()):
    print(i, sp_model.IdToPiece(i))

0 <unk>
1 <s>
2 </s>
3 ▁t
4 ▁a
5 he
6 ▁s
7 ▁w
8 ▁i
9 ▁o
10 ▁the
11 ▁b
12 ▁h
13 ▁m
14 ▁c
15 ▁f
16 ▁d
17 ▁p
18 re
19 ▁l
20 nd
21 ▁n
22 ▁e
23 ▁g
24 in
25 ▁y
26 er
27 ▁u
28 ou
29 ▁r
30 at
31 ▁k
32 ed
33 ▁v
34 ▁j
35 ▁and
36 ▁q
37 ▁to
38 ▁of
39 on
40 en
41 ▁z
42 ▁x
43 is
44 ing
45 ▁th
46 ▁he
47 or
48 es
49 as
50 ll
51 it
52 ar
53 an
54 ▁in
55 om
56 ▁be
57 ▁ha
58 le
59 ot
60 ow
61 ic
62 ut
63 ▁wh
64 ▁it
65 ld
66 ▁that
67 ly
68 ve
69 ▁was
70 st
71 id
72 se
73 ▁
74 e
75 t
76 a
77 o
78 n
79 i
80 h
81 s
82 r
83 d
84 l
85 u
86 m
87 c
88 w
89 f
90 g
91 y
92 p
93 b
94 v
95 k
96 x
97 j
98 q
99 z


In [238]:
from utils import reload
reload('hw_asr')
from hw_asr.text_encoder.ctc_char_bpe_encoder import CTCCharBpeEncoder  # noqa
from hw_asr.text_encoder.ctc_char_bpe_encoder import CTCCharTextEncoder  # noqa

encoder = CTCCharBpeEncoder(f'pretrained_model/tokenizer/sentence_piece_vocab_{VOCAB_SIZE}')
text_encoder = CTCCharTextEncoder()
sentence = 'hello world every day'
encoded = encoder.encode(sentence)
print(encoded)
print(text_encoder.encode(sentence))
encoder.ctc_decode_enhanced(encoded[0].numpy())

tensor([[46., 50., 77.,  7., 47., 65., 22., 94., 26., 91., 16., 76., 91.]])
tensor([[ 8.,  5., 12., 12., 15., 27., 23., 15., 18., 12.,  4., 27.,  5., 22.,
          5., 18., 25., 27.,  4.,  1., 25.]])


'hello world every day'

In [103]:
text_encoder.encode(sentence)

tensor([[ 8.,  5., 12., 12., 15., 27., 23., 15., 18., 12.,  4., 27.,  5., 22.,
          5., 18., 25., 27.,  4.,  1., 25.]])

In [63]:
import torch
import pathlib
temp = pathlib.PosixPath
pathlib.PosixPath = pathlib.WindowsPath

checkpoint = torch.load('pretrained_model/model_checkpoint.pth')

In [64]:
from hw_asr.model.deep_speech import DeepSpeech2


model = DeepSpeech2(n_feats=128, n_class=28)
model.load_state_dict(checkpoint['state_dict'])



<All keys matched successfully>

In [65]:
from torch import nn

model.fc = nn.Linear(1024, VOCAB_SIZE, bias=False)

In [66]:
import json
from torch.optim import Adam


with open('hw_asr\configs\deep_speech_2_server_bpe.json') as f:
    config = json.load(f)

torch.save({
    'state_dict': model.state_dict(),
    'monitor_best': 0,
    'config': config
}, 'tmp/bpe_model.pth')