In [1]:
!pip install g2p-en
!pip install praatio

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting g2p-en
  Downloading g2p_en-2.1.0-py3-none-any.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
Collecting distance>=0.1.3 (from g2p-en)
  Downloading Distance-0.1.3.tar.gz (180 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.3/180.3 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: distance
  Building wheel for distance (setup.py) ... [?25l[?25hdone
  Created wheel for distance: filename=Distance-0.1.3-py3-none-any.whl size=16258 sha256=1ffccef406f4ffb510031ae2ddcb0d7f31b335a9dd8c3fd095cffc9c00f91238
  Stored in directory: /root/.cache/pip/wheels/e8/bb/de/f71bf63559ea9a921059a5405806f7ff6ed612a9231c4a9309
Successfully built distance
Installing collected packages: distance, g2p-

In [2]:
from g2p_en import G2p

import torch
import torch.nn as nn
import torchaudio

from collections import defaultdict

from praatio import textgrid as tgio
from praatio.data_classes.interval_tier import Interval

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.


In [3]:
def make_frames(wav):
    return torchaudio.compliance.kaldi.mfcc(wav)

class LibriSpeech(torch.utils.data.Dataset):
    def __init__(self, url='dev-clean'):
        super().__init__()
        self.librispeech = torchaudio.datasets.LIBRISPEECH('.', url=url, download=True)

    def __len__(self):
        return len(self.librispeech)

    def __getitem__(self, index):
        wav, sr, text, speaker_id, chapter_id, utterance_id = self.librispeech[index]
        return make_frames(wav), sr, text, speaker_id, chapter_id, utterance_id
  

class Encoder(nn.Module):
    def __init__(self, input_dim=13, subsample_dim=128, hidden_dim=1024):
        super().__init__()
        self.subsample = nn.Conv1d(input_dim, subsample_dim, 5, stride=4, padding=3)
        self.lstm = nn.LSTM(subsample_dim, hidden_dim, batch_first=True, num_layers=3, dropout=0.2)

    def subsampled_lengths(self, input_lengths):
        # https://github.com/vdumoulin/conv_arithmetic
        p, k, s = self.subsample.padding[0], self.subsample.kernel_size[0], self.subsample.stride[0]
        o = input_lengths + 2 * p - k
        o = torch.floor(o / s + 1)
        return o.int()

    def forward(self, inputs):
        x = inputs
        x = self.subsample(x.mT).mT
        x = x.relu()
        x, _ = self.lstm(x)
        return x.relu()


class Vocabulary:
    def __init__(self):
        self.g2p = G2p()

        # http://www.speech.cs.cmu.edu/cgi-bin/cmudict
        self.rdictionary = ["ε", # CTC blank
                            " ",
                            "AA0", "AA1", "AE0", "AE1", "AH0", "AH1", "AO0", "AO1", "AW0", "AW1", "AY0", "AY1",
                            "B", "CH", "D", "DH",
                            "EH0", "EH1", "ER0", "ER1", "EY0", "EY1",
                            "F", "G", "HH",
                            "IH0", "IH1", "IY0", "IY1",
                            "JH", "K", "L", "M", "N", "NG",
                            "OW0", "OW1", "OY0", "OY1",
                            "P", "R", "S", "SH", "T", "TH",
                            "UH0", "UH1", "UW0", "UW1",
                            "V", "W", "Y", "Z", "ZH"]

        self.dictionary = {c: i for i, c in enumerate(self.rdictionary)}

    def __len__(self):
        return len(self.rdictionary)

    def encode(self, text):
        labels = [c.replace('2', '0') for c in self.g2p(text) if c != "'"]
        targets = torch.LongTensor([self.dictionary[phoneme] for phoneme in labels])
        return targets

    
class Recognizer(nn.Module):
    def __init__(self, feat_dim=1024, vocab_size=55+1):
        super().__init__()
        self.classifier = nn.Linear(feat_dim, vocab_size)

    def forward(self, features):
        features = self.classifier(features)
        return features.log_softmax(dim=-1)

In [4]:
vocab = Vocabulary()
encoder = Encoder()
recognizer = Recognizer()

In [5]:
ckpt = torch.load('/content/drive/MyDrive/Colab Notebooks/UCU_розпізнавання_мови/lstm_p3_360+500.pt', map_location='cpu')
encoder.load_state_dict(ckpt['encoder'])
recognizer.load_state_dict(ckpt['recognizer'])

<All keys matched successfully>

In [6]:
audio_frames, sr, text, speaker_id, chapter_id, utterance_id = LibriSpeech()[100]
phonemes = vocab.encode(text)
features = encoder(audio_frames)


speaker_id, chapter_id, utterance_id

100%|██████████| 322M/322M [00:12<00:00, 28.0MB/s]


(1462, 170138, 27)

In [7]:
outputs = recognizer.forward(features) # (T, 55+1)

In [8]:
predicted_speech = torch.argmax(outputs, dim=1)
predicted_speech_str = [vocab.rdictionary[idx] for idx in predicted_speech]
print(predicted_speech_str)

true_labels = [vocab.rdictionary[idx] for idx in phonemes]
print(true_labels)

['ε', 'ε', 'ε', 'ε', 'ε', 'ε', 'ε', 'ε', 'ε', 'ε', 'ε', 'ε', 'ε', 'IH1', 'IH1', 'T', ' ', ' ', 'W', 'AA1', 'Z', ' ', ' ', 'Y', 'Y', 'UW1', 'ε', 'ε', 'TH', ' ', ' ', ' ', 'AH0', 'N', 'D', ' ', ' ', 'P', 'ε', 'AA1', 'V', 'V', 'ER0', 'ER0', 'T', 'ε', 'IY0', ' ', ' ', 'ε', 'IH0', 'N', 'ε', ' ', ' ', 'P', 'R', 'AA0', 'K', 'ε', 'S', 'ε', 'IH1', 'M', 'ε', 'AH0', 'T', 'ε', 'IY0', 'ε', 'ε', 'ε', 'ε', 'ε', 'ε', 'ε', 'ε', ' ', ' ', ' ', 'AE1', 'T', ' ', ' ', 'EH1', 'V', 'R', 'IY0', 'IY0', 'TH', 'IH0', 'NG', ' ', ' ', 'W', 'AA1', 'Z', ' ', ' ', 'Y', 'ε', 'AH1', 'NG', 'ε', ' ', ' ', 'AH0', 'N', 'D', ' ', ' ', 'K', 'ε', 'AY1', 'N', 'D', 'D', 'L', 'IY0', 'IY0', 'ε', 'ε', 'ε', 'ε', 'ε', 'ε']
['IH1', 'T', ' ', 'W', 'AA1', 'Z', ' ', 'Y', 'UW1', 'TH', ' ', 'AH0', 'N', 'D', ' ', 'P', 'AA1', 'V', 'ER0', 'T', 'IY0', ' ', 'AH0', 'N', 'D', ' ', 'P', 'R', 'AA0', 'K', 'S', 'IH1', 'M', 'AH0', 'T', 'IY0', ' ', 'AH0', 'N', 'D', ' ', 'EH1', 'V', 'R', 'IY0', 'TH', 'IH0', 'NG', ' ', 'W', 'AA1', 'Z', ' ', 'Y', 'AH1', 

In [15]:
phonems_entries = []
for idx, pred_token in enumerate(predicted_speech_str):
  phonems_entries.append([((idx)*4/100), ((idx+1)*4/100), pred_token])


# if we have similar phonemes in a row combine them into one and add the duration

phonems_entries_concat = []

last_entry = phonems_entries[0][2]
start = phonems_entries[0][0]
end = phonems_entries[0][1]

for i in range(1, len(phonems_entries)):
  if phonems_entries[i][2] == last_entry:
    end = phonems_entries[i][1]
  else:
    phonems_entries_concat.append([start, end, last_entry])
    last_entry = phonems_entries[i][2]
    start = phonems_entries[i][0]
    end = phonems_entries[i][1]


In [10]:
# generate a praat TextGrid
intervals = defaultdict(list)
tg = tgio.Textgrid()

for idx, phonem_info in enumerate(phonems_entries_concat):
    start = phonem_info[0]
    end = phonem_info[1]
    interval = Interval(start, end, phonem_info[2])
    intervals[0].append(interval)


tg = tgio.Textgrid()
tg.minTimestamp = 0
tg.maxTimestamp = intervals[0][-1].end

tier_name = 'phones'
tg.addTier(tgio.IntervalTier(tier_name, [], minT=0, maxT=tg.maxTimestamp))

for interval in intervals[0]:
    tg.getTier(tier_name).insertEntry(interval)

tg.save('test_praat.TextGrid',
        includeBlankSpaces=True,
        format='long_textgrid',
        reportingMode='error')