In [None]:
#@title Parameters
##@markdown -  actual = images*batch_size


use_data = True  #@param {type:"boolean"}
feature_embedding = False  #@param {type:"boolean"}
embed_size =  18#@param {type:"number"}
device = "cuda:0"  #@param ["cpu", "cuda:0"]
dropout = 0.2  #@param {type:"number"}
d_hid = 200  #@param {type:"number"} # dimension of the feedforward network model in nn.TransformerEncoder
nlayers =   3#@param {type:"number"} # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead =   2#@param {type:"number"} # number of heads in nn.MultiheadAttention
batch_size =  16#@param {type:"number"}
eval_batch_size =  16#@param {type:"number"}
epochs = 25 #@param {type:"number"}
learning_rate =   0.002#@param {type:"number"} # learning rate
log_interval = 100 #@param {type:"number"}
maximum_word_length =  10#@param {type:"number"}

lr = learning_rate
emsize = embed_size
word_names = []






# data and imports

In [None]:
import math
from typing import Tuple

import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset

In [None]:
import requests
response = requests.get("http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b")
data = response.text.split("\n")
phonemes = {}
index_to_phon = {}
words = {}
maxlen = 0
ip = 1
toolong = 0
toplen = maximum_word_length
for d in data:
  if len(d) > 0 and d[0] != ";":
    d = d.split(" ")
    phonms = ["BEG"] + d[2:] + ["END"]
    if len(phonms) > toplen:
      toolong += 1
      continue
    words[d[0]] = phonms
    if len(phonms) > maxlen:
      maxlen = len(phonms)
    for p in phonms:
      if p not in phonemes:
        phonemes[p] = ip
        index_to_phon[ip] = p
        ip += 1

print(f"Got {len(words)} words")
print(f"Got {len(phonemes)} phonemes")
print(f"Word maximum length is {maxlen}")
print(f"{toolong} Words were too long")



Got 113601 words
Got 71 phonemes
Word maximum length is 10
20251 Words were too long


In [None]:
def word2vec(features):
  return torch.tensor([phonemes[a] for a in features])

# print(list(words.values())[0])
# print(word2vec(list(words.values())[0]))
vecs = torch.nn.utils.rnn.pad_sequence([word2vec(w) for w in words.values()], batch_first=True)
word_names ,data = list(words.keys()), vecs
print(data.shape)
print(f"all words padded to length {len(vecs[0])} where max length is {maxlen}")

torch.Size([113601, 10])
all words padded to length 10 where max length is 10


In [None]:
if use_data:
  data_size = len(words)
  num_of_features = emsize
  num_of_phonemes = len(phonemes) + 1
  word_maxlength = maxlen
  
else:
  word_maxlength = 10
  data_size = 100000
  num_of_features = 12
  num_of_phonemes = 30
  emsize = num_of_features

ntokens = num_of_phonemes  # size of vocabulary

In [None]:
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import numpy as np


if not use_data:
  # pretrained_weights = torch.randint(0,num_of_features,(num_of_phonemes+1, num_of_features))
  pretrained_weights = torch.tensor(np.random.choice([-1,0,1], (num_of_phonemes+1, num_of_features)))
  pretrained_weights[0] = torch.zeros(num_of_features)
  print(pretrained_weights[3])
  generated_data = [torch.zeros(word_maxlength)]
  for i in range(data_size):
    length = torch.randint(2,word_maxlength,(1,))
    generated_data.append(torch.randint(1,num_of_phonemes,(length,)))

  generated_data = torch.nn.utils.rnn.pad_sequence(generated_data, batch_first=True)[1:]
  data = generated_data

train_data = data[:int(data_size*0.7)]
train_words = word_names[:int(data_size*0.7)]
val_data = data[int(data_size*0.7): int(data_size*0.9)]
val_words = word_names[int(data_size*0.7): int(data_size*0.9)]
test_data = data[int(data_size*0.9):]
test_words = word_names[int(data_size*0.9):]

print(train_data.shape)
print(val_data.shape)
print(test_data.shape)


def batchify(dta: Tensor, bsz: int) -> Tensor:
    """Divides the data into bsz separate sequences, removing extra elements
    that wouldn't cleanly fit.

    Args:
        dta: Tensor, shape [N]
        bsz: int, batch size

    Returns:
        Tensor of shape [N // bsz, bsz]
    """
    seq_len = dta.size(0) // bsz
    dta = dta[:seq_len * bsz]
    dta = dta.view(seq_len, bsz, word_maxlength)
    return dta.to(device)



train_data = batchify(train_data, batch_size).long()  # shape [seq_len, batch_size]
print(train_data.shape)
val_data = batchify(val_data, eval_batch_size).long()
print(val_data.shape)
test_data = batchify(test_data, eval_batch_size).long()

torch.Size([79520, 10])
torch.Size([22720, 10])
torch.Size([11361, 10])
torch.Size([4970, 16, 10])
torch.Size([1420, 16, 10])


In [None]:
def get_batch(dataset, i):
  return torch.tensor(dataset[i]),torch.nn.functional.one_hot(torch.tensor(dataset[i]), num_classes=num_of_phonemes)

In [None]:
print(get_batch(train_data,0)[0].shape, get_batch(train_data,0)[1].shape)

torch.Size([16, 10]) torch.Size([16, 10, 72])


  


# Model

In [None]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        print(position.shape, div_term.shape, pe[:, 0, 1::2].shape)
        # print(position * div_term, (position * div_term).shape)
        print(torch.sin(position * div_term).shape, torch.cos(position * div_term).shape)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        # print(x.shape, self.pe[:x.size(0)].shape)
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [None]:
class TransformerModel(nn.Module):

    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5, features = None):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout, max_len=max(batch_size, eval_batch_size))
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.d_model = d_model
        self.decoder = nn.Linear(d_model, ntoken)
        if features is not None:
          print("lulululu")
          self.encoder = nn.Embedding.from_pretrained(features, freeze=True)
        else:
          self.encoder = nn.Embedding(ntoken, d_model)
          self.init_weights()
        self.sftmx = nn.Softmax(dim=2)
        

        

    def init_weights(self) -> None:
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:
        """
        Args:
            src: Tensor, shape [seq_len, batch_size]
            src_mask: Tensor, shape [seq_len, seq_len]

        Returns:
            output Tensor of shape [seq_len, batch_size, ntoken]
        """
        # print("lalala",src.shape, src_mask.shape)
        # print("bla", self.encoder(src))
        src = self.encoder(src) * math.sqrt(self.d_model)
        
        src = self.pos_encoder(src)
        # print(src.shape, src)
        output = self.transformer_encoder(src, src_mask)
        # print(output.shape)
        output = self.decoder(output)

        # return self.sftmx(output)
        return output


def generate_square_subsequent_mask(sz: int) -> Tensor:
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

# Train

In [None]:
import copy
import time

if not feature_embedding:
  pretrained_weights = None
else:
  pretrained_weights = pretrained_weights.float()
model = TransformerModel(ntokens, emsize, nhead, d_hid, nlayers, dropout, features=pretrained_weights).to(device)
criterion = nn.CrossEntropyLoss()
# criterion2 = nn.L1Loss()

example = train_data[0].unsqueeze(0)
mask = torch.tensor([1]).unsqueeze(0).float().to(device)

print(f"size of embedding:", model.encoder(example).shape)

optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
sft = nn.Softmax(dim=2)

def train(model: nn.Module) -> None:
    model.train()  # turn on train mode
    total_loss = 0.
    
    start_time = time.time()
    src_mask = generate_square_subsequent_mask(batch_size).to(device)

    num_batches = len(train_data) // batch_size
    for batch, i in enumerate(range(0, train_data.size(0) - 1)):
        data, targets = get_batch(train_data, i)
        if len(src_mask) != batch_size:  # only on last batch
            src_mask = src_mask[:batch_size, :batch_size]
        output = model(data, src_mask)
        # output = sft(model(data, src_mask))
        # print(targets.float())
        loss = 10 * criterion(output, targets.float())
        # loss = criterion(output, targets.float()) + criterion2(output, targets.float())

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        if batch % log_interval == 0 and batch > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = math.exp(cur_loss)
            print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
                  f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            total_loss = 0
            start_time = time.time()

def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    src_mask = generate_square_subsequent_mask(eval_batch_size).to(device)
    with torch.no_grad():
        for i in range(0, eval_data.size(0) - 1, eval_batch_size):
            data, targets = get_batch(eval_data, i)
            
            src_mask = src_mask[:eval_batch_size, :eval_batch_size]
            output = model(data, src_mask)
            # output = sft(model(data, src_mask))
            output_flat = output
            total_loss += eval_batch_size * criterion(output_flat, targets.float()).item()
    return total_loss / (len(eval_data) - 1)




torch.Size([16, 1]) torch.Size([9]) torch.Size([16, 9])
torch.Size([16, 9]) torch.Size([16, 9])
size of embedding: torch.Size([1, 16, 10, 18])


In [None]:
best_val_loss = float('inf')

best_model = None



for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(model)
    val_loss = 10 * evaluate(model, val_data)
    val_ppl = math.exp(val_loss)
    elapsed = time.time() - epoch_start_time
    print('-' * 89)
    print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
          f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = copy.deepcopy(model)

    scheduler.step()

  


| epoch   1 |   100/  310 batches | lr 0.00 | ms/batch  8.94 | loss  3.23 | ppl    25.31
| epoch   1 |   200/  310 batches | lr 0.00 | ms/batch  8.45 | loss  3.16 | ppl    23.61
| epoch   1 |   300/  310 batches | lr 0.00 | ms/batch  8.58 | loss  3.10 | ppl    22.29
| epoch   1 |   400/  310 batches | lr 0.00 | ms/batch  8.85 | loss  3.02 | ppl    20.47
| epoch   1 |   500/  310 batches | lr 0.00 | ms/batch  8.73 | loss  2.94 | ppl    18.97
| epoch   1 |   600/  310 batches | lr 0.00 | ms/batch  8.53 | loss  2.87 | ppl    17.60
| epoch   1 |   700/  310 batches | lr 0.00 | ms/batch  8.50 | loss  2.81 | ppl    16.59
| epoch   1 |   800/  310 batches | lr 0.00 | ms/batch  8.47 | loss  2.74 | ppl    15.51
| epoch   1 |   900/  310 batches | lr 0.00 | ms/batch  8.55 | loss  2.70 | ppl    14.93
| epoch   1 |  1000/  310 batches | lr 0.00 | ms/batch  8.67 | loss  2.67 | ppl    14.47
| epoch   1 |  1100/  310 batches | lr 0.00 | ms/batch  8.60 | loss  2.66 | ppl    14.34
| epoch   1 |  1200/ 

KeyboardInterrupt: ignored

In [None]:
import random
def vect2word(vect):
  vect = vect[0].detach().to("cpu").numpy()
  word = []
  for p in vect:
    if p > 0:
      word.append(index_to_phon[p])
  return " ".join(word)

for i in range(25):
  r1 , r2 = random.randint(0,len(train_data) - 1), random.randint(0,len(train_data[0])-1)
  ex = train_data[r1][r2].unsqueeze(0)
  mask = generate_square_subsequent_mask(1).to(device)
  res = best_model(ex, mask)
  resmax = torch.max(res,dim=2)
  # print(resmax.indices)


  print("example", vect2word(ex)[4:-4])
  print("result", vect2word(resmax.indices)[4:-4])
  print()


wrd = "BEG B AE1 G S END"

ex = torch.tensor([[phonemes[p] for p in wrd.split(" ")]]).to(device)
mask = generate_square_subsequent_mask(1).to(device)
res = best_model(ex, mask)
resmax = torch.max(res,dim=2)
print("example", vect2word(ex)[4:-4])
print("result", vect2word(resmax.indices)[4:-4])
print("vals", resmax.values[0][1:-1])

wrd = "BEG B AE1 G Z END"

ex = torch.tensor([[phonemes[p] for p in wrd.split(" ")]]).to(device)
mask = generate_square_subsequent_mask(1).to(device)
res = best_model(ex, mask)
resmax = torch.max(res,dim=2)
print("example", vect2word(ex)[4:-4])
print("result", vect2word(resmax.indices)[4:-4])
print("vals", resmax.values[0][1:-1])

# bla = torch.tensor([[phonemes["D"], phonemes["T"]]]).to(device)
# encoding = best_model.encoder(bla)[0]
# print(encoding)
# print(encoding[0] - encoding[1])

# bla = torch.tensor([[phonemes["D"], phonemes["N"]]]).to(device)
# encoding = best_model.encoder(bla)[0]
# print(encoding)
# print(encoding[0] - encoding[1])

example EH1 R G AE2 S
result EH1 R G AE2 S

example B R IH1 T N IY2
result B R IH1 T N IH1

example M AA0 R S EH1 S AH0
result M AA0 R S EH1 S AH0

example EH1 M B ER0
result EH1 M B ER0

example G AE2 L AH0 L IY1 OW0 Z
result G AE2 L AH0 L IY1 OW0 Z

example B R AO1 D HH ER0 S T
result B R AE1 D HH ER0 S T

example F AO1 R T N AH0
result F AE1 R T N AH0

example G AO1 N T
result G AE1 N T

example B EH1 R IH0 SH
result B EH1 R IH0 SH

example B AH1 F AH0 T
result B AH1 F AH0 T

example P AE1 K
result P AE1 K

example HH AE1 R AH0
result HH AE1 R AH0

example L IY1 CH
result L IY1 CH

example M AE1 S AH0 K IH0 S T
result M AE1 S AH0 K IH0 S T

example IH2 M P EY1 L D
result END M P EY1 L D

example P IH1 T AH0 D
result P IH1 T AH0 D

example K AH1 B Z
result K AH1 B Z

example B AA0 S T EY1 D OW0
result B AA0 S T EY1 D OW0

example K R IH1 S AH0 N IH0 NG
result K R IH1 S AH0 N IH0 NG

example K AA1 M P OW0 S T
result K AA1 M P OW0 S T

example S IH1 N AH0 M AE0 K S
result S IH1 N AH0 M

In [None]:
m = nn.Softmax(dim=2)
r = best_model(ex, mask)
print(r.shape)
sft = m(r)
print(torch.sum(r, dim = 2))
print(torch.sum(sft, dim = 2))
print(sft)

# print(m(best_model(ex, mask)))

torch.Size([1, 4, 70])
tensor([[-26.3453, -27.0656,  17.0586, -91.2527]], device='cuda:0',
       grad_fn=<SumBackward1>)
tensor([[1.0000, 1.0000, 1.0000, 1.0000]], device='cuda:0',
       grad_fn=<SumBackward1>)
tensor([[[9.5487e-07, 5.7577e-05, 3.0597e-08, 7.8769e-11, 3.6094e-07,
          7.4658e-06, 3.7032e-08, 7.5257e-13, 1.4633e-10, 1.0678e-07,
          1.9013e-10, 1.1392e-06, 1.9536e-11, 5.6422e-09, 1.3081e-13,
          4.3037e-09, 4.3951e-07, 1.2884e-08, 9.6486e-05, 1.3890e-09,
          6.2488e-07, 4.6038e-05, 9.7254e-09, 2.6380e-05, 9.7611e-07,
          9.9968e-01, 3.3579e-05, 1.2219e-10, 2.2879e-11, 5.1743e-08,
          7.9277e-11, 7.4144e-07, 1.6101e-11, 2.8078e-10, 2.5263e-10,
          4.9425e-07, 1.1511e-09, 3.3990e-08, 1.2757e-05, 5.2381e-08,
          3.5845e-09, 7.2815e-10, 1.1097e-11, 1.2549e-06, 4.6851e-07,
          2.6210e-07, 1.7313e-06, 8.3643e-07, 2.9552e-08, 7.3294e-07,
          7.2210e-08, 2.6002e-06, 2.1903e-08, 3.0060e-07, 4.2197e-07,
          3.1859e