In [91]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from google.colab import files
import numpy as np

In [15]:
uploaded = files.upload()

Saving ptb.train.txt to ptb.train (1).txt


In [24]:
with open("ptb.train.txt", 'r') as f:
    lines = f.readlines()


In [28]:
def get_tokens():
    tokens = [list(line) for line in lines]
    return tokens

token = get_tokens()

In [31]:
def flatten(tokens):
    return [items for i in tokens for items in i]

tokens = flatten(token)
print(tokens[:100])

[' ', 'a', 'e', 'r', ' ', 'b', 'a', 'n', 'k', 'n', 'o', 't', 'e', ' ', 'b', 'e', 'r', 'l', 'i', 't', 'z', ' ', 'c', 'a', 'l', 'l', 'o', 'w', 'a', 'y', ' ', 'c', 'e', 'n', 't', 'r', 'u', 's', 't', ' ', 'c', 'l', 'u', 'e', 't', 't', ' ', 'f', 'r', 'o', 'm', 's', 't', 'e', 'i', 'n', ' ', 'g', 'i', 't', 'a', 'n', 'o', ' ', 'g', 'u', 't', 'e', 'r', 'm', 'a', 'n', ' ', 'h', 'y', 'd', 'r', 'o', '-', 'q', 'u', 'e', 'b', 'e', 'c', ' ', 'i', 'p', 'o', ' ', 'k', 'i', 'a', ' ', 'm', 'e', 'm', 'o', 't', 'e']


In [30]:
def unique_char(tokens):
    uniq_tokens = []
    for i in tokens:
        if i not in uniq_tokens:
            uniq_tokens.append(i)
    return uniq_tokens


uniq_tokens = unique_char(tokens)
print(len(uniq_tokens))

50


In [None]:
vocab = {}
for e, char in enumerate(uniq_tokens):
    vocab[char] = e

vocab

In [35]:
numerical = [vocab[char] for char in tokens]
len(numerical)

5101619

In [36]:
def one_hot_data(numerical_list, vocab_size=50):
    result = torch.zeros((len(numerical_list), vocab_size))
    for i, idx in enumerate(numerical_list):
        result[i, idx] = 1.0
    return result

In [38]:
seq_length = 64
num_samples = (len(numerical) - 1) // seq_length
dataset = one_hot_data(numerical[:num_samples * seq_length]).reshape(num_samples, seq_length, len(uniq_tokens))
dataset.shape

torch.Size([79712, 64, 50])

In [39]:
batch_size = 32
num_batches = len(dataset) // batch_size
train_iter = dataset[:num_batches * batch_size].reshape((batch_size, num_batches, seq_length, len(uniq_tokens)))
train_iter = train_iter.swapaxes(0, 1)
train_iter = train_iter.swapaxes(1, 2)
train_iter.shape

torch.Size([2491, 64, 32, 50])

In [41]:
labels = one_hot_data(numerical[1:num_samples * seq_length + 1]).reshape(batch_size, num_batches, seq_length, len(uniq_tokens))
labels = labels.swapaxes(0, 1)
labels = labels.swapaxes(1, 2)
labels.shape

torch.Size([2491, 64, 32, 50])

In [43]:
def textify(embedding):
    result = ""
    indices = torch.argmax(embedding, axis=1)
    for idx in indices:
        result += uniq_tokens[int(idx)]
    return result

In [44]:
print(textify(train_iter[10, :, 3]))
print(textify(labels[10, :, 3]))

ell based in los angeles makes and distributes electronic comput
ll based in los angeles makes and distributes electronic compute


In [81]:
def get_params(vocab_size, num_hiddens):
    # Input gate params
    W_xi = torch.normal(0, 0.01, (vocab_size, num_hiddens), requires_grad=True)
    W_hi = torch.normal(0, 0.01, (num_hiddens, num_hiddens), requires_grad=True)
    b_i = torch.zeros(num_hiddens, requires_grad=True)
    # Forget gate params
    W_xf = torch.normal(0, 0.01, (vocab_size, num_hiddens), requires_grad=True)
    W_hf = torch.normal(0, 0.01, (num_hiddens, num_hiddens), requires_grad=True)
    b_f = torch.zeros(num_hiddens, requires_grad=True)
    # Output gate params
    W_xo = torch.normal(0, 0.01, (vocab_size, num_hiddens), requires_grad=True)
    W_ho = torch.normal(0, 0.01, (num_hiddens, num_hiddens), requires_grad=True)
    b_o = torch.zeros(num_hiddens, requires_grad=True)
    # Memory cell params
    W_xc = torch.normal(0, 0.01, (vocab_size, num_hiddens), requires_grad=True)
    W_hc = torch.normal(0, 0.01, (num_hiddens, num_hiddens), requires_grad=True)
    b_c = torch.zeros(num_hiddens, requires_grad=True)
    # Output layer params
    W_hq = torch.normal(0, 0.01, (num_hiddens, vocab_size), requires_grad=True)
    b_q = torch.zeros(vocab_size, requires_grad=True)
    params = [W_hi, W_xi, b_i, W_hf, W_xf, b_f, W_ho, W_xo, b_o, W_hc, W_xc, b_c, W_hq, b_q]
    return params

In [45]:
def init_state(batch_size, num_hiddens):
    return (torch.zeros(batch_size, num_hiddens), torch.zeros(batch_size, num_hiddens))

In [96]:
def lstm(inputs, state, params):
    W_hi, W_xi, b_i, W_hf, W_xf, b_f, W_ho, W_xo, b_o, W_hc, W_xc, b_c, W_hq, b_q = params
    outputs = []
    H, C = state
    for x in inputs:
        # https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html?highlight=nn+lstm#torch.nn.LSTM
        I = torch.sigmoid((x @ W_xi) + (H @ W_hi) + b_i)
        F = torch.sigmoid((x @ W_xf) + (H @ W_hf) + b_f)
        O = torch.sigmoid((x @ W_xo) + (H @ W_ho) + b_o)
        G = torch.tanh((x @ W_xc) + (H @ W_hc) + b_c)
        C = F * C + I * G
        H = O * torch.tanh(C)
        Y = (H @ W_hq) + b_q
        outputs.append(softmax(Y))
    return torch.cat(outputs, dim=0), (H, C)

In [92]:
def predict(prefix, num_chars):
    string = prefix
    sample_state = init_state(batch_size=32, num_hiddens=256)
    string_numerical = [vocab[char] for char in prefix]
    input = one_hot_data(string_numerical)
  
    for i in range(num_chars):
        outputs, sample_state = lstm(input, sample_state)
        choice = np.random.choice(50, p=fix_p(np.asarray(outputs[-1][0])))
        string += uniq_tokens[choice]
        input = one_hot_data([choice])
    return string

In [69]:
def fix_p(p):
    if p.sum() != 1.0:
        p = p*(1./p.sum())
    return p

In [95]:
def softmax(X):
    lin = (X - torch.max(X).reshape((-1, 1)))
    X_exp = torch.exp(lin)
    partition = X_exp.sum(1, keepdim=True)
    return X_exp / partition

In [83]:
def crossentropy(y_hat, y):
    return -torch.mean(torch.sum(y * torch.log(y_hat)))

In [84]:
def average_ce_loss(outputs, labels):
    assert(len(labels == len(outputs)))
    total_loss = 0
    for (outputs, labels) in zip(outputs, labels):
        total_loss = total_loss + crossentropy(outputs, labels)
    return total_loss / len(outputs)

In [89]:
def grad_clipping(params, theta):
    norm = torch.sqrt(sum(torch.sum((p.grad**2)) for p in params))
    if norm > theta:
        for param in params:
            param.grad[:] *= theta / norm

In [None]:
num_epochs = 500
criterion = average_ce_loss
params = get_params(vocab_size=50, num_hiddens=256)
lr = 0.01
optimizer = torch.optim.SGD(params, lr)

for epoch in range(num_epochs):
    state = init_state(batch_size=32, num_hiddens=256)
    for i in range(num_batches):
        input = train_iter[i]
        train_labels = labels[i]
        state = (s.detach() for s in state)
        optimizer.zero_grad()
        y_hat, state = lstm(input, state, params)
        l = criterion(y_hat, train_labels)
        l.sum().backward()
        grad_clipping(params, 1)
        optimizer.step()

    with torch.no_grad():
        l_loss = criterion(y_hat, train_labels)
        print(f'loss on epoch {epoch} was {l_loss}')
        print(predict('how many people does it take', 256))