In [1]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.2.2-py3-none-any.whl (346 kB)
[K     |████████████████████████████████| 346 kB 4.9 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 56.3 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 1.2 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 69.3 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting dill<0.3.5
  Downloading dill-0.3.4-py2.py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.6 MB/s 
[?25hCollecting

In [2]:
!pip install zstandard

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting zstandard
  Downloading zstandard-0.17.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.5 MB)
[K     |████████████████████████████████| 2.5 MB 4.9 MB/s 
[?25hInstalling collected packages: zstandard
Successfully installed zstandard-0.17.0


In [31]:
import math
import time

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import Parameter

import torchtext

import datasets
from enum import IntEnum
class Dim(IntEnum):
    batch = 0
    seq = 1
    feature = 2

In [4]:
torch.manual_seed(0)

<torch._C.Generator at 0x7fa858104630>

In [5]:
dataset = datasets.load_dataset('ptb_text_only')

Downloading builder script:   0%|          | 0.00/2.69k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

Downloading and preparing dataset ptb_text_only/penn_treebank (download: 5.68 MiB, generated: 5.72 MiB, post-processed: Unknown size, total: 11.40 MiB) to /root/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.70M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/135k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/150k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/42068 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3761 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3370 [00:00<?, ? examples/s]

Dataset ptb_text_only downloaded and prepared to /root/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence'],
        num_rows: 42068
    })
    test: Dataset({
        features: ['sentence'],
        num_rows: 3761
    })
    validation: Dataset({
        features: ['sentence'],
        num_rows: 3370
    })
})

In [7]:
dataset['train'][0]

{'sentence': 'aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter'}

In [8]:
dataset['train'][1]

{'sentence': 'pierre <unk> N years old will join the board as a nonexecutive director nov. N'}

In [9]:
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

In [10]:
tokenizer('hello world how are you?')

['hello', 'world', 'how', 'are', 'you', '?']

In [11]:
tokenizer(dataset['train'][1]['sentence'])

['pierre',
 '<unk>',
 'n',
 'years',
 'old',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'nov',
 '.',
 'n']

In [12]:
def tokenize_data(example, tokenizer):
    tokens = {'tokens': tokenizer(example['sentence'])}
    return tokens

In [13]:
tokenized_dataset = dataset.map(tokenize_data, remove_columns=['sentence'], fn_kwargs={'tokenizer': tokenizer})

  0%|          | 0/42068 [00:00<?, ?ex/s]

  0%|          | 0/3761 [00:00<?, ?ex/s]

  0%|          | 0/3370 [00:00<?, ?ex/s]

In [14]:
tokenized_dataset['train'][1]

{'tokens': ['pierre',
  '<unk>',
  'n',
  'years',
  'old',
  'will',
  'join',
  'the',
  'board',
  'as',
  'a',
  'nonexecutive',
  'director',
  'nov',
  '.',
  'n']}

In [15]:
vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_dataset['train']['tokens'],
                                                  min_freq=3)

In [16]:
vocab.get_itos()[:10]

['the', '<unk>', 'n', 'of', 'to', 'a', 'in', 'and', '.', "'"]

In [17]:
len(vocab)

9878

In [18]:
'hello' in vocab

False

In [None]:
# vocab.insert_token('<unk>', 0)

In [20]:
vocab.get_itos()[:10]

['the', '<unk>', 'n', 'of', 'to', 'a', 'in', 'and', '.', "'"]

In [21]:
unk_index = vocab['<unk>']
vocab.set_default_index(unk_index)

In [22]:
vocab['hello']

1

In [23]:
vocab.insert_token('<eos>', 1)

In [24]:
vocab.get_itos()[:10]

['the', '<eos>', '<unk>', 'n', 'of', 'to', 'a', 'in', 'and', '.']

In [25]:
def get_data(dataset, vocab, batch_size):
    data = []
    for example in dataset:
        if example['tokens']:
            tokens = example['tokens'].append('<eos>')
            tokens = [vocab[token] for token in example['tokens']]
            data.extend(tokens)
    data = torch.LongTensor(data)
    n_batches = data.shape[0] // batch_size
    data = data.narrow(0, 0, n_batches * batch_size)
    data = data.view(batch_size, -1)
    return data

In [26]:
batch_size = 65


train_data = get_data(tokenized_dataset['train'], vocab, batch_size)

In [27]:
train_data.shape

torch.Size([65, 14868])

In [28]:
valid_data = get_data(tokenized_dataset['validation'], vocab, batch_size)
test_data = get_data(tokenized_dataset['test'], vocab, batch_size)

In [34]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, dropout_rate, tie_weights):
        super().__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout_rate, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.dropout = nn.Dropout(dropout_rate)
        #self.Q = Parameter(torch.Tensor(hidden_sz,embedding_dim))
        #self.R = Parameter(torch.Tensor(embedding_dim,hidden_sz))


        if tie_weights:
            assert embedding_dim == hidden_dim, 'If tying weights then embedding_dim must equal hidden_dim'
            self.embedding.weight = self.fc.weight

        self.init_weights()

    def init_weights(self):
        init_range = 0.1
        self.embedding.weight.data.uniform_(-init_range, init_range)
        self.fc.weight.data.uniform_(-init_range, init_range)
        self.fc.bias.data.zero_()

    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device)
        cell = torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device)
        return hidden, cell

    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach()
        cell = cell.detach()
        return hidden, cell


    def forward(self, input, hidden):
        # input = [batch size, seq len]
        # hidden = [n layers, batch size, hidden dim]
        embedding = self.dropout(self.embedding(input))
        # embedding = [batch size, seq len, embedding dim]
        output, hidden = self.lstm(embedding, hidden)
        # output = [batch size, seq len, hidden dim]
        # hidden = [n layers, batch size, hidden dim]
        output = self.dropout(output)
        prediction = self.fc(output)
        # prediction = [batch size, seq len, vocab size]
        return prediction, hidden

In [35]:
vocab_size = len(vocab)
embedding_dim = 1170
hidden_dim = 1170
n_layers = 1
dropout_rate = 0.70
tie_weights = True

model = LSTM(vocab_size, embedding_dim, hidden_dim, n_layers, dropout_rate, tie_weights)

  "num_layers={}".format(dropout, num_layers))


In [36]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 22,528,869 trainable parameters


In [37]:
lr = 1e-3

optimizer = optim.Adam(model.parameters(), lr=lr,betas=(0,0.999))

In [38]:
criterion = nn.CrossEntropyLoss()

In [39]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(device)

cuda


In [40]:
model = model.to(device)
criterion = criterion.to(device)

In [41]:
def train(model, data, optimizer, criterion, batch_size, max_seq_len, clip, device):
    
    epoch_loss = 0
    model.train()
    n_tokens = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)
    
    for offset in range(0, n_tokens - 1, max_seq_len):
        optimizer.zero_grad()
        input, target, seq_len = get_batch(data, max_seq_len, n_tokens, offset)
        input = input.to(device)
        target = target.to(device)
        # input = [batch size, seq len]
        # target = [batch size, seq len]
        batch_size, seq_len = input.shape
        hidden = model.detach_hidden(hidden)
        # hidden = [n layers, batch size, hidden dim]
        prediction, hidden = model(input, hidden)
        # prediction = [batch size, seq len, vocab size]
        # hidden = [n layers, batch size, hidden dim]
        prediction = prediction.reshape(batch_size * seq_len, -1)
        target = target.reshape(-1)
        # prediction = [batch size * seq len, vocab size]
        # target = [batch size * seq len]
        loss = criterion(prediction, target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / n_tokens

In [42]:
def get_batch(data, max_seq_len, n_tokens, offset):
    seq_len = min(max_seq_len, n_tokens - offset - 1)
    input = data[:, offset:offset+seq_len]
    target = data[:, offset+1:offset+seq_len+1]
    return input, target, seq_len

In [43]:
def evaluate(model, data, criterion, batch_size, max_seq_len, device):

    epoch_loss = 0
    model.eval()
    n_tokens = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for offset in range(0, n_tokens - 1, max_seq_len):
            input, target, seq_len = get_batch(data, max_seq_len, n_tokens, offset)
            input = input.to(device)
            target = target.to(device)
            # input = [batch size, seq len]
            # target = [batch size, seq len]
            batch_size, seq_len = input.shape
            hidden = model.detach_hidden(hidden)
            # hidden = [n layers, batch size, hidden dim]
            prediction, hidden = model(input, hidden)
            # prediction = [batch size, seq len, vocab size]
            # hidden = [n layers, batch size, hidden dim]
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)
            # prediction = [batch size * seq len, vocab size]
            # target = [batch size * seq len]
            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / n_tokens

In [44]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [45]:
lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

In [46]:
n_epochs = 50
max_seq_len = 50
clip = 0.25

best_valid_loss = float('inf')

for epoch in range(n_epochs):

    start_time = time.monotonic()

    train_loss = train(model, train_data, optimizer, criterion, batch_size, max_seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size, max_seq_len, device)
    
    lr_scheduler.step(valid_loss)

    end_time = time.monotonic()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'lstm_lm.pt')

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

Epoch: 01 | Epoch Time: 0m 43s
	Train Perplexity: 386.621
	Valid Perplexity: 193.970
Epoch: 02 | Epoch Time: 0m 45s
	Train Perplexity: 192.031
	Valid Perplexity: 141.911
Epoch: 03 | Epoch Time: 0m 46s
	Train Perplexity: 148.560
	Valid Perplexity: 119.800
Epoch: 04 | Epoch Time: 0m 47s
	Train Perplexity: 125.093
	Valid Perplexity: 106.110
Epoch: 05 | Epoch Time: 0m 48s
	Train Perplexity: 109.661
	Valid Perplexity: 97.367
Epoch: 06 | Epoch Time: 0m 48s
	Train Perplexity: 98.272
	Valid Perplexity: 92.514
Epoch: 07 | Epoch Time: 0m 49s
	Train Perplexity: 89.678
	Valid Perplexity: 86.897
Epoch: 08 | Epoch Time: 0m 49s
	Train Perplexity: 82.981
	Valid Perplexity: 84.632
Epoch: 09 | Epoch Time: 0m 49s
	Train Perplexity: 77.491
	Valid Perplexity: 82.252
Epoch: 10 | Epoch Time: 0m 49s
	Train Perplexity: 72.873
	Valid Perplexity: 80.297
Epoch: 11 | Epoch Time: 0m 49s
	Train Perplexity: 69.019
	Valid Perplexity: 78.303
Epoch: 12 | Epoch Time: 0m 49s
	Train Perplexity: 65.650
	Valid Perplexity: 77

In [47]:
model.load_state_dict(torch.load('lstm_lm.pt'))

test_loss = evaluate(model, test_data, criterion, batch_size, max_seq_len, device)

print(f'Test Perplexity: {math.exp(test_loss):.3f}')

Test Perplexity: 69.236


In [None]:
def generate(prompt, n_gen_tokens, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(0)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(n_gen_tokens):
            input = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(input, hidden)
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1) 
            prediction = torch.multinomial(probs, num_samples=1).item()
            indices.append(prediction)

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [None]:
prompt = 'the'
n_gen_tokens = 25
temperature = 0.5
seed = 0

generation = generate(prompt, n_gen_tokens, temperature, model, tokenizer, vocab, device, seed)

In [None]:
generation

In [None]:
temperature = 0.1

generation = generate(prompt, n_gen_tokens, temperature, model, tokenizer, vocab, device, seed)

In [None]:
generation

['the',
 '<unk>',
 '<unk>',
 ',',
 'which',
 'was',
 'the',
 'first',
 'to',
 'be',
 'built',
 'in',
 'the',
 '<unk>',
 '.',
 '<eos>',
 '=',
 '=',
 '=',
 '=',
 'chapel',
 'of',
 'our',
 'lady',
 'of',
 'our']

In [None]:
temperature = 1.5

generation = generate(prompt, n_gen_tokens, temperature, model, tokenizer, vocab, device, seed)

In [None]:
generation

['the',
 'hide',
 'swap',
 'just',
 'leads',
 'landmarks',
 'and',
 'arranged',
 'discussions',
 '3',
 'agree',
 'specifically',
 'with',
 'the',
 'friend',
 'harvest',
 'as',
 'captains',
 'like',
 'tom',
 'bradley',
 'giger',
 'viewed',
 'the',
 'team',
 "'"]

In [None]:
temperature = 0.75

generation = generate(prompt, n_gen_tokens, temperature, model, tokenizer, vocab, device, seed)

In [None]:
generation

['the',
 'highest',
 '<unk>',
 'in',
 'the',
 'united',
 'states',
 '.',
 'it',
 'is',
 'a',
 'oldman',
 'city',
 ',',
 'and',
 'the',
 'st',
 '.',
 'louis',
 'rail',
 'district',
 'has',
 'a',
 'population',
 'of',
 '17']

In [None]:
temperature = 0.8

generation = generate(prompt, n_gen_tokens, temperature, model, tokenizer, vocab, device, seed)

In [None]:
generation

['the',
 'highest',
 'swap',
 'in',
 'the',
 'era',
 '.',
 'the',
 'old',
 '3',
 '@',
 '.',
 '@',
 '06',
 'm',
 '(',
 '3',
 '@',
 '.',
 '@',
 '6',
 'ft',
 ')',
 'wide',
 ',',
 'fifth']

In [None]:
temperature = 0.7

generation = generate(prompt, n_gen_tokens, temperature, model, tokenizer, vocab, device, seed)

In [None]:
generation

['the',
 'highest',
 '<unk>',
 'in',
 'the',
 'united',
 'states',
 '.',
 'it',
 'is',
 'a',
 '<unk>',
 '@-@',
 '<unk>',
 'and',
 'a',
 '@-@',
 '<unk>',
 '@-@',
 'chorus',
 'sample',
 ',',
 'which',
 'features',
 'the',
 '<unk>']