In [1]:
import numpy as np

In [None]:
with open('1268-0.txt', 'r', encoding="utf8") as fp:
    text=fp.read()
    
start_indx = text.find('THE MYSTERIOUS ISLAND')
end_indx = text.find('End of the Project Gutenberg')

text = text[start_indx:end_indx]
char_set = set(text)
print('Total Length:', len(text))
print('Unique Characters:', len(char_set))

Total Length: 1112310
Unique Characters: 80


In [6]:
chars_sorted = sorted(char_set)
char2int = {ch:i for i, ch in enumerate(chars_sorted)}
char_array = np.array(chars_sorted)
text_encoded = np.array([char2int[ch] for ch in text], dtype=np.int32)
print('Paзмep закодированного текста:', text_encoded.shape) 
print(text[:15], '==Кодирование==>', text_encoded[:15])
print(text_encoded[15:21], '==Декодирование==>','' .join(char_array[text_encoded[15:21]]))

Paзмep закодированного текста: (1112310,)
THE MYSTERIOUS  ==Кодирование==> [44 32 29  1 37 48 43 44 29 42 33 39 45 43  1]
[33 43 36 25 38 28] ==Декодирование==> ISLAND


In [7]:
for ex in text_encoded[:5]:
    print('{} -> {}'.format(ex, char_array[ex]))

44 -> T
32 -> H
29 -> E
1 ->  
37 -> M


In [8]:
import torch
from torch.utils.data import Dataset

In [13]:
seq_length = 40
chunk_size = seq_length + 1
text_chunks = [text_encoded[i:i+chunk_size] for i in range(len(text_encoded) - chunk_size + 1)]

In [14]:
class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks

    def __len__(self):
        return len(self.text_chunks)
    
    def __getitem__(self, idx):
        text_chunk = self.text_chunks[idx]
        return text_chunk[:-1].long(), text_chunk[1:].long()
    
seq_dataset = TextDataset(torch.tensor(text_chunks))

  seq_dataset = TextDataset(torch.tensor(text_chunks))


In [15]:
for i, (seq, target) in enumerate(seq_dataset):
    print(' Input (x):', repr(''.join(char_array[seq])))
    print('Target (y):', repr(''.join(char_array[target])))
    print()
    if i == 1:
        break

 Input (x): 'THE MYSTERIOUS ISLAND\n\nby Jules Verne\n\n1'
Target (y): 'HE MYSTERIOUS ISLAND\n\nby Jules Verne\n\n18'

 Input (x): 'HE MYSTERIOUS ISLAND\n\nby Jules Verne\n\n18'
Target (y): 'E MYSTERIOUS ISLAND\n\nby Jules Verne\n\n187'



In [16]:
from torch.utils.data import DataLoader


In [17]:
device = 'cpu'
batch_size = 64

torch.manual_seed(1)
seq_dl = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

In [18]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim) 
        self.rnn_hidden_size = rnn_hidden_size
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, 
                           batch_first=True)
        self.fc = nn.Linear(rnn_hidden_size, vocab_size)

    def forward(self, x, hidden, cell):
        out = self.embedding(x).unsqueeze(1)
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
        out = self.fc(out).reshape(out.size(0), -1)
        return out, hidden, cell

    def init_hidden(self, batch_size):
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden.to(device), cell.to(device)
    
vocab_size = len(char_array)
embed_dim = 256
rnn_hidden_size = 512

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size) 
model = model.to(device)
model

RNN(
  (embedding): Embedding(80, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=80, bias=True)
)

In [19]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

num_epochs = 10000 


In [20]:
torch.manual_seed(1)

for epoch in range(num_epochs):
    hidden, cell = model.init_hidden(batch_size)
    seq_batch, target_batch = next(iter(seq_dl))
    seq_batch = seq_batch.to(device)
    target_batch = target_batch.to(device)
    optimizer.zero_grad()
    loss = 0
    for c in range(seq_length):
        pred, hidden, cell = model(seq_batch[:, c], hidden, cell) 
        loss += loss_fn(pred, target_batch[:, c])
    loss.backward()
    optimizer.step()
    loss = loss.item()/seq_length
    if epoch % 500 == 0:
        print(f'Epoch {epoch} loss: {loss:.4f}')
 

Epoch 0 loss: 4.3712
Epoch 500 loss: 1.4108
Epoch 1000 loss: 1.3406
Epoch 1500 loss: 1.2845
Epoch 2000 loss: 1.1918
Epoch 2500 loss: 1.1875
Epoch 3000 loss: 1.1514
Epoch 3500 loss: 1.1670
Epoch 4000 loss: 1.1435
Epoch 4500 loss: 1.1252
Epoch 5000 loss: 1.1679
Epoch 5500 loss: 1.1235
Epoch 6000 loss: 1.1133
Epoch 6500 loss: 1.1532
Epoch 7000 loss: 1.1205
Epoch 7500 loss: 1.1743
Epoch 8000 loss: 1.2022
Epoch 8500 loss: 1.1859
Epoch 9000 loss: 1.0961
Epoch 9500 loss: 1.1491


In [21]:
from torch.distributions.categorical import Categorical

torch.manual_seed(1)

logits = torch.tensor([[1.0, 1.0, 1.0]])

print('Probabilities:', nn.functional.softmax(logits, dim=1).numpy()[0])

m = Categorical(logits=logits)
samples = m.sample((10,))
 
print(samples.numpy())

Probabilities: [0.33333334 0.33333334 0.33333334]
[[0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [2]
 [1]
 [1]]


In [22]:
torch.manual_seed(1)

logits = torch.tensor([[1.0, 1.0, 3.0]])

print('Probabilities:', nn.functional.softmax(logits, dim=1).numpy()[0])

m = Categorical(logits=logits)
samples = m.sample((10,))
 
print(samples.numpy())

Probabilities: [0.10650698 0.10650698 0.78698605]
[[0]
 [2]
 [2]
 [1]
 [2]
 [1]
 [2]
 [2]
 [2]
 [2]]


In [23]:
def sample(model, starting_str, 
           len_generated_text=500, 
           scale_factor=1.0):

    encoded_input = torch.tensor([char2int[s] for s in starting_str])
    encoded_input = torch.reshape(encoded_input, (1, -1))

    generated_str = starting_str

    model.eval()
    hidden, cell = model.init_hidden(1)
    hidden = hidden.to('cpu')
    cell = cell.to('cpu')
    for c in range(len(starting_str)-1):
        _, hidden, cell = model(encoded_input[:, c].view(1), hidden, cell) 
    
    last_char = encoded_input[:, -1]
    for i in range(len_generated_text):
        logits, hidden, cell = model(last_char.view(1), hidden, cell) 
        logits = torch.squeeze(logits, 0)
        scaled_logits = logits * scale_factor
        m = Categorical(logits=scaled_logits)
        last_char = m.sample()
        generated_str += str(char_array[last_char])
        
    return generated_str

torch.manual_seed(1)
model.to('cpu')
print(sample(model, starting_str='The island'))

The island for a ringly acqually, they likes; well as if it was of guids had fled in the same time azotic arrival of such was the river, in spranglin; for even it from the road as
you did not make ought that, it
could only open the events of these blots. Neither Neb
exteriored ran
possible among themselves to the corral.

Did you. You knot so,” replied the sailor, “and we shall once apart and his bite.

White the workwhan describe dry sloped over make a family of the pheater
by, they could not return thei


In [24]:
logits = torch.tensor([[1.0, 1.0, 3.0]])

print('Probabilities before scaling:        ', nn.functional.softmax(logits, dim=1).numpy()[0])

print('Probabilities after scaling with 0.5:', nn.functional.softmax(0.5*logits, dim=1).numpy()[0])

print('Probabilities after scaling with 0.1:', nn.functional.softmax(0.1*logits, dim=1).numpy()[0])

Probabilities before scaling:         [0.10650698 0.10650698 0.78698605]
Probabilities after scaling with 0.5: [0.21194156 0.21194156 0.57611686]
Probabilities after scaling with 0.1: [0.3104238  0.3104238  0.37915248]


In [25]:
torch.manual_seed(1)
print(sample(model, starting_str='The island', 
             scale_factor=0.5))

The island deep island; I
nentu”rno away histesred cliffs, but to skw, “Hallo, Pencroft!”  by they d’pentabouts, or.
Cyruss acted powder!--riry off had placed pixeding, camine has
will havemkes, Pencad hgassy my bushes; stepne hurrahr, wlaesly Cyrus borroundadned in
dry times. On 182.
Hollechduritterians,”
repprasented,ed Hardnary, if if Captain,” said the sailor, “Giviod rife.”

Aft, Caga often, Herbert, working,” sawabed Cylinga.
Wreshlast touched Fahad Tralitey.
Eithery vain’s, asscooved,
Nen replied r
