In [13]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from textdataset import TextDataset

In [2]:
with open('../Texts/shelley.txt', 'r', encoding="utf-8") as fp:
    text=fp.read()

start_idx = text.find('Letter 1\n\n')
end_idx = text.find('End of the Project Gutenberg')
text = text[start_idx:end_idx]
char_set = set(text)

In [3]:
chars_sorted = sorted(char_set)
char2int = {ch:i for i,ch in enumerate(chars_sorted)}
char_array = np.array(chars_sorted)
text_encoded = np.array(
    [char2int[ch] for ch in text],
    dtype=np.int32
)
print('Text encoded shape:', text_encoded.shape)
print(text[:15], '== Encoding ==>', text_encoded[:15])
print(text_encoded[15:21], '== Reverse ==>',
      ''.join(char_array[text_encoded[15:21]]))

for ex in text_encoded[:5]:
    print('{} -> {}'.format(ex, char_array[ex]))

Text encoded shape: (437427,)
Letter 1

_To M == Encoding ==> [36 57 72 72 57 70  1 13  0  0 52 44 67  1 37]
[70 71 10  1 43 53] == Reverse ==> rs. Sa
36 -> L
57 -> e
72 -> t
72 -> t
57 -> e


In [5]:
seq_length = 40         # sequence length
chunk_size = seq_length + 1
text_chunks = [text_encoded[i:i+chunk_size]
               for i in range(len(text_encoded)-chunk_size)]

In [11]:
seq_dataset = TextDataset(torch.tensor(np.array(text_chunks)))

for i, (seq, target) in enumerate(seq_dataset):
    print(' Input (x): ',
          repr(''.join(char_array[seq])))
    print('Traget (y): ',
          repr(''.join(char_array[target])))
    print()
    if i == 1:
        break

 Input (x):  'Letter 1\n\n_To Mrs. Saville, England._\n\n\n'
Traget (y):  'etter 1\n\n_To Mrs. Saville, England._\n\n\nS'

 Input (x):  'etter 1\n\n_To Mrs. Saville, England._\n\n\nS'
Traget (y):  'tter 1\n\n_To Mrs. Saville, England._\n\n\nSt'



In [14]:
batch_size = 64
torch.manual_seed(1)
seq_dl = DataLoader(seq_dataset, batch_size=batch_size, 
                    shuffle=True, drop_last=True)