### Text Preprocessing

In [1]:
import sys
sys.path.insert(0, '..')

import torch
import random

from google.colab import files
import io
from PIL import Image
import matplotlib.image as mpimg
from pathlib import Path

# you can donwload images from (http://d2l-data.s3-accelerate.amazonaws.com/timemachine.txt)
my_file = Path("./timemachine.txt") 
if not my_file.is_file():
    data_to_load = files.upload()

with open('timemachine.txt', 'r') as f:
    lines = f.readlines()
    raw_dataset = ' '.join(' '.join(lines).lower().split())

print('number of characters: ', len(raw_dataset))
print(raw_dataset[0:70])

Saving timemachine.txt to timemachine.txt
number of characters:  178605
the time machine, by h. g. wells [1898] i the time traveller (for so i


### Character Index

In [2]:
idx_to_char = list(set(raw_dataset))
char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
vocab_size = len(char_to_idx)
print(char_to_idx)

{'o': 0, '_': 1, 'x': 2, 'y': 3, 'g': 4, 'v': 5, '"': 6, '1': 7, 'i': 8, 'm': 9, 'z': 10, 'j': 11, 'n': 12, '.': 13, "'": 14, 'r': 15, 'd': 16, '8': 17, ':': 18, '?': 19, 'c': 20, 'h': 21, '!': 22, ' ': 23, 's': 24, 'e': 25, '9': 26, '(': 27, 'w': 28, '[': 29, ',': 30, 'a': 31, 'f': 32, 'u': 33, ';': 34, ']': 35, ')': 36, '-': 37, 'k': 38, 'l': 39, 'q': 40, 'b': 41, 't': 42, 'p': 43}


Converting it back to text

In [3]:
corpus_indices = [char_to_idx[char] for char in raw_dataset]
sample = corpus_indices[:20]
print('chars:', ''.join([idx_to_char[idx] for idx in sample]))
print('indices:', sample)

chars: the time machine, by
indices: [42, 21, 25, 23, 42, 8, 9, 25, 23, 9, 31, 20, 21, 8, 12, 25, 30, 23, 41, 3]


### Random Sampling

In [4]:
# This function is saved in the d2l package for future use.
def data_iter_random(corpus_indices, batch_size, num_steps):
    # offset for the iterator over the data for uniform starts
    offset = int(random.uniform(0,num_steps))
    corpus_indices = corpus_indices[offset:]
    # subtract 1 extra since we need to account for the sequence length
    num_examples = ((len(corpus_indices) - 1) // num_steps) - 1
    # discard half empty batches
    num_batches = num_examples // batch_size
    example_indices = list(range(0, num_examples * num_steps, num_steps))
    random.shuffle(example_indices)
    
    # This returns a sequence of the length num_steps starting from pos.
    def _data(pos):
        return corpus_indices[pos: pos + num_steps]

    for i in range(0, batch_size * num_batches, batch_size):
        # batch_size indicates the random examples read each time.
        batch_indices = example_indices[i:(i+batch_size)]
        X = [_data(j) for j in batch_indices]
        Y = [_data(j + 1) for j in batch_indices]       

        yield torch.tensor(X), torch.tensor(Y)

### Example

Batch size 2 and time steps is 5 for a sequence of length 30.

In [5]:
my_seq = list(range(30))
for X, Y in data_iter_random(my_seq, batch_size=2, num_steps=5):
    print('X: ', X, '\nY:', Y)

X:  tensor([[12, 13, 14, 15, 16],
        [17, 18, 19, 20, 21]]) 
Y: tensor([[13, 14, 15, 16, 17],
        [18, 19, 20, 21, 22]])
X:  tensor([[ 7,  8,  9, 10, 11],
        [ 2,  3,  4,  5,  6]]) 
Y: tensor([[ 8,  9, 10, 11, 12],
        [ 3,  4,  5,  6,  7]])


### Sequential partitioning

Adjacent positioning of minibatches. This way we can retain the latent state between batches. 

In [7]:
# This function is saved in the d2l package for future use.
def data_iter_consecutive(corpus_indices, batch_size, num_steps):
    # offset for the iterator over the data for uniform starts
    offset = int(random.uniform(0,num_steps))
    # slice out data - ignore num_steps and just wrap around
    num_indices = ((len(corpus_indices) - offset) // batch_size) * batch_size
    indices = torch.tensor(corpus_indices[offset:(offset + num_indices)])
    indices = indices.reshape((batch_size,-1))
    # need to leave one last token since targets are shifted by 1
    num_epochs = ((num_indices // batch_size) - 1) // num_steps

    for i in range(0, num_epochs * num_steps, num_steps):
        X = indices[:,i:(i+num_steps)]
        Y = indices[:,(i+1):(i+1+num_steps)]
        yield X, Y

### Example partitioning

In [8]:
for X, Y in data_iter_consecutive(my_seq, batch_size=2, num_steps=6):
    print('X: ', X, '\nY:', Y)

X:  tensor([[ 0,  1,  2,  3,  4,  5],
        [15, 16, 17, 18, 19, 20]]) 
Y: tensor([[ 1,  2,  3,  4,  5,  6],
        [16, 17, 18, 19, 20, 21]])
X:  tensor([[ 6,  7,  8,  9, 10, 11],
        [21, 22, 23, 24, 25, 26]]) 
Y: tensor([[ 7,  8,  9, 10, 11, 12],
        [22, 23, 24, 25, 26, 27]])
