In [1]:
from d2l import torch as d2l
import torch
import random

In [2]:
import random
import torch
from d2l import torch as d2l

tokens = d2l.tokenize(d2l.read_time_machine())
# Since each text line is not necessarily a sentence or a paragraph, we
# concatenate all text lines
corpus = [token for line in tokens for token in line]
vocab = d2l.Vocab(corpus)
vocab.token_freqs[:10]

[('the', 2261),
 ('i', 1267),
 ('and', 1245),
 ('of', 1155),
 ('a', 816),
 ('to', 695),
 ('was', 552),
 ('in', 541),
 ('that', 443),
 ('my', 440)]

In [3]:
bigram = [pair for pair in zip(corpus[:-1], corpus[1:])]
bigram_vocab = d2l.Vocab(bigram)

In [4]:
trigram = [trip for trip in zip(corpus[:-2], corpus[1:-1], corpus[2:])]
trigram_vocab = d2l.Vocab(trigram)
trigram_vocab.token_freqs[0:10]

[(('the', 'time', 'traveller'), 59),
 (('the', 'time', 'machine'), 30),
 (('the', 'medical', 'man'), 24),
 (('it', 'seemed', 'to'), 16),
 (('it', 'was', 'a'), 15),
 (('here', 'and', 'there'), 15),
 (('seemed', 'to', 'me'), 14),
 (('i', 'did', 'not'), 14),
 (('i', 'saw', 'the'), 13),
 (('i', 'began', 'to'), 13)]

## Random Sampling

In [14]:
def seq_data_iter_random(corpus, batch_size, num_steps):
#     print(int((len(corpus) - 1)/num_steps))
    initial_indices = list(range((len(corpus) - 1)//num_steps*num_steps))
    random.shuffle(initial_indices)
    
    for i in range(0, int((len(corpus) - 1)/num_steps), batch_size):
        X = []
        y = []
        for j in range(batch_size):
#             rand_idx = random.randint(0, len(corpus) - num_steps - 1)
            X.append(torch.tensor(corpus[initial_indices[i + j]: initial_indices[i + j] + num_steps]))
            y.append(torch.tensor(corpus[initial_indices[i + j] + 1: initial_indices[i + j] + num_steps + 1]))
        yield X, y

In [26]:
my_seq = list(range(35))
for X, Y in seq_data_iter_random(my_seq, batch_size=3, num_steps=5):
    print('X: ', X, '\nY:', Y)

X:  [tensor([ 9, 10, 11, 12, 13]), tensor([25, 26, 27, 28, 29]), tensor([12, 13, 14, 15, 16])] 
Y: [tensor([10, 11, 12, 13, 14]), tensor([26, 27, 28, 29, 30]), tensor([13, 14, 15, 16, 17])]
X:  [tensor([26, 27, 28, 29, 30]), tensor([22, 23, 24, 25, 26]), tensor([24, 25, 26, 27, 28])] 
Y: [tensor([27, 28, 29, 30, 31]), tensor([23, 24, 25, 26, 27]), tensor([25, 26, 27, 28, 29])]


## Sequential Partioning

In [78]:
def seq_data_iter_sequential(corpus, batch_size, num_steps):
    initial_indices = list(range((len(corpus) - 1)//num_steps*num_steps))
    
    batch_start_index = []
    for i in range(0, batch_size):
        batch_start_index.append(i*int((len(corpus) - 1)//num_steps)//batch_size*num_steps)
        
    for i in range(0, int((len(corpus) - 1)/num_steps)//batch_size):
        X = []
        y = []
        for j in range(batch_size):
#             rand_idx = random.randint(0, len(corpus) - num_steps - 1)
#             t = int((len(corpus) - 1)//num_steps)//batch_size*num_steps
            X.append(torch.tensor(corpus[batch_start_index[j] + i*num_steps: batch_start_index[j] + i*num_steps + num_steps]))
            y.append(torch.tensor(corpus[batch_start_index[j] + i*num_steps + 1: batch_start_index[j] + i*num_steps + num_steps + 1]))
        yield torch.stack(X), torch.stack(y)

In [79]:
my_seq = list(range(35))
for X, Y in seq_data_iter_sequential(my_seq, batch_size=2, num_steps=5):
    print('X: ', X, '\nY:', Y)

X:  tensor([[ 0,  1,  2,  3,  4],
        [15, 16, 17, 18, 19]]) 
Y: tensor([[ 1,  2,  3,  4,  5],
        [16, 17, 18, 19, 20]])
X:  tensor([[ 5,  6,  7,  8,  9],
        [20, 21, 22, 23, 24]]) 
Y: tensor([[ 6,  7,  8,  9, 10],
        [21, 22, 23, 24, 25]])
X:  tensor([[10, 11, 12, 13, 14],
        [25, 26, 27, 28, 29]]) 
Y: tensor([[11, 12, 13, 14, 15],
        [26, 27, 28, 29, 30]])
