In [1]:
with open('input.txt', 'r', encoding='utf-8') as file:
    text = file.read()

In [2]:
len(text)

1115393

In [7]:
chars = sorted(list(set(text))) # stores all the characters sorted.
# set() is used to get the unique characters.
# list() is used to transform to a Python list.
# sorted() is used to sort the elements. 

vocab_size = len(chars) # stores the vocabolary size.

In [18]:
encode_dict = { ch:i for i, ch in enumerate(chars) } # encoding of character to integer.
decode_dict = { i:ch for i, ch in enumerate(chars) } # decoding of integer to character.

def encode(s):
    return [encode_dict[c] for c in s]

def decode(d):
    return ''.join([decode_dict[n] for n in d])

In [28]:
import torch

# transforming the data into a pytorch tensor.
data = torch.tensor(
    encode(text),
    dtype=torch.long,
) 

In [32]:
# splitting the data into training and testing sets.
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [33]:
block_size = 8

train_data[:block_size + 1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [43]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

print(f"x = {x}")
print(f"y = {y}")

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is: {context} the target is: {target}")

x = tensor([18, 47, 56, 57, 58,  1, 15, 47])
y = tensor([47, 56, 57, 58,  1, 15, 47, 58])
When input is: tensor([18]) the target is: 47
When input is: tensor([18, 47]) the target is: 56
When input is: tensor([18, 47, 56]) the target is: 57
When input is: tensor([18, 47, 56, 57]) the target is: 58
When input is: tensor([18, 47, 56, 57, 58]) the target is: 1
When input is: tensor([18, 47, 56, 57, 58,  1]) the target is: 15
When input is: tensor([18, 47, 56, 57, 58,  1, 15]) the target is: 47
When input is: tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is: 58


In [61]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
   
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    return x, y


inputs:
torch.Size([4, 8])
tensor([[53, 59,  6,  1, 58, 56, 47, 40],
        [49, 43, 43, 54,  1, 47, 58,  1],
        [13, 52, 45, 43, 50, 53,  8,  0],
        [ 1, 39,  1, 46, 53, 59, 57, 43]])
targets:
torch.Size([4, 8])
tensor([[59,  6,  1, 58, 56, 47, 40, 59],
        [43, 43, 54,  1, 47, 58,  1, 58],
        [52, 45, 43, 50, 53,  8,  0, 26],
        [39,  1, 46, 53, 59, 57, 43,  0]])
-----
When input is [53], the target is: 59
When input is [53, 59], the target is: 6
When input is [53, 59, 6], the target is: 1
When input is [53, 59, 6, 1], the target is: 58
When input is [53, 59, 6, 1, 58], the target is: 56
When input is [53, 59, 6, 1, 58, 56], the target is: 47
When input is [53, 59, 6, 1, 58, 56, 47], the target is: 40
When input is [53, 59, 6, 1, 58, 56, 47, 40], the target is: 59
When input is [49], the target is: 43
When input is [49, 43], the target is: 43
When input is [49, 43, 43], the target is: 54
When input is [49, 43, 43, 54], the target is: 1
When input is [49, 43, 