#### Torch Import

In [24]:
import torch
import torch.nn as nn
import torch.nn.functional as F

#### CUDA Check

In [25]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(device)

cuda


#### Basic Test

In [26]:
# Read the contents of the file 'infinite_in_modern_thought.txt' and store it in the variable 'text'
with open('infinite_in_modern_thought.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Print the first 1000 characters of the text
# print(text[:1000])

# Create a list of unique characters in the text and sort them
chars = sorted(list(set(text)))
# print(chars)

# Print the number of unique characters
# print(len(chars))

# print(text[:1000])
chars = sorted(list(set(text)))
# print(chars)
# print(len(chars))

vocab_size = len(chars)

#### Simple Mapping

In [27]:
# Create a dictionary that maps each character to its index in the 'chars' list
string_to_index = {ch: i for i, ch in enumerate(chars)}

# Create a dictionary that maps each index to its corresponding character in the 'chars' list
index_to_string = {i: ch for i, ch in enumerate(chars)}

# Define the 'endcode' function that converts a string to a list of character indices
endcode = lambda s: [string_to_index[c] for c in s]

# Define the 'decode' function that converts a list of character indices to a string
decode = lambda l: ''.join([index_to_string[i] for i in l])

print(endcode("Hello, World!"))

print(decode(endcode("Hello, World!")))

[35, 61, 68, 68, 71, 11, 1, 50, 71, 74, 68, 60, 2]
Hello, World!


#### Torch Mapping

In [28]:
# Create a dictionary that maps each character to its index in the 'chars' list
string_to_index = {ch: i for i, ch in enumerate(chars)}

# Create a dictionary that maps each index to its corresponding character in the 'chars' list
index_to_string = {i: ch for i, ch in enumerate(chars)}

# Define the 'endcode' function that converts a string to a list of character indices
endcode = lambda s: [string_to_index[c] for c in s]

# Define the 'decode' function that converts a list of character indices to a string
decode = lambda l: ''.join([index_to_string[i] for i in l])

# Create a tensor from the encoded text using torch.tensor
data = torch.tensor(endcode(text), dtype=torch.long)

print(data[:100])

tensor([97, 47, 64, 61,  1, 43, 74, 71, 66, 61, 59, 76,  1, 34, 77, 76, 61, 70,
        58, 61, 74, 63,  1, 61, 29, 71, 71, 67,  1, 71, 62,  1, 42, 70,  1, 76,
        64, 61,  1, 76, 64, 61, 71, 74, 81,  1, 71, 62,  1, 76, 64, 61,  1, 65,
        70, 62, 65, 70, 65, 76, 61,  1, 65, 70,  1, 69, 71, 60, 61, 74, 70,  1,
        76, 64, 71, 77, 63, 64, 76,  0,  1,  1,  1,  1,  0, 47, 64, 65, 75,  1,
        61, 58, 71, 71, 67,  1, 65, 75,  1, 62])


#### Train Validation Split

In [29]:
# Split the data into train and validation sets
data_size = int(0.9 * len(data))  # Calculate the size of the train data

train_data = data[:data_size]  # Assign the first 90% of the data to the train_data variable
val_data = data[data_size:]  # Assign the remaining 10% of the data to the val_data variable

#### Tensor Process Flow

In [30]:
block_size = 8

x = train_data[:block_size]
y = train_data[1:block_size + 1]

for t in range(block_size):
    # Get the context by slicing the input sequence up to the current position
    context = x[:t + 1]
    
    # Get the target by selecting the next character in the input sequence
    target = y[t]
    
    # Print the context and target
    print(f"when input is {context}, the target: {target}")

when input is tensor([97]), the target: 47
when input is tensor([97, 47]), the target: 64
when input is tensor([97, 47, 64]), the target: 61
when input is tensor([97, 47, 64, 61]), the target: 1
when input is tensor([97, 47, 64, 61,  1]), the target: 43
when input is tensor([97, 47, 64, 61,  1, 43]), the target: 74
when input is tensor([97, 47, 64, 61,  1, 43, 74]), the target: 71
when input is tensor([97, 47, 64, 61,  1, 43, 74, 71]), the target: 66


#### Split Dataset

In [31]:
# Split the data into train and validation sets
data_size = int(0.9 * len(data))  # Calculate the size of the train data

train_data = data[:data_size]  # Assign the first 90% of the data to the train_data variable
val_data = data[data_size:]  # Assign the remaining 10% of the data to the val_data variable

# Define the get_batch function that returns a batch of data
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

# Set the batch size
batch_size = 4

# Get a batch of training data
x, y = get_batch('train')

# Print the shape of x and y
print(x.shape, y.shape)

# Print the values of x and y
print(x)
print(y)

torch.Size([4, 8]) torch.Size([4, 8])
tensor([[61, 80, 57, 69, 72, 68, 61, 75],
        [60, 60, 65, 70, 63,  1, 76, 64],
        [61, 76, 57, 74, 81,  1, 71, 74],
        [ 1, 76, 64, 61,  1, 57, 68, 68]], device='cuda:0')
tensor([[80, 57, 69, 72, 68, 61, 75,  1],
        [60, 65, 70, 63,  1, 76, 64, 61],
        [76, 57, 74, 81,  1, 71, 74,  1],
        [76, 64, 61,  1, 57, 68, 68, 65]], device='cuda:0')


#### Bigram (Autoregressive Model)

In [32]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self.forward(idx)
            logits = logits[:, -1, :]  # get the last token's logits
            probs = F.softmax(logits, dim=-1)  # get the probabilities of the last token
            idx_next = torch.multinomial(probs, num_samples=1)  # sample the next token
            idx = torch.cat((idx, idx_next), dim=1)  # append the next token to the current sequence
        return idx
    
model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=100)[0].tolist())
print(generated_chars)


wRogN;J’LS[%#zr!—1epE﻿R

—Fj.H6GbD”Jm+7möê)D/TAk*6
—-k3L”™hEGe﻿*FDH”&4O+hO﻿Dc—oFk0_hJan“6WeæC7ébPABC


#### Block Size and Batch Size

In [33]:
"""
NOTE:
- The block size determines the size of each block in the tensor.
- The batch size determines the number of blocks to be processed in parallel.
"""

'\nNOTE:\n- The block size determines the size of each block in the tensor.\n- The batch size determines the number of blocks to be processed in parallel.\n'