#### Torch Import

In [48]:
import torch
import torch.nn as nn
import torch.nn.functional as F

#### CUDA Check

In [49]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(device)

cuda


#### Basic Test

In [50]:
# Read the contents of the file 'infinite_in_modern_thought.txt' and store it in the variable 'text'
with open('infinite_in_modern_thought.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Print the first 1000 characters of the text
# print(text[:1000])

# Create a list of unique characters in the text and sort them
chars = sorted(list(set(text)))
# print(chars)

# Print the number of unique characters
# print(len(chars))

# print(text[:1000])
chars = sorted(list(set(text)))
# print(chars)
# print(len(chars))

vocab_size = len(chars)

#### Simple Mapping

In [51]:
# Create a dictionary that maps each character to its index in the 'chars' list
string_to_index = {ch: i for i, ch in enumerate(chars)}

# Create a dictionary that maps each index to its corresponding character in the 'chars' list
index_to_string = {i: ch for i, ch in enumerate(chars)}

# Define the 'endcode' function that converts a string to a list of character indices
endcode = lambda s: [string_to_index[c] for c in s]

# Define the 'decode' function that converts a list of character indices to a string
decode = lambda l: ''.join([index_to_string[i] for i in l])

print(endcode("Hello, World!"))

print(decode(endcode("Hello, World!")))

[35, 61, 68, 68, 71, 11, 1, 50, 71, 74, 68, 60, 2]
Hello, World!


#### Torch Mapping

In [52]:
# Create a dictionary that maps each character to its index in the 'chars' list
string_to_index = {ch: i for i, ch in enumerate(chars)}

# Create a dictionary that maps each index to its corresponding character in the 'chars' list
index_to_string = {i: ch for i, ch in enumerate(chars)}

# Define the 'endcode' function that converts a string to a list of character indices
endcode = lambda s: [string_to_index[c] for c in s]

# Define the 'decode' function that converts a list of character indices to a string
decode = lambda l: ''.join([index_to_string[i] for i in l])

# Create a tensor from the encoded text using torch.tensor
data = torch.tensor(endcode(text), dtype=torch.long)

print(data[:100])

tensor([97, 47, 64, 61,  1, 43, 74, 71, 66, 61, 59, 76,  1, 34, 77, 76, 61, 70,
        58, 61, 74, 63,  1, 61, 29, 71, 71, 67,  1, 71, 62,  1, 42, 70,  1, 76,
        64, 61,  1, 76, 64, 61, 71, 74, 81,  1, 71, 62,  1, 76, 64, 61,  1, 65,
        70, 62, 65, 70, 65, 76, 61,  1, 65, 70,  1, 69, 71, 60, 61, 74, 70,  1,
        76, 64, 71, 77, 63, 64, 76,  0,  1,  1,  1,  1,  0, 47, 64, 65, 75,  1,
        61, 58, 71, 71, 67,  1, 65, 75,  1, 62])


#### Train Validation Split

In [53]:
# Split the data into train and validation sets
data_size = int(0.9 * len(data))  # Calculate the size of the train data

train_data = data[:data_size]  # Assign the first 90% of the data to the train_data variable
val_data = data[data_size:]  # Assign the remaining 10% of the data to the val_data variable

#### Tensor Process Flow

In [54]:
block_size = 8

x = train_data[:block_size]
y = train_data[1:block_size + 1]

for t in range(block_size):
    # Get the context by slicing the input sequence up to the current position
    context = x[:t + 1]
    
    # Get the target by selecting the next character in the input sequence
    target = y[t]
    
    # Print the context and target
    print(f"when input is {context}, the target: {target}")

when input is tensor([97]), the target: 47
when input is tensor([97, 47]), the target: 64
when input is tensor([97, 47, 64]), the target: 61
when input is tensor([97, 47, 64, 61]), the target: 1
when input is tensor([97, 47, 64, 61,  1]), the target: 43
when input is tensor([97, 47, 64, 61,  1, 43]), the target: 74
when input is tensor([97, 47, 64, 61,  1, 43, 74]), the target: 71
when input is tensor([97, 47, 64, 61,  1, 43, 74, 71]), the target: 66


#### Split Dataset

In [55]:
# Split the data into train and validation sets
data_size = int(0.9 * len(data))  # Calculate the size of the train data

train_data = data[:data_size]  # Assign the first 90% of the data to the train_data variable
val_data = data[data_size:]  # Assign the remaining 10% of the data to the val_data variable

# Define the get_batch function that returns a batch of data
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

# Set the batch size
batch_size = 4

# Get a batch of training data
x, y = get_batch('train')

# Print the shape of x and y
print(x.shape, y.shape)

# Print the values of x and y
print(x)
print(y)

torch.Size([4, 8]) torch.Size([4, 8])
tensor([[64, 61,  1, 79, 64, 71, 68, 61],
        [ 1,  7, 79, 64, 65, 59, 64,  1],
        [ 0, 61, 80, 65, 75, 76, 61, 70],
        [79, 57, 81,  1, 57, 75,  1, 76]], device='cuda:0')
tensor([[61,  1, 79, 64, 71, 68, 61,  1],
        [ 7, 79, 64, 65, 59, 64,  1, 79],
        [61, 80, 65, 75, 76, 61, 70, 59],
        [57, 81,  1, 57, 75,  1, 76, 64]], device='cuda:0')


#### Bigram (Autoregressive Model)

In [56]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()  # Set the model to evaluation mode
    for split in ['train', 'val']:
        losses = torch.zeros(eval_interval)  # Initialize a tensor to store the losses
        for i in range(eval_interval):
            x, y = get_batch(split)  # Get a batch of data
            logits, loss = model(x, y)  # Forward pass through the model
            losses[i] = loss.item()  # Store the loss value
        out[split] = losses.mean()  # Calculate the mean loss for the split
    model.train()  # Set the model back to training mode
    return out

In [57]:
class BigramLanguageModel(nn.Module):
    # Initialize the model with the vocabulary size
    def __init__(self, vocab_size):
        # Call the parent class's constructor
        super().__init__()
        # Create an embedding table to map token indices to vectors
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    # Define the forward pass
    def forward(self, idx, targets=None):
        # Get the logits (unnormalized probabilities) for the input indices
        logits = self.token_embedding_table(idx)
        # If targets are not provided, set the loss to None
        if targets is None:
            loss = None
        # If targets are provided, calculate the cross-entropy loss
        else:
            # Get the shape of the logits tensor (batch size, sequence length, vocab size)
            B, T, C = logits.shape
            # Reshape the logits tensor to (batch size * sequence length, vocab size)
            logits = logits.view(B*T, C)
            # Reshape the targets tensor to (batch size * sequence length)
            targets = targets.view(B*T)
            # Calculate the cross-entropy loss
            loss = F.cross_entropy(logits, targets)
        # Return the logits and loss
        return logits, loss

    # Define the generate method
    def generate(self, idx, max_new_tokens):
        # Generate new tokens one at a time
        for _ in range(max_new_tokens):
            # Get the logits and loss for the current input indices
            logits, loss = self.forward(idx)
            # Get the logits for the last token in the sequence
            logits = logits[:, -1, :]
            # Convert the logits to probabilities
            probs = F.softmax(logits, dim=-1)
            # Sample the next token from the probabilities
            idx_next = torch.multinomial(probs, num_samples=1)
            # Append the next token to the current sequence
            idx = torch.cat((idx, idx_next), dim=1)
        # Return the generated sequence
        return idx

# Create an instance of the BigramLanguageModel with the specified vocabulary size
model = BigramLanguageModel(vocab_size)
# Move the model to the specified device (e.g. GPU)
m = model.to(device)

# Create a context tensor with a single token (index 0)
context = torch.zeros((1, 1), dtype=torch.long, device=device)
# Generate a sequence of 100 tokens starting from the context
generated_chars = decode(m.generate(context, max_new_tokens=100)[0].tolist())
# Print the generated sequence
print(generated_chars)


R*cZf•lD(fa—YDRODbrêpé$0:w!n+U﻿_0QZ“ W;kf2N—ew•%bz*J…wF[?eaöNl!BQi+éôêO!H™&U™W﻿Dv﻿k[1T-WE2pAl”##na X


In [61]:
max_iter = 5000
learning_rate = 3e-4

eval_interval = 100

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iter):
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    # Get a batch of training data
    x, y = get_batch('train')
    # Get the logits and loss
    logits, loss = model(x, y)
    # Backpropagate the loss
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    # Print the loss every 100 iterations
    # if iter % 100 == 0:
    #     print(f"loss: {loss.item()}")

print(loss.item())

step 0: train loss 4.0148, val loss 4.1414
step 100: train loss 4.0100, val loss 4.0979
step 200: train loss 4.0318, val loss 4.0931
step 300: train loss 3.9532, val loss 4.0376
step 400: train loss 3.9455, val loss 4.0262
step 500: train loss 3.9304, val loss 4.0580
step 600: train loss 3.8994, val loss 4.0182
step 700: train loss 3.8809, val loss 4.0294
step 800: train loss 3.8775, val loss 3.9768
step 900: train loss 3.8537, val loss 3.9811
step 1000: train loss 3.8524, val loss 3.9463
step 1100: train loss 3.8164, val loss 3.9090
step 1200: train loss 3.7919, val loss 3.9211
step 1300: train loss 3.7717, val loss 3.9284
step 1400: train loss 3.7755, val loss 3.9130
step 1500: train loss 3.7795, val loss 3.8395
step 1600: train loss 3.7286, val loss 3.8843
step 1700: train loss 3.7139, val loss 3.8227
step 1800: train loss 3.6971, val loss 3.8138
step 1900: train loss 3.6886, val loss 3.8349
step 2000: train loss 3.7093, val loss 3.8718
step 2100: train loss 3.6681, val loss 3.7504


#### Some Optimization Algorithms and Loss Functions

**MSE (Mean Squared Error):** MSE is a commonly used loss function in regression problems. It measures the average squared difference between the predicted and actual values. The lower the MSE, the better the model's performance.

**GD (Gradient Descent):** GD is an optimization algorithm used to minimize the loss function of a model. It iteratively updates the model's parameters in the direction of steepest descent of the loss function. GD can be slow for large datasets as it requires computing the gradients for the entire dataset in each iteration.

**Momentum:** Momentum is an extension of GD that helps accelerate convergence and overcome local minima. It introduces a momentum term that accumulates the gradients of previous iterations and uses it to update the model's parameters. This helps the model to continue moving in the right direction even when the gradients are small.

**RMSprop (Root Mean Square Propagation):** RMSprop is an optimization algorithm that adapts the learning rate for each parameter based on the average of recent squared gradients. It helps to speed up convergence by reducing the learning rate for parameters with large gradients and increasing it for parameters with small gradients.

**Adam:** Adam (Adaptive Moment Estimation) is an optimization algorithm that combines the benefits of both momentum and RMSprop. It maintains a running average of both the gradients and the squared gradients, and uses them to update the model's parameters. Adam is known for its fast convergence and good performance on a wide range of problems.

**AdamW:** AdamW is a variant of the Adam optimizer that incorporates weight decay regularization. Weight decay helps prevent overfitting by adding a penalty term to the loss function that discourages large parameter values. AdamW is particularly effective when dealing with models with large numbers of parameters.

In [62]:
# Create a context tensor with a single token (index 0)
context = torch.zeros((1, 1), dtype=torch.long, device=device)
# Generate a sequence of 100 tokens starting from the context
generated_chars = decode(m.generate(context, max_new_tokens=100)[0].tolist())
# Print the generated sequence
print(generated_chars)


Y#Yâö4™_F[™SêmöV!1ây&K1Tm)K#DuHyag™o/l,x’-sisthqr[fDq.#+ks.#G“c+!YMO_O&$•—DéV,*J0Mmmes9wa7quF.dra wh


#### Block Size and Batch Size

In [60]:
"""
NOTE:
- The block size determines the size of each block in the tensor.
- The batch size determines the number of blocks to be processed in parallel.
"""

'\nNOTE:\n- The block size determines the size of each block in the tensor.\n- The batch size determines the number of blocks to be processed in parallel.\n'