In [None]:
import torch
device = "xpu"
print(f"Using device: {device}")

In [None]:
with open(r"dataset/01.txt", "r", encoding='utf-8') as file:
    content = file.read()

'''print(len(content))
print(content[:100])'''  # Print the first 100 characters to verify content

chars=sorted(list(set(content)))
vocab_size=len(chars)
print("Vocab size:", vocab_size)
print("Characters:", ''.join(chars))

# Create mappings from characters to integers and vice versa
stoi={ch:i for i,ch in enumerate(chars)}
itos={i:ch for i,ch in enumerate(chars)}
def encode(s):
    return [stoi[c] for c in s]
def decode(l):
    return ''.join(itos[i] for i in l)

print(encode("hello world"))
print(decode(encode("hello world")))

devices = "cpu"

Vocab size: 82
Characters: 
 !'()*,-.0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz–—‘’“”…
[56, 53, 60, 60, 63, 1, 71, 63, 66, 60, 52]
hello world


In [None]:
import torch
data = torch.tensor(encode(content), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])  # Print the first 100 encoded integers to verify encoding

# Split the data into training and validation sets
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

# Demonstrate how to create input-target pairs for training
block_size = 8  # context length for predictions
train_data[:block_size+1]

x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context.tolist()} the target: {target}")


torch.Size([439477]) torch.int64
tensor([35, 66,  9,  1, 49, 62, 52,  1, 35, 66, 67,  9,  1, 26, 69, 66, 67, 60,
        53, 73,  7,  1, 63, 54,  1, 62, 69, 61, 50, 53, 66,  1, 54, 63, 69, 66,
         7,  1, 38, 66, 57, 70, 53, 68,  1, 26, 66, 57, 70, 53,  7,  1, 71, 53,
        66, 53,  1, 64, 66, 63, 69, 52,  1, 68, 63,  1, 67, 49, 73,  1, 68, 56,
        49, 68,  1, 68, 56, 53, 73,  1, 71, 53, 66, 53,  1, 64, 53, 66, 54, 53,
        51, 68, 60, 73,  1, 62, 63, 66, 61, 49])
when input is [35] the target: 66
when input is [35, 66] the target: 9
when input is [35, 66, 9] the target: 1
when input is [35, 66, 9, 1] the target: 49
when input is [35, 66, 9, 1, 49] the target: 62
when input is [35, 66, 9, 1, 49, 62] the target: 52
when input is [35, 66, 9, 1, 49, 62, 52] the target: 1
when input is [35, 66, 9, 1, 49, 62, 52, 1] the target: 35


In [None]:
torch.manual_seed(42)
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

batch_size = 4
x, y = get_batch('train')
print("inputs:")
print(x)
print("targets:")
print(y)

inputs:
tensor([[57, 55, 56, 68,  1, 50, 53,  1],
        [68,  1, 57, 67, 62, 78, 68,  1],
        [27, 24, 23, 25, 33,  0,  0,  0],
        [ 1, 56, 63, 60, 52, 57, 62, 55]])
targets:
tensor([[55, 56, 68,  1, 50, 53,  1, 62],
        [ 1, 57, 67, 62, 78, 68,  1, 68],
        [24, 23, 25, 33,  0,  0,  0,  0],
        [56, 63, 60, 52, 57, 62, 55,  1]])


In [None]:
import torch.nn as nn
from torch.nn import functional as F
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx)  # (B,T,C)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B,T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, _ = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :]  # (B,C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)  # (B,C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B,1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B,T+1)
        return idx
    
# 假设你之前已经定义了 vocab_size 和获取了 x, y
model = BigramLanguageModel(vocab_size).to(device)

logits, loss = model(x, y)
print("logits shape:", logits.shape)
print("loss:", loss)
print(decode(model.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))  # Decode the predicted tokens for verification



logits shape: torch.Size([32, 82])
loss: tensor(5.0049, grad_fn=<NllLossBackward0>)

ivdmYl‘OVFkq(Er5ce8RSS?xBeJzwX!oG9J‘'82vmF7E,)1f’oPtY3qwp
—.(;olqBlBWG;M(hCy*vk…Q7B4ZQF4‘?TT(EoReQ!X


In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
batch_size = 32
for steps in range(10000):
    xb, yb = get_batch('train')

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(f"step {steps+1}: loss {loss.item()}")

In [None]:
print(decode(model.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))  # Decode the predicted tokens for verification