# Build a minGPT

In [1]:
# 导入相关的包
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from dataclasses import dataclass
import math
torch.manual_seed(1024)

<torch._C.Generator at 0x1ffde586210>

In [2]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"  # 同步报告错误
os.environ['TORCH_USE_CUDA_DSA'] = "1"    # 启用设备端断言
torch.cuda.empty_cache()      # 释放未用的显存块
torch.cuda.reset_peak_memory_stats()  # 重置峰值统计
torch.cuda.synchronize()      # 等待GPU完成所有任务

In [3]:
# 自定义一些GPT的参数
class GPTConfig:
    block_size: int = 512 #文本的最大长度 max_seq
    batch_size: int = 4 #12
    n_layer: int = 2 #12
    n_head: int = 12 #12
    n_embd: int =768 #768 hidden_dim or hidden_size-->emb-size
    hidden_dim = n_embd
    # 为了tie_embding_weight
    dropout: float = 0.1
    head_size: int = n_embd // n_head
    # 使用了gpt2的官方tokenizer
    vocab_size: int = 50257
    

In [4]:
# 定义GPT的结构

# 1.single head attention 
class SingleHeadAttention(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.query = nn.Linear(config.hidden_dim, config.head_size)
        self.key = nn.Linear(config.hidden_dim, config.head_size)
        self.value = nn.Linear(config.hidden_dim, config.head_size)
        self.head_size = config.head_size
        
        # 尝试register——buffer注册实现sttention_masked
        # 因为不用计算梯度，节省计算开销
        self.register_buffer(
            "attention_mask", 
            # tril 是下三角的意思以及blok_size = 512
            torch.tril(
                torch.ones(config.block_size, config.block_size)
            )
        )
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        batch_size, seq_len, hidden_dim = x.size()
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)
        weight = q @ k.transpose(-2, -1)   # @ 是 torch.matmul 的简化写法
        # 一定要在 softmax 前除以 sqrt(head_size)
        weight = weight.masked_fill(
            self.attention_mask[:seq_len, :seq_len] == 0, 
            float('-inf')
        ) / math.sqrt(self.head_size)  # 这里的 hidden_size 其实是 head_size，因为是单头
        weight = F.softmax(weight, dim=-1)
        weight = self.dropout(weight)
        out = weight @ v
        return out
        

In [5]:
# 2.multi head attention
class MultiHeadAttention(nn.Module):
    def  __init__(self, config):
        super().__init__()
        self.heads = nn.ModuleList(
            [
                SingleHeadAttention(config)
                for _ in range(config.n_head)
            ]
        )
        self.proj = nn.Linear(config.hidden_dim, config.hidden_dim)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        output = torch.cat(
            [h(x) for h in self.heads], dim = -1
        )
        output = self.proj(output)
        output = self.dropout(output)
        return output

In [6]:
# feed forward (MLP)

class Feedforward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(config.hidden_dim, 4*config.hidden_dim), 
            nn.GELU(), 
            nn.Linear(4* config.hidden_dim, config.hidden_dim), 
            nn.Dropout(config.dropout)
        )

    def forward(self, x):
        return self.net(x)
        

In [7]:
# block 
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.att = MultiHeadAttention(config)
        self.ffn = Feedforward(config)
        self.ln1 = nn.LayerNorm(config.n_embd)
        self.ln2 = nn.LayerNorm(config.n_embd)

    def forward(self, x):
        x = x + self.att(self.ln1(x))
        x = x + self.ffn(self.ln2(x))
        return x

In [8]:
# 5.GPT
class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        # (embding, position, norm, mlp, block)
        self.token_embding_table = nn.Embedding(config.vocab_size, config.n_embd)
        self.position_embedding_table = nn.Embedding(config.block_size, config.n_embd)
        self.blocks = nn.Sequential(
            *[Block(config) for _ in range(config.n_layer)]
        )
        self.ln_final = nn.LayerNorm(config.n_embd)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        ## 现在的SLM模型会使用tie weight来减少参数
        # linear 4-->8，weight实际上的shape （8*4）
        self.apply(self._init_weight)
        
    def _init_weight(self, module):
        # 错误检查
        if isinstance(module, nn.Linear):
            # 初始化为正态分布
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        # idx为输入token ids
        # targets是 目标 ids
        # shape 要相同
        batch_size, seq_len =idx.size() # (batch_size, seq_len)
        token_emb = self.token_embding_table(idx) #(batch_size, seq_len, n_embd)
        pos_emb = self.position_embedding_table(
            # 确保位置编码和输入的idx在同一个设备上
            torch.arange(seq_len, device=idx.device)
        )
        # 经典题目为什么token_embd和positon_embd可以相加

        x = token_emb + pos_emb # shpe is (batch_size, seq_len, n_embd)
        x = self.blocks(x)
        x = self.ln_final(x)
        logits = self.lm_head(x) # shape is (batch_size, seq_len, vacab_size)
        if targets is None:
            loss = None
        else:
            batch, seq_len, vocab_size = logits.size()
            logits = logits.view(batch*seq_len, vocab_size)
            targets = targets.view(batch*seq_len)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # id shape is (batch, seq_len)
        for _ in range(max_new_tokens):
            idx_cond = idx if idx.size(1) <= confi .block_size else dix[:, -self.config.block_size:]
            logits, _ =self(idx_cond) # shape is (batch, seq_len, vocab_size)
            logits = self[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            # 随机采样
            idx_next = torch.multinomial(probs, num_sample=1)
            idx = torch.cat((idx, idx_next), dim=1) # shape is (batch, seq_len+1)
        return idx
            

In [9]:
# !pip install tiktoken

In [10]:
# 写一个dataset 为了dataloader准备
class MyDataset(Dataset):
    def __init__(self, path, block_size=512):
        import tiktoken
        self.enc = tiktoken.get_encoding("gpt2")
        self.block_size = block_size #position 的最大长度

        self.encoded_data = []
        # 需要用特护符号分割文本 gpt2--> <|endoftext|>
        self.eos_token = self.enc.encode(
            "<|endoftext|>", 
            allowed_special={"<|endoftext|>"}
        )[0]
        
        import json
        self.encoded_data = []
        raw_data = []
        self.max_lines = 1000  #读取前面的一千行
        
        with open(path, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f):
                if i>=self.max_lines:
                    break
                try:
                    text = json.loads(line.strip())["text"]
                    raw_data.append(text)
                except Exception as e:
                    continue

        full_encoded = []
        for text in raw_data:
            encoded_text = self.enc.encode(text) #list
            full_encoded.extend(encoded_text + [self.eos_token])

        # block_size is 512  将长文本分割成训练样本
        for i in range(0, len(full_encoded), self.block_size):
            # 多拿一个token作为目标
            chunk = full_encoded[i: i+self.block_size+1] # 512-->513（实际）
            # 如果不够使用eos_token填充
            if len(chunk) < self.block_size + 1:
                chunk = chunk + [self.eos_token]*(self.block_size + 1 -len(chunk))
            self.encoded_data.append(chunk)
                
    def __len__(self):
        return len(self.encoded_data)

    def __getitem__(self, idx):
        chunk = self.encoded_data[idx]
        x = torch.tensor(chunk[:-1], dtype = torch.long)
        y = torch.tensor(chunk[1:], dtype = torch.long)
        return x, y

    def encode(self, text):
        '''将文本编码为 token IDs'''
        return self.enc.encode(text)

    def decode(self, ids):
        '''将token IDs 解码为文本'''
        return self.enc.decode(ids)
            

In [11]:
train_data = MyDataset('../data/seq-monkey-data/seq_monkey_small.jsonl')
train_dataset, val_dataset = torch.utils.data.random_split(train_data, [0.9, 0.1])

train_loader = DataLoader(train_dataset, batch_size=12, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=12, shuffle=False)

In [12]:
model = GPT(GPTConfig())
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

# 打印模型一共有多少参数
total_para = sum(p.numel() for p in model.parameters())
print(f'Total parameters:{total_para / 1e6} M')

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
# 设置cosine学习率
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=1000)

Total parameters:120.116736 M


In [13]:
for x, y in train_loader:
    print(x.shape, y.shape)
    break

torch.Size([12, 512]) torch.Size([12, 512])


In [14]:
device

'cuda'

In [15]:
# 训练循环(粘贴)
def train(model, optimizer, scheduler, train_loader, val_loader, device, epoch):
    model.train()
    total_loss = 0
    for batch_idx, (x, y) in enumerate(train_loader):
        # 将数据移到设备上
        x, y = x.to(device), y.to(device)
        
        # 前向传播
        logits, loss = model(x, targets=y)
        
        # 反向传播
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # 调整学习率
        scheduler.step()
        
        total_loss += loss.item()
        
        if batch_idx % 100 == 0:
            print(f'Epoch: {epoch}, Batch: {batch_idx}, Loss: {loss.item():.4f}')
    return total_loss

def eval(model, val_loader, device):
    # 验证
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            logits, loss = model(x, targets=y)
            val_loss += loss.item()
    return val_loss

os.makedirs('checkpoints', exist_ok=True)

for epoch in range(2):
    train_loss = train(model, optimizer, scheduler, train_loader, val_loader, device, epoch)
    val_loss = eval(model, val_loader, device)
    print(f'Epoch: {epoch}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}')

    # 保存模型
    avg_val_loss = val_loss / len(val_loader)
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'val_loss': avg_val_loss,
    }
    # 保存每个epoch的模型
    torch.save(checkpoint, f'checkpoints/model_epoch_{epoch}.pt')
    

Epoch: 0, Batch: 0, Loss: 10.9697
Epoch: 0, Batch: 100, Loss: 3.6738
Epoch: 0, Batch: 200, Loss: 3.7228
Epoch: 0, Train Loss: 3.9688, Val Loss: 3.4325
Epoch: 1, Batch: 0, Loss: 3.3986
Epoch: 1, Batch: 100, Loss: 3.3431
Epoch: 1, Batch: 200, Loss: 3.2778
Epoch: 1, Train Loss: 3.2875, Val Loss: 3.1650
