# Import packages

In [1]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
# access bz2 file
import os
import tarfile

path = "/content/drive/MyDrive/LLMs-Zero-to-Hero_bilibili/mobvoi_seq_monkey_general_open_corpus.jsonl.tar.bz2"

# # Open the tar.bz2 file
# try:
#     with tarfile.open(path, "r:bz2") as tar:
#         # You can list the contents of the archive
#         print("Contents of the archive:")
#         tar.list()

#         # Or extract the contents to a directory
#         # tar.extractall("/content/extracted_data")
#         # print("File extracted to /content/extracted_data")

# except tarfile.ReadError as e:
#     print(f"Error reading the tar file: {e}")
# except FileNotFoundError:
#     print(f"Error: File not found at {path}")
# except Exception as e:
#     print(f"An unexpected error occurred: {e}")

In [3]:
# !tar -xvjf /content/drive/MyDrive/LLMs-Zero-to-Hero_bilibili/mobvoi_seq_monkey_general_open_corpus.jsonl.tar.bz2 -C /content/drive/MyDrive/LLMs-Zero-to-Hero_bilibili/


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from dataclasses import dataclass
import math

torch.manual_seed(1024)

<torch._C.Generator at 0x7dda84110db0>

# Defin GPT parameters

In [5]:
@dataclass
class GPTConfig:
    block_size: int = 512 # max sequence. The max length of a text
    batch_size: int = 12
    n_layer: int = 2
    n_head: int = 12
    n_embd: int = 768 # hidden_dim, hidden_size
    hidden_dim = n_embd

    # to tie_embedding_weight
    dropout:float = 0.1
    head_size = n_embd // n_head

    # vocab_size takes certain length of word. 50257 is the default GPT choice
    vocab_size = 50257

In [6]:

# @dataclass
# class GPTConfig:
#     block_size: int = 512   # 这里其实应该是文本的最大长度（ max_seq_len）
#     batch_size: int = 12
#     n_layer: int = 6
#     n_head: int = 12
#     n_embd: int = 768    # n_embd 也叫 hidden_dim, hiden_size, 这里我同时设置了和 embed_dim 一样
#     head_size: int = n_embd // n_head
#     dropout: float = 0.1
#     # # tiktoken 使用的是 GPT-2 的词表，大约有 50257 个token
#     vocab_size: int = 50257

# 3. Define the GPT structure

In [7]:
class SingleHeadAttention(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.key = nn.Linear(config.hidden_dim, config.head_size)
    self.value = nn.Linear(config.hidden_dim, config.head_size)
    self.query = nn.Linear(config.hidden_dim, config.head_size)
    self.head_size = config.head_size

    # attention mask. Register with register_buffer.
    # This helps save memory and process faster since no computing graidents
    self.register_buffer(
        "attention_mask",
        #tril means bottom triagnle
        # Block_size is 512
        torch.tril(
            torch.ones(config.block_size, config.block_size)
        )
    )
    self.dropout = nn.Dropout(config.dropout)

    # forward mechanism
  def forward(self, x):
      batch_size, seq_len, hidden_dim = x.size()
      k = self.key(x)
      q = self.query(x)
      v = self.value(x)
      weight = q @ k.transpose(-2, -1) # @ is simple torch.matmul
      weight = weight.masked_fill(
          self.attention_mask[:seq_len, :seq_len] == 0,
          float('-inf')
      ) / math.sqrt(self.head_size)

      # when compute wight, need divide by sqrt d_k
      weight = F.softmax(weight, dim = -1)

      # dropout ned place after weight
      weight = self.dropout(weight)
      return weight @ v

# 2. multi haed attention
class MultiHeadAttention(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.heads = nn.ModuleList(
        [SingleHeadAttention(config)
        for _ in range(config.n_head)
          ]
    )

    self.proj = nn.Linear(config.hidden_dim, config.hidden_dim)
    self.dropout = nn.Dropout(config.dropout)

  def forward(self, x):
    output = torch.cat(
        [h(x) for h in self.heads],
        dim = -1

    )

    output = self.proj(output)
    output = self.dropout(output)
    return output

# 3 feed forward(MLP)
class FeedForward(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.net = nn.Sequential(
        # refer to the flow chart, it has bottlent. Increase dim and decrease dim
        nn.Linear(config.hidden_dim, 4 * config.hidden_dim), # Swiglu # 8/3
        nn.GELU(),  # you may use nn.Relu() too
        nn.Linear( 4*config.hidden_dim, config.hidden_dim),
        nn.Dropout(config.dropout)
        )
  def forward(self, x):
    return self.net(x)


# 4 block
class Block(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.att = MultiHeadAttention(config)
    self.ffn = FeedForward(config)
    self.ln1 = nn.LayerNorm(config.hidden_dim)
    self.ln2 = nn.LayerNorm(config.hidden_dim)

  def forward(self, x):
    x = x +self.att(self.ln1(x))
    x = x +self.ffn(self.ln2(x))
    return x

# 5 GPT
class GPT(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.config = config # Store the config object

    # (embedding, position, norm, mlp, block)
    # position embedding from 0,1,xxx embedding upgrade to ROPE
    # norm layer norm -> rms norm
    # MLP -> Swiglu
    # MHA -> GQA
    self.token_embedding_table = nn.Embedding(config.vocab_size, config.n_embd)
    self.position_embedding_table = nn.Embedding(config.block_size, config.n_embd)
    self.blocks = nn.Sequential(
        # it has n blocks
        *[Block(config) for _ in range(config.n_layer)]
    )

    self.ln_final = nn.LayerNorm(config.n_embd)
    self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias = False)

    # SLM models would use tie-weight to reduce embeeding parameters
    # very important
    # above, you see its from vocab size to n_embd, and n_embd to vocab_size. it works simplely with next line of code
    # Because the linear layer, it is from dimension of 4 to 8, the weight's actual shape is 8 by 4
    # xW^T

    # self.token_embedding_table.weight = self.lm_head.weight
    self.apply(self._init_weights)


    """
    # option A (my earlier suggestion; common in GPT code)
    self.lm_head.weight = self.token_embedding_table.weight

    # option B (equivalent)
    self.token_embedding_table.weight = self.lm_head.weight

    # Why do most repos do “A”?
    # Convention.
    # People usually think “output layer uses the same weights as the input embedding,”
    # so they set the output to reference the embedding.
    # Either direction is fine—just pick one and be consistent.
    """

  def _init_weights(self, module):
    if isinstance(module, nn.Linear):
      # initialize eto normal distr
      torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)
      if module.bias is not None:
        torch.nn.init.zeros_(module.bias)
    elif isinstance(module, nn.Embedding):
      torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)

  def forward(self, idx, targets = None):
    # dix input is token ids,
    # target is target token ids
    # shape are the same
    batch, seq_len = idx.size()

    assert seq_len <= self.config.block_size, "seq_len is too long"

    token_emb = self.token_embedding_table(idx)
    pos_emb = self.position_embedding_table(
        # mak sure pos token and input idx ae in th same devic
        torch.arange(seq_len, device = idx.device)
    )

    # Question: token mbedding and position emebedding are addable?

    x = token_emb +pos_emb
    x = self.blocks(x)
    x = self.ln_final(x)
    logits = self.lm_head(x)

    if targets is None:
      loss = None
    else:
      batch, seq_len, vocab_size = logits.size()
      logits = logits.view(batch * seq_len, vocab_size)
      targets = targets.view(batch * seq_len)
      loss = F.cross_entropy(logits, targets)
    return logits, loss

  def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # 如果序列太长，只取最后 block_size 个token
            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
            # 获取预测
            logits, _ = self(idx_cond)
            # 只关注最后一个时间步的预测
            logits = logits[:, -1, :]  # becomes (B, vocab_size)
            # 应用softmax获取概率
            probs = F.softmax(logits, dim=-1)
            # 采样下一个token
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            # 附加到序列上
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
        return idx

# 4 Construct input Dataset
know the input value is importnat

In [8]:
import json
import tiktoken
class MyDataset(Dataset):

  def __init__(self, path, block_size = 512):
    self.enc = tiktoken.get_encoding("gpt2")
    self.block_size = block_size

    self.eos_token = self.enc.encode(
        "<|endoftext|>",
        allowed_special = {"<|endoftext|>"}
    )[0]



    self.encoded_data = []
    self.max_lines = 1000

    raw_data = []
    with open(path, 'r') as f:
      for i, line in enumerate(f):
        if i >= self.max_lines:
          break

        try:
          text = json.loads(line.strip())['text']
          raw_data.append(text)
        except Exception as e:
          continue

    full_encoded = []
    for text in raw_data:
      encoded_text = self.enc.encode(text) # list
      full_encoded.extend(encoded_text + [self.eos_token])

    # block_size is 512
    # long -> short (512)

    for i in range(0, len(full_encoded), self.block_size):
      chunk = full_encoded[i: i + self.block_size+1] # 512. Shift the it to right to 513
      if len(chunk) < self.block_size + 1:
        chunk = chunk + [self.eos_token] * (self.block_size +1 -len(chunk))
      self.encoded_data.append(chunk)

  def __len__(self):
    return len(self.encoded_data)

  def __getitem__(self, idx):
    chunk = self.encoded_data[idx]
    x = torch.tensor(chunk[:-1], dtype=torch.long)
    y = torch.tensor(chunk[1:], dtype = torch.long)
    return x,y

  def encode(self, text):
    return self.enc.encode(text)

  def decode(self,ids):
    return self.enc.decode(ids)



In [9]:
# some random test
import tiktoken
enc = tiktoken.get_encoding("gpt2")

enc.encode(
        "<|endoftext|>",
        allowed_special = {"<|endoftext|>"}
    )

print("---------")
enc.encode(
        "<|endoftext|>",
        allowed_special = {"<|endoftext|>"}
    )[0]

---------


50256

# 5. Implementing related functions

## take look the dataset

In [10]:
dataset_path = "/content/drive/MyDrive/LLMs-Zero-to-Hero_bilibili/mobvoi_seq_monkey_general_open_corpus.jsonl"

my_dataset = MyDataset(dataset_path, block_size=512)
print(len(my_dataset))
print(my_dataset[0])


3700
(tensor([28839,   101,   162,   253,    98, 13783,   226,   164,   247,   248,
        28156,   222,   161,    95,   252,   161,   222,   120,   163,   101,
          236, 10310,   241, 18796,   101, 20998,   239,   163,    98,   101,
          162,    94,   230, 20015,   114, 40792,   171,   120,   234, 30585,
          116, 30585,   116,   162,   114,   231, 20998,   232, 32573,   249,
          165,    94,   117, 45911,   247,   162,   232,   113,   163,   101,
          236,   165,    95,   251,   161,   240,   234,   163,   101,   236,
          162,   105,   122,   162,   235,   253, 13783,   109, 21410,   164,
          106,    97, 22522,   248,   161,   240,   234, 13783,   226, 49426,
          228, 16764, 28839,   101,   164,   106,    94,   163,   106,   245,
          163,   101,   236,   162,   105,   122,   162,   235,   253, 13783,
          109, 33768,   114,   171,   120,   234, 17358,   223, 38834, 17358,
          223, 49546, 32573,   249,   165,    94,   117, 4

In [11]:
raw_data = []
with open(dataset_path, 'r') as f:
  for i, line in enumerate(f):
    if i >= 1000:
      break

    try:
      text = json.loads(line.strip())['text']
      raw_data.append(text)
    except Exception as e:
      continue

raw_data[0:10]

['在查处虚开增值税专用发票案件中，常常涉及进项留抵税额和税款损失的认定和处理。在计算税款损失时，要不要将进项留抵税额包括在内？\n对此，实务中存在意见分歧。\n有人主张归并，即计算税款损失时包括进项留抵税额；\n有人主张剥离，即计算税款损失时剔除进项留抵税额。分析这个问题，需要确定进项留抵税额与税款损失之间是什么关系。\n理清这二者之间的关系，首先需要了解增值税的概念和其抵扣机制。增值税是以商品（货物、服务等）在流转过程中产生的增值额作为计税依据而征收的一种流转税。为避免重复征税，在增值税中存在抵扣链条机制。\n一般而言，交易上游企业缴纳的税额，交易下游企业可以对相应的税额进行抵扣。\n对增值税一般纳税人来说，其购进货物、服务等取得增值税专用发票，发票上的税额是进项税额。\n其出售货物、服务等，向购买方开具增值税专用发票，发票的税额是销项税额。\n一般情况下，销项税额减去进项税额的金额是应纳税额，企业根据应纳税额按期申报纳税。\n其次需要了解进项留抵税额的概念及产生原因。\n在计算销项税额和进项税额的差额时，有时会出现负数，即当期进项税额大于当期销项税额。这个差额在当期未实现抵扣，为进项留抵税额，在以后纳税人有销项税额时再进行抵扣。\n企业产生进项留抵税额的主要原因是其进项税额和销项税额时间上的不一致。\n例如，企业前期集中采购货物和服务，投资大，销项税率低于进项税率等。\n从税款抵扣的角度看，进项留抵税额只是购进的这部分进项税额参与到增值税应纳税额的计算过程中，但是其对应的进项税额抵扣还未真正实现，一般要等到其未来有相应的销项税额时，才能真正实现进项税额抵扣。\n可见，进项留抵税额处于不确定状态，能否抵扣受到很多因素影响，例如企业经营中断，没有销项税额，这时进项留抵税额就无法实现抵扣。但如果企业按照税收政策规定申请进项留抵退税，进项税额抵扣就随之实现。\n最后需要了解税款损失的概念。\n税款损失，通常是指因虚开增值税专用发票，导致国家税款被骗或者流失的金额。关于税款损失，实务中有多种表述。\n例如，北京大学法学院教授陈兴良曾谈到虚开行为本身不会造成国家税款损失，只有利用发票抵扣时才会造成国家税款损失。刘兵等编著的《虚开增值税专用发票案例司法观点和案例解析》一书中提到：“给国家税款造成损失的数额，实际上就是被骗取的国家税款在侦查终结以前无法追回的部分。”\n赵清海

## def train function and loading the dataset

In [12]:
train_dataset = MyDataset(dataset_path)
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [0.9, 0.1])
train_loader = DataLoader(train_dataset, batch_size = 12, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = 12, shuffle = False)


In [13]:
model = GPT(GPTConfig())
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params}")

optimizer = torch.optim.AdamW(model.parameters(), lr = 3e-4)

# cosine learning rate
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max = 1000)



Total number of parameters: 91765248


In [14]:
for x,y in train_loader:
  print(x.shape, y.shape)
  break

torch.Size([12, 512]) torch.Size([12, 512])


In [15]:
def train(model, optimizer, scheduler,train_loader, val_loader,device):
  model.train()
  total_loss = 0
  for batch_idx, (x,y) in enumerate(train_loader):
    # move data onto devices
    x, y = x.to(device), y.to(device)

    # forward probagation
    logits, loss = model(x, targets = y)

    # back propagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    # adjust learning rate
    scheduler.step()

    total_loss + loss.item()

    if batch_idx % 100 ==0:
      print(f'epoch:{epoch}, Batch:{batch_idx}, loss:{loss.item():.4f}')
    return total_loss

def eval(model, val_loader, device):
  model.eval()
  val_loss = 0

  with torch.no_grad():
    for x, y in val_loader:
      x, y = x.to(device), y.to(device)
      logits, loss = model(x, targets = y)
      val_loss += loss.item()
  return val_loss / len(val_loader)

# run epoch
for epoch in range(2):
  train_loss = train(model, optimizer, scheduler, train_loader, val_loader, device)
  val_loss = eval(model, val_loader, device)
  print(f'epoch:{epoch}, train_loss:{train_loss:.4f}, val_loss:{val_loss:.4f}')

  # save the model
  avg_val_loss = val_loss / len(val_loader)
  checkpoint = {
      'epoch': epoch,
      'model_state_dict': model.state_dict(),
      'optimizer_state_dict': optimizer.state_dict(),
      'scheduler_state_dict': scheduler.state_dict(),
      'avg_val_loss': avg_val_loss
  }
  torch.save(checkpoint, 'checkpoint.pth')


epoch:0, Batch:0, loss:11.0228
epoch:0, train_loss:0.0000, val_loss:9.8709
epoch:1, Batch:0, loss:9.8877
epoch:1, train_loss:0.0000, val_loss:9.2978
