# 3-Pretrain
预训练时模型经历的第一个阶段，在该阶段，模型将会吸收知识，学习尽可能正确的下一词语预测范式。该Notebook仅对预训练流程进行展示和学习，wandb和ddp不会涉及。

In [1]:
import os
import argparse
import time
import math
import warnings
import tqdm
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import DataLoader
from contextlib import nullcontext
from transformers import AutoTokenizer

from dataset import PretrainDataset
from model import MiniMindForCausalLM, MiniMindConfig

warnings.filterwarnings('ignore')

In [5]:
def Logger(content):
    print(content)
    
def get_lr(current_step, total_steps, lr):
    return lr / 10 + 0.5 * lr * (1 + math.cos(math.pi * current_step / total_steps))

def train_epoch(epoch):
    loss_fct = nn.CrossEntropyLoss(reduction='none')
    start_time = time.time()
    pbar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch+1}")
    
    for step, (X, Y, loss_mask) in pbar:
        X = X.to()
        
def init_model(lm_config):
    tokenizer = AutoTokenizer.from_pretrained('../model/minimind_tokenizer')
    model = MiniMindForCausalLM(lm_config).to(args.device)
    print(f"LLM 总参数量：{sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e6:.3f} 百万")
    return model, tokenizer

In [None]:
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="MiniMind Pretraining")
    parser.add_argument("--out_dir", type=str, default="../out")
    parser.add_argument("--epochs", type=int, default=1)
    parser.add_argument("--batch_size", type=int, default=32)
    parser.add_argument("--learing_rate", type=float, default=5e-4)
    parser.add_argument("--device", type=str, default="cuda:0" if torch.cuda.is_available() else 'cpu')
    parser.add_argument("--dtype", type=str, default="bfloat16")
    parser.add_argument("--accumulation_steps", type=int, default=8)
    parser.add_argument("--grad_clip", type=float, default=1.0)
    parser.add_argument("--warmup_iters", type=int, default=0)
    parser.add_argument("--log_interval", type=int, default=1)
    parser.add_argument("--save_interval", type=int, default=10)
    parser.add_argument("--hidden_size", type=int, default=512)
    parser.add_argument("--num_hidden_layers", type=int, default=8)
    parser.add_argument("--max_seq_len", type=int, default=512)
    parser.add_argument("--data_path", type=str, default="../dataset/pretrain_hq.jsonl")
    args = parser.parse_args()
    
    lm_config = MiniMindConfig(hidden_size=args.hidden_size, num_hidden_layers=args.num_hidden_layers)
    model, tokenizer = init_model()
    
    train_dataset = PretrainDataset(args.data_path, tokenizer, max_length=args.max_seq_len)
    
    train_loader = DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        pin_memory=True,
        drop_last=False,
        shuffle=False,
    )
    print(f"模型位于设备：{model.device}, 词表长度：{tokenizer.vocab_size}, DataLoader：{train_loader}")

usage: ipykernel_launcher.py [-h] [--out_dir OUT_DIR] [--epochs EPOCHS]
                             [--batch_size BATCH_SIZE]
                             [--learing_rate LEARING_RATE] [--device DEVICE]
                             [--dtype DTYPE]
                             [--accumulation_steps ACCUMULATION_STEPS]
                             [--grad_clip GRAD_CLIP]
                             [--warmup_iters WARMUP_ITERS]
                             [--log_interval LOG_INTERVAL]
                             [--save_interval SAVE_INTERVAL]
                             [--hidden_size HIDDEN_SIZE]
                             [--num_hidden_layers NUM_HIDDEN_LAYERS]
                             [--max_seq_len MAX_SEQ_LEN]
                             [--data_path DATA_PATH]
ipykernel_launcher.py: error: unrecognized arguments: --f=/Users/mei/Library/Jupyter/runtime/kernel-v38391394359e5903425b80f6b0cb02f4901a7675e.json


SystemExit: 2