In [1]:
import torch

device_type = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
device_type

'cuda'

In [3]:

import torch, platform
print("torch:", torch.__version__)
print("cuda runtime in wheel:", torch.version.cuda)
print("cuda available:", torch.cuda.is_available())
print("num devices:", torch.cuda.device_count())
print("device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "N/A")
print("platform:", platform.platform())



torch: 2.3.0+cu121
cuda runtime in wheel: 12.1
cuda available: True
num devices: 1
device name: NVIDIA GeForce RTX 4070 Ti
platform: Windows-10-10.0.26100-SP0


In [5]:
import os
import sys
# __package__ = "trainer"
# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

import argparse
import time
import math
import warnings
import torch
import torch.distributed as dist
from torch import optim, nn
from torch.nn.parallel import DistributedDataParallel
from torch.utils.data import DataLoader, DistributedSampler
from contextlib import nullcontext
from transformers import AutoTokenizer
from model.model_minimind import MiniMindConfig, MiniMindForCausalLM
from dataset.lm_dataset import PretrainDataset

  from .autonotebook import tqdm as notebook_tqdm


#Pretrain
# py 3.10
# cuda : pip install torch==2.3.0 --index-url https://download.pytorch.org/whl/cu121

env path

In [6]:
import os, sys, platform, torch
from pathlib import Path
project_root = Path.cwd().resolve()  
sys.path.append(str(project_root))
print("Project root:", project_root)
print("Python:", sys.executable)
print("Torch:", torch.__version__, "CUDA in wheel:", torch.version.cuda)
print("CUDA available:", torch.cuda.is_available(), "Count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("Device name:", torch.cuda.get_device_name(0))

Project root: C:\Users\lvbab\Desktop\minitest
Python: c:\Users\lvbab\anaconda3\envs\mini310\python.exe
Torch: 2.3.0+cu121 CUDA in wheel: 12.1
CUDA available: True Count: 1
Device name: NVIDIA GeForce RTX 4070 Ti


imports

In [14]:

import time, math, warnings
import torch
import torch.distributed as dist
from torch import optim, nn
from torch.nn.parallel import DistributedDataParallel
from torch.utils.data import DataLoader, DistributedSampler
from contextlib import nullcontext
from transformers import AutoTokenizer
from model.model_minimind import MiniMindConfig, MiniMindForCausalLM
from dataset.lm_dataset import PretrainDataset

warnings.filterwarnings('ignore')

def Logger(content, ddp=False):
    if not ddp or dist.get_rank() == 0:
        print(content)


def get_lr(current_step, total_steps, lr):
    return lr / 10 + 0.5 * lr * (1 + math.cos(math.pi * current_step / total_steps))

Settings

In [26]:

from types import SimpleNamespace
from pathlib import Path

args = SimpleNamespace(
    out_dir = "./out",
    epochs = 1,                    # 默认 1，教学演示先从 1 开始
    batch_size = 32,
    learning_rate = 5e-4,
    device = "cuda:0" if torch.cuda.is_available() else "cpu",
    dtype = "bfloat16",            # 若遇到问题可改 "float16"
    use_wandb = False,
    wandb_project = "MiniMind-Pretrain",
    num_workers = 1,
    ddp = False,                   # 一般单卡，先 False
    accumulation_steps = 8,
    grad_clip = 1.0,
    warmup_iters = 0,
    log_interval = 100,
    save_interval = 100,            
    hidden_size = 512,
    num_hidden_layers = 8,
    max_seq_len = 512,
    use_moe = False,
    data_path = "./dataset/pretrain_hq.jsonl",   # 修改为你的数据
)

# 创建输出目录
Path(args.out_dir).mkdir(parents=True, exist_ok=True)
device_type = "cuda" if "cuda" in args.device else "cpu"
from contextlib import nullcontext
ctx = nullcontext() if device_type == "cpu" else torch.cuda.amp.autocast()
Logger(f"Using device: {args.device}")


Using device: cuda:0


Tokenizer and init

In [27]:
tokenizer_path = str(project_root / "model")
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

lm_config = MiniMindConfig(hidden_size=args.hidden_size, num_hidden_layers=args.num_hidden_layers, use_moe=args.use_moe)
model = MiniMindForCausalLM(lm_config).to(args.device)

trainable_params_m = sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e6
Logger(f"Total Params：{trainable_params_m:.3f} M")

Total Params：25.830 M


Load

In [28]:

from pathlib import Path

data_path = str((project_root / args.data_path).resolve()) if not Path(args.data_path).exists() else args.data_path
assert Path(data_path).exists(), f"数据文件不存在：{data_path}"
train_ds = PretrainDataset(data_path, tokenizer, max_length=args.max_seq_len)
train_sampler = DistributedSampler(train_ds) if args.ddp else None
train_loader = DataLoader(
    train_ds,
    batch_size=args.batch_size,
    pin_memory=True,
    drop_last=False,
    shuffle=False,
    num_workers=args.num_workers,
    sampler=train_sampler
)
iter_per_epoch = len(train_loader)
Logger(f"iter_per_epoch = {iter_per_epoch}")
scaler = torch.cuda.amp.GradScaler(enabled=(args.dtype in ['float16', 'bfloat16']))
optimizer = optim.AdamW(model.parameters(), lr=args.learning_rate)

iter_per_epoch = 44160


Train

In [None]:
def save_ckpt(epoch, print_msg=False):
    model.eval()
    moe_path = '_moe' if args.use_moe else ''
    fn = f'pretrain_{args.hidden_size}{moe_path}_{epoch}.pth'
    ckp = str(Path(args.out_dir) / fn)
    state_dict = model.state_dict()
    state_dict = {k: v.half() for k, v in state_dict.items()}
    torch.save(state_dict, ckp)
    if print_msg:
        Logger(f"Saved: {ckp}")
    model.train()

In [30]:

def train_epoch(epoch, wandb=None, ddp=False):
    loss_fct = nn.CrossEntropyLoss(reduction='none')
    start_time = time.time()
    model.train()
    for step, (X, Y, loss_mask) in enumerate(train_loader):
        X = X.to(args.device); Y = Y.to(args.device); loss_mask = loss_mask.to(args.device)

        lr = get_lr(epoch * iter_per_epoch + step, args.epochs * iter_per_epoch, args.learning_rate)
        for param_group in optimizer.param_groups: param_group['lr'] = lr

        with ctx:
            res = model(X)
            loss = loss_fct(res.logits.view(-1, res.logits.size(-1)), Y.view(-1)).view(Y.size())
            loss = (loss * loss_mask).sum() / loss_mask.sum()
            loss = loss + getattr(res, "aux_loss", 0.0)
            loss = loss / args.accumulation_steps

        scaler.scale(loss).backward()

        if (step + 1) % args.accumulation_steps == 0:
            scaler.unscale_(optimizer)
            nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
            scaler.step(optimizer); scaler.update()
            optimizer.zero_grad(set_to_none=True)

        if step % args.log_interval == 0:
            spend_time = time.time() - start_time
            eta_min = int(spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60)
            Logger(f"Epoch:[{epoch+1}/{args.epochs}]({step}/{iter_per_epoch}) "
                   f"loss:{loss.item()*args.accumulation_steps:.3f} lr:{optimizer.param_groups[-1]['lr']:.12f} "
                   f"epoch_Time:{eta_min}min:")

        if (step + 1) % args.save_interval == 0:
            save_ckpt(epoch)


In [31]:

for epoch in range(args.epochs):
    train_epoch(epoch, wandb=None, ddp=False)
save_ckpt(epoch, print_msg=True)

Epoch:[1/1](0/44160) loss:8.954 lr:0.000550000000 epoch_Time:4367min:
Epoch:[1/1](100/44160) loss:6.462 lr:0.000549993674 epoch_Time:183min:
Epoch:[1/1](200/44160) loss:6.318 lr:0.000549974695 epoch_Time:150min:
Epoch:[1/1](300/44160) loss:6.848 lr:0.000549943065 epoch_Time:149min:
Epoch:[1/1](400/44160) loss:5.910 lr:0.000549898786 epoch_Time:151min:
Epoch:[1/1](500/44160) loss:6.166 lr:0.000549841859 epoch_Time:150min:


KeyboardInterrupt: 