# 引入必要库

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, DataCollatorForLanguageModeling, TrainingArguments, Trainer
from accelerate import Accelerator
from datasets import load_dataset
from typing import Dict, List

# 数据集预处理

In [None]:
tokenizer = AutoTokenizer.from_pretrained('story_tokenizer_2048')
dataset_name_or_path = "TinyStoriesV2_SpecialTokens"

train_split = load_dataset(dataset_name_or_path, split='train').shuffle(seed=42).select(range(100000))

print(train_split)

In [None]:
def process_func(
    examples: Dict[str, List]
) -> Dict[str, List]:
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained('story_tokenizer_2048')
    max_token = 512

    encoded_texts = tokenizer(examples['text'], add_special_tokens=False)
    input_ids_list = encoded_texts['input_ids']

    new_input_ids_list, new_attn_mask_list = [], []
    for input_ids in input_ids_list:
        temp = input_ids[-max_token+1:] + [tokenizer.eos_token_id]
        new_input_ids_list.append(temp)
        new_attn_mask_list.append([1] * len(temp))
    return {
        "input_ids": new_input_ids_list,
        "attention_mask": new_attn_mask_list
    }

num_proc = 8

train_split = train_split.shuffle()

train_split = train_split.map(
    process_func,
    batched=True,
    num_proc=num_proc,
    remove_columns=train_split.column_names,
    desc='Running tokenizer on train_set: '
)

print(train_split[0])

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# 实例化模型

In [None]:
hidden_size = 128

# 中间层取 8/3 倍，按 128 向上取整
intermediate_size = (int(hidden_size * 8/3 / 128) + 1) * 128

config = AutoConfig.for_model(
    model_type="llama",
    hidden_size=hidden_size,
    intermediate_size=intermediate_size,
    num_attention_heads=8,
    num_hidden_layers=2,
    num_key_value_heads=4,
    tie_word_embeddings=True,vocab_size=2048,max_position_embeddings=512
)

print(config)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

def init_model():
    model = AutoModelForCausalLM.from_config(                    
        config,
        torch_dtype=torch.float32   # 全精度训练
    ).to(device)                    # 迁移到 device 上

    # Kaiming 初始化
    def kaiming_initialization(model):
        for name, param in model.named_parameters():
            if 'weight' in name and param.dim() > 1:
                torch.nn.init.kaiming_uniform_(param, mode='fan_in', nonlinearity='leaky_relu')
            elif 'bias' in name:
                # 一般偏置项可以初始化为 0
                torch.nn.init.constant_(param, 0)

    kaiming_initialization(model)

    def print_model_parameters(model):
        print("Layer Name & Parameters")
        print("----------------------------")
        total_params = 0
        for name, parameter in model.named_parameters():
            param_size = parameter.size()
            param_count = torch.prod(torch.tensor(param_size)).item()
            total_params += param_count
            print(f"{name:50} | Size: {str(param_size):30} | Count: {str(param_count):20}")
        print("----------------------------")
        print(f"Total Parameters: {total_params} ({total_params / 1000000:.1f} M)")

    print_model_parameters(model)
    return model

# 开始训练

In [None]:
# def train_function(config=None):
#     with wandb.init(config=config,project='huggingface'):
#         # 从 wandb 获取超参数
#         config = wandb.config


#         training_args = TrainingArguments(
#                 output_dir="saves",
#                 do_train=True,                              # 是否做训练
#                 do_eval=False,                               # 是否做评估
#                 per_device_train_batch_size=config.batch_size,              # 每设备批次
#                 gradient_accumulation_steps=1,              # 梯度累计步大小，省显存，但小模型没必要，用 1 收敛比较快
#                 learning_rate=config.learning_rate,                         # 学习率大小
#                 lr_scheduler_type=config.lr_scheduler_type,                 # 学习率调度策略，LLM 训练一般都用余弦
#                 bf16=True,
#                 logging_steps=5,                           # 打印步骤间隔
#                 num_train_epochs=2,                         # 训练轮数，2 ~ 3 即可
#                 save_steps=10000000,                            # 检查点保存步骤间隔
#                 seed=3407,report_to=None
#             )

#         trainer=Trainer(
#             model=init_model(),
#             args=training_args,
#             train_dataset=train_split,
#             tokenizer=tokenizer,
#             data_collator=data_collator,
#         )
#         trainer.train()

# def scan():
#     import os
#     import wandb
#     wandb.init( project='huggingface')
#     sweep_config = {
#         "method": "bayes",  # 使用贝叶斯优化
#         "metric": {
#             "name": "train/loss",  # 优化的指标是训练 loss
#             "goal": "minimize",  # 目标是最小化训练 loss
#         },
#         "parameters": {
#             "learning_rate": {
#                 "min": 1e-4,
#                 "max": 5e-3,
#                 "distribution": "log_uniform_values",  # 学习率的分布是对数均匀分布
#             },
#             "batch_size": {"values": [16, 32, 64]},"lr_scheduler_type":{"values": ["cosine", "constant"]}
#         },
#     }
#     os.environ["WANDB_TIMEOUT"] = "60"
#     sweep_id = wandb.sweep(sweep_config)
#     wandb.agent(sweep_id, train_function)

training_args = TrainingArguments(
                output_dir="saves",
                do_train=True,                              # 是否做训练
                do_eval=False,                               # 是否做评估
                per_device_train_batch_size=16,              # 每设备批次
                gradient_accumulation_steps=1,              # 梯度累计步大小，省显存，但小模型没必要，用 1 收敛比较快
                learning_rate=0.004629403549377777,                         # 学习率大小
                lr_scheduler_type="constant",                 # 学习率调度策略，LLM 训练一般都用余弦
                bf16=True,
                logging_steps=5,                           # 打印步骤间隔
                num_train_epochs=2,                         # 训练轮数，2 ~ 3 即可
                save_steps=10000000,                            # 检查点保存步骤间隔
                seed=3407,report_to=None
            )

model=init_model()

trainer=Trainer(
            model=model,
            args=training_args,
            train_dataset=train_split,
            tokenizer=tokenizer,
            data_collator=data_collator,
        )

trainer.train()

In [None]:
def inference(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    input_text: str = "Once upon a time, ",
    max_new_tokens: int = 16
):
    inputs = tokenizer(input_text, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        pad_token_id=tokenizer.eos_token_id,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_k=40,
        top_p=0.9,
        temperature=0.6
    )
    generated_text = tokenizer.decode(
        outputs[0],
        skip_special_tokens=True
    )
    # print(outputs)
    print(generated_text)



In [24]:
inference(
    model,
    tokenizer,
    "<|start_story|>Once upon a time, ",
    max_new_tokens=256
)

Once upon a time, there was a little boy named Tim. Tim had a toy car that he loved to play with. One day, he went to the park with his mom. Tim saw a toy car on the ground. Tim wanted to play with the car to his mom and said, "Mom, can I play with your car with my car too?"
His mom said, "Yes, but we must not take turns." Tim felt sad, but he knew he had to go. He asked his mom for help. His mom said, "Okay, let's clean it together." They went to play together and played the toy car. They had a lot of fun.
After they finished the car together, Tim and his mom were surprised. They did not know that the car was not a toy car like it was a magic car. Tim had an idea. He put the car in the car and put the car on it. He pushed the car on the car on the car car and pulled it down. Tim was so happy. He played with the car with his car all day long, and Tim was very happy.<|end_story|>


In [None]:
model.to(torch.bfloat16).save_pretrained("saves_bfloat16")