In [1]:
import json 

# 8、语言
with open("../user_data/corpus.json", "r", encoding="utf-8") as f:
    corpus = json.load(f)

In [2]:
import json
from transformers import BertTokenizer


# 1. 创建词汇表（去重）
all_tokens = set(token for sentence in corpus for token in sentence)
vocab = {token: idx for idx, token in enumerate(sorted(all_tokens), start=5)}
# 添加特殊 token (比如: [PAD], [CLS], [SEP], [UNK])
special_tokens = {"[PAD]": 0, "[CLS]": 1, "[SEP]": 2, "[UNK]": 3, "[MASK]": 4}
vocab = {**special_tokens, **vocab}
print(len(vocab))

# 2. 将词汇表保存为一个文件 (vocab.txt)
with open("../user_data/vocab.txt", "w", encoding="utf-8") as vocab_file:
    for token, idx in vocab.items():
        vocab_file.write(f"{token}\n")

# 3. 加载词汇表并创建 BertTokenizer
tokenizer = BertTokenizer(vocab_file="../user_data/vocab.txt", do_lower_case=False)

# 4. 测试 tokenizer
sample_text = corpus[0]
encoded = tokenizer.encode(sample_text, add_special_tokens=True)
print(f"Encoded: {encoded}")
print(f"Decoded: {tokenizer.decode(encoded)}")

4510
Encoded: [1, 4439, 4438, 4461, 513, 4502, 4398, 4459, 280, 2470, 2942, 2781, 861, 2528, 624, 2477, 3391, 4489, 280, 2470, 2942, 3169, 2781, 2528, 624, 4458, 4501, 2965, 2600, 2362, 2477, 4437, 4499, 4460, 4386, 4388, 4462, 4438, 4461, 1246, 4502, 1829, 4459, 207, 314, 2942, 2781, 861, 2528, 624, 2477, 3391, 4489, 2528, 2477, 4425, 280, 2470, 2781, 861, 672, 4425, 3169, 2781, 2528, 624, 4458, 4501, 1631, 1627, 2362, 2477, 4437, 4499, 4460, 4369, 4359, 4462, 4463, 4443, 4466, 4441, 4451, 4485, 4497, 4457, 4498, 2213, 2884, 1121, 929, 619, 3189, 402, 2361, 1723, 2681, 1190, 1121, 929, 3347, 1050, 653, 755, 527, 616, 2942, 2443, 154, 4488, 4460, 4385, 4397, 4474, 4451, 4485, 4497, 4457, 4498, 2213, 2884, 4409, 4435, 20, 20, 30, 402, 1969, 1080, 3169, 4104, 653, 755, 2443, 154, 4488, 4460, 4388, 4474, 4451, 4485, 4497, 283, 2884, 207, 3711, 708, 4115, 2353, 2443, 1101, 4278, 2975, 546, 1950, 3056, 2281, 2876, 3376, 2870, 1022, 4053, 2940, 3056, 2465, 1307, 1893, 2057, 866, 819, 2719, 2

2024-09-16 21:19:48.112680: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-16 21:19:48.134399: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-16 21:19:48.141170: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-16 21:19:48.159235: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Decoded: [CLS] <begin_EduExps> <begin_EduExp> <education> 14722706&14853292 <schoolLevel> 62&63&50&53&54&54 <department> 14717874 14854807 14925705 14923942 14780072 14856354 14725004 14854817 15638181 <major> 14717874 14854807 14925705 15632285 14923942 14856354 14725004 <courses> <school> 14925738 14857910 14853024 14854817 <GPA> <ranking> <duration> 53 54 <end_EduExp> <begin_EduExp> <education> 14785451&14721174 <schoolLevel> 14844587&14847385&15641759&14788518 <department> 14716590 14718849 14925705 14923942 14780072 14856354 14725004 14854817 15638181 <major> 14856354 14854817 88 14717874 14854807 14923942 14780072 14726844 88 15632285 14923942 14856354 14725004 <courses> <school> 14792344 14792334 14853024 14854817 <GPA> <ranking> <duration> 51 50 <end_EduExp> <end_EduExps> <begin_WorkExps> <end_WorkExps> <begin_ProjectExps> <begin_projectExp> <jobTitle> <projectDesc> <companyName> <projectName> 14850237 14925193 14783159 14781097 14724797 15634611 14720387 14853021 14794135 1485

In [3]:
import random
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, corpus, tokenizer, max_length=2048):
        self.corpus = corpus
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.corpus)

    def __getitem__(self, idx):
        tokens = self.corpus[idx]

        # 如果 tokens 长度超过 max_length - 2，随机选择一个起始位置
        if len(tokens) > self.max_length - 2:
            start_idx = random.randint(0, len(tokens) - (self.max_length - 2))
            tokens = tokens[start_idx: start_idx + (self.max_length - 2)]
        
        # 进行编码处理，确保输入是适当长度
        encoding = self.tokenizer.encode_plus(
            tokens, 
            padding="max_length",  # 如果不足 max_length，进行填充
            trunction=True,
            max_length=self.max_length,  # 保证长度为 max_length
            return_tensors="pt"
        )
        
        input_ids = encoding["input_ids"].squeeze(0)
        attention_mask = encoding["attention_mask"].squeeze(0)
        # 打印 input_ids 的最大值，查看是否超出 vocab_size
        max_input_id = input_ids.max().item()
        vocab_size = len(self.tokenizer.vocab)
        
        if max_input_id >= vocab_size:
            print(f"Error: input_id {max_input_id} exceeds vocab_size {vocab_size}")
            raise ValueError(f"input_id {max_input_id} exceeds vocab_size {vocab_size}")
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask
        }

# 假设你的 tokenizer 和 corpus 已经定义好
dataset = CustomDataset(corpus, tokenizer)

In [None]:
from transformers import BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments


# 1. 定义 BERT 模型配置
config = BertConfig(
    vocab_size=len(tokenizer.vocab),  # 词汇表大小
    max_position_embeddings=dataset.max_length,      # 最大序列长度
    hidden_size=768,                  # 隐藏层大小
    num_attention_heads=12,           # 注意力头的数量
    num_hidden_layers=6,              # Transformer 层的数量
    type_vocab_size=2,                # token 类型词表大小
    pad_token_id=tokenizer.pad_token_id  # PAD token ID
)


# 2. 初始化一个从零开始的 BERT 模型
model = BertForMaskedLM(config=config)

# 3. 使用 DataCollatorForLanguageModeling 来自动遮盖 token
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15  # 遮盖 15% 的 token
)

# 4. 定义训练参数
training_args = TrainingArguments(
    output_dir="../user_data/output",       # 输出目录
    overwrite_output_dir=True,   # 是否覆盖输出目录
    num_train_epochs=100,          # 训练轮数
    per_device_train_batch_size=16,  # 每个设备的批次大小
    logging_strategy="epoch",
    save_strategy="epoch",          # 每个epoch步保存模型
    save_total_limit=2,          # 保留最近的 2 个保存点
    logging_dir='../user_data/logs',        # 日志保存路径
    fp16=True
)

# 5. 创建 Trainer 实例
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,  # 你的自定义数据集
    tokenizer=tokenizer     # 你的自定义 tokenizer
)

# 6. 开始训练
trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
1660,5.5831
3320,5.1423
4980,4.6695
6640,3.5875
8300,2.8465
9960,2.472
11620,2.2326
13280,2.0187
14940,1.8609
16600,1.7373


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

