In [None]:
from transformers import GPT2Tokenizer
import jieba
# 适配gpt2的jieba tokenizer
class JiebaGPT2Tokenizer:
    def __init__(self, gpt2_tokenizer_path, max_length=50):
        self.gpt2_tokenizer = GPT2Tokenizer.from_pretrained(gpt2_tokenizer_path)
        self.max_length = max_length
        if self.gpt2_tokenizer.pad_token is None:
            self.gpt2_tokenizer.add_special_tokens({"pad_token": "[PAD]"})
        self.pad_token_id=self.gpt2_tokenizer.pad_token_id

    def tokenize(self, text):
        text=str(text)
        words = jieba.lcut(text)
        tokens = []
        for word in words:
            tokens.extend(self.gpt2_tokenizer.tokenize(word))
        return tokens
    
    def convert_tokens_to_ids(self, tokens):
        return self.gpt2_tokenizer.convert_tokens_to_ids(tokens)
    
    def encode(self, text, truncation=True, padding="max_length"):
        tokens = self.tokenize(text)
        # 将 tokens 转换为 input_ids
        input_ids = self.convert_tokens_to_ids(tokens)
        # 截断或填充到固定长度
        if truncation:
            input_ids = input_ids[:self.max_length]
        # Pad if necessary
        attention_mask = [1] * len(input_ids)  # Initialize mask for actual words
        if padding == "max_length" and len(input_ids) < self.max_length:
            pad_length = self.max_length - len(input_ids)
            input_ids += [self.pad_token_id] * pad_length
            attention_mask += [0] * pad_length  # Padding tokens should have mask 0

        return {"input_ids": input_ids, "attention_mask": attention_mask}

In [None]:
import pandas as pd
from datasets import Dataset, Features, Value

df = pd.read_csv("combined_danmaku.csv")

texts = df["弹幕内容"].tolist()  # 假设列名为 "text"
df=df.dropna()
df['弹幕内容'] = df['弹幕内容'].astype(str)

dataset = Dataset.from_pandas(df[["弹幕内容"]])
dataset = dataset.rename_column("弹幕内容", "text")

dataset

Dataset({
    features: ['text', '__index_level_0__'],
    num_rows: 69970
})

In [None]:
from transformers import GPT2LMHeadModel

def load_model_and_tokenizer(model_name="gpt2"):
    tokenizer = JiebaGPT2Tokenizer(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    model.resize_token_embeddings(len(tokenizer.gpt2_tokenizer))

    model.config.pad_token_id = tokenizer.pad_token_id

    return tokenizer, model

In [84]:
def preprocess_data(dataset, tokenizer):
    """
    对数据集进行分词处理，并确保 `input_ids`, `labels`, `attention_mask` 存在
    """
    def tokenize_function(examples):
        encoded_inputs = [tokenizer.encode(text) for text in examples["text"]]

        return {
            "input_ids": [entry["input_ids"] for entry in encoded_inputs],
            "attention_mask": [entry["attention_mask"] for entry in encoded_inputs],
            "labels": [entry["input_ids"] for entry in encoded_inputs]  # Labels same as input_ids
        }

    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
    return tokenized_dataset


In [None]:
from transformers import Trainer, TrainingArguments

# 定义训练参数
training_args = TrainingArguments(
    output_dir="./gpt2-danmaku-jieba", 
    overwrite_output_dir=True,
    num_train_epochs=3,             
    per_device_train_batch_size=8,   
    save_steps=500,                  
    save_total_limit=2,                
    logging_dir="./logs",              
    prediction_loss_only=True,
)


In [86]:
tokenizer, model = load_model_and_tokenizer()
tokenized_dataset = preprocess_data(dataset, tokenizer)

Map: 100%|██████████| 69970/69970 [00:18<00:00, 3860.49 examples/s]


In [87]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1) 

In [88]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)
trainer.train()

Step,Training Loss
100,5.565
200,1.5402
300,1.3189
400,1.2976
500,1.2701
600,1.2599
700,1.1896
800,1.1992
900,1.1962
1000,1.1729


TrainOutput(global_step=23616, training_loss=0.9970842268731859, metrics={'train_runtime': 5556.6017, 'train_samples_per_second': 33.999, 'train_steps_per_second': 4.25, 'total_flos': 4820608339200000.0, 'train_loss': 0.9970842268731859, 'epoch': 3.0})

In [89]:
model.save_pretrained("./gpt2-danmaku-jieba")
tokenizer.gpt2_tokenizer.save_pretrained("./gpt2-danmaku-jieba")

('./gpt2-danmaku-jieba\\tokenizer_config.json',
 './gpt2-danmaku-jieba\\special_tokens_map.json',
 './gpt2-danmaku-jieba\\vocab.json',
 './gpt2-danmaku-jieba\\merges.txt',
 './gpt2-danmaku-jieba\\added_tokens.json')

In [None]:
model = GPT2LMHeadModel.from_pretrained("./gpt2-danmaku-jieba")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-danmaku-jieba")

input_text = "1"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

output = model.generate(
    input_ids,
    max_length=10,
    num_return_sequences=1,
    no_repeat_ngram_size=2,
    top_k=50,
    top_p=0.95,
    temperature=0.1,
)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


1.5倍速通
