In [15]:
# 安装transformers库
# !pip install transformers

import os
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

from google.colab import drive
drive.mount('./mount')


Drive already mounted at ./mount; to attempt to forcibly remount, call drive.mount("./mount", force_remount=True).


In [16]:

# Function to load stopwords
def load_stopwords(file_path):
    stop_words = []
    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
        stop_words.extend([word.strip('\n') for word in f.readlines()])
    return stop_words

# Function to preprocess corpus
def preprocess_corpus(text, cn_stopwords):
    for tmp_char in cn_stopwords:
        text = text.replace(tmp_char, "")
    return text

# Define corpus path in Google Drive
corpus_path = './mount/My Drive/Colab Notebooks/BH/1-2/DL/3/chinese_corpus/'

# Read specific text files ('越女剑' in filename)
texts = []
for file_name in os.listdir(corpus_path):
    if '越女剑' in file_name and file_name.endswith('.txt'):
        with open(os.path.join(corpus_path, file_name), 'r', encoding='utf-8') as file:
            texts.append(file.read())

# Merge texts from selected files
merged_content = ''.join(texts)

# Load Chinese stopwords
stopwords_file_path = './mount/My Drive/Colab Notebooks/BH/1-2/DL/3/stopwords-zh.txt'
cn_stopwords = load_stopwords(stopwords_file_path)

# Preprocess merged content by removing stopwords
merged_content = preprocess_corpus(merged_content, cn_stopwords)

# Save preprocessed content to a file
output_file_path = './mount/My Drive/Colab Notebooks/BH/1-2/DL/3/chinese_corpus/all.txt'
with open(output_file_path, 'w', encoding='utf-8') as f:
    f.write(merged_content)

print(f"Preprocessed content saved to {output_file_path}")


Preprocessed content saved to ./mount/My Drive/Colab Notebooks/BH/1-2/DL/3/chinese_corpus/all.txt


In [17]:
!pip install accelerate -U



In [None]:


# 使用GPT-2模型和分词器
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# 创建数据集
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=output_file_path,
    block_size=128,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# 训练模型
training_args = TrainingArguments(
    output_dir="./gpt2_jin_yong",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

trainer.train()

# 生成文本的函数
def generate_text_transformer(seed_text, next_words, model, tokenizer):
    input_ids = tokenizer.encode(seed_text, return_tensors='pt')
    output = model.generate(input_ids, max_length=next_words + len(input_ids[0]), num_return_sequences=1)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# 生成文本示例
generated_text = generate_text_transformer("", 50, model, tokenizer)
print(generated_text)
