In [2]:
import pandas as pd

data = pd.read_csv("formatted_csv.csv")

In [3]:
dialogues = []
for index, row in data.iterrows():
    speaker1_text = row['Speaker1']
    speaker2_text = row['Speaker2']
    
    dialogue = f"{speaker1_text}\n{speaker2_text}\n"
    dialogues.append(dialogue)

# 将对话文本保存到文件中（每个对话一行）
with open("processed_dialogues.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(dialogues))


In [4]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")

max_token_length = 1024  # 根据模型的最大限制设置适当的长度

with open("processed_dialogues.txt", "r", encoding="utf-8") as file:
    dialogues = file.read().splitlines()

tokenized_dialogues = []
for dialogue in dialogues:
    tokens = tokenizer.encode(dialogue, add_special_tokens=True, truncation=True, max_length=max_token_length)
    tokenized_dialogues.append(tokens)


In [5]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_name = "gpt2-medium"  # 选择合适的模型规模
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

In [6]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments


# 从处理好的文本文件中加载对话数据
with open("processed_dialogues.txt", "r", encoding="utf-8") as file:
    dialogues = file.read().splitlines()


In [7]:
dialogues

['there .',
 'where ?',
 '',
 'you have my word . as a gentleman',
 'you re sweet .',
 '',
 'hi .',
 'looks like things worked out tonight huh ?',
 '',
 'have fun tonight ?',
 'tons',
 '',
 'well no . . .',
 'then that s all you had to say .',
 '',
 'then that s all you had to say .',
 'but',
 '',
 'but',
 'you always been this selfish ?',
 '',
 'do you listen to this crap ?',
 'what crap ?',
 '',
 'what good stuff ?',
 ' the real you . ',
 '',
 'wow',
 'let s go .',
 '',
 'she okay ?',
 'i hope so .',
 '',
 'they do to !',
 'they do not !',
 '',
 'did you change your hair ?',
 'no .',
 '',
 'no .',
 'you might wanna think about it',
 '',
 'who ?',
 'joey .',
 '',
 'great',
 'would you mind getting me a drink cameron ?',
 '',
 'it s more',
 'expensive ?',
 '',
 'where ve you been ?',
 'nowhere . . . hi daddy .',
 '',
 'what ?',
 'in th . for a month',
 '',
 'in th . for a month',
 'why ?',
 '',
 'why ?',
 'he was like a total babe',
 '',
 'he was like a total babe',
 'but you hate joey

In [8]:
# 将对话文本转换为输入数据
tokenized_dialogues = [tokenizer.encode(dialogue, add_special_tokens=True) for dialogue in dialogues]

# 将tokenized数据写入文件
with open("tokenized_dialogues.txt", "w", encoding="utf-8") as file:
    for tokens in tokenized_dialogues:
        file.write(" ".join(map(str, tokens)) + "\n")

In [9]:
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="tokenized_dialogues.txt",  # 文件路径包含tokenized数据
    block_size=100
)



In [10]:
len(dataset)

11437

In [11]:
dataset[0]

tensor([   23, 17657,   767,  2414,   198,  6200,    18,  7265,  2091,   198,
          198,  3365,  2624, 49125,   718,  1433,  1315,  4790,   767,  2414,
        36561, 36100, 26063,  1983,   198,  3365,  2624, 32591,  3126,  1959,
          767,  2414,   198,   198,    20, 22572,   767,  2414,   198,  4051,
         1899,  9415,   642,  3459,  1105,  3559,   513, 16243, 44541,  7388,
         2425, 34620,  2075,  7265,  2091,   198,   198,  1415,  8628,  1105,
         3553,  7388,  2425,  7265,  2091,   198,  1983, 25270,   198,   198,
         1821,  4310,   718,  2231,   767,  2414,   767,  2414,   767,  2414,
          198,  5332,  1731, 40660, 32158,   604,  3324, 39937, 25240, 40654,
          860,   940,   767,  2414,   198,   198,  5332,  1731, 40660, 32158])

In [12]:
# 设置微调参数
training_args = TrainingArguments(
    output_dir="./chatbot_model",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
)

In [13]:
# 创建Trainer并开始微调
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

In [14]:
trainer.train()

  0%|          | 0/3575 [00:00<?, ?it/s]

ValueError: The batch received was empty, your model won't be able to train on it. Double-check that your training dataset contains keys expected by the model: input_ids,past_key_values,attention_mask,token_type_ids,position_ids,head_mask,inputs_embeds,encoder_hidden_states,encoder_attention_mask,labels,use_cache,output_attentions,output_hidden_states,return_dict,label_ids,label,labels.