In [137]:
import pandas as pd
import csv
import re



In [138]:
# Load the CSV file
file_path = '/Users/coco/Downloads/learning_chatbot/data/dataset_qa.csv'
df = pd.read_csv(file_path)

# Check the dataframe
print(df.head())


   Id                                           Question  \
0   1                              What is data science?   
1   2  What are the key steps in the data science pro...   
2   3  What is the difference between supervised and ...   
3   4                Explain the bias-variance tradeoff.   
4   5                       What is feature engineering?   

                                              Answer  
0  Data science is an interdisciplinary field tha...  
1  The key steps typically include problem defini...  
2  Supervised learning involves training a model ...  
3  The bias-variance tradeoff is the balance betw...  
4  Feature engineering is the process of selectin...  


In [139]:
# Basic preprocessing: removing any NaNs and ensuring text is clean
df.dropna(inplace=True)

# If needed, you can clean the text data
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.strip()  # Remove leading/trailing spaces
    return text

df['Question'] = df['Question'].apply(clean_text)
df['Answer'] = df['Answer'].apply(clean_text)

# Verify the cleaned data
print(df.head())

   Id                                           Question  \
0   1                              What is data science?   
1   2  What are the key steps in the data science pro...   
2   3  What is the difference between supervised and ...   
3   4                Explain the bias-variance tradeoff.   
4   5                       What is feature engineering?   

                                              Answer  
0  Data science is an interdisciplinary field tha...  
1  The key steps typically include problem defini...  
2  Supervised learning involves training a model ...  
3  The bias-variance tradeoff is the balance betw...  
4  Feature engineering is the process of selectin...  


In [142]:
# 将问题和答案组合成适合微调的格式
# 这将取决于具体的模型要求。这里，我们将创建一个提示和完成的列表。
data = []
for index, row in df.iterrows():
    prompt = f"Q: {row['Question']}\nA:"
    completion = f" {row['Answer']}"
    data.append({'prompt': prompt, 'completion': completion})

# 将格式化的数据保存为JSON文件以供训练
import json

formatted_file_path = '/Users/coco/Downloads/learning_chatbot/data/formatted_qadataset.jsonl'
with open(formatted_file_path, 'w') as outfile:
    for entry in data:
        json.dump(entry, outfile)
        outfile.write('\n')

# 确认数据已正确格式化
print(data[:3])  # 显示前3个条目

[{'prompt': 'Q: What is data science?\nA:', 'completion': ' Data science is an interdisciplinary field that uses scientific methods, processes, algorithms, and systems to extract knowledge and insights from data.'}, {'prompt': 'Q: What are the key steps in the data science process?\nA:', 'completion': ' The key steps typically include problem definition, data collection, data preparation, exploratory data analysis, modeling, evaluation, and deployment.'}, {'prompt': 'Q: What is the difference between supervised and unsupervised learning?\nA:', 'completion': ' Supervised learning involves training a model on labeled data, where the algorithm learns the relationship between input features and target labels. Unsupervised learning deals with unlabeled data and aims to find hidden patterns or structures within the data.'}]


In [147]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import torch.nn as nn
from transformers import GPT2LMHeadModel

class CustomGPT2Model(GPT2LMHeadModel):
    def forward(self, input_ids, attention_mask=None, labels=None):
        # 使用 GPT-2 模型的父类方法进行前向传播
        outputs = super().forward(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        
        print(f"Logits shape: {logits.shape}")
        print(f"Logits sample: {logits[0, -1, :5].detach().cpu().numpy()}")

        loss = None
        if labels is not None:
            print(f"Labels shape: {labels.shape}")
            print(f"Labels sample: {labels[0, :5].detach().cpu().numpy()}")

            # Shift the logits and labels for calculating the loss
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()

            print(f"Shifted Logits shape: {shift_logits.shape}")
            print(f"Shifted Labels shape: {shift_labels.shape}")

            try:
                # 计算损失
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
                print(f"Calculated loss: {loss.item()}")
            except Exception as e:
                print(f"Error calculating loss: {e}")

        return {'loss': loss, 'logits': logits, 'past_key_values': outputs.past_key_values}
# 加载预训练模型和分词器
model_name = 'gpt2'  # 你也可以使用其他模型
model = CustomGPT2Model.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)


# 添加一个新的填充标记
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))

# 加载你格式化的数据集
dataset = load_dataset('json', data_files=formatted_file_path, split='train')

# 对数据集进行分词和填充
def tokenize_function(examples):
    return tokenizer(
        examples['prompt'],
        truncation=True,
        padding='max_length',
        max_length=512,
        pad_to_multiple_of=8,
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 微调参数
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    save_steps=500,
    save_total_limit=2,
    learning_rate=5e-5,
    logging_dir='./logs',            # 日志保存路径
    logging_steps=10,
)

# 初始化Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer,
)



In [148]:
# 微调模型
trainer.train()

Logits shape: torch.Size([4, 512, 50258])
Logits sample: [-105.8998  -107.28401 -108.54804 -111.86737 -105.87482]


TypeError: unsupported operand type(s) for /: 'NoneType' and 'int'

In [116]:
# 保存微调后的模型
model.save_pretrained('/Users/coco/Downloads/learning_chatbot/models/fine_tuned_model')
tokenizer.save_pretrained('/Users/coco/Downloads/learning_chatbot/models/fine_tuned_model')

('/Users/coco/Downloads/learning_chatbot/models/fine_tuned_model/tokenizer_config.json',
 '/Users/coco/Downloads/learning_chatbot/models/fine_tuned_model/special_tokens_map.json',
 '/Users/coco/Downloads/learning_chatbot/models/fine_tuned_model/vocab.json',
 '/Users/coco/Downloads/learning_chatbot/models/fine_tuned_model/merges.txt',
 '/Users/coco/Downloads/learning_chatbot/models/fine_tuned_model/added_tokens.json')

In [117]:
from transformers import pipeline

# 加载微调后的模型
qa_pipeline = pipeline('text-generation', model='/Users/coco/Downloads/learning_chatbot/models/fine_tuned_model')



Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [150]:
questions = [
    "What is the difference between classification and clustering?",
    "How do you handle missing data in a dataset?",
    "Explain the concept of overfitting."
]

# 使用模型生成答案
for question in questions:
    prompt = f"Q: {question}\nA:"
    answers = qa_pipeline(
    prompt, 
    max_length=150, 
    temperature=0.7, 
    top_k=50, 
    top_p=0.9,
    num_return_sequences=1
)
    generated_text = answers[0]['generated_text']
    print(f"Q: {question}\nA: {generated_text}\n")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Q: What is the difference between classification and clustering?
A: Q: What is the difference between classification and clustering?
A: A clustering algorithm is a series of algorithms that perform a single task. For example, a classification algorithm may perform multiple tasks at a time, for example, sorting through an array. The classification algorithm uses a set of weights to determine which tasks are correct. For example, the algorithm can determine which images are good and which are bad. The classification algorithm can also choose which tasks are correct, for example, grouping a list of images together.
A clustering algorithm is a set of algorithms that performs a single task. For example, a classification algorithm may perform multiple tasks at a time, for example, sorting through an array. The classification algorithm uses a set of weights



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Q: How do you handle missing data in a dataset?
A: Q: How do you handle missing data in a dataset?
A: I'm not really a big fan of missing data. It's an issue of looking at what's really important. I'm not saying I don't care about that. I'm not really a huge fan of missing data. I'm just not sure how to handle it.
But I do like to think that I can handle missing data. I don't like the idea of looking at everything and then trying to figure out how to deal with it. And I think that's a big reason I'm doing this.
I think that the next big question is how do we solve this problem?
Q: How do we get people to look at their data in a

Q: Explain the concept of overfitting.
A: Q: Explain the concept of overfitting.
A: Overfitting is a term that has been around for a while. It is defined as a way of making a device that is more comfortable to use.
A: The more comfortable you are with your device, the less comfortable it will be to use. If you have an over-fitting device, the more you will be a