In [1]:
import pandas as pd

# Load dataset
dataset = pd.read_csv("Chatbot.csv")

# Check the dataset structure
print(dataset.head())

# Ensure the dataset has clear columns 'query' and 'response'
if "query" not in dataset.columns or "response" not in dataset.columns:
    raise ValueError("The dataset must contain 'query' and 'response' columns")


                                   query  \
0     "How do I sign up for an account?"   
1       "How do I log in to my account?"   
2          "How do I reset my password?"   
3          "How do I delete my account?"   
4  "How do I change my account details?"   

                                            response  
0  "To sign up, click the 'Sign Up' button on the...  
1  "To log in, click the 'Login' button, enter yo...  
2  "To reset your password, go to the login page ...  
3  "To delete your account, navigate to your acco...  
4  "Go to your profile settings and click on 'Edi...  


In [2]:
from datasets import Dataset

# Prepare the dataset by concatenating query and response with markers
def add_markers(examples):
    examples['text'] = f"<|query|> {examples['query']} <|response|> {examples['response']}"
    return examples

# Convert pandas DataFrame to HuggingFace Dataset
dataset = Dataset.from_pandas(dataset)
dataset = dataset.map(add_markers)

# Print example with markers
print(dataset[0])


Map:   0%|          | 0/19 [00:00<?, ? examples/s]

{'query': '"How do I sign up for an account?"', 'response': '"To sign up, click the \'Sign Up\' button on the homepage, fill out the required information, and confirm your email."', 'text': '<|query|> "How do I sign up for an account?" <|response|> "To sign up, click the \'Sign Up\' button on the homepage, fill out the required information, and confirm your email."'}


In [7]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

# Initialize the GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Set EOS token as pad token

# Tokenize function to handle input formatting
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Set input_ids as labels for language modeling task
def prepare_data_for_gpt2(examples):
    import torch

    # Convert input_ids to tensors if they are not already
    input_ids = torch.tensor(examples['input_ids']) if not isinstance(examples['input_ids'], torch.Tensor) else examples['input_ids']
    
    # Clone input_ids to create labels
    examples['labels'] = input_ids.clone()
    return examples


# Apply the preparation
final_dataset = tokenized_dataset.map(prepare_data_for_gpt2)

model = GPT2LMHeadModel.from_pretrained('gpt2')
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=20,
    per_device_train_batch_size=12,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()




Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Step,Training Loss
10,2.3222
20,0.4541
30,0.296
40,0.2393


TrainOutput(global_step=40, training_loss=0.8278794407844543, metrics={'train_runtime': 567.3947, 'train_samples_per_second': 0.67, 'train_steps_per_second': 0.07, 'total_flos': 24822743040000.0, 'train_loss': 0.8278794407844543, 'epoch': 20.0})

In [15]:
    input_text = "How can i search courses?"
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt")

    # Generate response from the fine-tuned model
    output = model.generate(
        inputs['input_ids'], 
        attention_mask=inputs['attention_mask'], 
        max_length=150,
        eos_token_id=tokenizer.eos_token_id,
        do_sample= True,
        temperature=0.7,  # Adjust for more creativity vs. determinism
        top_k=50,         # Consider top k most likely next words
        top_p=0.9,  
        no_repeat_ngram_size=3
    )

    # Decode and print the response
    response = tokenizer.decode(output[0], skip_special_tokens=True)
   
    print(f"Chatbot response: {response}\n")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Chatbot response: How can i search courses?

To find out more courses, go to the courses page and click the 'Search'.



In [10]:
import os
print(os.getcwd())


C:\Users\DELL\Desktop\test
