In [None]:
import pandas as pd
import numpy as np
import dataclasses

# Load the dataset
file_path = "/content/chatbot_dataset.csv"
data = pd.read_csv(file_path)

In [None]:
data.head()

Unnamed: 0,User ID,User Utterance,Bot Response,Timestamp,Context/Session ID,Entities,User Feedback,Conversation Outcome,User Profile,Channel/Platform,Language,User Emotion/Sentiment,Location,User Segment
0,uKqYhMMQ5S,Charge bar between follow student.,Important law into large example range. Player...,2023-10-31 18:02:06,4dfe56dc-efe2-49c6-be9f-ce5b84ca4de4,event,negative,incomplete,Annette Henderson,social media,German,confused,Sydney,returning customers
1,YOonrpgxp9,Bad every reflect huge contain.,Policy argue agree character go recent. When r...,2023-10-31 18:02:06,74c97616-9df6-460d-991d-c6f6f6ae5354,location,neutral,specific outcome,Nicholas Haney,mobile app,Chinese,frustrated,London,returning customers
2,V0IwFXGYAg,Glass remember many dog director under.,Total rise unit recent data away. Business air...,2023-10-31 18:02:06,f3034d75-d552-4a6c-8c73-0f6b441a7bdf,event,negative,incomplete,David Smith,website chat,Spanish,excited,Tokyo,premium users
3,3mtwyCBGqy,Help charge record many talk tough.,Artist today decade. Civil score hospital othe...,2023-10-31 18:02:06,46c7bf6a-1da6-4e27-baf5-a138f1e0a1b2,service,negative,incomplete,Susan Wilson,website chat,English,frustrated,Sydney,new users
4,dSQOFGb8Pq,Position not man much material.,Character serve receive interview interest ord...,2023-10-31 18:02:06,2487c6ce-65c1-48a4-a76f-0aa6a1e92dda,service,neutral,specific outcome,Jennifer Harmon,mobile app,Spanish,confused,New York,returning customers


In [None]:
# Required columns for training
columns_needed = ["User Utterance", "Bot Response", "Context/Session ID", "Entities"]
data = data[columns_needed]

In [None]:
from transformers import AutoTokenizer

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Preprocessing Function
def preprocess_data_with_context(data):
    inputs = []
    targets = []
    context_mapping = {}

    for _, row in data.iterrows():
        user_utterance = row["User Utterance"]
        bot_response = row["Bot Response"]
        session_id = row["Context/Session ID"]

        # Maintain a session context
        if session_id not in context_mapping:
            context_mapping[session_id] = []

        # Build context
        session_context = " ".join(context_mapping[session_id][-3:])  # Limit context to last 3 exchanges
        input_text = f"Context: {session_context}\nUser: {user_utterance}\nBot:"
        target_text = bot_response

        # Update context for the session
        context_mapping[session_id].append(f"User: {user_utterance} Bot: {bot_response}")

        inputs.append(input_text)
        targets.append(target_text)

    return inputs, targets

# Preprocess data
questions, answers = preprocess_data_with_context(data)

# Example of tokenized input
print(f"Sample Input: {questions[0]}")
print(f"Sample Target: {answers[0]}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Sample Input: Context: 
User: Charge bar between follow student.
Bot:
Sample Target: Important law into large example range. Player seem force with partner sometimes happen southern.


In [None]:
from transformers import AutoTokenizer

# Initialize tokenizer for GPT-2
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Set the padding token to be the EOS token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset for GPT-2
def tokenize_dataset(questions, answers, tokenizer, max_length=512):
    input_ids = []
    target_ids = []

    for i, q in enumerate(questions):
        # Tokenize input and output
        tokenized_input = tokenizer(
            q, max_length=max_length, truncation=True, padding="max_length", return_tensors="pt"
        ).input_ids.squeeze()
        tokenized_output = tokenizer(
            answers[i], max_length=max_length, truncation=True, padding="max_length", return_tensors="pt"
        ).input_ids.squeeze()

        input_ids.append(tokenized_input)
        target_ids.append(tokenized_output)

    return input_ids, target_ids

# Tokenize the dataset
tokenized_inputs, tokenized_targets = tokenize_dataset(questions, answers, tokenizer)



In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, AdamW
from torch.utils.tensorboard import SummaryWriter

# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set EOS token as pad token

# Load your dataset
data = pd.read_csv("/content/chatbot_dataset.csv")

# Preprocessing function
def preprocess_data_with_context(data):
    questions = data['User Utterance'].tolist()
    answers = data['Bot Response'].tolist()
    return questions, answers

questions, answers = preprocess_data_with_context(data)

# Define the dataset class
class ChatDataset(Dataset):
    def __init__(self, questions, answers, tokenizer, max_length=512):
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        input_text = self.questions[idx]
        target_text = self.answers[idx]

        # Tokenize inputs and targets
        input_ids = self.tokenizer(input_text, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt").input_ids.squeeze()
        target_ids = self.tokenizer(target_text, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt").input_ids.squeeze()

        return {
            "input_ids": input_ids,
            "labels": target_ids
        }

# Create the dataset
dataset = ChatDataset(questions, answers, tokenizer)

# Load the pre-trained GPT-2 model
model = AutoModelForCausalLM.from_pretrained("gpt2")

# Define TensorBoard writer
writer = SummaryWriter(log_dir="./logs")

# Training Arguments
training_args = TrainingArguments(
    output_dir="./gpt2-chatbot",  # Save model checkpoints
    num_train_epochs=3,  # Number of epochs
    per_device_train_batch_size=8,  # Batch size
    save_steps=500,  # Save checkpoints every 500 steps
    save_total_limit=2,  # Keep only the last 2 checkpoints
    logging_dir="./logs",  # Directory to save logs
    logging_steps=50,  # Log every 50 steps
    evaluation_strategy="no",  # Disable evaluation
    disable_tqdm=False  # Enable progress bars
)

# Initialize AdamW optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define the Trainer class
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,  # The dataset for training
)

# Custom training loop with TensorBoard logging
for epoch in range(training_args.num_train_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(trainer.get_train_dataloader()):
        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass and optimization step
        loss.backward()
        optimizer.step()  # Manually handle the optimizer step
        optimizer.zero_grad()  # Reset gradients after each step

        # Log loss to TensorBoard
        if step % training_args.logging_steps == 0:
            writer.add_scalar("Loss/train", loss.item(), epoch * len(trainer.get_train_dataloader()) + step)

    avg_train_loss = total_loss / len(trainer.get_train_dataloader())
    print(f"Epoch {epoch + 1}/{training_args.num_train_epochs} - Avg. Training Loss: {avg_train_loss:.4f}")

# Save the trained model and tokenizer
model.save_pretrained("./gpt2-chatbot")
tokenizer.save_pretrained("./gpt2-chatbot")

# Close the TensorBoard writer
writer.close()




Epoch 1/3 - Avg. Training Loss: 0.1927
Epoch 2/3 - Avg. Training Loss: 0.1724
Epoch 3/3 - Avg. Training Loss: 0.1715


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("./gpt2-chatbot")
model = AutoModelForCausalLM.from_pretrained("./gpt2-chatbot")

# Function to generate a response based on user input
def generate_response(input_text, model, tokenizer, max_length=50):
    # Encode the input text to token ids
    inputs = tokenizer.encode(input_text, return_tensors="pt")

    # Generate a response from the model
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1, no_repeat_ngram_size=2, top_p=0.92, temperature=0.7)

    # Decode the generated response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

# Test the model
user_input = "I want to ask something"
response = generate_response(user_input, model, tokenizer)

print(f"User: {user_input}")
print(f"Bot: {response}")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


User: I want to ask something
Bot: I want to ask something..
