In [39]:
import pandas as pd
import re

In [40]:
# read json file
data = pd.read_json("data/combined_dataset.json", lines=True)

In [41]:
data.shape

(3512, 2)

In [42]:
data.head()

Unnamed: 0,Context,Response
0,I'm going through some things with my feelings...,"If everyone thinks you're worthless, then mayb..."
1,I'm going through some things with my feelings...,"Hello, and thank you for your question and see..."
2,I'm going through some things with my feelings...,First thing I'd suggest is getting the sleep y...
3,I'm going through some things with my feelings...,Therapy is essential for those that are feelin...
4,I'm going through some things with my feelings...,I first want to let you know that you are not ...


In [43]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3512 entries, 0 to 3511
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Context   3512 non-null   object
 1   Response  3512 non-null   object
dtypes: object(2)
memory usage: 55.0+ KB


In [44]:
data.describe()

Unnamed: 0,Context,Response
count,3512,3512.0
unique,995,2480.0
top,I have so many issues to address. I have a his...,
freq,94,4.0


# Data cleaning

In [45]:
# remove duplicate values
data.duplicated().sum()

760

In [46]:
data.drop_duplicates(inplace=True)

In [47]:
data.shape

(2752, 2)

In [48]:
# remove missing and null values
data.isna().sum()

Context     0
Response    0
dtype: int64

In [49]:
# Remove Extra Spaces, Tabs, and Newlines
data['Context'] = data['Context'].str.replace(r"\s+", " ", regex=True).str.strip()

In [50]:
# Standardize Capitalization
data['Context'] = data['Context'].str.lower()
data['Context'] = data['Context'].str.replace(r"(^\w|\.\s*\w)", lambda m: m.group().upper(), regex=True)


In [51]:
# Remove Sensitive Data
data['Context'] = data['Context'].str.replace(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b", "[EMAIL]", regex=True)
data['Context'] = data['Context'].str.replace(r"\b\d{10}\b", "[PHONE NUMBER]", regex=True)


In [52]:
# Normalize Punctuation
data['Context'] = data['Context'].str.replace(r"[?!]+", lambda m: m.group()[0], regex=True)
data['Context'] = data['Context'].str.replace(r"([.,!?])(\w)", r"\1 \2", regex=True)
data['Context'] = data['Context'].str.replace(r"\s([.,!?])", r"\1", regex=True)


In [53]:
data['Response'] = data['Response'].str.replace(r"\s+", " ", regex=True).str.strip()


In [54]:
data['Response'] = data['Response'].str.lower()
data['Response'] = data['Response'].str.replace(r"(^\w|\.\s*\w)", lambda m: m.group().upper(), regex=True)


In [55]:
data['Response'] = data['Response'].str.replace(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b", "[EMAIL]", regex=True)
data['Response'] = data['Response'].str.replace(r"\b\d{10}\b", "[PHONE NUMBER]", regex=True)


In [56]:
data['Response'] = data['Response'].str.replace(r"[?!]+", lambda m: m.group()[0], regex=True)
data['Response'] = data['Response'].str.replace(r"([.,!?])(\w)", r"\1 \2", regex=True)
data['Response'] = data['Response'].str.replace(r"\s([.,!?])", r"\1", regex=True)


In [57]:
# Save cleaned data
data.to_csv('data\\cleaned_chatbot_data.csv', index=False)

In [58]:
from nltk.tokenize import word_tokenize

In [59]:
processed_context = data["Context"]
processed_response = data["Response"]


In [60]:
processed_response

0       If everyone thinks you're worthless, then mayb...
1       Hello, and thank you for your question and see...
2       First thing i'd suggest is getting the sleep y...
3       Therapy is essential for those that are feelin...
4       I first want to let you know that you are not ...
                              ...                        
3504    Hi. This is an excellent question! i think tha...
3508    I'm sorry you have tension between you and you...
3509    The true answer is, "no one can really say wit...
3510    How do you help yourself to believe you requir...
3511                             Hmm this is a tough one!
Name: Response, Length: 2752, dtype: object

In [61]:
processed_context

0       I'm going through some things with my feelings...
1       I'm going through some things with my feelings...
2       I'm going through some things with my feelings...
3       I'm going through some things with my feelings...
4       I'm going through some things with my feelings...
                              ...                        
3504    After first meeting the client, what is the pr...
3508    My boyfriend is in recovery from drug addictio...
3509    The birth mother attempted suicide several tim...
3510    I think adult life is making him depressed and...
3511    I just took a job that requires me to travel f...
Name: Context, Length: 2752, dtype: object

In [67]:
import re
from nltk.corpus import stopwords

# Cleaning the text
def clean_text(text):
    # Lowercasing
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Optional: Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Using a for loop to process the dataset
processed_context = []
processed_response = []

for i in range(len(data)):
    context = data["Context"].iloc[i]  # Use .iloc for positional indexing
    response = data["Response"].iloc[i]  # Use .iloc for positional indexing
    
    processed_context.append(clean_text(context))
    processed_response.append(clean_text(response))

# Add cleaned data back to the DataFrame
data["Cleaned_Context"] = processed_context
data["Cleaned_Response"] = processed_response




In [69]:
X = data["Cleaned_Context"]
y = data["Cleaned_Response"]

In [70]:
X

0       im going things feelings barely sleep nothing ...
1       im going things feelings barely sleep nothing ...
2       im going things feelings barely sleep nothing ...
3       im going things feelings barely sleep nothing ...
4       im going things feelings barely sleep nothing ...
                              ...                        
3504    first meeting client process counselor facilit...
3508    boyfriend recovery drug addiction recently got...
3509    birth mother attempted suicide several times p...
3510    think adult life making depressed often sleep ...
3511    took job requires travel far away home family ...
Name: Cleaned_Context, Length: 2752, dtype: object

In [71]:
y

0       everyone thinks youre worthless maybe need fin...
1       hello thank question seeking advice feelings w...
2       first thing id suggest getting sleep need impa...
3       therapy essential feeling depressed worthless ...
4       first want let know alone feelings always some...
                              ...                        
3504    hi excellent question think answer probably va...
3508    im sorry tension bf relationship means two peo...
3509    true answer one really say certainty variables...
3510    help believe require offers get relationship f...
3511                                        hmm tough one
Name: Cleaned_Response, Length: 2752, dtype: object

In [83]:
from transformers import GPT2Tokenizer

# Load GPT-2 Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add padding token if not already included in the tokenizer
tokenizer.pad_token = tokenizer.eos_token


tokenized_pairs = []
max_length = 30  # You can adjust this value based on your input data

for context, response in zip(X, y):
    tokenized_context = tokenizer(context, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
    tokenized_response = tokenizer(response, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
    
    # Append tokenized results to the list
    tokenized_pairs.append((tokenized_context, tokenized_response))

# Print results
for i, (tokenized_context, tokenized_response) in enumerate(tokenized_pairs):
    print(f"Pair {i+1}:")
    print("Tokenized Context:", tokenized_context)
    print("Tokenized Response:", tokenized_response)
    print()


Pair 1:
Tokenized Context: {'input_ids': tensor([[  320,  1016,  1243,  7666,  8523,  3993,  2147,   892,   545, 28063,
           815,   429,   220,   425,  1239,  3088, 39496,  7341,   220,   425,
          1464,  2227,  4259,  2428,  1239,   651,  1088,  1487,  4203, 28063]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1]])}
Tokenized Response: {'input_ids': tensor([[47057,  6834,   345,   260, 28063,  3863,   761,  1064,   649,   661,
          8181,  6411,  1919,  4732,  1048,  3160,  1263,  4588,  2116, 31869,
          4306,   467,  2835,  2835,  2111,  1833,   345,   260, 28063,   467]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1]])}

Pair 2:
Tokenized Context: {'input_ids': tensor([[  320,  1016,  1243,  7666,  8523,  3993,  2147,   892,   545, 28063,
           815,   429,   220,   425,  1239,  3088, 39496,  7341,  

In [84]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW


In [90]:
class ChatbotDataset(Dataset):
    def __init__(self, tokenized_contexts, tokenized_responses, max_length):
        self.tokenized_contexts = tokenized_contexts
        self.tokenized_responses = tokenized_responses
        self.max_length = max_length

    def __len__(self):
        return len(self.tokenized_contexts)

    def __getitem__(self, idx):
        # Ensure the context and response are padded to the same length
        input_ids = self.tokenized_contexts[idx]['input_ids'].squeeze(0)
        attention_mask = self.tokenized_contexts[idx]['attention_mask'].squeeze(0)
        labels = self.tokenized_responses[idx]['input_ids'].squeeze(0)
        
        # If the label length is smaller than max_length, pad the label sequence
        padding_length = self.max_length - labels.size(0)
        if padding_length > 0:
            padding = torch.full((padding_length,), tokenizer.pad_token_id)
            labels = torch.cat([labels, padding])

        # Convert labels to Long type
        labels = labels.long()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels,
        }


In [91]:
# Convert tokenized_pairs to lists of input_ids and attention_masks
input_contexts = [pair[0] for pair in tokenized_pairs]
input_responses = [pair[1] for pair in tokenized_pairs]

dataset = ChatbotDataset(input_contexts, input_responses, max_length=30)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

In [92]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

# Use AdamW optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)




In [94]:
# Define training parameters
epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Use AdamW optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Scheduler for learning rate adjustment
num_training_steps = epochs * len(dataloader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in dataloader:
        optimizer.zero_grad()
    
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
    
        # Check tensor types to confirm
        print(f"Input IDs type: {input_ids.dtype}, Attention Mask type: {attention_mask.dtype}, Labels type: {labels.dtype}")
    
        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
    
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
    
        total_loss += loss.item()


    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")




Input IDs type: torch.int64, Attention Mask type: torch.int64, Labels type: torch.int64
Input IDs type: torch.int64, Attention Mask type: torch.int64, Labels type: torch.int64
Input IDs type: torch.int64, Attention Mask type: torch.int64, Labels type: torch.int64
Input IDs type: torch.int64, Attention Mask type: torch.int64, Labels type: torch.int64
Input IDs type: torch.int64, Attention Mask type: torch.int64, Labels type: torch.int64
Input IDs type: torch.int64, Attention Mask type: torch.int64, Labels type: torch.int64
Input IDs type: torch.int64, Attention Mask type: torch.int64, Labels type: torch.int64
Input IDs type: torch.int64, Attention Mask type: torch.int64, Labels type: torch.int64
Input IDs type: torch.int64, Attention Mask type: torch.int64, Labels type: torch.int64
Input IDs type: torch.int64, Attention Mask type: torch.int64, Labels type: torch.int64
Input IDs type: torch.int64, Attention Mask type: torch.int64, Labels type: torch.int64
Input IDs type: torch.int64, Att

KeyboardInterrupt: 

In [None]:
model.save_pretrained("fine_tuned_chatbot")
tokenizer.save_pretrained("fine_tuned_chatbot")


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the model and tokenizer
model = GPT2LMHeadModel.from_pretrained("fine_tuned_chatbot")
tokenizer = GPT2Tokenizer.from_pretrained("fine_tuned_chatbot")
model.eval()


In [None]:
def generate_response(model, tokenizer, context, max_length=50):
    # Tokenize input context
    input_ids = tokenizer.encode(context, return_tensors="pt")

    # Generate response
    response_ids = model.generate(input_ids, max_length=max_length, pad_token_id=tokenizer.eos_token_id, 
                                   no_repeat_ngram_size=2, temperature=0.7)

    # Decode response
    response = tokenizer.decode(response_ids[0], skip_special_tokens=True)
    return response


In [None]:
while True:
    user_input = input("You: ")
    if user_input.lower() in ['exit', 'quit']:
        break

    bot_response = generate_response(model, tokenizer, user_input)
    print(f"Chatbot: {bot_response}")
