In [1]:
import pandas as pd

In [2]:
# read json file
data = pd.read_json("data/combined_dataset.json", lines=True)

In [3]:
data

Unnamed: 0,Context,Response
0,I'm going through some things with my feelings...,"If everyone thinks you're worthless, then mayb..."
1,I'm going through some things with my feelings...,"Hello, and thank you for your question and see..."
2,I'm going through some things with my feelings...,First thing I'd suggest is getting the sleep y...
3,I'm going through some things with my feelings...,Therapy is essential for those that are feelin...
4,I'm going through some things with my feelings...,I first want to let you know that you are not ...
...,...,...
3507,My grandson's step-mother sends him to school ...,Absolutely not! It is never in a child's best ...
3508,My boyfriend is in recovery from drug addictio...,I'm sorry you have tension between you and you...
3509,The birth mother attempted suicide several tim...,"The true answer is, ""no one can really say wit..."
3510,I think adult life is making him depressed and...,How do you help yourself to believe you requir...


In [4]:
# Display dataset info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3512 entries, 0 to 3511
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Context   3512 non-null   object
 1   Response  3512 non-null   object
dtypes: object(2)
memory usage: 55.0+ KB


### Data cleaning
#### Remove duplicates

In [5]:
data.duplicated().sum()

760

In [6]:
data.drop_duplicates(inplace=True)

In [7]:
data.duplicated().sum()

0

#### Remove missing values

In [8]:
data.isna().sum()

Context     0
Response    0
dtype: int64

#### Clean the Context Text

In [9]:
import re

In [10]:
# Remove Extra Spaces, Tabs, and Newlines
data['Context'] = data['Context'].str.replace(r"\s+", " ", regex=True).str.strip()

In [11]:
# Standardize Capitalization
data['Context'] = data['Context'].str.lower()
data['Context'] = data['Context'].str.replace(r"(^\w|\.\s*\w)", lambda m: m.group().upper(), regex=True)


In [12]:
# Remove Sensitive Data
data['Context'] = data['Context'].str.replace(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b", "[EMAIL]", regex=True)
data['Context'] = data['Context'].str.replace(r"\b\d{10}\b", "[PHONE NUMBER]", regex=True)


In [13]:
# Normalize Punctuation
data['Context'] = data['Context'].str.replace(r"[?!]+", lambda m: m.group()[0], regex=True)
data['Context'] = data['Context'].str.replace(r"([.,!?])(\w)", r"\1 \2", regex=True)
data['Context'] = data['Context'].str.replace(r"\s([.,!?])", r"\1", regex=True)


#### Clean the Response Text

In [14]:
data['Response'] = data['Response'].str.replace(r"\s+", " ", regex=True).str.strip()


In [15]:
data['Response'] = data['Response'].str.lower()
data['Response'] = data['Response'].str.replace(r"(^\w|\.\s*\w)", lambda m: m.group().upper(), regex=True)


In [16]:
data['Response'] = data['Response'].str.replace(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b", "[EMAIL]", regex=True)
data['Response'] = data['Response'].str.replace(r"\b\d{10}\b", "[PHONE NUMBER]", regex=True)


In [17]:
data['Response'] = data['Response'].str.replace(r"[?!]+", lambda m: m.group()[0], regex=True)
data['Response'] = data['Response'].str.replace(r"([.,!?])(\w)", r"\1 \2", regex=True)
data['Response'] = data['Response'].str.replace(r"\s([.,!?])", r"\1", regex=True)


In [18]:
# Save to a new JSON file
data.to_json("data/cleaned_dataset.json", orient="records", lines=True)

print("Dataset cleaned and saved as 'data/cleaned_dataset.json'.")


Dataset cleaned and saved as 'data/cleaned_dataset.json'.


In [19]:
clean_data = pd.read_json("data/cleaned_dataset.json", lines=True)

In [20]:
clean_data

Unnamed: 0,Context,Response
0,I'm going through some things with my feelings...,"If everyone thinks you're worthless, then mayb..."
1,I'm going through some things with my feelings...,"Hello, and thank you for your question and see..."
2,I'm going through some things with my feelings...,First thing i'd suggest is getting the sleep y...
3,I'm going through some things with my feelings...,Therapy is essential for those that are feelin...
4,I'm going through some things with my feelings...,I first want to let you know that you are not ...
...,...,...
2747,"After first meeting the client, what is the pr...",Hi. This is an excellent question! i think tha...
2748,My boyfriend is in recovery from drug addictio...,I'm sorry you have tension between you and you...
2749,The birth mother attempted suicide several tim...,"The true answer is, ""no one can really say wit..."
2750,I think adult life is making him depressed and...,How do you help yourself to believe you requir...


### Tokenization

In [21]:
from transformers import AutoTokenizer

In [22]:
# Initialize the tokenizer
model_name = "gpt2"  # Replace with your desired model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Assign the eos_token as the padding token
tokenizer.pad_token = tokenizer.eos_token


In [23]:
# Tokenize the input and output columns
input_ids = tokenizer(list(clean_data["Context"]), padding=True, truncation=True, max_length=512, return_tensors="pt").input_ids
output_ids = tokenizer(list(clean_data["Response"]), padding=True, truncation=True, max_length=512, return_tensors="pt").input_ids

In [26]:
input_ids

tensor([[   40,  1101,  1016,  ..., 50256, 50256, 50256],
        [   40,  1101,  1016,  ..., 50256, 50256, 50256],
        [   40,  1101,  1016,  ..., 50256, 50256, 50256],
        ...,
        [  464,  4082,  2802,  ..., 50256, 50256, 50256],
        [   40,   892,  4044,  ..., 50256, 50256, 50256],
        [   40,   655,  1718,  ..., 50256, 50256, 50256]])

In [24]:
# Combine into a single dataset
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, input_ids, output_ids):
        self.input_ids = input_ids
        self.output_ids = output_ids

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "labels": self.output_ids[idx],
        }

# Create the dataset object
dataset = TextDataset(input_ids, output_ids)

#### Fine-Tune the Model

In [25]:
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments

# Load the model
model = AutoModelForCausalLM.from_pretrained(model_name)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10,
    save_total_limit=2,
    logging_dir="./logs",
)

# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

# Fine-tune the model
trainer.train()





ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`

#### Save the Fine-Tuned Model


In [None]:
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")
print("Model and tokenizer saved!")


####  Deploy as a Chatbot

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

def load_saved_model(model_path):
    """
    Load the saved model and tokenizer.
    """
    try:
        # Load the tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        
        # Load the model
        model = AutoModelForCausalLM.from_pretrained(model_path)
        
        # Move model to GPU if available
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model = model.to(device)
        
        return model, tokenizer
    
    except Exception as e:
        print(f"Error loading saved model: {e}")
        return None, None

def generate_response(model, tokenizer, prompt, max_length=200):
    """
    Generate a response using the saved model.
    """
    try:
        # Prepare the input
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        
        # Generate response
        outputs = model.generate(
            inputs.input_ids, 
            max_length=max_length, 
            num_return_sequences=1,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.7
        )
        
        # Decode the response
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        return response
    
    except Exception as e:
        print(f"Error generating response: {e}")
        return ""

def interactive_chat():
    """
    Interactive chat interface to input questions and get responses.
    """
    # Path to the saved model
    model_path = "./fine_tuned_model"
    
    # Load the saved model and tokenizer
    model, tokenizer = load_saved_model(model_path)
    
    if not model or not tokenizer:
        print("Failed to load the saved model.")
        return
    
    print("🤖 Interactive Model Response")
    print("Type 'exit' to quit the program")
    
    while True:
        try:
            # Get user input
            user_input = input("\nEnter your question: ").strip()
            
            # Check for exit condition
            if user_input.lower() in ['exit', 'quit', 'bye']:
                print("Goodbye! 👋")
                break
            
            # Generate and print response
            if user_input:
                response = generate_response(model, tokenizer, user_input)
                print("\nModel's Response:", response)
        
        except KeyboardInterrupt:
            print("\n\nChat interrupted. Goodbye! 👋")
            break
        except Exception as e:
            print(f"An error occurred: {e}")

if __name__ == "__main__":
    interactive_chat()