In [3]:
!pip install tf-keras

Collecting tf-keras
  Downloading tf_keras-2.20.1-py3-none-any.whl.metadata (1.8 kB)
Downloading tf_keras-2.20.1-py3-none-any.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ---------------------------------------- 1.7/1.7 MB 22.7 MB/s  0:00:00
Installing collected packages: tf-keras
Successfully installed tf-keras-2.20.1


In [5]:
import pandas as pd
import torch
import re
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from torch.utils.data import Dataset




In [6]:
# Loading our dataset
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,Context,Response
0,I'm going through some things with my feelings...,"If everyone thinks you're worthless, then mayb..."
1,I'm going through some things with my feelings...,"Hello, and thank you for your question and see..."
2,I'm going through some things with my feelings...,First thing I'd suggest is getting the sleep y...
3,I'm going through some things with my feelings...,Therapy is essential for those that are feelin...
4,I'm going through some things with my feelings...,I first want to let you know that you are not ...


In [7]:
import re

# Functioning to clean text
def clean_text(text):
    if isinstance(text, str):  
        text = text.lower()
        text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  
    return text

# Appling the function to clean the 'Context' and 'Response' columns
df["Cleaned_Questions"] = df["Context"].apply(clean_text)
df["Cleaned_Answers"] = df["Response"].apply(clean_text)

# Displaying processed data
df.head()

Unnamed: 0,Context,Response,Cleaned_Questions,Cleaned_Answers
0,I'm going through some things with my feelings...,"If everyone thinks you're worthless, then mayb...",im going through some things with my feelings ...,if everyone thinks youre worthless then maybe ...
1,I'm going through some things with my feelings...,"Hello, and thank you for your question and see...",im going through some things with my feelings ...,hello and thank you for your question and seek...
2,I'm going through some things with my feelings...,First thing I'd suggest is getting the sleep y...,im going through some things with my feelings ...,first thing id suggest is getting the sleep yo...
3,I'm going through some things with my feelings...,Therapy is essential for those that are feelin...,im going through some things with my feelings ...,therapy is essential for those that are feelin...
4,I'm going through some things with my feelings...,I first want to let you know that you are not ...,im going through some things with my feelings ...,i first want to let you know that you are not ...


In [8]:
# Loading GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Adding special tokens
tokenizer.add_special_tokens({"sep_token": "[SEP]", "pad_token": "[PAD]"})  

# Formating dataset as "question [SEP] answer"
df["Formatted_Text"] = df["Cleaned_Questions"] + " [SEP] " + df["Cleaned_Answers"]

df["Formatted_Text"] = df["Formatted_Text"].fillna("")

df["Formatted_Text"] = df["Formatted_Text"].astype(str)

# Check for the first few rows of the cleaned column
df["Formatted_Text"].head()


0    im going through some things with my feelings ...
1    im going through some things with my feelings ...
2    im going through some things with my feelings ...
3    im going through some things with my feelings ...
4    im going through some things with my feelings ...
Name: Formatted_Text, dtype: object

In [9]:
# Tokenize dataset with proper padding
encodings = tokenizer(
    df["Formatted_Text"].tolist(),
    padding=True,  
    truncation=True,
    max_length=128,
    return_tensors="pt"
)
encodings.keys()

KeysView({'input_ids': tensor([[  320,  1016,   832,  ...,   284,   262,   976],
        [  320,  1016,   832,  ...,   284,   466,   674],
        [  320,  1016,   832,  ...,  3785,   340,   503],
        ...,
        [ 1169,  4082,  2802,  ...,  1738,   284,   915],
        [   72,   892,  4044,  ..., 50258, 50258, 50258],
        [   72,   655,  1718,  ..., 50258, 50258, 50258]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])})

In [10]:
class MentalHealthDataset(Dataset):
    def __init__(self, encodings):
        self.input_ids = encodings["input_ids"]
        self.attention_mask = encodings["attention_mask"]
        
        # Labels are same as input_ids 
        self.labels = encodings["input_ids"]  
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx], 
        }

# Creating dataset with labels
dataset = MentalHealthDataset(encodings)

In [11]:
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, val_size, test_size])

print(f"Training: {len(train_dataset)}, Validation: {len(val_dataset)}, Test: {len(test_dataset)}")

Training: 2809, Validation: 351, Test: 352


In [12]:
# Loading GPT-2 model with text generation head
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Resizing our token embeddings 
model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(50259, 768)

In [12]:
training_args = TrainingArguments(
    output_dir="./gpt2_mental_health_1",
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    num_train_epochs=25,
    learning_rate=5e-5,
    save_steps=500,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,   # present but not used automatically
)
trainer.train()
# run validation manually:
metrics = trainer.evaluate()
print(metrics)


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
500,3.1534
1000,2.3778
1500,1.9706
2000,1.6752
2500,1.4452
3000,1.2514
3500,1.1104
4000,0.9966
4500,0.9099
5000,0.8331


{'eval_loss': 1.5282549858093262, 'eval_runtime': 8.4, 'eval_samples_per_second': 41.786, 'eval_steps_per_second': 4.286, 'epoch': 25.0}


In [None]:
# # Define training arguments
# training_args = TrainingArguments(
#     output_dir="./gpt2_mental_health_128",
#     evaluation_strategy="epoch",
#     per_device_train_batch_size=10,
#     per_device_eval_batch_size=10,
#     num_train_epochs=25,
#     learning_rate=5e-5,
#     save_steps=500,
#     logging_dir="./logs",
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
# )


In [None]:
trainer.train()

In [12]:
# Get the total size of the dataset
dataset_size = len(test_dataset)

# Set subset size 
subset_size = min(1000, dataset_size)

# Create subset safely
small_test_dataset = torch.utils.data.Subset(test_dataset, range(subset_size))

# Run evaluation on the data
trainer.evaluate(small_test_dataset)


{'eval_loss': 1.538917064666748,
 'eval_runtime': 8.3586,
 'eval_samples_per_second': 42.112,
 'eval_steps_per_second': 4.307,
 'epoch': 25.0}

In [1]:
import matplotlib.pyplot as plt

history = trainer.state.log_history
train_data = [(log['epoch'], log['loss']) for log in history if 'loss' in log]
eval_data = [(log['epoch'], log['eval_loss']) for log in history if 'eval_loss' in log]

train_epochs, train_losses = zip(*train_data) if train_data else ([], [])
eval_epochs, eval_losses = zip(*eval_data) if eval_data else ([], [])

plt.figure(figsize=(12, 5))
plt.plot(train_epochs, train_losses, 'b-o', label='Training Loss')
plt.plot(eval_epochs, eval_losses, 'r-o', label='Validation Loss')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training vs Validation Loss")
plt.grid(True)
plt.legend()
plt.savefig("training_progress.png", dpi=300)
plt.show()


NameError: name 'trainer' is not defined

In [5]:
!pip install nltk absl rouge_score

Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)


ERROR: Could not find a version that satisfies the requirement absl (from versions: none)
ERROR: No matching distribution found for absl


In [3]:
import evaluate

# Load the ROUGE and BLEU metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

# Define a function for generating predictions
def generate_predictions(model, tokenizer, dataset):
    model.eval()
    predictions = []
    references = []
    
    for idx in range(len(dataset)):
        input_ids = dataset[idx]['input_ids'].unsqueeze(0).to(model.device)  # Add batch dimension and move to device
        attention_mask = dataset[idx]['attention_mask'].unsqueeze(0).to(model.device)
        
        # Generate prediction with explicit pad_token_id
        outputs = model.generate(
        input_ids=input_ids, 
        attention_mask=attention_mask, 
        max_new_tokens=50,  # Limit the number of new tokens generated
        pad_token_id=tokenizer.eos_token_id  # Explicitly set the pad token ID
)

        pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Get the actual text from the dataset
        target_text = tokenizer.decode(dataset[idx]['labels'], skip_special_tokens=True)
        
        predictions.append(pred_text)
        references.append([target_text])  
    
    return predictions, references

# Generate predictions and references
predictions, references = generate_predictions(model, tokenizer, small_test_dataset)

# Compute ROUGE and BLEU scores
rouge_score = rouge.compute(predictions=predictions, references=references)
bleu_score = bleu.compute(predictions=predictions, references=references)

# Printing the scores
print("ROUGE Score:", rouge_score)
print("BLEU Score:", bleu_score)

Downloading builder script: 5.94kB [00:00, 5.97MB/s]
Downloading extra modules: 4.07kB [00:00, 4.10MB/s]                   
Downloading extra modules: 3.34kB [00:00, 334kB/s]


NameError: name 'model' is not defined

In [13]:
# Saving the  model & tokenizer
model.save_pretrained("mental_health_chatbot_gpt2")
tokenizer.save_pretrained("mental_health_chatbot_gpt2")

print("Model and tokenizer saved successfully!")

Model and tokenizer saved successfully!


In [4]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load your fine-tuned model
model_path = "mental_health_chatbot_gpt2"

tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Move model to GPU if available
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50259, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50259, bias=False)
)

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load model and tokenizer
model_path = "mental_health_chatbot_gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def clean_text(text):
    return text.strip()

def chatbot_response(user_input):
    user_input = clean_text(user_input)
    input_text = user_input + " [SEP]"

    # Tokenize
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

    # Generate response
    output = model.generate(
        input_ids,
        max_length=100,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode
    response = tokenizer.decode(output[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
    return response

# Chat loop
while True:
    user_input = input("You: ")
    if user_input.lower() == "exit":
        break
    response = chatbot_response(user_input)
    print(f"Chatbot: {response}")


Chatbot:  i am so sorry to hear that you are going through thisthis is a very normal human response to pain i dont know how old you are but i would suggest that you be between the ages of 15 and 18 if you are in good health and have access to a good health insurance i would suggest that you reach out to a local mental health professional and they could help you schedule a time to talk to someone hi there thank you for your question i have a few thoughts
Chatbot:  i would say that you need to start looking for a better job and getting a professional relationship with your fiancee living longer may be a good thing if you are able to find one and the relationship will improve the long termÂ  if your fiancee is not happy with the amount of time you have been married for example she may be resentful and may be seeking validation from youit is possible that she is seeking validation from you from someone who is
