# **Installations and Imports**

In [1]:
!pip install datasets
!pip install transformers datasets accelerate
!pip install gradio

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
import pandas as pd
from datasets import load_dataset, Dataset
import json
from transformers import LlamaTokenizer, LlamaForCausalLM, TrainingArguments, Trainer, pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
from sklearn.model_selection import train_test_split
import gradio as gr

# **MentalChat16K Dataset**

In [None]:
'''
This dataset consists of synthetic data that could optionally be added:

ds = load_dataset("ShenLab/MentalChat16K")
train_data = ds['train']

MentalChat16K_Df = pd.DataFrame({
    'question': train_data['input'],
    'answer': train_data['output']
})

MentalChat16K_Df.to_csv('MentalChat16K.csv', index = False)

'''

# **Mental Health Chatbot Dataset**

In [3]:
ds = load_dataset("heliosbrahma/mental_health_chatbot_dataset")

train_data = ds['train']
texts = train_data['text']
questions = []
answers = []

for text in texts:
    exchanges = text.split("<HUMAN>:")
    for exchange in exchanges[1:]:
        if "<ASSISTANT>:" in exchange:
            question, answer = exchange.split("<ASSISTANT>:")
            questions.append(question.strip())
            answers.append(answer.strip())

MentalHealthChatbot_Df = pd.DataFrame({
    'question': questions,
    'answer': answers
})

MentalHealthChatbot_Df.to_csv("mental_health_chatbot_parsed.csv", index=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# **Counsel Chat Dataset**

In [4]:
'''
This dataset should first be downloaded online from kaggle,
then the file called 'oneHotData.csv' contains the questions and answer pairs.
'''
CounselChat = pd.read_csv('/content/oneHotData.csv')
questionlist = []
answerlist = []
for i in CounselChat['questionFull']:
  questionlist.append(i)
for i in CounselChat['answerText']:
  answerlist.append(i)

CounselChatDf = pd.DataFrame({
    'question': questionlist,
    'answer': answerlist
})

CounselChatDf.to_csv("CounselChat.csv", index=False)

# **Mental Health FAQ Dataset**

In [5]:
'''
This dataset should first be downloaded online from kaggle,
It then gives the file 'Mental_Health_FAQ.csv'. This file is processed for usage here.
'''
FAQ = pd.read_csv('/content/Mental_Health_FAQ.csv')
questionlist = []
answerlist = []
for i in FAQ['Questions']:
  questionlist.append(i)
for i in FAQ['Answers']:
  answerlist.append(i)

FAQ_Dataframe = pd.DataFrame({
    'question': questionlist,
    'answer': answerlist
})

FAQ_Dataframe.to_csv("FAQ_Dataframe.csv", index=False)

# **Merging the dataframes**

In [6]:
'''
Merging all questions and answers from the created dataframes together
and shuffeling them randomly.
OPTIONAL: add the MentalChat16K_Df to fill the data with synthetic data.
'''
QA_Input_Df = pd.concat([FAQ_Dataframe, CounselChatDf, MentalHealthChatbot_Df], ignore_index=True)
QA_Input_Df = QA_Input_Df.sample(frac=1, random_state=42).reset_index(drop=True)
QA_Input_Df.to_csv("QA_Input_Df.csv", index=False)

# **Pre-process the data**

In [7]:
#Load the dataset
data = pd.read_csv("QA_Input_Df.csv")
#Clean the dataset: fill NaN values and convert to strings
data['question'] = data['question'].fillna("").astype(str)
data['answer'] = data['answer'].fillna("").astype(str)
#Convert to a Hugging Face Dataset
formatted_data = {
    "instruction": data['question'].tolist(),
    "response": data['answer'].tolist()
}
hf_dataset = Dataset.from_dict(formatted_data)

print(f"Dataset prepared with {len(hf_dataset)} entries.")

Dataset prepared with 1646 entries.


In [8]:
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
#Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
#Add padding token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

print("TinyLlama model and tokenizer loaded successfully.")

TinyLlama model and tokenizer loaded successfully.


In [9]:
#Split the dataset into training and evaluation sets (80/20 split)
train_data, eval_data = train_test_split(data, test_size=0.2, random_state=42)

#Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_data.rename(columns={"question": "instruction", "answer": "response"}))
eval_dataset = Dataset.from_pandas(eval_data.rename(columns={"question": "instruction", "answer": "response"}))

#Tokenize the datasets
def tokenize_function(examples):
    return tokenizer(
        examples["instruction"],
        text_pair=examples["response"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

#Tokenize training and evaluation datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

#Remove unnecessary columns
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["instruction", "response"])
tokenized_eval_dataset = tokenized_eval_dataset.remove_columns(["instruction", "response"])

#Format for PyTorch
tokenized_train_dataset.set_format("torch")
tokenized_eval_dataset.set_format("torch")

Map:   0%|          | 0/1316 [00:00<?, ? examples/s]

Map:   0%|          | 0/330 [00:00<?, ? examples/s]

# **Fine-tune the model**

In [10]:
#Define training arguments
training_args = TrainingArguments(
    output_dir="./Mental_Health_Chatbot",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    eval_strategy="steps",
    eval_steps=100,
    save_steps=100,
    save_total_limit=2,
    num_train_epochs=3,
    learning_rate=5e-5,
    fp16=True,
    logging_dir="./logs",
    logging_steps=50,
    report_to="none"
)

#Define a custom Trainer to compute the loss
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs["input_ids"].clone()
        labels[:, :-1] = inputs["input_ids"][:, 1:]
        labels[:, -1] = -100
        #Forward pass
        outputs = model(**inputs)
        logits = outputs.logits
        #Compute the loss
        loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)
        loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

#Initialize the custom Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer
)

#Fine-tune the model
trainer.train()

#Save the fine-tuned model
trainer.save_model("./Mental_Health_Chatbot")
print("Model 'Mental_Health_Chatbot' fine-tuned and saved.")

  trainer = CustomTrainer(


Step,Training Loss,Validation Loss
100,1.0309,No log
200,0.4892,No log


Model 'Mental_Health_Chatbot' fine-tuned and saved.


# **Testing the model**

**In order to test the model, you don't have to run all the code cells above except for the installations and imports. They are just for illustration of how the model was built. To test the model, load all the files in 'Final_Trained_Model' into the environment in a folder that you must call 'Mental_Health_Chatbot' and run all code cells below. The model.safetensors file is 4.4 GB, so some environments do not support this upload. You can then upload it to google drive and acces it from there using the first code cell below.**

In [5]:
'''
If you want to acces the model.safetensors from your google drive, run this code and then drag the model.safetensors file from the 'drive' folder to the 'Mental_Health_Chatbot' folder.
Note that uploading the 4.4 GB file to your drive can take a few minutes.
'''
from google.colab import drive
drive.mount('/content/drive') #Adjust this to pathname to the path you need.

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
'''
Run this when you are on an NVIDIA GPU like the A100 from google colab, if you are on your cpu, use the next code block.
'''
#Load the model and tokenizer
model_name = "./Mental_Health_Chatbot"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda")

In [20]:
'''
Use this only when you run the model on your local CPU. Note that the model's responses take longer on your local cpu (even up to 2 minutes).
'''
#Load the model and tokenizer
model_name = "./Mental_Health_Chatbot"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [22]:
#Check device availability
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

#Function to handle chatbot responses
def chat_with_bot(user_input, history=None):
    if history is None:
        history = []
    try:
        #Format the input with conversation history
        history_text = "".join([f"Human: {msg}\nAssistant: {resp}\n" for msg, resp in history])
        prompt = f"{history_text}Human: {user_input}\nAssistant:"
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        #Generate the response
        outputs = model.generate(
            **inputs,
            max_length=len(inputs["input_ids"][0]) + 500,  #Edit this to change the max output length of the LLM
            temperature=0.7,
            top_k=50,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id
        )
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        #Extract the assistant's response
        assistant_response = response[len(prompt):].strip()
        history.append((user_input, assistant_response))
        return assistant_response, history
    except Exception as e:
        print(f"Error during generation: {e}")
        return "Sorry, I couldn't process that. Please try again.", history

#Building an interface for the chatbot with gradio
chat_interface = gr.Interface(
    fn=chat_with_bot,
    inputs=["text", "state"],
    outputs=["text", "state"],
    title="Mental Health Chatbot",
    description="Ask questions about mental health, and the chatbot will provide answers.",
    examples=[
        ["What is depression?"],
        ["How can I manage anxiety?"],
        ["What are the symptoms of a panic attack?"]
    ]
)

In [23]:
#Run this to launch the chatbot
chat_interface.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://2df4d8a582be21c4c4.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


