### Data Preparation

In [1]:
# Import pandas for data analysis
import pandas as pd

#df  = pd.read_csv("/content/Merged_data_QA.csv")
df = pd.read_csv("./Mental_health.csv", encoding='windows-1252', encoding_errors='replace')
df.head() #show first five rows

Unnamed: 0,Questions,Answers
0,"What is mental health, and why is it important?","Mental health refers to our emotional, psychol..."
1,How do I maintain mental health during stressf...,Managing your mental health in the midst of st...
2,How do I know if I need trauma therapy?,Deciding whether to pursue trauma?focused ther...
3,What are the most common medications for menta...,Here’s an overview of the major classes of psy...
4,What are the side effects of antidepressants?,"Antidepressants differ in their mechanisms, bu..."


In [2]:
df.shape # 118 rows | 2 cols

(592, 2)

In [3]:
len(df.Questions)

592

In [4]:
len(df.Answers)

592

In [5]:
pip install cleantext

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [6]:
import os
print(os.getcwd())


!pip install cleantext
import nltk
nltk.download('stopwords')

c:\Users\mohda\Documents\AI Health Assistant\Notebook
Defaulting to user installation because normal site-packages is not writeable


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mohda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
# Function to clean text data by removing unwanted characters and formatting
import cleantext
import os
print(os.getcwd())
print("Current directory:", os.getcwd())
print("Directory contents:", os.listdir())

def clean(textdata):
    '''Use the cleantext library to remove extra spaces, lowercase the text,
        and remove numbers and punctuation'''
    cleaned_text = []
    for i in textdata:
        cleaned_text.append(cleantext.clean(str(i), extra_spaces=True, lowercase=True, stopwords=False, stemming=False, numbers=True, punct=True, clean_all = True))
        
    return cleaned_text

c:\Users\mohda\Documents\AI Health Assistant\Notebook
Current directory: c:\Users\mohda\Documents\AI Health Assistant\Notebook
Directory contents: ['Clean_medication_output.csv', 'clean_Mental_health_data.csv', 'Data_cleaned_medication_QA_data.csv', 'Diabetes Classification.ipynb', 'diabetes.csv', 'logs', 'Medication Chat.ipynb', 'medication_info_model', 'Medicine Classification.ipynb', 'Mental_health.csv', 'Mental_health.ipynb', 'Mental_health_info_model', 'random_forest_model.joblib', 'results']


In [8]:
# Apply the clean function to the questions and answers columns

df.Questions = list(clean(df.Questions))
df.Answers = list(clean(df.Answers))

In [9]:
# Save the cleaned data into a new CSV file & save
df.to_csv("clean_Mental_health_data.csv", index=False)

### GPT-2 Model

In [10]:
pip install datasets

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



In [11]:
!pip install torch

Defaulting to user installation because normal site-packages is not writeable


In [12]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
import torch

# Load the GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')




In [13]:
# Set the padding token for the tokenizer to be the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token

# Maximum sequence length that GPT-2 can handle
max_length = tokenizer.model_max_length
print(max_length)

1024


In [14]:
from datasets import load_dataset

# Load the cleaned QA dataset as a training set using the 'datasets' library
dataset = load_dataset('csv', data_files={'train': 'clean_Mental_health_data.csv'}, split='train')

Generating train split: 0 examples [00:00, ? examples/s]

In [15]:
#Function to tokenize questions and answers and prepare them for the model
def tokenize_function(examples):

  '''1. Combine each question and answer into a single input string
     2. Tokenize the combined text using the GPT-2 tokenizer
     3. Set the labels to be the same as the input_ids (shifted to predict the next word)
     4. Return the tokenized output. '''
    
  combined_text = [str(q) + " " + str(a) for q, a in zip(examples['Questions'], examples['Answers'])]
  tokenized_output = tokenizer(combined_text, padding='max_length', truncation=True, max_length=128)
  tokenized_output['labels'] = tokenized_output['input_ids'].copy()
  
  return tokenized_output

In [16]:
# Tokenize the entire dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/592 [00:00<?, ? examples/s]

In [17]:
import torch

if torch.cuda.is_available():
    device_name = torch.cuda.get_device_name(0)
    device_type = "GPU"
    total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9  # GB
else:
    import psutil
    device_name = "CPU"
    device_type = "CPU"
    total_memory = psutil.virtual_memory().total / 1e9  # GB

print(f"Device: {device_type} ({device_name})")
print(f"Memory: {total_memory:.2f} GB")

from transformers import TrainingArguments, Trainer


# Define training arguments for the GPT-2 model
training_args = TrainingArguments(
    output_dir='./results',  # Directory to save model outputs
    num_train_epochs=3,  # Train for 50 epochs
    fp16 = True,
    per_device_train_batch_size=8, # Batch size during training
    per_device_eval_batch_size=32,  # Batch size during evaluation
    eval_strategy="no", # Evaluation can slow training.
    save_strategy="epoch", #Saves checkpoints only once per epoch instead of every few steps
    warmup_steps=500,  # Warmup steps for learning rate scheduler
    weight_decay=0.01,  # Weight decay for regularization
    logging_dir='./logs',  # Directory for saving logs
    logging_steps=500,  # Log every 10 steps
    save_steps=2000,  # Save model checkpoints every 1000 steps
)

Device: CPU (CPU)
Memory: 6.38 GB


In [18]:
# Trainer class to handle training process
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [19]:
# Train the model
trainer.train()
model.config.loss_type = None  # Avoid setting unrecognized loss_type 

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


In [20]:
# Save the model
trainer.save_model('Mental_health_model.joblib')

In [21]:
# Function to generate a response based on a user prompt (testing the model)
def generate_response(prompt):
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    outputs = model.generate(inputs, max_length=150, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

    # Decode the generated output
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Remove the prompt from the response
    if response.startswith(prompt):
        response = response[len(prompt):].strip()  # Remove the prompt from the response

    return response


In [22]:
# Example conversation
user_input = "How can I become a Data Scientist?"
bot_response = generate_response(user_input)
print("Bot Response:", bot_response)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Bot Response: Data scientists are the most effective tools for building a team of professionals who understand the challenges of data science and the challenges of data science can be incredibly powerful for both the team and the organization.

Data science is about building a team of professionals who understand the challenges of data science and the challenges of data analytics can be incredibly powerful for both the team and the organization

here are some key tools for becoming a data scientist



   understand the challenges of data science

here are some key tools for becoming a data scientist

    understand the challenges of data science

   you need to understand the challenges of data science

  you need to understand the challenges of


# Use this for google notbook

In [23]:
# Copying the model to Google Drive (optional)
import shutil

# Path to the file in Colab
#colab_file_path = '/content/counsel_model/model.safetensors'

# Path to your Google Drive
#drive_file_path = '/content/drive/MyDrive'

# Copy the file
# shutil.copy(colab_file_path, drive_file_path)

In [24]:
# Use this for anoconda and vs code 

In [27]:
import shutil

# Path to the source file on your system
source_file_path = r'C:\Users\mohda\Documents\AI Health Assistant\Notebook\medication_info_model\model.safetensors'

# Destination folder where you want to copy the file
destination_folder = r'C:\Users\mohda\Documents\AI Health Assistant\backend\models\Mental_health_model'

destination_file_path = destination_folder + r'\model.safetensors'
# Copy the fil
#;[{}] ea
shutil.copy(source_file_path, destination_file_path)

print("File copied successfully!")

File copied successfully!
