Mental Health Counsel Chatbot

Kaggle Notebook: Mental Health Counsel Chatbot
Description: Provides mental health counseling data, which we used to supplement information from the primary dataset and align topics for consistent categorization.
Mental Health Synthetic Dataset

Kaggle Dataset: Mental Health Synthetic Dataset
Description: This primary dataset contains synthetic data on mental health symptoms, demographics, and treatment, forming the basis for model training and recommendation generation.

In [1]:
import pandas as pd

mental_df = pd.read_csv("mental_health.csv")
mental_df.columns

Index(['User ID', 'Age', 'Gender', 'Symptoms', 'Duration (weeks)',
       'Previous Diagnosis', 'Therapy History', 'Medication',
       'Diagnosis / Condition', 'Suggested Therapy', 'Self-care Advice',
       'Urgency Level', 'Mood', 'Stress Level'],
      dtype='object')

In [2]:
# Renmae columns: easy to follow up.
mental_df = mental_df.rename(columns= {
    'Diagnosis / Condition' : 'Diagnosis',
    'Self-care Advice': 'Self_Care_Advice',
    'Therapy History' : 'Therapy_History',
    'Stress Level' : 'Stress_Level',
    'Urgency Level' : 'Urgency_Level',
    'Suggested Therapy': 'Suggested_Therapy',
    'Duration (weeks)': 'Duration',
    'Previous Diagnosis': 'Prev_Diagnosis'
})

# Check renamed columns
mental_df.columns

Index(['User ID', 'Age', 'Gender', 'Symptoms', 'Duration', 'Prev_Diagnosis',
       'Therapy_History', 'Medication', 'Diagnosis', 'Suggested_Therapy',
       'Self_Care_Advice', 'Urgency_Level', 'Mood', 'Stress_Level'],
      dtype='object')

In [3]:
symptoms_unique_values = mental_df['Symptoms'].unique()
therapy_unique_values = mental_df['Suggested_Therapy'].unique()
diagnosis_unique_values = mental_df['Diagnosis'].unique()

print("Symtoms unique values \n", symptoms_unique_values)
print("therapy_unique_values \n", therapy_unique_values)
print("diagnosis_unique_values \n", diagnosis_unique_values)

Symtoms unique values 
 ['feeling anxious' 'excessive worry' 'trouble sleeping'
 'loss of interest in activities' 'panic attacks' 'lack of concentration'
 'feeling irritable' 'feeling sad' 'feeling overwhelmed']
therapy_unique_values 
 ['Support Groups' 'Cognitive Behavioral Therapy' 'Psychotherapy'
 'Mindfulness-Based Therapy' 'No Therapy Needed']
diagnosis_unique_values 
 ['Panic Disorder' 'Depression' 'Anxiety' 'Burnout' 'Stress']


A label encoder is a technique that converts non-numerical data into numerical values, 
which is useful for machine learning and data analysis. <br>
It's often used when working with categorical data, such as ordinal data, 
where there's a hierarchy among the values

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
# Target supposed to be Target Supposed.
#  th
def build_self_test_self_care_advice():
    le_diagnosis = LabelEncoder()
    le_symtoms = LabelEncoder()
    le_self_care = LabelEncoder()
    le_therapy = LabelEncoder()
    # mental_df['Duration'] is already Int style, we don't have to encode it
    
    mental_df['Diagnosis_encoded'] = le_diagnosis.fit_transform(mental_df['Diagnosis'])
    mental_df['Symptoms_encoded'] = le_symtoms.fit_transform(mental_df['Symptoms'])
    mental_df['Self_Care_Advice_encoded'] = le_self_care.fit_transform(mental_df['Self_Care_Advice'])
    mental_df['Suggested_Therapy_encoded'] = le_therapy.fit_transform(mental_df['Suggested_Therapy'])

    # Training Data
    X = mental_df[['Diagnosis_encoded', 'Symptoms_encoded']]
    y_self_care = mental_df['Self_Care_Advice_encoded']
    y_therapy = mental_df['Suggested_Therapy_encoded']

    X_train, X_test, y_self_care_train, y_self_care_test, y_therapy_train, y_therapy_test = train_test_split(X, y_self_care, y_therapy, test_size=0.2, random_state=42)
    # Train models for Self Care Advice and Suggested Therapy
    model_self_care = RandomForestClassifier()
    model_therapy = RandomForestClassifier()

    model_self_care.fit(X_train, y_self_care_train)
    model_therapy.fit(X_train, y_therapy_train)

    # Make predictions
    self_care_pred = model_self_care.predict(X_test)
    therapy_pred = model_therapy.predict(X_test)

    # Display classification reports
    print("Self Care Advice Classification Report:")
    self_care_report = classification_report(y_self_care_test, self_care_pred, target_names=le_self_care.classes_)
    print(self_care_report)
    print("\nSuggested Therapy Classification Report:")
    self_therapy_report = classification_report(y_therapy_test, therapy_pred, target_names=le_therapy.classes_)
    print(self_therapy_report)
    return model_self_care , model_therapy, le_diagnosis, le_symtoms, le_self_care, le_therapy, self_care_report, self_therapy_report

# Initial Reports

### Self Care Advice Classification Report

| Self Care Advice       | Precision | Recall | F1-Score | Support |
|------------------------|-----------|--------|----------|---------|
| Breathing Exercises    | 0.20      | 0.17   | 0.18     | 166     |
| Exercise               | 0.17      | 0.24   | 0.20     | 181     |
| Journaling             | 0.32      | 0.14   | 0.20     | 191     |
| Meditation             | 0.11      | 0.03   | 0.04     | 116     |
| Take Breaks            | 0.17      | 0.33   | 0.23     | 164     |
| Talk to a Friend       | 0.22      | 0.21   | 0.21     | 182     |
| **Accuracy**           |           |        | 0.19     | 1000    |
| **Macro Avg**          | 0.20      | 0.19   | 0.18     | 1000    |
| **Weighted Avg**       | 0.21      | 0.19   | 0.18     | 1000    |

### Suggested Therapy Classification Report

| Suggested Therapy               | Precision | Recall | F1-Score | Support |
|---------------------------------|-----------|--------|----------|---------|
| Cognitive Behavioral Therapy    | 0.16      | 0.13   | 0.14     | 200     |
| Mindfulness-Based Therapy       | 0.14      | 0.03   | 0.05     | 189     |
| No Therapy Needed               | 0.17      | 0.09   | 0.11     | 187     |
| Psychotherapy                   | 0.17      | 0.31   | 0.22     | 202     |
| Support Groups                  | 0.24      | 0.34   | 0.28     | 222     |
| **Accuracy**                    |           |        | 0.19     | 1000    |
| **Macro Avg**                   | 0.17      | 0.18   | 0.16     | 1000    |
| **Weighted Avg**                | 0.18      | 0.19   | 0.17     | 1000    |



In [5]:
# model_self_care , model_therapy, le_diagnosis, le_symtoms, le_self_care, le_therapy, self_care_report, self_therapy_report = build_model()
# Save model for purpose
import joblib
def save_model(model_self_care , model_therapy, le_diagnosis, le_symtoms, le_self_care, le_therapy ):
    joblib.dump(model_self_care, "model_self_care.pkl")
    joblib.dump(model_therapy, "model_therapy.pkl")
    joblib.dump(le_diagnosis, "le_diagnosis.pkl")
    joblib.dump(le_symtoms, "le_symtoms.pkl")
    joblib.dump(le_self_care, "le_self_care.pkl")
    joblib.dump(le_therapy, "le_therapy.pkl")

# save_model(model_self_care , model_therapy, le_diagnosis, le_symtoms, le_self_care, le_therapy)

Current accuracy 0.18 precison also,
To imporve this models. <br>
There are serveral ways to imporve ways.<br>
Add more parameters (which contains demographic infomrationm which user can simply input them), also re mapping based Diagnosis.<br>
I created three value includes 2 informaiton.<br>
It wil help to organize better modeling. <br>
Current features has <br>


In [6]:
print(mental_df['Gender'].unique())
print(mental_df['Duration'].unique())
print(mental_df['Urgency_Level'].unique())
print(mental_df['Stress_Level'].unique())
print(mental_df['Prev_Diagnosis'].unique())

['Other' 'Female' 'Non-binary' 'Male']
[29 37 47 35 22  8 31 20 21  9 38 43 30  2  3 34 36 19 33 23 46 49 48 41
 11 16 10 45 13  4 39 12 51 24 17 32  1 14 44 15 26  5 40 27 42 18  6 50
 28 25  7]
['Moderate' 'High' 'Low' 'Critical']
[ 1  4  5  2  6  9 10  8  7  3]
['OCD' 'None' 'PTSD' 'Bipolar Disorder' 'Anxiety' 'Depression']


In [7]:
# Diagnosis Group

def group_diagnosis(row):
    if row['Diagnosis'] in ['Panic Disorder', 'Anxiety']:
        return 'Anxiety Disorders'
    elif row['Diagnosis'] in ['Depression', 'Burnout']:
        return 'Mood Disorders'
    elif row['Diagnosis'] == 'Stress':
        return 'Stress-Related Disorders'
def group_prev_diagnosis(row):
    if row['Prev_Diagnosis'] in ['Panic Disorder', 'Anxiety', 'OCD']:
        return 'Anxiety Disorders'
    elif row['Prev_Diagnosis'] in ['Depression', 'Bipolar Disorder']:
        return 'Mood Disorders'
    elif row['Prev_Diagnosis'] in ['Stress', 'PTSD']:
        return 'Stress-Related Disorders'
    else:
        return ''

def re_map_gender(row):
    if row['Gender'] == 'Male':
        return 1
    elif row['Gender'] == 'Female':
        return 2
    else:
        return 3
def re_map_urgency_level(row):
    if row["Urgency_Level"] == "Low":
        return 1
    elif row["Urgency_Level"] == "Moderate":
        return 2
    elif row["Urgency_Level"] == "High":
        return 3
    else:
        return 4
    
# Bascially
def improved_test_reports_diagnosis(mental_df):
    # Diagnosis Group
    mental_df["Diagnosis_Group"] = mental_df.apply(group_diagnosis, axis=1)
    mental_df['Prev_Diagnosis_Group'] = mental_df.apply(group_prev_diagnosis, axis=1)
    mental_df["Re_Gender"] = mental_df.apply(re_map_gender, axis=1)
    mental_df["Urgency_Level"] = mental_df.apply(re_map_urgency_level, axis=1)
    
    le_diagnosis_group = LabelEncoder()
    le_prev_Diagnosis_group = LabelEncoder()
    le_symtoms = LabelEncoder()

    mental_df['Diagnosis_Group_encoded'] = le_diagnosis_group.fit_transform(mental_df['Diagnosis_Group'])
    mental_df['Prev_Diagnosis_Group_encoded'] =  le_prev_Diagnosis_group.fit_transform(mental_df['Prev_Diagnosis'])
    mental_df['Symptoms_encoded'] = le_symtoms.fit_transform(mental_df['Symptoms'])

    # Training Data with duration
    X = mental_df[['Age', 'Symptoms_encoded', "Re_Gender", "Prev_Diagnosis_Group_encoded", "Duration", "Stress_Level", "Urgency_Level"]] 
    y_diagnosis = mental_df['Diagnosis_Group_encoded']

    X_train, X_test, y_diagnosis_train, y_diagnosis_test = train_test_split(X, y_diagnosis, test_size=0.2, random_state=42)
    
    # Train model for Diagnosis
    model_diagnosis = RandomForestClassifier()
    model_diagnosis.fit(X_train, y_diagnosis_train)

    # Make predictions
    diagnos_pred = model_diagnosis.predict(X_test)

    # Display classification reports
    print("Diagnosis Group Classification Report:")
    diagnosis_report = classification_report(y_diagnosis_test, diagnos_pred, target_names=le_diagnosis_group.classes_)
    print(diagnosis_report)
    
    return model_diagnosis, le_diagnosis_group, diagnosis_report
improved_test_reports_diagnosis(mental_df)


Diagnosis Group Classification Report:
                          precision    recall  f1-score   support

       Anxiety Disorders       0.46      0.64      0.54       422
          Mood Disorders       0.49      0.42      0.45       440
Stress-Related Disorders       0.23      0.07      0.11       138

                accuracy                           0.46      1000
               macro avg       0.39      0.38      0.37      1000
            weighted avg       0.44      0.46      0.44      1000



(RandomForestClassifier(),
 LabelEncoder(),
 '                          precision    recall  f1-score   support\n\n       Anxiety Disorders       0.46      0.64      0.54       422\n          Mood Disorders       0.49      0.42      0.45       440\nStress-Related Disorders       0.23      0.07      0.11       138\n\n                accuracy                           0.46      1000\n               macro avg       0.39      0.38      0.37      1000\n            weighted avg       0.44      0.46      0.44      1000\n')

In [8]:
from sklearn.svm import SVC
def improved_test_reports_diagnosis_svm(mental_df):
    # Same preprocessing steps
    mental_df["Diagnosis_Group"] = mental_df.apply(group_diagnosis, axis=1)
    mental_df['Prev_Diagnosis_Group'] = mental_df.apply(group_prev_diagnosis, axis=1)
    mental_df["Re_Gender"] = mental_df.apply(re_map_gender, axis=1)
    mental_df["Urgency_Level"] = mental_df.apply(re_map_urgency_level, axis=1)

    le_diagnosis_group = LabelEncoder()
    le_prev_Diagnosis_group = LabelEncoder()
    le_symtoms = LabelEncoder()

    mental_df['Diagnosis_Group_encoded'] = le_diagnosis_group.fit_transform(mental_df['Diagnosis_Group'])
    mental_df['Prev_Diagnosis_Group_encoded'] = le_prev_Diagnosis_group.fit_transform(mental_df['Prev_Diagnosis'])
    mental_df['Symptoms_encoded'] = le_symtoms.fit_transform(mental_df['Symptoms'])

    X = mental_df[['Age', 'Symptoms_encoded', "Re_Gender", "Prev_Diagnosis_Group_encoded", "Duration", "Stress_Level", "Urgency_Level"]] 
    y_diagnosis = mental_df['Diagnosis_Group_encoded']

    X_train, X_test, y_diagnosis_train, y_diagnosis_test = train_test_split(X, y_d|iagnosis, test_size=0.2, random_state=42)

    # Train model using Support Vector Machine
    model_diagnosis = SVC(kernel='poly')  # You can also try 'rbf' or 'poly'
    model_diagnosis.fit(X_train, y_diagnosis_train)

    # Make predictions
    diagnos_pred = model_diagnosis.predict(X_test)

    # Display classification report
    print("Diagnosis Group Classification Report (SVM):")
    diagnosis_report = classification_report(y_diagnosis_test, diagnos_pred, target_names=le_diagnosis_group.classes_)
    print(diagnosis_report)

    return model_diagnosis, le_diagnosis_group, diagnosis_report

improved_test_reports_diagnosis_svm(mental_df)

NameError: name 'y_d' is not defined

In [33]:
import nltk
from nltk.corpus import wordnet
from itertools import chain

# Based on questions and topics, Create chains then finds vlaues.
def find_related_words():
    nltk.download('wordnet')

    words = {
        'Panic': 'panic',
        'Disorder': 'disorder',
        'Depression': 'depression',
        'Burnout': 'burnout',
        'Stress': 'stress'
    }

    # Initialize a dictionary to store the results
    related_words = {}

    for key, word in words.items():
        word_synsets = wordnet.synsets(word)
        similar_words = set(chain(*[synset.lemma_names() for synset in word_synsets]))
        related_words[key] = list(similar_words)

    return related_words

# Get the related words and print the dictionary
similar_words_dict = find_related_words()
print(similar_words_dict)

{'Panic': ['terror', 'affright', 'panic', 'scare'], 'Disorder': ['perturb', 'disorderliness', 'distract', 'disarray', 'disquiet', 'trouble', 'cark', 'disorder', 'unhinge', 'upset'], 'Depression': ['depression', 'slump', 'natural_depression', 'impression', 'Great_Depression', 'depressive_disorder', 'clinical_depression', 'economic_crisis', 'Depression', 'imprint', 'low'], 'Burnout': [], 'Stress': ['accentuate', 'accent', 'punctuate', 'stress', 'tension', 'try', 'tenseness', 'emphasis', 'emphasize', 'emphasise', 'strain', 'focus']}


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ykim\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [34]:
# Result for related words
# {'Panic': ['panic', 'scare', 'terror', 'affright'], 
# 'Disorder': ['distract', 'upset', 'trouble', 'disorderliness', 'cark', 'disarray', 'disorder', 'perturb', 'disquiet', 'unhinge'], 
# 'Depression': ['impression', 'clinical_depression', 'slump', 'Great_Depression', 'Depression', 'low', 'depressive_disorder', 'natural_depression', 'imprint', 'economic_crisis', 'depression'], 
# 'Burnout': [], 
# 'Stress': ['strain', 'emphasis', 'emphasize', 'tension', 'punctuate', 'focus', 'accentuate', 'try', 'tenseness', 'emphasise', 'stress', 'accent']}


In [35]:
# Counsel data training.

In [36]:
counsel_df = pd.read_csv("counselchat-data.csv")

from collections import Counter
# Ensure you've downloaded the WordNet corpus
# Display the word frequencies
def get_word_frequencies(counsel_df):
    print("Orignal length" + str(len(counsel_df)))
    counsel_df = counsel_df[['questionText', 'topics','answerText']]
    all_words = ' '.join(counsel_df['topics'].astype(str)).replace(',', '').split()
    # Count the frequency of each word
    word_count = Counter(all_words)
    print("\nWord frequencies in the 'topics' column:")
    found_selected_count = 0
    stress_count = 0
    depression_count = 0
    disorder_count = 0
    anxiety_count = 0
    burn_out_count = 0
    for word, count in word_count.items():
        if word.__contains__("Stress"):
            stress_count +=count
        elif word.__contains__("Depression"):
            depression_count += count
        elif word.__contains__("Disorder"):
            disorder_count += count
        elif word.__contains__("Anxiety"):
            anxiety_count += count
        elif word.__contains__("Burnout"):
            burn_out_count += count
        else:
            print(f"{word} :  {count}")
    found_selected_count = stress_count + depression_count + disorder_count + anxiety_count + burn_out_count
    print("total counts" + str(len(counsel_df)))
    print("stress_count", stress_count)
    print("depression_count", depression_count)
    print("disorder_count", disorder_count)
    print("anxiety_count", anxiety_count)
    print("burn_out_count", burn_out_count)
    print("Found selcted count", found_selected_count)

get_word_frequencies(counsel_df)



Orignal length1482

Word frequencies in the 'topics' column:
Family :  119
Conflict :  91
Substance :  14
AbuseAddiction :  9
Behavioral :  49
ChangeSocial :  3
Relationships :  214
Relationship :  45
Dissolution :  68
Anger :  32
Management :  25
Sleep :  10
Improvement :  17
Professional :  34
EthicsLegal :  6
& :  45
Regulatory :  20
Social :  41
RelationshipsMarriage :  11
MarriageIntimacy :  26
Domestic :  9
ViolenceAnger :  2
ManagementFamily :  3
Human :  50
Sexuality :  33
ManagementSleep :  2
Military :  3
Issues :  10
RelationshipsDomestic :  3
Violence :  10
ViolenceRelationship :  1
Marriage :  25
Grief :  20
and :  23
Loss :  9
ConflictChildren :  1
Adolescents :  9
MarriageRelationship :  4
TraumaHuman :  1
RelationshipsIntimacy :  42
ManagementParenting :  1
Intimacy :  17
Workplace :  9
SexualityMarriage :  3
LGBTQ :  29
SpiritualityFamily :  2
Ethics :  23
ViolenceRelationships :  1
ConflictRelationships :  5
Self-esteem :  29
Self-esteemRelationships :  12
Parenting :

In [56]:
# https://my.clevelandclinic.org/health/diseases/22295-mental-health-disorders

import nltk
from nltk.corpus import wordnet

counsel_df = pd.read_csv("counselchat-data.csv")
target_Keywords = {
    'Anxiety Disorders': ['panic disorder', 'anxiety'],
    'Mood Disorders': ['depression', 'burnout'],
    'Stress-Related Disorders': ['stress', 'PTSD']
}

similar_words_dict = {
    'panic': ['panic', 'scare', 'terror', 'affright'],
    'disorder': ['distract', 'upset', 'trouble', 'disorderliness', 'cark', 'disarray', 'disorder', 'perturb', 'disquiet', 'unhinge'],
    'depression': ['impression', 'clinical_depression', 'slump', 'Great_Depression', 'depression', 'low', 'depressive_disorder', 'natural_depression', 'imprint', 'economic_crisis', 'depression'],
    'burnout': [],
    'stress': ['strain', 'emphasis', 'emphasize', 'tension', 'punctuate', 'focus', 'accentuate', 'try', 'tenseness', 'emphasise', 'stress', 'accent']
}

def group_diagnosis(row):
    all_words = str(row['topics']).lower().split()

    for disorder, keywords in target_Keywords.items():
        for keyword in keywords:
            if any(keyword in word for word in all_words):
                return disorder
            if keyword in similar_words_dict:
                # Check if any of the similar words match
                if any(sim_word in all_words for sim_word in similar_words_dict[keyword]):
                    return disorder
    return None
# Original 120.
# print(similar_words_dict)
# Adding new counsel_df based on items
counsel_df["re_diagnosis"] = counsel_df.apply(group_diagnosis, axis=1)
# print(counsel_df['re_diagnosis'])
# print(len(counsel_df['re_diagnosis']))

# cd remapping diagnosis.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Function to create and train the model
# TfidfVectorizer
# LogicRegression
def create_diagnosis_model(train_data, target_column='re_diagnosis'):
    # Extract text and target columns
    X_train = train_data['topics']
    y_train = train_data[target_column]

    # TF-IDF vectorization
    tfidf = TfidfVectorizer(max_features=5000)
    X_train_tfidf = tfidf.fit_transform(X_train)

    # Train a logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_tfidf, y_train)

    # Evaluate model accuracy on the training set
    X_train_pred = model.predict(X_train_tfidf)
    accuracy = accuracy_score(y_train, X_train_pred)
    print(f"Training Accuracy: {accuracy}")

    return model, tfidf

# Function to make predictions using the trained model
def predict_missing_diagnoses(df, model, tfidf, target_column='re_diagnosis'):
    # Extarct out df re_diagnosis is None
    df_test = df[df[target_column].isna()]

    X_test = df_test['topics']
    print(f"Number of rows in test set: {len(X_test)}")
    X_test = df_test['topics'].fillna('')  # Replace NaN values with an empty string
    non_empty_mask = X_test.str.strip() != ''

    # 한번더 걸러준다
    df_test = df_test[non_empty_mask]
    X_test =  X_test[non_empty_mask]

    # 
    X_test_tfidf = tfidf.transform(X_test)

    # Predict missing diagnoses
    predictions = model.predict(X_test_tfidf)

    # Assign predictions back to the DataFrame
    df.loc[df[target_column].isna() & non_empty_mask, target_column] = predictions

    return df

# Example usage
# Step 1: Create model
# Training and prediction process
df_train = counsel_df[counsel_df['re_diagnosis'].notna()]
model, tfidf = create_diagnosis_model(df_train)
remapped_consel_df = predict_missing_diagnoses(counsel_df, model, tfidf)



# remapped_consel_df.to_csv("remapped_consel_df", index=False)




Training Accuracy: 0.9976133651551312
Number of rows in test set: 1063


In [18]:
import sys
print("Environment path:", sys.executable)

Environment path: c:\Users\ykim\AppData\Local\anaconda3\python.exe


In [None]:
print(len(remapped_consel_df))
print(remapped_consel_df.columns)
remapped_consel_df = remapped_consel_df[['questionText', 'topics', 're_diagnosis', 'answerText']]
remapped_consel_df.to_csv("remapped_consel_df", index=False)

In [16]:
# Finally, train NLP gpt3.
# Starting from 
train_df = pd.read_csv('remapped_consel_df')


import pandas as pd
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW
from torch.utils.data import Dataset, DataLoader

# Load and preprocess data
train_df = pd.read_csv('remapped_consel_df')
train_df["input_text"] = "Question: " + train_df["questionText"] + " Topics: " + train_df["topics"] + " Diagnosis: " + train_df["re_diagnosis"] + " Response:"
train_df["target_text"] = train_df["answerText"]

# Initialize the model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Set padding token for GPT2, which does not have one by default
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

class ChatBotDataSet(Dataset):
    def __init__(self, tokenizer, input_texts, target_texts, max_len=128):
        self.tokenizer = tokenizer
        self.input_texts = input_texts
        self.target_texts = target_texts
        self.max_len = max_len

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, index):
        # Encode input and target texts with truncation and padding
        input_encodings = self.tokenizer(
            self.input_texts[index],
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        target_encodings = self.tokenizer(
            self.target_texts[index],
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": input_encodings["input_ids"].squeeze(),
            "attention_mask": input_encodings["attention_mask"].squeeze(),
            "labels": target_encodings["input_ids"].squeeze(),
        }

# Prepare dataset and dataloader
input_texts = train_df["input_text"].tolist()
target_texts = train_df["target_text"].tolist()
dataset = ChatBotDataSet(tokenizer, input_texts, target_texts)
train_loader = DataLoader(dataset, batch_size=2, shuffle=True)

# Model training setup
optimizer = AdamW(model.parameters(), lr=5e-5)
model.train()

for epoch in range(3):  # Adjust epochs as needed
    for batch in train_loader:
        optimizer.zero_grad()
        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"]
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f"Epoch: {epoch}, Loss: {loss.item()}")


TypeError: 'GPT2Tokenizer' object is not callable