In [2]:
!pip install keras-cv


Collecting keras-cv
  Downloading keras_cv-0.9.0-py3-none-any.whl.metadata (12 kB)
Collecting keras-core (from keras-cv)
  Downloading keras_core-0.1.7-py3-none-any.whl.metadata (4.3 kB)
Downloading keras_cv-0.9.0-py3-none-any.whl (650 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m650.7/650.7 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading keras_core-0.1.7-py3-none-any.whl (950 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: keras-core, keras-cv
Successfully installed keras-core-0.1.7 keras-cv-0.9.0


In [3]:
import pandas as pd
import numpy as np
import string
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from keras_cv.losses import FocalLoss
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional



In [7]:
df1 = pd.read_csv("survey.csv",encoding='utf-8')
df2= pd.read_csv("Combined Data.csv",encoding='utf-8')

In [8]:
# Rename columns in df2 to match df1
df2 = df2.rename(columns={
    'statement': 'comments',         # 
    'status': 'treatment',         # 
})


In [None]:
df2

Unnamed: 0.1,Unnamed: 0,comments,treatment
0,0,oh my gosh,Anxiety
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,3,I've shifted my focus to something else but I'...,Anxiety
4,4,"I'm restless and restless, it's been a month n...",Anxiety
...,...,...,...
53038,53038,Nobody takes me seriously I’ve (24M) dealt wit...,Anxiety
53039,53039,"selfishness ""I don't feel very good, it's lik...",Anxiety
53040,53040,Is there any way to sleep better? I can't slee...,Anxiety
53041,53041,"Public speaking tips? Hi, all. I have to give ...",Anxiety


In [9]:
df1 = df1[['comments', 'treatment']]  # or replace 'treatment' with your actual target column
df1 = df1.dropna(subset=['comments'])  # do not drop rows unless 'comments' is missing


In [None]:
df1

Unnamed: 0,comments,treatment
13,I'm not on my company's health insurance which...,No
15,I have chronic low-level neurological issues t...,Yes
16,My company does provide healthcare but not to ...,Yes
24,Relatively new job. Ask again later,Yes
25,Sometimes I think about using drugs for my me...,Yes
...,...,...
1223,Although my employer does everything they can ...,No
1232,I work at a large university with a track reco...,Yes
1234,i'm in a country with social health care so my...,Yes
1245,In australia all organisations of a certain si...,Yes


In [10]:
df2 = df2[['comments', 'treatment']]  # or replace 'treatment' with your actual target column
df2 = df2.dropna(subset=['comments'])  # do not drop rows unless 'comments' is missing


In [None]:
df2

Unnamed: 0,comments,treatment
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety
...,...,...
53038,Nobody takes me seriously I’ve (24M) dealt wit...,Anxiety
53039,"selfishness ""I don't feel very good, it's lik...",Anxiety
53040,Is there any way to sleep better? I can't slee...,Anxiety
53041,"Public speaking tips? Hi, all. I have to give ...",Anxiety


In [11]:

# Normalize and clean comments
import re
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [12]:
important_terms = ["depressed", "suicidal", "anxious", "bipolar", "sad", "hopeless"]

def emphasize_keywords(text):
    for word in important_terms:
        text = text.replace(word, (word + " ") * 3)
    return text


In [13]:
# Clean survey comments
df1 = df1[['comments', 'treatment']].dropna(subset=['comments'])
df1['clean_comments'] = df1['comments'].apply(clean_text).apply(emphasize_keywords)
df1['label'] = df1['treatment'].map({'Yes': 1, 'No': 0})

In [14]:
# Clean combined data
df2 = df2[['comments', 'treatment']].dropna(subset=['comments'])
df2['clean_comments'] = df2['comments'].apply(clean_text).apply(emphasize_keywords)
df2['label'] = df2['treatment'].apply(lambda x: 0 if str(x).lower() == 'normal' else 1)


In [None]:
df1.shape

(164, 4)

In [None]:
df2.shape

(52681, 4)

In [15]:
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

df1['clean_comments'] = df1['clean_comments'].apply(lemmatize_text)
df2['clean_comments'] = df2['clean_comments'].apply(lemmatize_text)


[nltk_data] Downloading package wordnet to /root/nltk_data...


In [16]:
merged_df = pd.concat([
    df1[['clean_comments', 'label']],
    df2[['clean_comments', 'label']]
], ignore_index=True)

In [None]:
merged_df

Unnamed: 0,clean_comments,label
0,im not on my company health insurance which co...,0
1,i have chronic lowlevel neurological issue tha...,1
2,my company doe provide healthcare but not to m...,1
3,relatively new job ask again later,1
4,sometimes i think about using drug for my ment...,1
...,...,...
52840,nobody take me seriously ive 24m dealt with de...,1
52841,selfishness i dont feel very good it like i do...,1
52842,is there any way to sleep better i cant sleep ...,1
52843,public speaking tip hi all i have to give a pr...,1


In [None]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52845 entries, 0 to 52844
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   clean_comments  52845 non-null  object
 1   label           52845 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 825.8+ KB


In [None]:
print(merged_df['label'].value_counts())

label
1    36444
0    16401
Name: count, dtype: int64


In [17]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(merged_df['clean_comments'])

sequences = tokenizer.texts_to_sequences(merged_df['clean_comments'])
padded_sequences = pad_sequences(sequences, padding='post', maxlen=100)

X = padded_sequences
y =merged_df['label']


In [18]:
embedding_index = {}
embedding_dim = 100
with open("glove.6B.100d.txt", encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs


word_index = tokenizer.word_index
MAX_VOCAB = len(word_index) + 1
embedding_matrix = np.zeros((MAX_VOCAB, embedding_dim))
for word, i in word_index.items():
    if i < MAX_VOCAB:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [19]:
y.shape

(52845,)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X,y, test_size=0.2, random_state=42, stratify=y
)


In [21]:
from sklearn.utils import class_weight
#  Class weights
weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights = {i: weights[i] for i in range(len(weights))}
print("Class Weights:", class_weights)

Class Weights: {0: np.float64(1.6110052587455224), 1: np.float64(0.7250214371462871)}


In [22]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
max_length = 100

model = Sequential()
model.add(Embedding(input_dim=MAX_VOCAB,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    input_length=max_length,
                    trainable=False))
model.add (Bidirectional(LSTM(64, return_sequences=False)))
model.add(Dropout(0.6))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss=FocalLoss(), optimizer='adam', metrics=['accuracy'])

# Train with class weights
model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=3,
    batch_size=32,
    class_weight=class_weights
)




Epoch 1/3
[1m1322/1322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 64ms/step - accuracy: 0.8053 - loss: 0.0361 - val_accuracy: 0.9107 - val_loss: 0.0205
Epoch 2/3
[1m1322/1322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 63ms/step - accuracy: 0.8934 - loss: 0.0228 - val_accuracy: 0.9090 - val_loss: 0.0201
Epoch 3/3
[1m1322/1322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 63ms/step - accuracy: 0.9085 - loss: 0.0194 - val_accuracy: 0.8957 - val_loss: 0.0208


<keras.src.callbacks.history.History at 0x7bc8ed67a4d0>

In [23]:
model.summary()

In [24]:
def preprocess_text_pipeline(text):
    text = clean_text(text)
    text = emphasize_keywords(text)
    text = lemmatize_text(text)
    return text

def predict_comment(comment):
    processed = preprocess_text_pipeline(comment)
    sequence = tokenizer.texts_to_sequences([processed])
    padded = pad_sequences(sequence, maxlen=100, padding='post')
    prob = model.predict(padded, verbose=0)[0][0]
    label = "High Risk" if prob >= 0.4 else "Low Risk"

    print(f"📝 Entered Comment: {comment}")
    print(f"🔍 Cleaned Comment: {processed}")
    print(f"📊 Probability: {prob:.4f}")
    print(f"🔔 Prediction: {label}")

user_input = input("Enter your mental health comment: ")
predict_comment(user_input)


Enter your mental health comment: she feels hopeless
📝 Entered Comment: she feels hopeless
🔍 Cleaned Comment: she feel hopeless hopeless hopeless
📊 Probability: 0.4286
🔔 Prediction: High Risk


In [None]:
print(merged_df['label'].value_counts())

label
1    36444
0    16401
Name: count, dtype: int64


In [25]:
# Save model and tokenizer
import pickle
model.save("mental_health_model.h5")
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

