In [None]:
import pandas as pd

# Load the dataset
data = pd.read_csv("/Combined Data.csv")

# Display column names
print("Column names:", data.columns)


Column names: Index(['Unnamed: 0', 'statement', 'status'], dtype='object')


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Map mental health status labels to numeric values
label_mapping = {'Normal': 0, 'Depression': 1, 'Suicidal': 2, 'Anxiety': 3, 'Stress': 4, 'Bipolar': 5, 'Personality disorder': 6}
data['status'] = data['status'].map(label_mapping)


In [None]:
# Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)      # Remove digits
    tokens = text.split()
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    return ' '.join(tokens)


In [None]:

# Apply text cleaning
data['statement'] = data['statement'].fillna('').astype(str).apply(clean_text)

In [None]:
# Prepare data for modeling
texts = data['statement'].values
labels = data['status'].values

In [None]:

# Tokenization and padding
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
max_len = 100
data_padded = pad_sequences(sequences, maxlen=max_len)


In [None]:

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(data_padded, labels, test_size=0.2, random_state=42)

In [None]:

# Build Bidirectional LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(word_index) + 1, output_dim=128, input_length=max_len))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.5))
model.add(Dense(7, activation='softmax'))  # 7 classes for each mental health status



In [None]:

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)


Epoch 1/5
[1m531/531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m296s[0m 543ms/step - accuracy: 0.5232 - loss: 1.2355 - val_accuracy: 0.6766 - val_loss: 0.8259
Epoch 2/5
[1m531/531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m327s[0m 554ms/step - accuracy: 0.7191 - loss: 0.7495 - val_accuracy: 0.7295 - val_loss: 0.7129
Epoch 3/5
[1m531/531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m308s[0m 528ms/step - accuracy: 0.7931 - loss: 0.5906 - val_accuracy: 0.7342 - val_loss: 0.7184
Epoch 4/5
[1m531/531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m323s[0m 529ms/step - accuracy: 0.8217 - loss: 0.5035 - val_accuracy: 0.7403 - val_loss: 0.7236
Epoch 5/5
[1m531/531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 526ms/step - accuracy: 0.8589 - loss: 0.4091 - val_accuracy: 0.7315 - val_loss: 0.8000


In [None]:

# Predictions and evaluation metrics
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)


[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 60ms/step


In [None]:
# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred_classes)
precision = precision_score(y_test, y_pred_classes, average='weighted')
recall = recall_score(y_test, y_pred_classes, average='weighted')
f1 = f1_score(y_test, y_pred_classes, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred_classes)

print(f"Test Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("Confusion Matrix:")
print(conf_matrix)

Test Accuracy: 74.90%
Precision: 0.75
Recall: 0.75
F1 Score: 0.75
Confusion Matrix:
[[3017  102   93   34   56   10   15]
 [  83 2093  785   36   15   42   46]
 [  77  464 1464    3    5    5    0]
 [  29   62   11  608   22   38    9]
 [  75  107   25   67  230   19   34]
 [  47   51    8   17    7  426   24]
 [  46   49    9   13    8   15  108]]


In [None]:
from scipy.stats import ttest_ind

# Replace these with your actual F1-scores from repeated runs
lstm_f1_scores = [0.75, 0.74, 0.75, 0.76, 0.75]
gru_f1_scores = [0.76, 0.77, 0.76, 0.76, 0.77]

# Perform T-test
t_stat, p_value = ttest_ind(gru_f1_scores, lstm_f1_scores)

# Display the results
print(f"T-statistic: {t_stat}")
print(f"P-value: {p_value}")

if p_value < 0.05:
    print("The GRU model's performance is significantly better than the LSTM model.")
else:
    print("There is no significant difference between the GRU and LSTM models.")


T-statistic: 3.5
P-value: 0.00807908226041189
The GRU model's performance is significantly better than the LSTM model.


In [None]:
import numpy as np

# Example: Assuming `model` is your trained GRU or LSTM model and `X_test` is the test dataset.

# Simulating a mock test dataset and predictions for demonstration purposes
# Replace with your actual test dataset (X_test) and true labels (y_test)
mock_test_data = [
    "I feel really good today!",
    "I'm feeling very anxious about tomorrow.",
    "I don't know how to continue with life.",
    "I'm managing, but things are hard.",
    "I have a lot of energy and feel unstoppable!"
]

# Mock predictions (Replace with actual predictions from your model)
mock_true_labels = ["Normal", "Anxiety", "Suicidal", "Stress", "Bipolar"]
mock_predicted_labels = ["Normal", "Anxiety", "Suicidal", "Stress", "Normal"]

# Combine test data with predictions for display
predictions_output = list(zip(mock_test_data, mock_true_labels, mock_predicted_labels))
predictions_output


[('I feel really good today!', 'Normal', 'Normal'),
 ("I'm feeling very anxious about tomorrow.", 'Anxiety', 'Anxiety'),
 ("I don't know how to continue with life.", 'Suicidal', 'Suicidal'),
 ("I'm managing, but things are hard.", 'Stress', 'Stress'),
 ('I have a lot of energy and feel unstoppable!', 'Bipolar', 'Normal')]