In [None]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout

In [None]:
import nltk

# Download stopwords for removing common words
nltk.download('stopwords')

# Download punkt for tokenization
nltk.download('punkt')

# Download wordnet for lemmatization
nltk.download('wordnet')

# Download averaged_perceptron_tagger for POS tagging (used in some preprocessing)
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
import pandas as pd
from csv import QUOTE_NONE

data = pd.read_csv('/content/Combined Data.csv', on_bad_lines='skip', quoting=QUOTE_NONE)
print(data.columns)




Index(['Unnamed: 0', 'statement', 'status'], dtype='object')


In [None]:
!pip install tensorflow




In [None]:
!pip install keras




In [None]:
pip install --upgrade tensorflow




In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder

# Load dataset
data = pd.read_csv('/content/Combined Data.csv')

# Clean and preprocess data
data = data.dropna(subset=['statement', 'status'])  # Remove rows with missing values
data['statement'] = data['statement'].astype(str)  # Ensure all statements are strings
data['status'] = data['status'].str.strip()  # Remove leading/trailing spaces
data['status'] = data['status'].str.replace('Bipolar', 'Bi-Polar')  # Standardize label variations

# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['status'])
y = to_categorical(y)  # One-hot encode the labels

# Tokenize and pad text data
texts = data['statement'].values
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(texts)
X = tokenizer.texts_to_sequences(texts)
X = pad_sequences(X, padding='post', maxlen=100)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Calculate class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(label_encoder.transform(data['status'])),
    y=label_encoder.transform(data['status'])
)
class_weight_dict = dict(enumerate(class_weights))

# Define the GRU model
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=100),
    GRU(128, dropout=0.2),
    Dense(7, activation='softmax')  # Adjust for number of classes
])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=64,
    validation_data=(X_test, y_test),
    class_weight=class_weight_dict
)

# Evaluate the model
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

print(f"Accuracy: {accuracy_score(y_test_classes, y_pred_classes):.4f}")
print("Classification Report:")
print(classification_report(y_test_classes, y_pred_classes, target_names=label_encoder.classes_))
print("Confusion Matrix:")
print(confusion_matrix(y_test_classes, y_pred_classes))

# Define prediction function
def predict_statement(model, tokenizer, label_encoder, statement, max_len=100):
    """
    Predict the mental health status for a given statement.

    Args:
        model: Trained GRU model.
        tokenizer: Fitted tokenizer for text preprocessing.
        label_encoder: Fitted LabelEncoder for decoding class labels.
        statement: Input statement (string).
        max_len: Maximum length for padding sequences.

    Returns:
        Predicted class label (string) and confidence score.
    """
    # Preprocess the input statement
    sequence = tokenizer.texts_to_sequences([statement])
    padded_sequence = pad_sequences(sequence, padding='post', maxlen=max_len)

    # Predict using the model
    prediction = model.predict(padded_sequence)
    predicted_class_index = np.argmax(prediction)
    confidence_score = np.max(prediction)

    # Decode the predicted class
    predicted_label = label_encoder.inverse_transform([predicted_class_index])[0]

    return predicted_label, confidence_score

# Example predictions
example_statements = [
    "I feel very anxious about tomorrow's presentation.",
    "I am so tired of everything; I feel like giving up.",
    "I am happy and content with my current situation.",
    "Sometimes I get extreme mood swings from happy to sad."
]

# Correct the function call to use predict_statement instead of predict_
print("\nMental Health Status Predictions:\n")
for statement in example_statements:
    predicted_label, confidence = predict_statement(model, tokenizer, label_encoder, statement)
    print(f"Statement: {statement}")
    print(f"Predicted Mental Health Status: {predicted_label} (Confidence: {confidence:.2f})")




Epoch 1/10
[1m659/659[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m209s[0m 313ms/step - accuracy: 0.4146 - loss: 1.6909 - val_accuracy: 0.6697 - val_loss: 0.8207
Epoch 2/10
[1m659/659[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 312ms/step - accuracy: 0.7058 - loss: 0.8867 - val_accuracy: 0.7530 - val_loss: 0.6543
Epoch 3/10
[1m659/659[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 309ms/step - accuracy: 0.7780 - loss: 0.5876 - val_accuracy: 0.7598 - val_loss: 0.6377
Epoch 4/10
[1m659/659[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m212s[0m 321ms/step - accuracy: 0.8100 - loss: 0.4377 - val_accuracy: 0.7641 - val_loss: 0.6220
Epoch 5/10
[1m659/659[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m254s[0m 309ms/step - accuracy: 0.8317 - loss: 0.3451 - val_accuracy: 0.7567 - val_loss: 0.6696
Epoch 6/10
[1m659/659[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m278s[0m 333ms/step - accuracy: 0.8492 - loss: 0.2822 - val_accuracy: 0.7579 - val_loss: 0.6930
Epoc

In [None]:
import numpy as np
from scipy import stats

# Accuracy results for the models
logistic_regression_accuracies = np.array([78, 77, 78, 77])  # Logistic Regression model
gru_accuracies = np.array([76.1, 77.0, 76.0, 76.0])  # GRU model

# Calculate the differences between the two models
differences = logistic_regression_accuracies - gru_accuracies

# Perform the paired t-test
t_statistic, p_value = stats.ttest_1samp(differences, 0)

# Output the results
print(f"T-statistic: {t_statistic}")
print(f"P-value: {p_value}")

# Interpreting the result
if p_value < 0.05:
    print("Reject the null hypothesis: There is a significant difference between the models.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference between the models.")


T-statistic: 2.627934601251784
P-value: 0.07846422716984412
Fail to reject the null hypothesis: There is no significant difference between the models.


In [None]:
import numpy as np
from scipy import stats

# Accuracy results for the models
logistic_regression_accuracies = np.array([78, 77, 78, 77])  # Logistic Regression model
lstm_accuracies = np.array([74.9, 75, 75, 75])  # LSTM model

# Calculate the differences between the two models
differences = logistic_regression_accuracies - lstm_accuracies

# Perform the paired t-test
t_statistic, p_value = stats.ttest_1samp(differences, 0)

# Output the results
print(f"T-statistic: {t_statistic}")
print(f"P-value: {p_value}")

# Interpreting the result
if p_value < 0.05:
    print("Reject the null hypothesis: There is a significant difference between the models.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference between the models.")


T-statistic: 8.311513948033323
P-value: 0.003649630156265394
Reject the null hypothesis: There is a significant difference between the models.
