In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import the necessary libraries
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Ensure nltk resources are available
import nltk
nltk.download('stopwords')

nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize the lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Custom stop words list with negations retained
custom_stop_words = stop_words - {'not', 'no', 'but', 'because'}

# Function to preprocess text
def preprocess_text(text):
    # Convert text to Lowercase
    text = text.lower()
    # Removing special characters and numbers
    text = re.sub(r'[^\w\s,.]', '', text)  # Remove punctuation, Keep periods and commas for sentence flow
    text = re.sub(r'\d+', '', text)  # Remove numbers
    # Tokenization
    tokens = text.split()
    # Removing stop words and stemming
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in custom_stop_words]
    # Rejoin tokens into a single string
    text = ' '.join(tokens)
    return text

2024-09-19 10:52:41.992240: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-19 10:52:42.727670: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-19 10:52:42.949485: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-19 10:52:44.606999: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package stopwords to


In [3]:
import pandas as pd
df = pd.read_csv('Mental-Health-Twitter.csv', index_col=0)


In [4]:
df.shape

(20000, 10)

In [5]:
df.head()

Unnamed: 0,post_id,post_created,post_text,user_id,followers,friends,favourites,statuses,retweets,label
0,637894677824413696,Sun Aug 30 07:48:37 +0000 2015,It's just over 2 years since I was diagnosed w...,1013187241,84,211,251,837,0,1
1,637890384576778240,Sun Aug 30 07:31:33 +0000 2015,"It's Sunday, I need a break, so I'm planning t...",1013187241,84,211,251,837,1,1
2,637749345908051968,Sat Aug 29 22:11:07 +0000 2015,Awake but tired. I need to sleep but my brain ...,1013187241,84,211,251,837,0,1
3,637696421077123073,Sat Aug 29 18:40:49 +0000 2015,RT @SewHQ: #Retro bears make perfect gifts and...,1013187241,84,211,251,837,2,1
4,637696327485366272,Sat Aug 29 18:40:26 +0000 2015,It’s hard to say whether packing lists are mak...,1013187241,84,211,251,837,1,1


In [6]:
# Import necessary libraries
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical

# Load your dataset
data = pd.read_csv('Mental-Health-Twitter.csv', index_col=0)  # Replace with your dataset path
texts = data['post_text'].apply(preprocess_text)  # Apply preprocessing
labels = data['label']  # Replace with your label column name

labels = np.array(labels)
# labels = to_categorical(labels, num_classes=2)

# Apply preprocessing
# processed_texts = texts.apply(preprocess_text)

# Calculate the length of each processed text
lengths = texts.apply(lambda x: len(x.split()))

# Calculate the average length
average_length = round(lengths.mean())

print(f"Average length of reviews: {average_length}")

# Split the dataset
x_train, x_test, y_train, y_test = train_test_split(texts, labels, test_size=0.3, random_state=42, stratify=labels)

# Tokenization and Padding
max_features = 10000  # Number of unique words to keep
maxlen = average_length * 2  # Maximum length of sequences

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(x_train)
x_train_sequences = tokenizer.texts_to_sequences(x_train)
x_test_sequences = tokenizer.texts_to_sequences(x_test)

x_train_padded = pad_sequences(x_train_sequences, maxlen=maxlen)
x_test_padded = pad_sequences(x_test_sequences, maxlen=maxlen)

Average length of reviews: 8


In [7]:
x_train

10227    rt bonmotvivant lili reinhart say betty veroni...
6597     nativeandnaive legendxofxzach one time freshma...
6355     zaynmalik dont forget belong zayn, heart milli...
14190                         battlafield baba_khan uh huh
8059     rt historyinpics mcdonalds menu early httpt.co...
                               ...                        
335      kid screen electronics addiction httpst.corwcx...
4235     gradingus would answered yes say specify sausa...
1617     toda adventure natural option depression easie...
1815     annie testing help sufferer depressive disorde...
13728                                          friend dead
Name: post_text, Length: 14000, dtype: object

In [8]:
from tensorflow.keras import layers, models
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    monitor='val_loss',    # Watch the validation loss
    patience=2,            # Stop after 3 epochs of no improvement
    restore_best_weights=True  # Revert to the best model
)

# Build the LSTM model
model = models.Sequential([
    layers.Embedding(input_dim=max_features, output_dim=128),
    layers.LSTM(64, return_sequences=True),
    layers.LSTM(32),
    layers.Dense(1, activation='sigmoid') #It outputs a value between 0 and 1, representing the probability that the input belongs to the positive class.
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy', Precision(), Recall()])

# Train the model
history = model.fit(x_train_padded, y_train, epochs=8, validation_data=(x_test_padded, y_test), callbacks=[early_stopping])

Epoch 1/8
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 19ms/step - accuracy: 0.7382 - loss: 0.4966 - precision: 0.7789 - recall: 0.6453 - val_accuracy: 0.8615 - val_loss: 0.2961 - val_precision: 0.9085 - val_recall: 0.8040
Epoch 2/8
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step - accuracy: 0.9284 - loss: 0.1651 - precision: 0.9292 - recall: 0.9281 - val_accuracy: 0.8727 - val_loss: 0.2763 - val_precision: 0.8658 - val_recall: 0.8820
Epoch 3/8
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 19ms/step - accuracy: 0.9675 - loss: 0.0866 - precision: 0.9679 - recall: 0.9672 - val_accuracy: 0.8727 - val_loss: 0.4151 - val_precision: 0.8734 - val_recall: 0.8717
Epoch 4/8
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 18ms/step - accuracy: 0.9786 - loss: 0.0566 - precision: 0.9780 - recall: 0.9793 - val_accuracy: 0.8693 - val_loss: 

2024-09-19 10:53:02.720651: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


In [9]:
# Evaluate the model and unpack all values
results = model.evaluate(x_test_padded, y_test, verbose=2)
test_loss = results[0]
test_acc = results[1]
test_precision = results[2]
test_recall = results[3]

print(f"Test loss: {test_loss}")
print(f"Test accuracy: {test_acc}")
print(f"Test precision: {test_precision}")
print(f"Test recall: {test_recall}")


188/188 - 1s - 3ms/step - accuracy: 0.8727 - loss: 0.2763 - precision: 0.8658 - recall: 0.8820
Test loss: 0.2763007879257202
Test accuracy: 0.8726666569709778
Test precision: 0.8658376932144165
Test recall: 0.8820000290870667


In [10]:
from sklearn.metrics import classification_report, confusion_matrix

# Get predictions for classification metrics
y_pred = (model.predict(x_test_padded) > 0.5).astype("int32")  # Convert probabilities to 0 or 1

# Calculate Precision, Recall, and F1-Score
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Calculate Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.86      0.87      3000
           1       0.87      0.88      0.87      3000

    accuracy                           0.87      6000
   macro avg       0.87      0.87      0.87      6000
weighted avg       0.87      0.87      0.87      6000

Confusion Matrix:
[[2590  410]
 [ 354 2646]]
