In [1]:
pip install nltk



In [2]:
# Import the necessary libraries
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Ensure nltk resources are available
import nltk
nltk.download('stopwords')

nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize the lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Custom stop words list with negations retained
custom_stop_words = stop_words - {'not', 'no', 'but', 'because'}

# Function to preprocess text
def preprocess_text(text):
    # Convert text to Lowercase
    text = text.lower()
    # Removing special characters and numbers
    text = re.sub(r'[^\w\s,.]', '', text)  # Remove punctuation, Keep periods and commas for sentence flow
    text = re.sub(r'\d+', '', text)  # Remove numbers
    # Tokenization
    tokens = text.split()
    # Removing stop words and applying lemmatizer tool
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in custom_stop_words]
    # Rejoin tokens into a single string
    text = ' '.join(tokens)
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [5]:
import pandas as pd
df = pd.read_csv('Mental-Health-Twitter.csv', index_col=0)


In [6]:
df.shape

(20000, 10)

In [7]:
df.head()

Unnamed: 0,post_id,post_created,post_text,user_id,followers,friends,favourites,statuses,retweets,label
0,637894677824413696,Sun Aug 30 07:48:37 +0000 2015,It's just over 2 years since I was diagnosed w...,1013187241,84,211,251,837,0,1
1,637890384576778240,Sun Aug 30 07:31:33 +0000 2015,"It's Sunday, I need a break, so I'm planning t...",1013187241,84,211,251,837,1,1
2,637749345908051968,Sat Aug 29 22:11:07 +0000 2015,Awake but tired. I need to sleep but my brain ...,1013187241,84,211,251,837,0,1
3,637696421077123073,Sat Aug 29 18:40:49 +0000 2015,RT @SewHQ: #Retro bears make perfect gifts and...,1013187241,84,211,251,837,2,1
4,637696327485366272,Sat Aug 29 18:40:26 +0000 2015,It’s hard to say whether packing lists are mak...,1013187241,84,211,251,837,1,1


In [8]:
# Import necessary libraries
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical

# Load your dataset
data = pd.read_csv('Mental-Health-Twitter.csv', index_col=0)
texts = data['post_text'].apply(preprocess_text)  # Apply preprocessing
labels = data['label']

labels = np.array(labels)
# labels = to_categorical(labels, num_classes=2)

# Apply preprocessing
# processed_texts = texts.apply(preprocess_text)

# Calculate the length of each processed text
lengths = texts.apply(lambda x: len(x.split()))

# Calculate the average length
average_length = round(lengths.mean())

print(f"Average length of reviews: {average_length}")

# Split the dataset
x_train, x_test, y_train, y_test = train_test_split(texts, labels, test_size=0.3, random_state=42, stratify=labels)

# Tokenization and Padding
max_features = 10000  # Number of unique words to keep
maxlen = average_length * 2  # Maximum length of sequences

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(x_train)
x_train_sequences = tokenizer.texts_to_sequences(x_train)
x_test_sequences = tokenizer.texts_to_sequences(x_test)

x_train_padded = pad_sequences(x_train_sequences, maxlen=maxlen)
x_test_padded = pad_sequences(x_test_sequences, maxlen=maxlen)

Average length of reviews: 8


In [9]:
x_train

Unnamed: 0,post_text
10227,rt bonmotvivant lili reinhart say betty veroni...
6597,nativeandnaive legendxofxzach one time freshma...
6355,"zaynmalik dont forget belong zayn, heart milli..."
14190,battlafield baba_khan uh huh
8059,rt historyinpics mcdonalds menu early httpt.co...
...,...
335,kid screen electronics addiction httpst.corwcx...
4235,gradingus would answered yes say specify sausa...
1617,toda adventure natural option depression easie...
1815,annie testing help sufferer depressive disorde...


In [13]:
from tensorflow.keras import layers, models
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    monitor='val_loss',    # Watch the validation loss
    patience=4,            # Stop after 4 epochs of no improvement
    restore_best_weights=True  # Revert to the best model
)

# Build the LSTM model
model = models.Sequential([
    layers.Embedding(input_dim=max_features, output_dim=128),
    layers.LSTM(64, return_sequences=True),
    layers.LSTM(32),
    layers.Dense(1, activation='sigmoid') #It outputs a value between 0 and 1, representing the probability that the input belongs to the positive class.
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy', Precision(), Recall()])

# Train the model
history = model.fit(x_train_padded, y_train, epochs=8, validation_data=(x_test_padded, y_test), callbacks=[early_stopping])

Epoch 1/8
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 40ms/step - accuracy: 0.7316 - loss: 0.4997 - precision_1: 0.7122 - recall_1: 0.8398 - val_accuracy: 0.8705 - val_loss: 0.2763 - val_precision_1: 0.8729 - val_recall_1: 0.8673
Epoch 2/8
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 37ms/step - accuracy: 0.9385 - loss: 0.1547 - precision_1: 0.9361 - recall_1: 0.9428 - val_accuracy: 0.8762 - val_loss: 0.2653 - val_precision_1: 0.8858 - val_recall_1: 0.8637
Epoch 3/8
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 54ms/step - accuracy: 0.9679 - loss: 0.0849 - precision_1: 0.9690 - recall_1: 0.9661 - val_accuracy: 0.8735 - val_loss: 0.3559 - val_precision_1: 0.8813 - val_recall_1: 0.8633
Epoch 4/8
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 36ms/step - accuracy: 0.9816 - loss: 0.0527 - precision_1: 0.9825 - recall_1: 0.9805 - val_accuracy: 0.8688 - val_loss: 0.4025 - val_precision_1: 0.8767 - val_recal

In [14]:
# Evaluate the model and unpack all values
results = model.evaluate(x_test_padded, y_test, verbose=2)
test_loss = results[0]
test_acc = results[1]
test_precision = results[2]
test_recall = results[3]

print(f"Test loss: {test_loss}")
print(f"Test accuracy: {test_acc}")
print(f"Test precision: {test_precision}")
print(f"Test recall: {test_recall}")


188/188 - 2s - 8ms/step - accuracy: 0.8762 - loss: 0.2653 - precision_1: 0.8858 - recall_1: 0.8637
Test loss: 0.2653316557407379
Test accuracy: 0.8761666417121887
Test precision: 0.885811984539032
Test recall: 0.8636666536331177


In [15]:
from sklearn.metrics import classification_report, confusion_matrix

# Get predictions for classification metrics
y_pred = (model.predict(x_test_padded) > 0.5).astype("int32")  # Convert probabilities to 0 or 1

# Calculate Precision, Recall, and F1-Score
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Calculate Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.89      0.88      3000
           1       0.89      0.86      0.87      3000

    accuracy                           0.88      6000
   macro avg       0.88      0.88      0.88      6000
weighted avg       0.88      0.88      0.88      6000

Confusion Matrix:
[[2666  334]
 [ 409 2591]]


In [17]:
from tensorflow.keras.models import load_model

# Save the model
model.save('sent_model.keras')  # or 'my_model' for the TensorFlow SavedModel format

# Load the model

model = load_model('sent_model.keras')


  saveable.load_own_variables(weights_store.get(inner_path))


In [19]:
# Test loaded Model
# Get predictions for classification metrics
y_pred = (model.predict(x_test_padded) > 0.5).astype("int32")  # Convert probabilities to 0 or 1

# Calculate Precision, Recall, and F1-Score
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Calculate Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.89      0.88      3000
           1       0.89      0.86      0.87      3000

    accuracy                           0.88      6000
   macro avg       0.88      0.88      0.88      6000
weighted avg       0.88      0.88      0.88      6000

Confusion Matrix:
[[2666  334]
 [ 409 2591]]
