In [12]:
import pandas as pd
import numpy as np
import nltk
import re
import pickle
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nazla\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
# Load dataset
df = pd.read_csv("C:/Users/nazla/Documents/A. College Tasks 3rd Year/A. International Research/datasets/reddit_mentalhealth_sample.csv")

# Select only relevant columns
df = df[['content', 'subreddit']]
df.head()

Unnamed: 0,content,subreddit
0,My #1 biggest fear is death. Losing my conscio...,Anxiety
1,Trying my 4th med out soon. The first 3 anxiet...,Anxiety
2,Nauseated when thinking about a holiday How ca...,Anxiety
3,Angerxiety? Does anyone else experience anger ...,Anxiety
4,No idea how to deal with new anxiety For as lo...,Anxiety


In [34]:
# Convert text to lowercase and remove special characters
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@\w+|\#', '', text)  # Remove mentions and hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuations
    text = re.sub(r'\d+', '', text)  # Remove numbers
    return text

df['content'] = df['content'].apply(clean_text)

# Encode sentiment labels
sentiment_label_encoder = LabelEncoder()
df['subreddit_label'] = sentiment_label_encoder.fit_transform(df['subreddit'])

In [35]:
# Save label encoders for deployment
with open("subreddit_label_encoder.pkl", "wb") as f:
    pickle.dump(sentiment_label_encoder, f)

In [36]:
# Convert text to sequences and pad them
sequences = tokenizer.texts_to_sequences(df['content'])
max_length = 50  # Max sequence length
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding="post", truncating="post")

In [37]:
# Splitting data for Sentiment Model
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    padded_sequences, df['subreddit_label'], test_size=0.2, random_state=42, stratify=df['subreddit_label']
)

# Convert labels to categorical
num_classes_s = len(sentiment_label_encoder.classes_)
y_train_s = tf.keras.utils.to_categorical(y_train_s, num_classes_s)
y_test_s = tf.keras.utils.to_categorical(y_test_s, num_classes_s)

In [38]:
# Function to create LSTM model
def create_lstm_model(output_classes):
    model = Sequential([
        Embedding(input_dim=10000, output_dim=128, input_length=max_length),
        LSTM(128, return_sequences=True),
        Dropout(0.2),
        LSTM(64),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dense(output_classes, activation='softmax')  # Multi-class classification
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [39]:
# Train Subreddit Analysis Model
sentiment_model = create_lstm_model(num_classes_s)
sentiment_model.summary()
sentiment_model.fit(X_train_s, y_train_s, epochs=10, batch_size=32, validation_data=(X_test_s, y_test_s))



Epoch 1/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 1s/step - accuracy: 0.1263 - loss: 1.9471 - val_accuracy: 0.1500 - val_loss: 1.9442
Epoch 2/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 585ms/step - accuracy: 0.1572 - loss: 1.9371 - val_accuracy: 0.1714 - val_loss: 1.9413
Epoch 3/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 552ms/step - accuracy: 0.2702 - loss: 1.8729 - val_accuracy: 0.1857 - val_loss: 1.9709
Epoch 4/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 653ms/step - accuracy: 0.3384 - loss: 1.6606 - val_accuracy: 0.2429 - val_loss: 1.9817
Epoch 5/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 308ms/step - accuracy: 0.4549 - loss: 1.2852 - val_accuracy: 0.2357 - val_loss: 2.1780
Epoch 6/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 493ms/step - accuracy: 0.6639 - loss: 0.9434 - val_accuracy: 0.2857 - val_loss: 2.5436
Epoch 7/10
[1m18/18[0m 

<keras.src.callbacks.history.History at 0x1f725e08590>

In [41]:
# Save Model
model.save("subreddit_lstm_model.h5")

print("Model, tokenizer, and label encoder saved successfully!")



Model, tokenizer, and label encoder saved successfully!
