In [None]:
import pandas as pd
import numpy as np
import nltk
import re
import pickle
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Load dataset
df = pd.read_csv("reddit_mentalhealth_sample.csv")

# Select only relevant columns
df = df[['content', 'subreddit']]
df.head()

Unnamed: 0,content,subreddit
0,My #1 biggest fear is death. Losing my conscio...,Anxiety
1,Trying my 4th med out soon. The first 3 anxiet...,Anxiety
2,Nauseated when thinking about a holiday How ca...,Anxiety
3,Angerxiety? Does anyone else experience anger ...,Anxiety
4,No idea how to deal with new anxiety For as lo...,Anxiety


In [None]:
# Convert text to lowercase and remove special characters
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@\w+|\#', '', text)  # Remove mentions and hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuations
    text = re.sub(r'\d+', '', text)  # Remove numbers
    return text

df['content'] = df['content'].apply(clean_text)

# Encode sentiment labels
sentiment_label_encoder = LabelEncoder()
df['subreddit_label'] = sentiment_label_encoder.fit_transform(df['subreddit'])

In [None]:
# Save label encoders for deployment
with open("subreddit_label_encoder.pkl", "wb") as f:
    pickle.dump(sentiment_label_encoder, f)

In [None]:
# Tokenization
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['content'])

# Save tokenizer for deployment
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [None]:
# Convert text to sequences and pad them
sequences = tokenizer.texts_to_sequences(df['content'])
max_length = 50  # Max sequence length
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding="post", truncating="post")

In [None]:
# Splitting data for Sentiment Model
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    padded_sequences, df['subreddit_label'], test_size=0.2, random_state=42, stratify=df['subreddit_label']
)

# Convert labels to categorical
num_classes_s = len(sentiment_label_encoder.classes_)
y_train_s = tf.keras.utils.to_categorical(y_train_s, num_classes_s)
y_test_s = tf.keras.utils.to_categorical(y_test_s, num_classes_s)

In [None]:
# Function to create LSTM model
def create_lstm_model(output_classes):
    model = Sequential([
        Embedding(input_dim=10000, output_dim=128, input_length=max_length),
        LSTM(128, return_sequences=True),
        Dropout(0.2),
        LSTM(64),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dense(output_classes, activation='softmax')  # Multi-class classification
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
# Train Subreddit Analysis Model
sentiment_model = create_lstm_model(num_classes_s)
sentiment_model.summary()
sentiment_model.fit(X_train_s, y_train_s, epochs=10, batch_size=32, validation_data=(X_test_s, y_test_s))



Epoch 1/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 40ms/step - accuracy: 0.1525 - loss: 1.9460 - val_accuracy: 0.1643 - val_loss: 1.9372
Epoch 2/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.1737 - loss: 1.9289 - val_accuracy: 0.1714 - val_loss: 1.9299
Epoch 3/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.2792 - loss: 1.8104 - val_accuracy: 0.1857 - val_loss: 1.9089
Epoch 4/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.2918 - loss: 1.7613 - val_accuracy: 0.2143 - val_loss: 2.0404
Epoch 5/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.3218 - loss: 1.5674 - val_accuracy: 0.1857 - val_loss: 2.0147
Epoch 6/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.4383 - loss: 1.3001 - val_accuracy: 0.2429 - val_loss: 2.3427
Epoch 7/10
[1m18/18[0m [32m━━━━

<keras.src.callbacks.history.History at 0x7f0218f04690>

In [None]:
# Save Model
sentiment_model.save("subreddit_lstm_model.h5")

print("Model, tokenizer, and label encoder saved successfully!")



Model, tokenizer, and label encoder saved successfully!
