In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

# Load the CSV 
df = pd.read_csv("mental_disorders_reddit.csv", encoding='utf-8-sig')

# Preprocessing 
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if pd.isnull(text):
        return ""
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = text.lower()
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Apply cleaning to the main post body
df["clean_text"] = df["selftext"].apply(clean_text)

df = df[df["clean_text"].str.strip() != ""]

# Final dataset preview
print(df[["clean_text", "subreddit"]].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\garav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\garav\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                          clean_text subreddit
0  anyone else think important part life relation...       BPD
1  hello fellow friend im bpd spectrum discourage...       BPD
2  f bf told today said wish could better like do...       BPD
3  okay im open many thing ive done past im proud...       BPD
4                                            removed       BPD


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode subreddits
le = LabelEncoder()
df["label_encoded"] = le.fit_transform(df["subreddit"])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_text"], df["label_encoded"], test_size=0.2, random_state=42
)

print("Classes:", list(le.classes_))


Classes: ['Anxiety', 'BPD', 'bipolar', 'depression', 'mentalillness', 'schizophrenia']


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Predict
y_pred = model.predict(X_test_tfidf)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=le.classes_))


Accuracy: 0.7038341363265123
               precision    recall  f1-score   support

      Anxiety       0.81      0.80      0.80     33493
          BPD       0.78      0.76      0.77     46579
      bipolar       0.74      0.41      0.53      9407
   depression       0.56      0.80      0.66     31264
mentalillness       0.52      0.20      0.29      8704
schizophrenia       0.66      0.21      0.32      4012

     accuracy                           0.70    133459
    macro avg       0.68      0.53      0.56    133459
 weighted avg       0.71      0.70      0.69    133459



In [5]:
def predict_subreddit(text):
    cleaned = clean_text(text)
    vec = vectorizer.transform([cleaned])
    pred = model.predict(vec)
    return le.inverse_transform(pred)[0]

# Test it
test_text = "I feel like someone eating my brain"
print("Predicted Subreddit:", predict_subreddit(test_text))


Predicted Subreddit: depression
