<a href="https://colab.research.google.com/github/Gianna17159/reddit-depression-detection/blob/main/Reddit_Depression_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Reddit Depression Final Project
Link to the paper: https://dl.acm.org/doi/pdf/10.1145/3578503.3583621

Read through the paper fully before starting the assignment!

In [None]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier

!pip install pandas==2.2.2

from google.colab import drive
drive.mount("/content/drive")

FOLDER = "/content/drive/MyDrive/cs1460/fp1"
FILEPATH = f"{FOLDER}/student.pkl"

Mounted at /content/drive


In [None]:
!pip install happiestfuntokenizing

Collecting happiestfuntokenizing
  Downloading happiestfuntokenizing-0.0.7.tar.gz (6.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: happiestfuntokenizing
  Building wheel for happiestfuntokenizing (setup.py) ... [?25l[?25hdone
  Created wheel for happiestfuntokenizing: filename=happiestfuntokenizing-0.0.7-py3-none-any.whl size=6711 sha256=546f11d0214b7de8dbea0ac2a3fea70271d8cb76685e397a9e81b596fb302287
  Stored in directory: /root/.cache/pip/wheels/bf/c9/4d/310f0c60855eb7b428558f29d93cf464dbb64c1b8628753395
Successfully built happiestfuntokenizing
Installing collected packages: happiestfuntokenizing
Successfully installed happiestfuntokenizing-0.0.7


## Preprocessing

In [None]:
def load():
  """Load pickles"""
  with open(FILEPATH, 'rb') as f:
    data = pd.read_pickle(f)
  return data

In [None]:
# List of depression subreddits in the paper
depression_subreddits = ["Anger",
    "anhedonia", "DeadBedrooms",
    "Anxiety", "AnxietyDepression", "HealthAnxiety", "PanicAttack",
    "bingeeating", "BingeEatingDisorder", "EatingDisorders", "eating_disorders", "EDAnonymous",
    "ForeverAlone", "lonely",
    "cry", "grief", "sad", "Sadness",
    "AvPD", "SelfHate", "selfhelp", "socialanxiety", "whatsbotheringyou",
    "insomnia", "sleep",
    "cfs", "ChronicPain", "Constipation", "EssentialTremor", "headaches", "ibs", "tinnitus",
    "Guilt", "Pessimism", "selfhelp", "whatsbotheringyou"
]
anger_subreddits = ["Anger"]
anhedonia_subreddits = ["anhedonia", "DeadBedrooms"]
anxiety_subreddits = ["Anxiety", "AnxietyDepression", "HealthAnxiety", "PanicAttack"]
eating_subreddits = ["bingeeating", "BingeEatingDisorder", "EatingDisorders", "eating_disorders", "EDAnonymous"]
loneliness_subreddits = ["ForeverAlone", "lonely"]
sadness_subreddits = ["cry", "grief", "sad", "Sadness"]
loathing_subreddits = ["AvPD", "SelfHate", "selfhelp", "socialanxiety", "whatsbotheringyou"]
insomnia_subreddits = ["insomnia", "sleep"]
somatic_subreddits = ["cfs", "ChronicPain", "Constipation", "EssentialTremor", "headaches", "ibs", "tinnitus"]
worthless_subreddits = ["Guilt", "Pessimism", "selfhelp", "whatsbotheringyou"]

#symptoms represented by integers for dataset
symptoms = ["control",        #0
            "anger",          #1
            "anhedonia",      #2
            "anxiety",        #3
            "eating",         #4
            "loneliness",     #5
            "sadness",        #6
            "loathing",       #7
            "insomnia",       #8
            "somatic",        #9
            "worthlessness"]  #10

In [None]:
from datetime import datetime
import time
from tqdm import tqdm

def dataset_generation(data):
  """Build control and symptom datasets"""
  """
  Parameters: data -- a pandas dataframe

  Filters through original dataset to build control and symptom datasets. All
  final datasets are saved as pickle files.
  """

  control_dict = {"text": [], "label": []}
  depression_dict = {"text": [], "label": []}

  pos_control_dict = {"text": [], "author": [], "created_utc": []}
  authors_dict = {"author": [], "earliest date": []}

  #iterate through original data
  for row in tqdm(data.itertuples(),  total=data.shape[0], desc=f'Reading DF'):

    #update depression and symptom dicts if subreddit is in depression_subreddits
    if row.subreddit in depression_subreddits:

      #find symptom idx
      if row.subreddit in anger_subreddits:
        idx = 1
      elif row.subreddit in anhedonia_subreddits:
        idx = 2
      elif row.subreddit in anxiety_subreddits:
        idx = 3
      elif row.subreddit in eating_subreddits:
        idx = 4
      elif row.subreddit in loneliness_subreddits:
        idx = 5
      elif row.subreddit in sadness_subreddits:
        idx = 6
      elif row.subreddit in loathing_subreddits:
        idx = 7
      elif row.subreddit in insomnia_subreddits:
        idx = 8
      elif row.subreddit in somatic_subreddits:
        idx = 9
      elif row.subreddit in worthless_subreddits:
        idx = 10

      #update depression and all data dicts
      depression_dict["text"].append(row.text)
      depression_dict["label"].append(idx)

      #add new author and their post to author_dict
      if row.author not in authors_dict["author"]:
        authors_dict["author"].append(row.author)
        authors_dict["earliest date"].append(row.created_utc)
      else:
        #update earliest post utc if current post is earlier
        if authors_dict["earliest date"][authors_dict["author"].index(row.author)] > row.created_utc:
          authors_dict["earliest date"][authors_dict["author"].index(row.author)] = row.created_utc
    else:
      #add all non-depression posts to possible control dictionary
      pos_control_dict["text"].append(row.text)
      pos_control_dict["author"].append(row.author)
      pos_control_dict["created_utc"].append(row.created_utc)

  #create df from possible controls, filter using author list, and iterate through them
  pos_control_df = pd.DataFrame(pos_control_dict)
  pos_control_df = pos_control_df.loc[pos_control_df["author"].isin(authors_dict["author"])]

  for row in tqdm(pos_control_df.itertuples(),  total=pos_control_df.shape[0], desc=f'Reading PCDF'):

    #if possible control post is at least 180 days older than earliest depression post
    #by same author, add post to final control dataset
    earliest_timestamp = authors_dict["earliest date"][authors_dict["author"].index(row.author)]
    earliest_date = datetime.fromtimestamp(earliest_timestamp)
    cur_date = datetime.fromtimestamp(row.created_utc)
    difference = earliest_date - cur_date
    if difference.days >= 180:
      control_dict["text"].append(row.text)
      control_dict["label"].append(0)

  #create pickle files for all dataframes
  pickle.dump(depression_dict, open(f"{FOLDER}/depression_dict.pkl", "wb"))
  pickle.dump(control_dict, open(f"{FOLDER}/control_dict.pkl", "wb"))



In [None]:
from happiestfuntokenizing.happiestfuntokenizing import Tokenizer
import string

def tokenize(posts : dict) -> dict:
  """Tokenize"""
  tokenizer = Tokenizer(preserve_case=False)
  tokenized_posts = []
  for post in posts["text"]:

    #removing punctuation
    post = post.translate(str.maketrans('', '', string.punctuation))
    #tokenize, add tokenized post to post list
    tokenized_post = tokenizer.tokenize(post)
    tokenized_posts.append(tokenized_post)
  #update text column to be tokenized posts
  posts["text"] = tokenized_posts
  return posts


In [None]:
from collections import Counter

def stop_words(control: dict) -> list[str]:
  """
  Find top 100 words from Reddit control dataset to use as stop words
  parameter: control -- list of tokenized control posts
  returns: stop_words -- list of top 100 words from control dataset
  """
  words = Counter()
  for post in control["text"]:
    for word in post:
      words[word] += 1
  stop_words = words.most_common(100)
  stop_words = [word[0] for word in stop_words]
  return stop_words


In [None]:
def remove_stop_words(all_posts: dict, stop_words: list[str]) -> list[list[str]]:
  """
  remove stop words from all posts
  parameter: all_posts -- list of tokenized posts
             stop_words -- list of stop words
  returns: all_posts -- list of tokenized posts with stop words removed
  """
  for post in all_posts["text"]:
    for word in post:
      if word in stop_words:
        post.remove(word)
  return all_posts

In [None]:
data = load()

In [None]:
#ONLY RUN ONCE
#apply dataset generation and preprocessing steps

dataset_generation(data)
control_tokenized = tokenize(pickle.load(open(f"{FOLDER}/control_dict.pkl", "rb")))
depression_tokenized = tokenize(pickle.load(open(f"{FOLDER}/depression_dict.pkl", "rb")))
stop_words = stop_words(control_tokenized)
control_preprocessed = remove_stop_words(control_tokenized, stop_words)
depression_preprocessed = remove_stop_words(depression_tokenized, stop_words)
pickle.dump(control_preprocessed, open(f"{FOLDER}/control_preprocessed.pkl", "wb"))
pickle.dump(depression_preprocessed, open(f"{FOLDER}/depression_preprocessed.pkl", "wb"))

## Reddit Topics with LDA

 - Don't use MALLET (as the paper does), use some other LDA implementation.

In [None]:
# We highly recommend you using the LdaMulticore interface, but feel free to use any other implementations if you prefer.
# from gensim.models import LdaMulticore

# TODO: Your LDA code!
from gensim.models import LdaMulticore
from gensim.corpora import Dictionary


def train_topic_model(corpus: np.ndarray, id2word: dict, num_topics: int) -> LdaMulticore:
  """
  Train a topic model on a corpus of posts.

  parameter: corpus -- term-document matrix
             id2word -- dictionary mapping word indices to words
             num_topics -- number of topics to train the model with
  returns: model -- trained LDA model
  """
  model = LdaMulticore(corpus, id2word=id2word, num_topics=num_topics, passes=3, random_state=42)
  return model


def get_topic_probabilities(model, corpus)-> list[float]:
  """
  Get the topic probabilities for each post in the corpus.

  parameter: model -- trained LDA model
             corpus -- term-document matrix
  returns: topic_probabilities -- list of topic probabilities for each post
  """
  topic_probabilities = []
  for doc in corpus:
    doc_topics = model.get_document_topics(doc)
    topic_probs = [0] * model.num_topics
    for topic, prob in doc_topics:
      topic_probs[topic] = prob
    topic_probabilities.append(topic_probs)
  return topic_probabilities

In [None]:
#load datasets
control_preprocessed = pickle.load(open(f"{FOLDER}/control_preprocessed.pkl", "rb"))
depression_preprocessed = pickle.load(open(f"{FOLDER}/depression_preprocessed.pkl", "rb"))

#recombine control and depression datasets
all_posts = {"text": control_preprocessed["text"] + depression_preprocessed["text"],
             "label": control_preprocessed["label"] + depression_preprocessed["label"]}


In [None]:
#ONLY RUN ONCE
#create dictionary and vector representations of posts, run LDA training, takes ~22 minutes
dictionary = Dictionary(all_posts["text"])
corpus = [dictionary.doc2bow(text) for text in all_posts["text"]]
lda_model = train_topic_model(corpus=corpus,id2word=dictionary, num_topics=200)

#calculate topic probability for each post, takes ~4-5 minutes
topic_probabilities = get_topic_probabilities(lda_model, corpus)

#add topic probabilities for each post to df
all_posts_df = pd.DataFrame(all_posts)
all_posts_df["topic_probabilities"] = topic_probabilities

#pickle, only run once
pickle.dump(all_posts_df, open(f"{FOLDER}/after_lda_df.pkl", "wb"))

## RoBERTa Embeddings

In [None]:
#ONLY RUN ONCE

from transformers import AutoConfig, AutoModelForMaskedLM, AutoTokenizer
import torch
import gc

# Load model directly
config = AutoConfig.from_pretrained("distilroberta-base")
config.output_hidden_states = True
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base", add_prefix_space=True)
roberta_model = AutoModelForMaskedLM.from_pretrained("distilroberta-base", config=config)


def get_roberta_embeddings(model, post, tokenizer) -> list[float]:
  """
  Uses pretrained model to get embedding representation of each post in corpus.
  parameters - model: pretrained model, tokenizer: tokenizer for post encoding,
                posts: tokenized posts
  returns - list of embeddings for each post
  """
  model.to('cuda')
  model.eval()

  #represent empty posts, which previously exclusively contained stop words
  if not post:
    post = [""]

  #tokenize, return tensors
  input = tokenizer(post, padding=True, max_length=100, truncation=True,
                    is_split_into_words=True, return_tensors='pt')

  with torch.no_grad():
    #put input to gpu, find outputs
    input.to('cuda')
    output = model(**input)
    #memory saving measures
    input.to('cpu')
    del input
    torch.cuda.empty_cache()
    #access output of 5th layer, which is index 5 when accounting for word embedding output
    #find average of token embeddings for post embedding
    post_embeddings = torch.mean(output.hidden_states[5].squeeze().cpu(), dim=0)
  return post_embeddings



#get roberta embeddings (~14-15 minutes) for all posts and add to all_posts dataframe
all_posts_df = pickle.load(open(f"{FOLDER}/after_lda_df.pkl", "rb"))

post_embeddings = [get_roberta_embeddings(roberta_model, all_posts_df["text"][i], tokenizer)
                  for i in tqdm(range(len(all_posts_df["text"])), desc=f'Getting post embeddings')]

all_posts_df["roberta_embeddings"] = post_embeddings
pickle.dump(all_posts_df, open(f"{FOLDER}/after_roberta_df.pkl", "wb"))

## Main

In [None]:
def main(X, y):
  """
  Here's the basic structure of the main block! It should run
  5-fold cross validation with random forest to evaluate your RoBERTa and LDA
  performance.
  """
  rf_classifier = RandomForestClassifier()
  cv = KFold(n_splits=5, shuffle=True, random_state=42)
  results = cross_validate(rf_classifier, X=X, y=y, cv=cv, scoring='roc_auc', return_train_score=True)

  # TODO: Print your training and testing scores!
  avg_train_score = np.mean(results['train_score'])
  avg_test_score = np.mean(results['test_score'])
  print(f"Training score: {avg_train_score}")
  print(f"Testing score: {avg_test_score}")

In [None]:
#create symptom dataframes, perform both lda and roberta testing for each symptom
from sklearn.model_selection import train_test_split
all_posts_df = pickle.load(open(f"{FOLDER}/after_roberta_df.pkl", "rb"))

#symptom datasets with control added for one to one classification
#symptom datasets with control added for one to one classification
anger_df = all_posts_df.loc[(all_posts_df["label"] == 1) | (all_posts_df["label"] == 0)]
anhedonia_df = all_posts_df.loc[(all_posts_df["label"] == 2) | (all_posts_df["label"] == 0)]
anxiety_df = all_posts_df.loc[(all_posts_df["label"] == 3) | (all_posts_df["label"] == 0)]
eating_df = all_posts_df.loc[(all_posts_df["label"] == 4) | (all_posts_df["label"] == 0)]
loneliness_df = all_posts_df.loc[(all_posts_df["label"] == 5) | (all_posts_df["label"] == 0)]
sadness_df = all_posts_df.loc[(all_posts_df["label"] == 6) | (all_posts_df["label"] == 0)]
loathing_df = all_posts_df.loc[(all_posts_df["label"] == 7) | (all_posts_df["label"] == 0)]
insomnia_df = all_posts_df.loc[(all_posts_df["label"] == 8) | (all_posts_df["label"] == 0)]
somatic_df = all_posts_df.loc[(all_posts_df["label"] == 9) | (all_posts_df["label"] == 0)]
worthless_df = all_posts_df.loc[(all_posts_df["label"] == 10) | (all_posts_df["label"] == 0)]

#lda testing with topic probabilities, converted to list for classifier compatibility

#anger
print("Anger LDA")
main(anger_df["topic_probabilities"].tolist(), anger_df["label"].tolist())

#anhedonia
print("Anhedonia LDA")
main(anhedonia_df["topic_probabilities"].tolist(), anhedonia_df["label"].tolist())

#anxiety
print("Anxiety LDA")
main(anxiety_df["topic_probabilities"].tolist(), anxiety_df["label"].tolist())

#eating
print("Eating LDA")
main(eating_df["topic_probabilities"].tolist(), eating_df["label"].tolist())

#loneliness
print("Loneliness LDA")
main(loneliness_df["topic_probabilities"].tolist(), loneliness_df["label"].tolist())

#sadness
print("Sadness LDA")
main(sadness_df["topic_probabilities"].tolist(), sadness_df["label"].tolist())

#loathing
print("Loathing LDA")
main(loathing_df["topic_probabilities"].tolist(), loathing_df["label"].tolist())

#insomnia
print("Insomnia LDA")
main(insomnia_df["topic_probabilities"].tolist(), insomnia_df["label"].tolist())

#somatic
print("Somatic LDA")
main(somatic_df["topic_probabilities"].tolist(), somatic_df["label"].tolist())

#worthless
print("Worthless LDA")
main(worthless_df["topic_probabilities"].tolist(), worthless_df["label"].tolist())




Anger LDA
Training score: 0.9992857174409266
Testing score: 0.9346233191775315
Anhedonia LDA
Training score: 0.9995597758628092
Testing score: 0.9598393680870757
Anxiety LDA
Training score: 0.9998807385164881
Testing score: 0.9401399206229929
Eating LDA
Training score: 0.9993281109981856
Testing score: 0.9586837041012789
Loneliness LDA
Training score: 0.9998413671993003
Testing score: 0.8647183865958381
Sadness LDA
Training score: 0.9994498659547606
Testing score: 0.8527168959859776
Loathing LDA
Training score: 0.9993851983552184
Testing score: 0.8819954616000654
Insomnia LDA
Training score: 0.9996680930717463
Testing score: 0.9782681950346429
Somatic LDA
Training score: 0.9993919133526784
Testing score: 0.9193625251720301
Worthless LDA
Training score: 0.9981798009859274
Testing score: 0.657632803439651


In [None]:
#distilRoberta testing with post embeddings

#anger
print("Anger distilRoberta")
main(anger_df["roberta_embeddings"].tolist(), anger_df["label"].tolist())

#anhedonia
print("Anhedonia distilRoberta")
main(anhedonia_df["roberta_embeddings"].tolist(), anhedonia_df["label"].tolist())

#anxiety
print("Anxiety distilRoberta")
main(anxiety_df["roberta_embeddings"].tolist(), anxiety_df["label"].tolist())

#eating
print("Eating distilRoberta")
main(eating_df["roberta_embeddings"].tolist(), eating_df["label"].tolist())

#loneliness
print("Loneliness distilRoberta")
main(loneliness_df["roberta_embeddings"].tolist(), loneliness_df["label"].tolist())

#sadness
print("Sadness distilRoberta")
main(sadness_df["roberta_embeddings"].tolist(), sadness_df["label"].tolist())

#loathing
print("Loathing distilRoberta")
main(loathing_df["roberta_embeddings"].tolist(), loathing_df["label"].tolist())

#insomnia
print("Insomnia distilRoberta")
main(insomnia_df["roberta_embeddings"].tolist(), insomnia_df["label"].tolist())

#somatic
print("Somatic distilRoberta")
main(somatic_df["roberta_embeddings"].tolist(), somatic_df["label"].tolist())

#worthless
print("Worthless distilRoberta")
main(worthless_df["roberta_embeddings"].tolist(), worthless_df["label"].tolist())


Anger distilRoberta
Training score: 0.9999993566656874
Testing score: 0.9341479404465952
Anhedonia distilRoberta
Training score: 0.9999994474437205
Testing score: 0.9583828131147701
Anxiety distilRoberta
Training score: 0.9999746613213343
Testing score: 0.9523284098927516
Eating distilRoberta
Training score: 1.0
Testing score: 0.9657040676961428
Loneliness distilRoberta
Training score: 0.9999374560665926
Testing score: 0.9077385469268918
Sadness distilRoberta
Training score: 0.9999018379174591
Testing score: 0.9183706115966425
Loathing distilRoberta
Training score: 0.9999778148156869
Testing score: 0.9284033092691135
Insomnia distilRoberta
Training score: 1.0
Testing score: 0.9773468964051535
Somatic distilRoberta
Training score: 0.9999998027087538
Testing score: 0.9392502671435782
Worthless distilRoberta
Training score: 1.0
Testing score: 0.816783019663774
