# Reddit Depression Final Project
Link to the paper: https://dl.acm.org/doi/pdf/10.1145/3578503.3583621

Read through the paper fully before starting the assignment!

## Installation and Imports

In [1]:
pip install happiestfuntokenizing

Collecting happiestfuntokenizing
  Downloading happiestfuntokenizing-0.0.7.tar.gz (6.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: happiestfuntokenizing
  Building wheel for happiestfuntokenizing (setup.py) ... [?25l[?25hdone
  Created wheel for happiestfuntokenizing: filename=happiestfuntokenizing-0.0.7-py3-none-any.whl size=6710 sha256=57db10ea684d2c716d12c8bd7c4a48951301a0fe0d74d35669da4d27a2d053b7
  Stored in directory: /root/.cache/pip/wheels/bf/c9/4d/310f0c60855eb7b428558f29d93cf464dbb64c1b8628753395
Successfully built happiestfuntokenizing
Installing collected packages: happiestfuntokenizing
Successfully installed happiestfuntokenizing-0.0.7


In [2]:
import numpy as np
import pandas as pd
import re
import nltk
import torch
import multiprocessing

from collections import Counter
from gensim import corpora, models
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from gensim.models import LdaModel
from gensim.models import LdaMulticore
from transformers import RobertaModel, RobertaTokenizer
from happiestfuntokenizing.happiestfuntokenizing import Tokenizer

from datetime import datetime, timedelta
from sklearn.model_selection import cross_validate, cross_val_score, KFold, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

import spacy
nlp = spacy.load("en_core_web_sm")

from nltk.tokenize import word_tokenize
nltk.download('punkt')

import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

from google.colab import drive
drive.mount('/content/drive')
FILEPATH = 'drive/MyDrive/data/'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Mounted at /content/drive


## Dataset Preparation

In [10]:
def load(filepath):
    """Load pickles"""
    try:
        data = pd.read_pickle(filepath)
        return data

    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Using the load function
FILEPATH = '/content/drive/My Drive/Colab/Final Project/student.pkl'
# FILEPATH = 'student.pkl'
data = load(FILEPATH)

if data is not None:
    print(data.head())


                                                text            author  \
0  does your life feel like a waste mines not a c...        trademeple   
1  Just relapsed again. Any advice I just got to ...          kenny818   
2  Audio and mic not working? So I have a HyperX ...          psyjinks   
3  PG&amp;E: Mylar balloon causes outage in centr...            Majnum   
4                                    Um... Forward?   OldManoftheNorth   

     subreddit  created_utc     date  
0   depression   1504920055  2017-09  
1        NoFap   1507890053  2017-10  
2  techsupport   1513558467  2017-12  
3  nottheonion   1499573023  2017-07  
4        memes   1516842851  2018-01  


In [11]:
def dataset_generation(data):
    """
    Build control and symptom datasets.

    Parameters:
    data (pd.DataFrame): DataFrame containing the Reddit posts data.

    Returns:
    tuple: Two DataFrames (symptom-related posts & control posts)
    """
    # Define the subreddits for each of the 13 depression symptoms
    subreddit_to_symptom = {
    'Anger': ['Anger'],
    'Anhedonia': ['anhedonia', 'DeadBedrooms'],
    'Anxiety': ['Anxiety', 'AnxietyDepression', 'HealthAnxiety', 'PanicAttack'],
    'Concentration deficit': ['DecisionMaking', 'shouldi'],
    'Disordered eating': ['bingeeating', 'BingeEatingDisorder', 'EatingDisorders', 'eating_disorders', 'EDAnonymous'],
    'Fatigue': ['chronicfatigue', 'Fatigue'],
    'Loneliness': ['ForeverAlone', 'lonely'],
    'Sad mood': ['cry', 'grief', 'sad', 'Sadness'],
    'Self-loathing': ['AvPD', 'SelfHate', 'selfhelp', 'socialanxiety', 'whatsbotheringyou'],
    'Sleep problem': ['insomnia', 'sleep'],
    'Somatic complaint': ['cfs', 'ChronicPain', 'Constipation', 'EssentialTremor', 'headaches', 'ibs', 'tinnitus'],
    'Suicidal thoughts and attempts': ['AdultSelfHarm', 'selfharm', 'SuicideWatch'],
    'Worthlessness': ['Guilt', 'Pessimism', 'selfhelp', 'whatsbotheringyou']
    }

    # Reverse the subreddit to symptom mapping for easier lookup
    symptom_to_subreddit = {sub: symptom for symptom, subs in subreddit_to_symptom.items() for sub in subs}

    # Step 1: Label all posts with their corresponding symptoms
    data['symptom'] = data['subreddit'].map(symptom_to_subreddit).fillna('Control')

    ## Step 2: Build a control dataset
    # Find posts from non-symptom-related subreddits
    # by authors who have made at least one symptom-related post
    symptom_authors = data[data['symptom'] != 'Control']['author'].unique()
    control_df = data[(~data['subreddit'].isin(symptom_to_subreddit)) & (data['author'].isin(symptom_authors))]

    # Convert 'created_utc' to a datetime and filter control posts
    control_df['created_utc'] = pd.to_datetime(control_df['created_utc'], unit='s')
    data['created_utc'] = pd.to_datetime(data['created_utc'], unit='s')

    # Get the first symptom post date for each author
    first_symptom_post_dates = data[data['symptom'] != 'Control'].groupby('author')['created_utc'].min()

    # Filter the control posts to be at least 180 days before the first symptom post
    control_df = control_df.join(first_symptom_post_dates.rename('first_symptom_post_date'), on='author')
    control_df = control_df[control_df['created_utc'] < (control_df['first_symptom_post_date'] - pd.Timedelta(days=180))]

    # Drop unnecessary columns
    control_df.drop(['first_symptom_post_date'], axis=1, inplace=True)

    # Remove duplicates if any
    control_df.drop_duplicates(inplace=True)

    ## Step 3: Build a symptom dataset
    # Filter out the control posts
    symptom_df = data[data['symptom'] != 'Control']

    # Remove duplicates if any
    symptom_df.drop_duplicates(inplace=True)

    # Step 4: Keep only post and symptoms
    symptom_df = symptom_df[['symptom','text']]
    control_df = control_df[['symptom','text']]

    return symptom_df, control_df


In [12]:
symptom_df, control_df = dataset_generation(data)

# Check the shape of symptom and control dataset
print("Shape of Symptom Dataset:", symptom_df.shape)
print("Shape of Control Dataset:", control_df.shape)


Shape of Symptom Dataset: (94514, 2)
Shape of Control Dataset: (4369, 2)


In [13]:
df = pd.concat([symptom_df, control_df])
df

Unnamed: 0,symptom,text
20,Suicidal thoughts and attempts,"i'm trying hi, i'm sorry if my writing is bad,..."
39,Loneliness,Only friend has been blanking me for what feel...
67,Anxiety,Study hall social anxiety bruh We had a study ...
72,Anxiety,Positive Thoughts For You - We Are Happy To Pu...
79,Suicidal thoughts and attempts,Starting from a blowup mattress Today was a ve...
...,...,...
1968023,Control,"Mouse basics? Hi there, \r\n\r\nI've recently ..."
1969250,Control,[Serious] If a person has a serious life threa...
1969342,Control,What are the Peasants views on a free and inde...
1969582,Control,For some reason this makes me uncomfortable


In [15]:
pickle_file_path = '/content/drive/My Drive/Colab/Final Project/df.pkl'
# pickle_file_path = 'df.pkl'
df.to_pickle(pickle_file_path)
print(f"DataFrame saved to {pickle_file_path}")

df_loaded = pd.read_pickle(pickle_file_path)

DataFrame saved to /content/drive/My Drive/Colab/Final Project/df.pkl


In [16]:
df['symptom'].value_counts ()

Suicidal thoughts and attempts    26520
Anxiety                           24514
Loneliness                        11535
Somatic complaint                  8330
Self-loathing                      8115
Anhedonia                          5934
Control                            4369
Sleep problem                      3184
Sad mood                           2222
Worthlessness                      1805
Disordered eating                  1789
Anger                               555
Concentration deficit                10
Fatigue                               1
Name: symptom, dtype: int64

In [17]:
# List of symptoms to remove
symptoms_to_remove = ['Suicidal thoughts and attempts', 'Concentration deficit', 'Fatigue']

filtered_df = df[~df['symptom'].isin(symptoms_to_remove)]
filtered_df

Unnamed: 0,symptom,text
39,Loneliness,Only friend has been blanking me for what feel...
67,Anxiety,Study hall social anxiety bruh We had a study ...
72,Anxiety,Positive Thoughts For You - We Are Happy To Pu...
133,Anhedonia,Love Language Opposites In the process of divo...
172,Loneliness,2meirl42meirl4meirl
...,...,...
1968023,Control,"Mouse basics? Hi there, \r\n\r\nI've recently ..."
1969250,Control,[Serious] If a person has a serious life threa...
1969342,Control,What are the Peasants views on a free and inde...
1969582,Control,For some reason this makes me uncomfortable


In [18]:
filtered_df['symptom'].value_counts ()

Anxiety              24514
Loneliness           11535
Somatic complaint     8330
Self-loathing         8115
Anhedonia             5934
Control               4369
Sleep problem         3184
Sad mood              2222
Worthlessness         1805
Disordered eating     1789
Anger                  555
Name: symptom, dtype: int64

## Preprocessing

### Tokenize

In [19]:
def tokenize(text):
    # Initialize the tokenizer
    tokenizer = Tokenizer(preserve_case=False)
    tokenized_texts = [tokenizer.tokenize(text) for text in text]

    return tokenized_texts

In [20]:
# Tokenize whole and fetch control tokens
filtered_df['tokens'] = tokenize(filtered_df['text'])

"""
# Tokenize control
control_df = df[df['symptom'] == 'Control']
control_df['tokens'] = tokenize(control_df['text'])
"""

"\n# Tokenize control\ncontrol_df = df[df['symptom'] == 'Control']\ncontrol_df['tokens'] = tokenize(control_df['text'])\n"

In [21]:
tokenize_df = filtered_df[['symptom','tokens']]
tokenize_df

Unnamed: 0,symptom,tokens
39,Loneliness,"[only, friend, has, been, blanking, me, for, w..."
67,Anxiety,"[study, hall, social, anxiety, bruh, we, had, ..."
72,Anxiety,"[positive, thoughts, for, you, -, we, are, hap..."
133,Anhedonia,"[love, language, opposites, in, the, process, ..."
172,Loneliness,[2meirl42meirl4meirl]
...,...,...
1968023,Control,"[mouse, basics, ?, hi, there, ,, i've, recentl..."
1969250,Control,"[[, serious, ], if, a, person, has, a, serious..."
1969342,Control,"[what, are, the, peasants, views, on, a, free,..."
1969582,Control,"[for, some, reason, this, makes, me, uncomfort..."


In [22]:
tokenize_df['symptom'].value_counts ()

Anxiety              24514
Loneliness           11535
Somatic complaint     8330
Self-loathing         8115
Anhedonia             5934
Control               4369
Sleep problem         3184
Sad mood              2222
Worthlessness         1805
Disordered eating     1789
Anger                  555
Name: symptom, dtype: int64

In [23]:
control_tokenize_df = filtered_df[filtered_df['symptom'] == 'Control']
len(control_tokenize_df)

4369

### Stop Word

In [24]:
def stop_words(tokens):
    """
    Identify the top 100 words from a DataFrame of tokenized documents to use as stop words.
    """
    # Flatten the list of lists of tokens
    all_tokens = [token for tokens_list in tokenize_df['tokens'] for token in tokens_list]

    # Count freq
    token_freq = Counter(all_tokens)

    # Identify the top 100 most frequent words
    top_100_words = [word for word, freq in token_freq.most_common(100)]

    return top_100_words


In [25]:
control_stop_words = stop_words(control_tokenize_df)
#control_stop_words

In [26]:
# Remove stop words from the tokenize_df
def remove_stop_words(tokens, stop_words):
    return [token for token in tokens if token not in stop_words]

tokenize_df['tokens'] = tokenize_df['tokens'].apply(lambda tokens: remove_stop_words(tokens, control_stop_words))


In [27]:
clean_df = tokenize_df[['symptom','tokens']]
clean_df

Unnamed: 0,symptom,tokens
39,Loneliness,"[only, friend, blanking, feels, months, comple..."
67,Anxiety,"[study, hall, social, bruh, study, hall, gym, ..."
72,Anxiety,"[positive, thoughts, happy, publish]"
133,Anhedonia,"[love, language, opposites, process, divorce, ..."
172,Loneliness,[2meirl42meirl4meirl]
...,...,...
1968023,Control,"[mouse, basics, hi, recently, taken, sisters, ..."
1969250,Control,"[[, serious, ], person, serious, threatening, ..."
1969342,Control,"[peasants, views, free, independent, quebec, q..."
1969582,Control,"[reason, makes, uncomfortable]"


In [28]:
clean_df['symptom'].value_counts ()

Anxiety              24514
Loneliness           11535
Somatic complaint     8330
Self-loathing         8115
Anhedonia             5934
Control               4369
Sleep problem         3184
Sad mood              2222
Worthlessness         1805
Disordered eating     1789
Anger                  555
Name: symptom, dtype: int64

## Reddit Topics with LDA

 - Don't use MALLET (as the paper does), use some other LDA implementation.

Outline
1. Remove the top 100 most frequent words (already done in your previous steps).
2. Use LDA to identify topics in the posts, aiming to generate 200 topics.
3. Obtain the distribution of each topic for all posts in the dataset.

In [29]:
# sampled_df = clean_df.sample(n=3000, random_state=42)

In [31]:
# TODO: Your LDA code!
def perform_lda(clean_df, num_topics=200, alpha=5, workers=2):
    # Create a dictionary and corpus
    token_lists = clean_df['tokens'].tolist()
    dictionary = corpora.Dictionary(token_lists)
    corpus = [dictionary.doc2bow(tokens) for tokens in token_lists]

    # Train the LDA model using LdaMulticore
    lda_model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dictionary,
                             alpha=alpha, workers=workers)

    # Get topic vectors
    topic_vectors = []

    for doc in corpus:
        topic_distribution = lda_model.get_document_topics(doc, minimum_probability=0)
        doc_vector = [0] * num_topics # Initialize a vector of zeros for each topic
        for topic_num, prob in topic_distribution:
            doc_vector[topic_num] = prob # Assign the probability to the correct topic index
        topic_vectors.append(doc_vector)

    topic_df = pd.DataFrame(topic_vectors, columns=[f'topic_{i}' for i in range(num_topics)])

    # Concatenate the symptom labels with the topic probabilities
    lda_df = pd.concat([clean_df[['symptom']].reset_index(drop=True), topic_df], axis=1)

    return lda_model, lda_df

# Assuming sampled_df is your dataframe and it has 'tokens' and 'symptom' columns
lda_model, lda_df = perform_lda(clean_df, num_topics=200, alpha=5, workers=2)

In [32]:
lda_df

Unnamed: 0,symptom,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_190,topic_191,topic_192,topic_193,topic_194,topic_195,topic_196,topic_197,topic_198,topic_199
0,Loneliness,0.005018,0.005000,0.004991,0.005001,0.004995,0.004988,0.004990,0.005010,0.004991,...,0.005000,0.005011,0.004997,0.005002,0.005019,0.004998,0.005013,0.004990,0.005005,0.004981
1,Anxiety,0.005041,0.005012,0.004994,0.004963,0.004996,0.004994,0.004979,0.004982,0.004979,...,0.005070,0.004992,0.004989,0.004990,0.004994,0.004984,0.004978,0.005003,0.005025,0.005128
2,Anxiety,0.004999,0.005002,0.004999,0.005001,0.004998,0.005002,0.005000,0.005001,0.005000,...,0.005002,0.004997,0.005002,0.005001,0.004999,0.005001,0.004999,0.004999,0.005002,0.005001
3,Anhedonia,0.004911,0.004917,0.004912,0.005007,0.004926,0.005136,0.005090,0.004989,0.004964,...,0.004996,0.004949,0.004947,0.005097,0.004977,0.004971,0.004921,0.004957,0.005003,0.005091
4,Loneliness,0.005000,0.005000,0.005000,0.005000,0.005000,0.005000,0.005000,0.005000,0.005000,...,0.005000,0.005000,0.005000,0.005000,0.005000,0.005000,0.005000,0.005000,0.005000,0.005000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72347,Control,0.005165,0.004988,0.004927,0.004964,0.004977,0.005024,0.005061,0.004932,0.004948,...,0.004933,0.005049,0.004992,0.004970,0.004970,0.004966,0.005022,0.004952,0.004931,0.004985
72348,Control,0.004998,0.004996,0.004992,0.004987,0.005011,0.004999,0.004999,0.004998,0.004988,...,0.005016,0.005008,0.004997,0.004998,0.005002,0.005000,0.005005,0.005007,0.004999,0.004998
72349,Control,0.004989,0.005007,0.004979,0.004970,0.005007,0.005030,0.004981,0.004984,0.004971,...,0.004990,0.004981,0.004981,0.004981,0.004977,0.004989,0.004992,0.004985,0.005004,0.005041
72350,Control,0.005001,0.005002,0.005000,0.005002,0.004999,0.005000,0.005003,0.005003,0.005000,...,0.005000,0.004999,0.005000,0.005000,0.005001,0.004999,0.005001,0.005000,0.004999,0.004998


In [None]:
def train_rf_for_each_symptom(lda_df, symptom_list):
    # DataFrame to store AUC scores for each symptom
    auc_scores_df = pd.DataFrame(columns=['Symptom', 'Train AUC', 'Test AUC'])

    for symptom in symptom_list:
        print(f"Training model for symptom: {symptom}")

        symptom_df = lda_df[lda_df['symptom'] == symptom]
        control_df = lda_df[lda_df['symptom'] == 'Control']
        combined_df = pd.concat([symptom_df, control_df])

        y = (combined_df['symptom'] == symptom).astype(int)
        X = combined_df.drop(columns=['symptom']).values

        # Check if the symptom category has more than one class
        if np.unique(y).size > 1:
            rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=10, min_samples_leaf=4, n_jobs=-1)

            # Perform 5-fold cross-validation
            cv = KFold(n_splits=5, shuffle=True)
            results = cross_validate(rf_classifier, X=X, y=y, cv=cv, scoring='roc_auc', return_train_score=True)

            # Calculate average AUC scores
            train_auc = np.mean(results['train_score'])
            test_auc = np.mean(results['test_score'])

            auc_scores_df = auc_scores_df.append({'Symptom': symptom,'Train AUC': train_auc,'Test AUC': test_auc}, ignore_index=True)
        else:
            print(f"Skipping AUC calculation for '{symptom}' due to only one class present.")

    return auc_scores_df

symptom_list = ['Anger', 'Anhedonia', 'Anxiety', 'Disordered eating', 'Loneliness', 'Sad mood', 'Self-loathing', 'Sleep problem', 'Somatic complaint', 'Worthlessness']
lda_auc_scores_df = train_rf_for_each_symptom(lda_df, symptom_list)
lda_auc_scores_df


Training model for symptom: Anger
Training model for symptom: Anhedonia
Training model for symptom: Anxiety
Training model for symptom: Disordered eating
Training model for symptom: Loneliness
Training model for symptom: Sad mood
Training model for symptom: Self-loathing
Training model for symptom: Sleep problem
Training model for symptom: Somatic complaint
Training model for symptom: Worthlessness


Unnamed: 0,Symptom,Train AUC,Test AUC
0,Anger,0.997259,0.885057
1,Anhedonia,0.991816,0.905243
2,Anxiety,0.995128,0.814905
3,Disordered eating,0.993878,0.804873
4,Loneliness,0.998944,0.702759
5,Sad mood,0.998163,0.743941
6,Self-loathing,0.993611,0.798047
7,Sleep problem,0.992108,0.833361
8,Somatic complaint,0.996363,0.784881
9,Worthlessness,1.0,0.570013


## RoBERTa Embeddings

In [None]:
#roberta_sampled_df = filtered_df.sample(n=3000, random_state=42)

In [None]:
#roberta_sampled_df

Unnamed: 0,symptom,text,tokens
39,Loneliness,Only friend has been blanking me for what feel...,"[only, friend, has, been, blanking, me, for, w..."
67,Anxiety,Study hall social anxiety bruh We had a study ...,"[study, hall, social, anxiety, bruh, we, had, ..."
72,Anxiety,Positive Thoughts For You - We Are Happy To Pu...,"[positive, thoughts, for, you, -, we, are, hap..."
133,Anhedonia,Love Language Opposites In the process of divo...,"[love, language, opposites, in, the, process, ..."
172,Loneliness,2meirl42meirl4meirl,[2meirl42meirl4meirl]
...,...,...,...
1968023,Control,"Mouse basics? Hi there, \r\n\r\nI've recently ...","[mouse, basics, ?, hi, there, ,, i've, recentl..."
1969250,Control,[Serious] If a person has a serious life threa...,"[[, serious, ], if, a, person, has, a, serious..."
1969342,Control,What are the Peasants views on a free and inde...,"[what, are, the, peasants, views, on, a, free,..."
1969582,Control,For some reason this makes me uncomfortable,"[for, some, reason, this, makes, me, uncomfort..."


In [36]:
def get_roberta_layer_embeddings(dataframe, model_name='roberta-base', layer_num=9):
    # Load RoBERTa model and tokenizer
    tokenizer = RobertaTokenizer.from_pretrained(model_name)
    model = RobertaModel.from_pretrained(model_name, output_hidden_states=True)

    # If CUDA (GPU support) is available, move the model to the GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    def extract_layer_embeddings(text):
        inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)

        # Move input tensors to the same device as the model
        inputs = {key: value.to(device) for key, value in inputs.items()}

        # Get model outputs
        with torch.no_grad():
            outputs = model(**inputs)

        hidden_states = outputs.hidden_states
        layer = hidden_states[layer_num]

        # Identify indices of non-special tokens ([CLS] is at position 0 and [SEP] is at the end)
        input_ids = inputs['input_ids'].squeeze()  # Remove batch dimension
        token_indices = torch.where((input_ids != tokenizer.cls_token_id) & (input_ids != tokenizer.sep_token_id))[0]

        # Calculate the mean of embeddings for non-special tokens
        return layer[:, token_indices, :].mean(dim=1).squeeze().cpu().numpy()

    embeddings = []
    for _, row in dataframe.iterrows():
        text = ' '.join(row['tokens'])
        embeddings.append(extract_layer_embeddings(text))

    dataframe['roberta_layer_embeddings'] = embeddings

    return dataframe

roberta_df = get_roberta_layer_embeddings(filtered_df)
roberta_df

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unnamed: 0,symptom,text,tokens,roberta_layer_embeddings
39,Loneliness,Only friend has been blanking me for what feel...,"[only, friend, has, been, blanking, me, for, w...","[0.13779949, -0.22853173, -0.13161923, -0.0517..."
67,Anxiety,Study hall social anxiety bruh We had a study ...,"[study, hall, social, anxiety, bruh, we, had, ...","[0.22733095, -0.3250484, 0.07622574, 0.1280205..."
72,Anxiety,Positive Thoughts For You - We Are Happy To Pu...,"[positive, thoughts, for, you, -, we, are, hap...","[0.125981, -0.103187196, -0.15969585, 0.113437..."
133,Anhedonia,Love Language Opposites In the process of divo...,"[love, language, opposites, in, the, process, ...","[0.10807705, -0.108680114, 0.14013788, -0.0339..."
172,Loneliness,2meirl42meirl4meirl,[2meirl42meirl4meirl],"[-0.0727954, -0.3522391, -0.040860772, 0.57268..."
...,...,...,...,...
1968023,Control,"Mouse basics? Hi there, \r\n\r\nI've recently ...","[mouse, basics, ?, hi, there, ,, i've, recentl...","[0.09425101, -0.08012942, 0.005038114, -0.2179..."
1969250,Control,[Serious] If a person has a serious life threa...,"[[, serious, ], if, a, person, has, a, serious...","[0.1475767, 0.037965745, 0.0763892, -0.1509939..."
1969342,Control,What are the Peasants views on a free and inde...,"[what, are, the, peasants, views, on, a, free,...","[0.09628822, 0.00863403, 0.14035712, 0.1642202..."
1969582,Control,For some reason this makes me uncomfortable,"[for, some, reason, this, makes, me, uncomfort...","[0.21933109, -0.0378911, -0.23542161, 0.218970..."


In [None]:
def train_rf_for_each_symptom(dataframe, symptom_list):
    # DataFrame to store AUC scores for each symptom
    auc_scores_df = pd.DataFrame(columns=['Symptom', 'Train AUC', 'Test AUC'])

    for symptom in symptom_list:
        print(f"Training model for symptom: {symptom}")

        # Create labels: 1 for the current symptom, 0 for all others
        y = (dataframe['symptom'] == symptom).astype(int).values
        X = np.stack(dataframe['roberta_layer_embeddings'].values)

        # Initialize Random Forest Classifier
        rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=10, min_samples_leaf=4, n_jobs=-1)

        # Perform 5-fold cross-validation
        cv = KFold(n_splits=5, shuffle=True, random_state=42)
        results = cross_validate(rf_classifier, X=X, y=y, cv=cv, scoring='roc_auc', return_train_score=True)

        # Calculate average AUC scores
        train_auc = np.mean(results['train_score'])
        test_auc = np.mean(results['test_score'])

        # Store results
        auc_scores_df = auc_scores_df.append({'Symptom': symptom,'Train AUC': train_auc,'Test AUC': test_auc}, ignore_index=True)

    return auc_scores_df

symptom_list = ['Anger', 'Anhedonia', 'Anxiety', 'Disordered eating', 'Loneliness', 'Sad mood', 'Self-loathing', 'Sleep problem', 'Somatic complaint', 'Worthlessness']
auc_scores_df = train_rf_for_each_symptom(roberta_df, symptom_list)
print(auc_scores_df)

Training model for symptom: Anger
Training model for symptom: Anhedonia
Training model for symptom: Anxiety
Training model for symptom: Disordered eating
Training model for symptom: Loneliness
Training model for symptom: Sad mood
Training model for symptom: Self-loathing
Training model for symptom: Sleep problem
Training model for symptom: Somatic complaint
Training model for symptom: Worthlessness
             Symptom  Train AUC  Test AUC
0              Anger   0.999948  0.877589
1          Anhedonia   0.996708  0.955641
2            Anxiety   0.946312  0.856849
3  Disordered eating   0.998779  0.922961
4         Loneliness   0.981171  0.903972
5           Sad mood   0.975876  0.884950
6      Self-loathing   0.970767  0.840128
7      Sleep problem   0.997330  0.953684
8  Somatic complaint   0.975946  0.898474
9      Worthlessness   0.914727  0.846817


# Main

In [33]:
def main(X, y, df, symptom):
  """
  Here's the basic structure of the main block! It should run
  5-fold cross validation with random forest to evaluate your RoBERTa and LDA
  performance.
  """
  rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=10, min_samples_leaf=4, n_jobs=-1)
  # cv = KFold(n_splits=5, shuffle=True)
  cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
  results = cross_validate(rf_classifier, X=X, y=y, cv=cv, scoring='roc_auc', return_train_score=True)

  # TODO: Print your training and testing scores!
  # Calculate average AUC scores
  train_auc = np.mean(results['train_score'])
  test_auc = np.mean(results['test_score'])

  df = df.append({'Symptom': symptom,'Train AUC': train_auc,'Test AUC': test_auc}, ignore_index=True)

  return df

In [34]:
# LDA
from sklearn.model_selection import GridSearchCV, KFold
def train_rf_for_each_symptom(lda_df, symptom_list):
    # DataFrame to store AUC scores for each symptom
    auc_scores_df = pd.DataFrame(columns=['Symptom', 'Train AUC', 'Test AUC'])

    for symptom in symptom_list:
        print(f"Training model for symptom: {symptom}")

        symptom_df = lda_df[lda_df['symptom'] == symptom]
        control_df = lda_df[lda_df['symptom'] == 'Control']
        combined_df = pd.concat([symptom_df, control_df])

        y = (combined_df['symptom'] == symptom).astype(int)
        X = combined_df.drop(columns=['symptom']).values

        # Check if the symptom category has more than one class
        if np.unique(y).size > 1:
            auc_scores_df = main(X, y, auc_scores_df, symptom)
        else:
            print(f"Skipping AUC calculation for '{symptom}' due to only one class present.")

    return auc_scores_df

symptom_list = ['Anger', 'Anhedonia', 'Anxiety', 'Disordered eating', 'Loneliness', 'Sad mood', 'Self-loathing', 'Sleep problem', 'Somatic complaint', 'Worthlessness']
lda_auc_scores_df = train_rf_for_each_symptom(lda_df, symptom_list)
print(lda_auc_scores_df)

Training model for symptom: Anger
Training model for symptom: Anhedonia
Training model for symptom: Anxiety
Training model for symptom: Disordered eating
Training model for symptom: Loneliness
Training model for symptom: Sad mood
Training model for symptom: Self-loathing
Training model for symptom: Sleep problem
Training model for symptom: Somatic complaint
Training model for symptom: Worthlessness
             Symptom  Train AUC  Test AUC
0              Anger   0.948942  0.780650
1          Anhedonia   0.921271  0.836802
2            Anxiety   0.865990  0.739456
3  Disordered eating   0.950534  0.835841
4         Loneliness   0.920592  0.681786
5           Sad mood   0.967323  0.706632
6      Self-loathing   0.919654  0.762070
7      Sleep problem   0.969836  0.861935
8  Somatic complaint   0.899346  0.753151
9      Worthlessness   0.968155  0.659192


In [37]:
pickle_file_path = '/content/drive/My Drive/Colab/Final Project/lda_auc_scores_df.pkl'
#pickle_file_path = 'lda_auc_scores_df.pkl'
lda_auc_scores_df.to_pickle(pickle_file_path)
print(f"DataFrame saved to {pickle_file_path}")

lda_loaded_df = pd.read_pickle(pickle_file_path)


DataFrame saved to /content/drive/My Drive/Colab/Final Project/lda_auc_scores_df.pkl


In [38]:
lda_loaded_df

Unnamed: 0,Symptom,Train AUC,Test AUC
0,Anger,0.948942,0.78065
1,Anhedonia,0.921271,0.836802
2,Anxiety,0.86599,0.739456
3,Disordered eating,0.950534,0.835841
4,Loneliness,0.920592,0.681786
5,Sad mood,0.967323,0.706632
6,Self-loathing,0.919654,0.76207
7,Sleep problem,0.969836,0.861935
8,Somatic complaint,0.899346,0.753151
9,Worthlessness,0.968155,0.659192


In [39]:
# RoBERTa
def train_rf_for_each_symptom(dataframe, symptom_list):
    # DataFrame to store AUC scores for each symptom
    auc_scores_df = pd.DataFrame(columns=['Symptom', 'Train AUC', 'Test AUC'])

    for symptom in symptom_list:
        print(f"Training model for symptom: {symptom}")

        # Create labels: 1 for the current symptom, 0 for all others
        y = (dataframe['symptom'] == symptom).astype(int).values
        X = np.stack(dataframe['roberta_layer_embeddings'].values)

        auc_scores_df = main(X, y, auc_scores_df, symptom)

    return auc_scores_df

symptom_list = ['Anger', 'Anhedonia', 'Anxiety', 'Disordered eating', 'Loneliness', 'Sad mood', 'Self-loathing', 'Sleep problem', 'Somatic complaint', 'Worthlessness']
roberta_auc_scores_df = train_rf_for_each_symptom(roberta_df, symptom_list)
print(roberta_auc_scores_df)

Training model for symptom: Anger
Training model for symptom: Anhedonia
Training model for symptom: Anxiety
Training model for symptom: Disordered eating
Training model for symptom: Loneliness
Training model for symptom: Sad mood
Training model for symptom: Self-loathing
Training model for symptom: Sleep problem
Training model for symptom: Somatic complaint
Training model for symptom: Worthlessness
             Symptom  Train AUC  Test AUC
0              Anger   0.999950  0.884188
1          Anhedonia   0.996785  0.955610
2            Anxiety   0.946076  0.856344
3  Disordered eating   0.998527  0.921890
4         Loneliness   0.981341  0.904190
5           Sad mood   0.976754  0.885370
6      Self-loathing   0.971009  0.839931
7      Sleep problem   0.997132  0.950495
8  Somatic complaint   0.975919  0.898153
9      Worthlessness   0.916362  0.848144


In [42]:
pickle_file_path = '/content/drive/My Drive/Colab/Final Project/roberta_auc_scores_df.pkl'
# pickle_file_path = 'roberta_auc_scores_df.pkl'
roberta_auc_scores_df.to_pickle(pickle_file_path)
print(f"DataFrame saved to {pickle_file_path}")

roberta_loaded_df = pd.read_pickle(pickle_file_path)

DataFrame saved to /content/drive/My Drive/Colab/Final Project/roberta_auc_scores_df.pkl


In [43]:
roberta_loaded_df

Unnamed: 0,Symptom,Train AUC,Test AUC
0,Anger,0.99995,0.884188
1,Anhedonia,0.996785,0.95561
2,Anxiety,0.946076,0.856344
3,Disordered eating,0.998527,0.92189
4,Loneliness,0.981341,0.90419
5,Sad mood,0.976754,0.88537
6,Self-loathing,0.971009,0.839931
7,Sleep problem,0.997132,0.950495
8,Somatic complaint,0.975919,0.898153
9,Worthlessness,0.916362,0.848144


In [49]:
final_df = pd.DataFrame()

final_df['Symptom'] = lda_loaded_df['Symptom']
final_df['LDA'] = lda_loaded_df['Test AUC']
final_df['RoBERTa'] = roberta_loaded_df['Test AUC']

print(final_df)

             Symptom       LDA   RoBERTa
0              Anger  0.780650  0.884188
1          Anhedonia  0.836802  0.955610
2            Anxiety  0.739456  0.856344
3  Disordered eating  0.835841  0.921890
4         Loneliness  0.681786  0.904190
5           Sad mood  0.706632  0.885370
6      Self-loathing  0.762070  0.839931
7      Sleep problem  0.861935  0.950495
8  Somatic complaint  0.753151  0.898153
9      Worthlessness  0.659192  0.848144


In [51]:
pickle_file_path = '/content/drive/My Drive/Colab/Final Project/final_df.pkl'
final_df.to_pickle(pickle_file_path)
print(f"DataFrame saved to {pickle_file_path}")

final_loaded_df = pd.read_pickle(pickle_file_path)
final_loaded_df

DataFrame saved to /content/drive/My Drive/Colab/Final Project/final_df.pkl


Unnamed: 0,Symptom,LDA,RoBERTa
0,Anger,0.78065,0.884188
1,Anhedonia,0.836802,0.95561
2,Anxiety,0.739456,0.856344
3,Disordered eating,0.835841,0.92189
4,Loneliness,0.681786,0.90419
5,Sad mood,0.706632,0.88537
6,Self-loathing,0.76207,0.839931
7,Sleep problem,0.861935,0.950495
8,Somatic complaint,0.753151,0.898153
9,Worthlessness,0.659192,0.848144
