# Mental Health Analysis

In [13]:
import pandas as pd
from pathlib import Path
import os
import re

import spacy

In [2]:
current_working_folder = Path.cwd()
current_folder_abs = os.path.abspath(current_working_folder)

data_folder = "reddit_mental_health_data"
data_filepath = f"./{data_folder}/addiction_2018_features_tfidf_256.csv"
addiction_18_df = pd.read_csv(os.path.join(current_folder_abs, data_filepath))
data_folder_path = os.path.join(current_folder_abs,f"./{data_folder}")

In [3]:
required_columns = addiction_18_df.columns[:4]
required_columns

Index(['subreddit', 'author', 'date', 'post'], dtype='object')

In [4]:
csv_files = [f for f in os.listdir(data_folder_path)if f.endswith('.csv')]

data_frames = []

for file in csv_files:
    file_path = os.path.join(data_folder_path, file)
    df = pd.read_csv(file_path)
    df_first_four_columns = df.iloc[:, :4]
    data_frames.append(df_first_four_columns)

mental_health_df = pd.concat(data_frames, ignore_index=True)

mental_health_df.head()

Unnamed: 0,subreddit,author,date,post
0,guns,TrickyWidget,2019/08/29,"Bersa Thunder 22: Long-term reliability? Hi, f..."
1,guns,Phonkboi,2019/08/29,Anybody have any experience with the diamondba...
2,guns,civildallas,2019/08/29,Old &amp; New // Colt 1911 &amp; MCX Rattler I...
3,guns,alphenos,2019/08/29,Start working in a gun shop next week. Need so...
4,guns,ThePrince_OfWhales,2019/08/29,Cheap targets for plinking? I'm headed to the ...


In [5]:
data_subreddits = [
    'EDAnonymous', 'addiction', 'alcoholism', 'adhd', 'anxiety', 'autism',
    'bipolarreddit', 'bpd', 'depression', 'healthanxiety', 'lonely', 'ptsd',
    'schizophrenia', 'socialanxiety', 'suicidewatch', 'mentalhealth', 'COVID19_support',
    'conspiracy', 'divorce', 'fitness', 'guns', 'jokes', 'legaladvice', 'meditation',
    'parenting', 'personalfinance', 'relationships', 'teaching'
]

subreddits_df = pd.DataFrame(data_subreddits, columns=['subreddit'])

mental_health_subreddits = [
    'EDAnonymous', 'addiction', 'alcoholism', 'adhd', 'anxiety', 'autism',
    'bipolarreddit', 'bpd', 'depression', 'healthanxiety', 'lonely', 'ptsd',
    'schizophrenia', 'socialanxiety', 'suicidewatch', 'mentalhealth', 'COVID19_support'
]

subreddits_df['class'] = subreddits_df['subreddit'].apply(lambda x: x if x in mental_health_subreddits else 'non_mental_health')

print(subreddits_df)


          subreddit              class
0       EDAnonymous        EDAnonymous
1         addiction          addiction
2        alcoholism         alcoholism
3              adhd               adhd
4           anxiety            anxiety
5            autism             autism
6     bipolarreddit      bipolarreddit
7               bpd                bpd
8        depression         depression
9     healthanxiety      healthanxiety
10           lonely             lonely
11             ptsd               ptsd
12    schizophrenia      schizophrenia
13    socialanxiety      socialanxiety
14     suicidewatch       suicidewatch
15     mentalhealth       mentalhealth
16  COVID19_support    COVID19_support
17       conspiracy  non_mental_health
18          divorce  non_mental_health
19          fitness  non_mental_health
20             guns  non_mental_health
21            jokes  non_mental_health
22      legaladvice  non_mental_health
23       meditation  non_mental_health
24        parenting  non_

In [6]:
mental_health_df['target'] = mental_health_df['subreddit'].str.strip().apply(lambda x:x if x in list(subreddits_df['class']) else 'non_mental_health')
mental_health_df['target'].unique()

array(['non_mental_health', 'socialanxiety', 'adhd', 'autism', 'ptsd',
       'suicidewatch', 'lonely', 'mentalhealth', 'alcoholism',
       'bipolarreddit', 'schizophrenia', 'depression', 'EDAnonymous',
       'addiction', 'healthanxiety', 'COVID19_support', 'anxiety', 'bpd'],
      dtype=object)

In [7]:
mental_health_df.head()

Unnamed: 0,subreddit,author,date,post,target
0,guns,TrickyWidget,2019/08/29,"Bersa Thunder 22: Long-term reliability? Hi, f...",non_mental_health
1,guns,Phonkboi,2019/08/29,Anybody have any experience with the diamondba...,non_mental_health
2,guns,civildallas,2019/08/29,Old &amp; New // Colt 1911 &amp; MCX Rattler I...,non_mental_health
3,guns,alphenos,2019/08/29,Start working in a gun shop next week. Need so...,non_mental_health
4,guns,ThePrince_OfWhales,2019/08/29,Cheap targets for plinking? I'm headed to the ...,non_mental_health


In [11]:
#DATA SET CREATION AND EMPIRICAL ANALYSIS FOR DETECTING SIGNS OF DEPRESSION FROM SOCIAL MEDIA POSTINGS

# Sampath, Kayalvizhi, and Thenmozhi Durairaj. "Data set creation and empirical analysis for detecting signs of depression from social media postings." 
# International Conference on Computational Intelligence in Data Science. Cham: Springer International Publishing, 2022.

# the text and title part are pre-processed by removing the non-ASCII characters and emoticons
# to get a clean data set

def remove_non_ascii(text):
    cleaned_text = ''.join([char for char in text if ord(char) < 128])
    cleaned_text = re.sub(r'[^\x00-\x7F]+', '', cleaned_text)
    return cleaned_text

mental_health_df['cleaned_post'] = mental_health_df['post'].apply(lambda x: remove_non_ascii(str(x)))
mental_health_df['cleaned_post'] = mental_health_df['cleaned_post'].apply(lambda x: x.replace("&amp;","&"))


In [12]:
mental_health_df.head()

Unnamed: 0,subreddit,author,date,post,target,cleaned_post
0,guns,TrickyWidget,2019/08/29,"Bersa Thunder 22: Long-term reliability? Hi, f...",non_mental_health,"Bersa Thunder 22: Long-term reliability? Hi, f..."
1,guns,Phonkboi,2019/08/29,Anybody have any experience with the diamondba...,non_mental_health,Anybody have any experience with the diamondba...
2,guns,civildallas,2019/08/29,Old &amp; New // Colt 1911 &amp; MCX Rattler I...,non_mental_health,Old & New // Colt 1911 & MCX Rattler If y'all ...
3,guns,alphenos,2019/08/29,Start working in a gun shop next week. Need so...,non_mental_health,Start working in a gun shop next week. Need so...
4,guns,ThePrince_OfWhales,2019/08/29,Cheap targets for plinking? I'm headed to the ...,non_mental_health,Cheap targets for plinking? I'm headed to the ...


In [None]:
# Natural Language Processing with Python and spaCy: A Practical Introduction by Yuli Vasiliev
spacy_model = spacy.load('en_core_web_sm')

def text_statistics(df, text_column):
    total_words = 0
    total_sentences = 0
    unique_words = set()
    total_words_without_stopwords = 0

    for text in df[text_column]:
        doc = spacy_model(str(text))

        total_sentences += len(list(doc.sents))
        
        # Tokenize words and count total words, unique words, and words without stopwords
        for token in doc:
            if token.is_alpha:  # Only consider alphabetic tokens (words)
                total_words += 1
                unique_words.add(token.text.lower())
                
                if not token.is_stop:  # Skip stop words
                    total_words_without_stopwords += 1

    stats = {
        "total_words": total_words,
        "unique_words_count": len(unique_words),
        "total_sentences": total_sentences,
        "total_words_without_stopwords": total_words_without_stopwords
    }
    return stats

In [None]:
stats = text_statistics(mental_health_df, 'cleaned_post')

print(f"Total Words: {stats['total_words']}")
print(f"Unique Words: {stats['unique_words_count']}")
print(f"Total Sentences: {stats['total_sentences']}")
print(f"Total Words without Stopwords: {stats['total_words_without_stopwords']}")

# Total Words: 198807402
# Unique Words: 239494
# Total Sentences: 12742645
# Total Words without Stopwords: 77943241

#595m 58.8s

Total Words: 198807402
Unique Words: 239494
Total Sentences: 12742645
Total Words without Stopwords: 77943241
