## Code for Constructing the Dataset for our CS 224U Project

### PRAW
1. Generated a prelim list of users who've posts on r/Anxiety using ParseHub
2. Using the Reddit API through praw (initially was trying to use BeautifulSoup, but Reddit API is easier) to fetch posts by users
3. Future work: Optimise python processing 


### Three categories
1. Mental Illness Related Subreddits - ('men')
2. Anxiety Related Subreddits (subset of first group) - ('anx')
3. Unrelated Subreddits (Mutually exclusive from first group) - ('unr')

In [206]:
import requests
import csv
import time
from bs4 import BeautifulSoup
from collections import Counter
import praw
import pandas as pd
import datetime as dt #only if you want to analyze the date created feature
import json 
from types import SimpleNamespace
import sys

In [185]:
class SetEncoder(json.JSONEncoder):
    '''Using SetEncoder to JSONify our individual result sets'''
    def default(self, obj):
        if isinstance(obj, set): return list(obj)
        return json.JSONEncoder.default(self, obj)

In [207]:
reddit = praw.Reddit(client_id='A1ufXY3vdi2vpA', 
                     client_secret='nO1tFwJCFGiegR4NQzcMEbPze0g', 
                     user_agent='Script', 
                     username='sophiemiya', 
                     password='stevie13weedR')

In [208]:
f = open('dataset/dont-change.json',)  # prelim list of users gathered through parsehub
data = json.load(f)['selection2']
unique = [ each['name'].replace("u/", "") for each in data] # get a unique set of users 
    
f.close() 

In [188]:
def add_to_unique_users(sub):
    panic_party_posts = reddit.subreddit(sub).top(limit=1000)
    panic_users = [post.author.name for post in panic_party_posts if post.author is not None]
    unique.extend(list(set(panic_users)))

for sub in ['Anxietyhelp', 'PanicAttack','PanicParty', 'panicdisorder', 'AnxietyDepression', 'socialanxiety', 'HealthAnxiety']:
    add_to_unique_users(sub)

In [192]:
unique = list(set(unique))
print(len(unique))
with open('dataset/unique-users-dont-change.json', 'w') as fp: json.dump(unique, fp) # write the unique users to a file

4922


### Load Users

In [211]:
# LOAD DATA

f = open('dataset/unique-users-dont-change.json',)  # prelim list of users gathered through parsehub
unique = json.load(f)
print(len(data))
# print(data)
f.close() 

4922


In [197]:
# CATEGORY SPECIFIC VARs

counter_men = Counter()
counter_unr = Counter()
counter_anx = Counter()
posts_men = []
posts_anx = []
posts_unr = []

categories = SimpleNamespace(**{
    
    'unr': SimpleNamespace(**{
        'type': 'unr',
        'counter': counter_unr,
        'result_list': posts_unr,
        'file_name': 'posts-unr.json'
    }),
    
    'men': SimpleNamespace(**{
        'type': 'men',
        'counter': counter_men,
        'result_list': posts_men,
        'file_name': 'posts-men.json'
    }),
    
    'anx': SimpleNamespace(**{
        'type': 'anx',
        'counter': counter_anx,
        'result_list': posts_anx,
        'file_name': 'posts-anx.json'
    })
})

In [198]:
def get_dict_from_submission(submission,  user):
    try:
        sub_dict = {'sub': submission.subreddit.display_name, \
                'title': submission.title, \
                'body' : submission.selftext, \
                'num_u' : submission.score, \
                'url' : submission.url, \
                'num_c' : submission.num_comments, \
                'time': submission.created, \
                'user': user
        }
    except:
        print("Error has occured here")
        
    return sub_dict

def init_vars():
    return [],0,0

def print_progress(user_count):
    if user_count%150==0:  
        print("Handling user: ", user_count)  

def process_user(user):
    global user_count

    print_progress(user_count)
    user_count += 1
    
def dump_data(file_name, data):
    if isinstance(data, set): data = json.dumps(data, cls=SetEncoder) # if set, first encode
        
    with open(file_name, 'w') as fp:
        json.dump(data, fp)

def get_post_type(sub):
    if sub in SUBS_ANX: return categories.anx.type
    elif sub in SUBS_MEN: return categories.men.type
    else: return categories.unr.type

def process_for_category(post, sub):
    if sub in SUBS_ANX:
        categories.anx.counter[sub] += 1
        categories.anx.result_list.append(post)
            
    elif sub in SUBS_MEN:
        categories.men.counter[sub] += 1
        categories.men.result_list.append(post)

    else: 
        categories.unr.counter[sub] += 1
        categories.unr.result_list.append(post)


In [199]:
# STATICS

TOTAL_USERS = 4500
LIM = 50

SUBS_MEN = ['selfharm', 'bipolar', 'CPTSD', 'BPD', 'MentalHealthSupport','antidepressants','medical_advice','raisedbynarcissists','GeneticCounseling','ADHDAccountability','Dissociation','Anxiety', 'AnxietyDepression', 'socialanxiety', 'panicdisorder', 'Anxietyhelp', 'PanicAttack', 'PanicParty', 'HealthAnxiety', 'depression', 'SuicideWatch', 'OCD' , 'TrueOffMyChest', 'mentalhealth', 'offmychest', 'emotionalsupport', 'therapy', 'ADHD','AskDocs']
SUBS_ANX = ['Anxietyhelp', 'PanicAttack', 'Anxiety', 'AnxietyDepression', 'socialanxiety', 'HealthAnxiety', 'panicdisorder', 'PanicParty']


# RESULTS 

results = {} # stores actual result here
results_user_names = [] # stores final user names here


# COUNTERS

result_user_count = 0 # counts the total valid users we get by the end
result_post_count = 0 # counts the total valid posts we get by the end


In [200]:
user_count = 0 

# get the submissions
# users_to_process = list(set(unique))[0:TOTAL_USERS]
users_to_process = list(set(unique))

users_submissions = [(reddit.redditor(user).submissions.new(limit=LIM), user) for user in users_to_process]
user_submissions_2 = [] # need this because of Praw iterator related bug

# main processing happens now:

for submissions, user in users_submissions:
    process_user(user) # print and increment count
    
    user_result, count_related, count_unrelated  = init_vars() # initializes empty variables
    
    try: # because 404 HTTP errors in reading submissions sometimes
        submissions = list(submissions)
        submissions_2 = [] # need this because of Praw iterator related bug
        
        for submission in submissions: # quick pass to see if this user's posts are relevant
            submissions_2.append(submission)
            # if title doesn't exist or body doesn't exist
            # or we already have 10 relevant posts from user
            # or post is removed
            # then skip

            if not (submission.title and submission.selftext) \
            or (count_related > 5 and count_unrelated > 5) \
            or submission.selftext == "[removed]": 
                continue 

            if get_post_type(submission.subreddit.display_name) in ['anx', 'men']: count_related += 1
            else: count_unrelated += 1

        is_relevant = False
         # do another longer pass if relevant user
        if(count_related > 2 and count_unrelated > 2): 
            is_relevant = True
            try:
                for submission in submissions_2:
                    # sub-reddit related work 
                    post = get_dict_from_submission(submission, user)
                    process_for_category(post, submission.subreddit.display_name)
#                     result_post_count += 1 # increment the result count
                    sub_dict = get_dict_from_submission(submission, user)
                    # add the post to user's results
                    user_result.append(sub_dict)
            except Exception:
                print("Exception has occurred: ", sys.exc_info())
            
            if(is_relevant):
                
#             print("Incrementing results",  len(results))
                results_user_names.append(user) # add user to the list of final users used
                results[user] = (user_result) # add result to final result
                result_user_count += 1 # increment the user count
                result_post_count += len(user_result)
    except Exception:
        print("Exception: ", sys.exc_info()[0])
        continue
    

Handling user:  0
Exception:  <class 'prawcore.exceptions.Forbidden'>
Exception:  <class 'prawcore.exceptions.NotFound'>
Exception:  <class 'prawcore.exceptions.NotFound'>
Handling user:  150
Exception:  <class 'prawcore.exceptions.Forbidden'>
Handling user:  300
Exception:  <class 'prawcore.exceptions.NotFound'>
Handling user:  450
Exception:  <class 'prawcore.exceptions.Forbidden'>
Exception:  <class 'prawcore.exceptions.Forbidden'>
Handling user:  600
Exception:  <class 'prawcore.exceptions.NotFound'>
Exception:  <class 'prawcore.exceptions.Forbidden'>
Handling user:  750
Handling user:  900
Handling user:  1050
Exception:  <class 'prawcore.exceptions.Forbidden'>
Exception:  <class 'prawcore.exceptions.Forbidden'>
Exception:  <class 'prawcore.exceptions.Forbidden'>
Handling user:  1200
Exception:  <class 'prawcore.exceptions.Forbidden'>
Exception:  <class 'prawcore.exceptions.Forbidden'>
Handling user:  1350
Exception:  <class 'prawcore.exceptions.NotFound'>
Handling user:  1500
Han

In [201]:
print("Result user count: ", result_user_count)

print("\nResult post count: ", result_post_count)

print("\nCounter of mental posts: ", categories.men.counter.most_common(10))
print("Counter of anx posts: ", categories.anx.counter.most_common(10))
print("Counter of unrelated posts: ", categories.unr.counter.most_common(10))

print("\nResult of mental posts", len(categories.men.result_list))
print("Result of anx posts", len(categories.anx.result_list))
print("Result of unrelated posts", len(categories.unr.result_list))

Result user count:  1440

Result post count:  51996

Counter of mental posts:  [('depression', 1237), ('AskDocs', 833), ('OCD', 349), ('mentalhealth', 344), ('SuicideWatch', 332), ('offmychest', 324), ('ADHD', 235), ('raisedbynarcissists', 233), ('CPTSD', 212), ('bipolar', 212)]
Counter of anx posts:  [('Anxiety', 2813), ('HealthAnxiety', 2444), ('AnxietyDepression', 1063), ('socialanxiety', 679), ('PanicAttack', 624), ('Anxietyhelp', 545), ('panicdisorder', 269), ('PanicParty', 48)]
Counter of unrelated posts:  [('AskReddit', 1196), ('NoStupidQuestions', 278), ('Advice', 257), ('NoFap', 225), ('relationship_advice', 222), ('aww', 220), ('unpopularopinion', 190), ('Showerthoughts', 184), ('cats', 161), ('trees', 152)]

Result of mental posts 5025
Result of anx posts 8485
Result of unrelated posts 38486


In [205]:
dump_data('dataset/reddit-data.json', results)

dump_data('dataset/reddit-posts-mental.json', categories.men.result_list)
dump_data('dataset/reddit-posts-anxiety.json', categories.anx.result_list)
dump_data('dataset/reddit-posts-unrelated.json', categories.unr.result_list)

dump_data('dataset/reddit-counter-unrelated.txt', categories.unr.counter.most_common())
dump_data('dataset/reddit-counter-mental.txt', categories.men.counter.most_common())
dump_data('dataset/reddit-counter-anxiety.txt', categories.anx.counter.most_common())

dump_data('dataset/stats.txt', "Result of mental posts: {} \
        Result of anx posts: {} \
        Result of unrelated posts: {}"\
         .format(len(categories.men.result_list),\
                 len(categories.anx.result_list),\
                 len(categories.unr.result_list))

### Getting non-anxiety control group posts

In [245]:
unr_post_count = 0

def print_progress_posts(post_count):
    if post_count%200==0:  
        print("Handling post: ", post_count)  

def process_post():
    global unr_post_count

    print_progress_posts(unr_post_count)
    unr_post_count += 1

    
def get_dict_from_submission(submission):
    try:
        sub_dict = {'sub': submission.subreddit.display_name, \
                'title': submission.title, \
                'body' : submission.selftext, \
                'num_u' : submission.score, \
                'url' : submission.url, \
                'num_c' : submission.num_comments, \
                'time': submission.created, \
                'user': submission.author.name
        }
    except:
        pass
        
    return sub_dict

control_group_counter = Counter()
countrol_group_posts = []


def fetch_non_anxious_users_posts(sub):
    submissions = reddit.subreddit(sub).top(limit=1000)
    
    try: # because 404 HTTP errors in reading submissions sometimes
        for submission in submissions:
            try:
                process_post()
                post = get_dict_from_submission(submission)
                if(post['user'] in unique):
                    print("Found in anxiety", post['user'])
                    continue
                countrol_group_posts.append(post)
                control_group_counter[post['sub']] += 1
            except Exception:
                continue
    
    except Exception:
        print("Exception2: ", sys.exc_info()[0])



    



unrelated_subs = [["AskReddit", 1196], ["NoStupidQuestions", 278], ["Advice", 257], \
                  ["NoFap", 225], ["relationship_advice", 222], ["aww", 220], ["unpopularopinion", 190], \
                  ["Showerthoughts", 184], ["cats", 161], ["trees", 152],   \
                  ["ibs", 141], ["relationships", 140], ["BreakUps", 137], ["Drugs", 133], ["teenagers", 132],\
                  ["memes", 130], ["Christianity", 125],  ["GERD", 121], ["stopdrinking", 119], ["rant", 116],\
                  ["lonely", 111], ["AnimalCrossing", 111], ["Dentistry", 107], \
                  ["SkincareAddiction", 104], ["tipofmytongue", 103], ["AmItheAsshole", 102],["Vent", 98], \
                  ["dogs", 96], ["TooAfraidToAsk", 94], ["legaladvice", 93], ["CasualConversation", 92],\
                  ["sex", 89],  ["dating_advice", 81], ["birthcontrol", 78], \
                  ["funny", 78], ["socialskills", 78], ["MadeOfStyrofoam", 78], ["techsupport", 77], \
                  ["personalfinance", 76], ["Dreams", 76], ["DecidingToBeBetter", 72], ["college", 71], \
                  ["pics", 70], ["buildapc", 70]]

unrelated_subs_top_30 = unrelated_subs[0:20]


for sub in unrelated_subs_top_30:
    fetch_non_anxious_users_posts(sub[0])



Handling post:  0
Handling post:  200
Handling post:  400
Handling post:  600
Found in anxiety formalde_heidi
Handling post:  800
Handling post:  1000
Handling post:  1200
Handling post:  1400
Handling post:  1600
Handling post:  1800
Found in anxiety 20-CharactersAllowed
Found in anxiety FireFromTonsOfLiars
Found in anxiety treedy99
Handling post:  2000
Found in anxiety cubingtothegame
Found in anxiety Alex1965
Handling post:  2200
Handling post:  2400
Handling post:  2600
Handling post:  2800
Handling post:  3000
Handling post:  3200
Handling post:  3400
Found in anxiety _SupaHotFirenze
Handling post:  3600
Found in anxiety Spaceboy1598
Handling post:  3800
Found in anxiety qsauce6
Handling post:  4000
Handling post:  4200
Handling post:  4400
Handling post:  4600
Handling post:  4800
Handling post:  5000
Found in anxiety mofifa16
Handling post:  5200
Found in anxiety P00ld3ad
Handling post:  5400
Handling post:  5600
Handling post:  5800
Found in anxiety thequinquangle
Found in anxi

In [246]:
print(control_group_counter)

Counter({'aww': 968, 'cats': 967, 'memes': 955, 'trees': 952, 'stopdrinking': 949, 'AskReddit': 946, 'BreakUps': 944, 'ibs': 941, 'teenagers': 935, 'Showerthoughts': 916, 'Drugs': 915, 'NoStupidQuestions': 913, 'Christianity': 913, 'rant': 911, 'Advice': 903, 'NoFap': 893, 'GERD': 891, 'unpopularopinion': 857, 'relationships': 833, 'relationship_advice': 816})


In [247]:
dump_data('dataset/reddit-control.json', countrol_group_posts)
dump_data('dataset/reddit-control-counter.txt', control_group_counter.most_common())