In [5]:
#!pip install -q snscrape
#!pip install -q pandas

import snscrape.modules.twitter as sntwitter
import pandas as pd

import time

In [6]:
#parameters to query twitter
max_limit_tweets = 21000 #1mil tweets will take about 22 hours
language = 'en' #only tweets with english language

In [7]:
#function to scrape based on passed in query -> once finished, save all to csv
def scrape_tweets(query):
    start_time = time.time()

    tweets = []
    count = 0

    for tweet in sntwitter.TwitterSearchScraper(query).get_items():
        tweets.append(tweet)
        count += 1
        
        if count >= max_limit_tweets:
            break
    
    return tweets

In [8]:
#scrape data and add labels to it based on the hashtag

#anxiety
anxiety_hashtags = ['#anxietyhelp', '#anxietyattack', '#anxietydisorder', '#anxietysupport', '#fuckanxiety', '#anxietyawareness']

#depression
depression_hashtags = ['#depressionhelp', '#fuckdepression', '#depresion', '#depressionrecovery']

#bipolar
bipolar_hashtags = ['#bipolarhelp', '#bipolarawareness', '#bipolardisorder', '#bipolarrecovery', '#bippolar', '#bipolar1', 'bipolar2', '#bpd']

#ptsd
ptsd_hashtags = ['#ptsdhelp', '#ptsdawareness', '#ptsdrecovery', '#ptsdsupport', '#ptsdtherapy', '#cptsd', '#fuckptsd', 'ptsdsucks']

#eating disorders
eating_disorders_hashtags = ['#eatingdisorder', '#eatingdisorderhelp', '#anorexiahelp', '#bulimiahelp', '#ednos', '#ednoshelp', '#edrecovery', '#edawareness']

#neutral data
neutral_hashtags = ['#music', '#food', '#basketball', '#science', '#nature']

labels = {'anxiety': 0, 'depression': 1, 'bipolar': 2, 'ptsd': 3, 'eatingdisorders': 4, 'neutral': 5}
query_hashtags = {'anxiety': anxiety_hashtags, 'depression': depression_hashtags, 'bipolar': bipolar_hashtags, 'ptsd': ptsd_hashtags, 'eatingdisorders': eating_disorders_hashtags, 'neutral': neutral_hashtags}

queries = []
for label,hashtags in query_hashtags.items():
    curr_query = ''
    for hashtag in hashtags:
        curr_query = curr_query + hashtag + ' OR '
    curr_query = curr_query[:-4] #remove last OR
    queries.append(curr_query)

for idx, query in enumerate(queries):
    #filter replies, media and URLs
    queries[idx] = ('({}) lang:{} -filter:links -filter:replies').format(query, language)

#adjust each query to prevent tweets with the following hashtags: '#mentalhealth', '#anxiety', '#depression', '#bipolar', '#ptsd', '#anorexia', '#bulimia'
#to prevent duplicates from expert base + to ensure higher quality tweets (by ignoring most obvious hashtags)
for idx, query in enumerate(queries):
    queries[idx] = query + ' -#mentalhealth -#anxiety -#depression -#bipolar -#ptsd -#anorexia -#bulimia'

In [9]:
#scrape tweets
anxiety_tweets = scrape_tweets(queries[0])
anxiety_df = pd.DataFrame(anxiety_tweets)
anxiety_df['label'] = labels['anxiety']
anxiety_df.to_csv('anxiety_tweets.csv', index=False)
print('Done: anxiety', len(anxiety_df))

depression_tweets = scrape_tweets(queries[1])
depression_df = pd.DataFrame(depression_tweets)
depression_df['label'] = labels['depression']
depression_df.to_csv('depression_tweets.csv', index=False)
print('Done: depression', len(depression_df))

bipolar_tweets = scrape_tweets(queries[2])
bipolar_df = pd.DataFrame(bipolar_tweets)
bipolar_df['label'] = labels['bipolar']
bipolar_df.to_csv('bipolar_tweets.csv', index=False)
print('Done: bipolar', len(bipolar_df))

ptsd_tweets = scrape_tweets(queries[3])
ptsd_df = pd.DataFrame(ptsd_tweets)
ptsd_df['label'] = labels['ptsd']
ptsd_df.to_csv('ptsd_tweets.csv', index=False)
print('Done: ptsd', len(ptsd_df))

eatingdisorders_tweets = scrape_tweets(queries[4])
eatingdisorders_df = pd.DataFrame(eatingdisorders_tweets)
eatingdisorders_df['label'] = labels['eatingdisorders']
eatingdisorders_df.to_csv('eatingdisorders_tweets.csv', index=False)
print('Done: eatingdisorders', len(eatingdisorders_df))

'''neutral_tweets = scrape_tweets(queries[5])
neutral_df = pd.DataFrame(neutral_tweets)
neutral_df['label'] = labels['neutral']
neutral_df.to_csv('neutral_tweets.csv', index=False)
print('Done: neutral', len(neutral_df))'''

Done: anxiety 21000
Done: depression 21000
Done: bipolar 21000
Done: ptsd 17745
Done: eatingdisorders 21000


"neutral_tweets = scrape_tweets(queries[5])\nneutral_df = pd.DataFrame(neutral_tweets)\nneutral_df['label'] = labels['neutral']\nneutral_df.to_csv('neutral_tweets.csv', index=False)\nprint('Done: neutral')"

In [13]:
#read in all csvs
anxiety_df = pd.read_csv('anxiety_tweets.csv')
depression_df = pd.read_csv('depression_tweets.csv')
#bipolar_df = pd.read_csv('bipolar_tweets.csv')
#ptsd_df = pd.read_csv('ptsd_tweets.csv')
#eatingdisorders_df = pd.read_csv('eatingdisorders_tweets.csv')
neutral_df = pd.read_csv('neutral_tweets.csv')

#drop duplicates
anxiety_df = anxiety_df.drop_duplicates()
depression_df = depression_df.drop_duplicates()
#bipolar_df = bipolar_df.drop_duplicates()
#ptsd_df = ptsd_df.drop_duplicates()
#eatingdisorders_df = eatingdisorders_df.drop_duplicates()
neutral_df = neutral_df.drop_duplicates()

#change neutral label to 3
neutral_df['label'] = 2

#combine all dataframes and save to csv
combined_df = pd.concat([anxiety_df, depression_df, neutral_df])
#print number of tweets per label
print(combined_df['label'].value_counts())
combined_df.to_csv('all_train_test_twitter.csv', index=False)

0    21000
1    21000
2    21000
Name: label, dtype: int64


General Pre-Processing Steps:
- Remove newlines
- Remove URLs
- Remove punctuation (for twitter only)
- Replace all instances of 3 or more letters with 2 letters
- Remove all instances of 3 or more repeated subsequent words (ex: "I love love love love you")
- Remove html tags
- Remove non-ascii characters
- Handling Bullet Points or similar
- Replace numbers with \<num\> token


Twitter Specific Pre-Processing Steps:
- Remove RT
- Remove @mentions and usernames for privacy
- Remove hashtags (remove hashtags (both symbol and respective word) for hashtags used in querying) (keep hashtags not used for querying (remove only the hashtag symbol but keep the word))

Reddit Specific Pre-Processing Steps:
- Only keep english text (twitter data was already filtered to english, reddit wasn't)
- Remove user mentions
- Remove "Removed Posts" Posts


Processing Steps we also can/need to do for Non-Bert Models:
- Remove stopwords (except for maybe "not" and "no")
- Handling Clitics
- Handling Ellipses
- Having a minimum word frequency and replacing words that occur less than that with \<unk\> token
- Replace emojis/emoticons with \<emoji\> token or remove them

Post-Processing for Bert Step 2 and other model
- Only use posts/tweets that pass a pre-trained sentiment classifier
- Remove tweets that do not have a minumum number of replies/likes/retweets/followers/etc
- Remove posts that do not have a minimum number of score (upvotes - downvotes) or number of comments