# Using Reddit API via Pushshift (psaw)

In [1]:
from psaw import PushshiftAPI
import pandas as pd
import datetime as dt
from os.path import join


api = PushshiftAPI()

In [2]:
src = "../data"

## Get data from API

In [3]:
def get_pushshift_data(query, subreddit, api, after, limit = 500):

    gen = api.search_comments(q = query, subreddit = subreddit, after = after)

    max_response_cache = limit
    cache = []

    for c in gen:
        cache.append(c)
        
        if limit is not None: 
            # Omit this test to actually return all results. Wouldn't recommend it though: could take a while, but you do you.
            if len(cache) >= max_response_cache:
                break

    # If you really want to: pick up where we left off to get the rest of the results.
    if False:
        for c in gen:
            cache.append(c)
            
    df = pd.DataFrame([thing.d_ for thing in cache])
    
    return(df)
            


In [4]:
diabetes_terms =  ["diabetes",
                   "Diabetes",
                   '"I was just diagnosed with diabetes"',
                   '"today I was diagnosed with diabetes"',
                   '"I just learned I have diabetes"',
                   '"learned I got diabetes"',
                   '"heard I got diabetes"',
                   '"learned I have diabetes"',
                   '"heard I have diabetes"',
                   '"I was recently diagnosed with diabetes"',
                   '"I recently learned I have diabetes"',
                   '"I recently learned that I have diabetes"',
                   '"new diabetic"',
                   '"New diabetic"',
                   '"NEW DIABETIC"']

diabetes_query = '|'.join(diabetes_terms)

print(diabetes_query)

diabetes|Diabetes|"I was just diagnosed with diabetes"|"today I was diagnosed with diabetes"|"I just learned I have diabetes"|"learned I got diabetes"|"heard I got diabetes"|"learned I have diabetes"|"heard I have diabetes"|"I was recently diagnosed with diabetes"|"I recently learned I have diabetes"|"I recently learned that I have diabetes"|"new diabetic"|"New diabetic"|"NEW DIABETIC"


In [5]:
start_epoch=int(dt.datetime(2020, 1, 1).timestamp()) # Set filtering date to 1st of January 2020

df = get_pushshift_data(query = diabetes_query,
                        subreddit = "diabetes",
                        api = api,
                        after = start_epoch,
                        limit = None)

print("Found", len(df), "posts.")
print(df.head())

Found 19103 posts.
  all_awardings associated_award          author  \
0            []             None    crappysurfer   
1            []             None        adam_mmm   
2            []             None  NebrasketballN   
3            []             None    badnewsblair   
4            []             None       Lausannea   

  author_flair_background_color author_flair_css_class  \
0                                                   T1   
1                          None                   None   
2                          None                   None   
3                                                   T2   
4                       #0079d3           user-type-15   

                               author_flair_richtext  \
0                    [{'e': 'text', 't': 'T1 1996'}]   
1                                                 []   
2                                                 []   
3  [{'e': 'text', 't': 'T2, 2015, Metformin/Lower...   
4  [{'e': 'text', 't': 'LADA/1.5 dx 201

## Identify and remove anniversaries

In [6]:
# many users post about a diagnosis anniversary, remove these tweets as well
df['recent'] = df['body'].apply(lambda x: ('years ago' not in x) and \
                                              ('yrs ago' not in x) and \
                                              ('year ago' not in x) and \
                                              ('YEARS AGO' not in x) and \
                                              ('years today' not in x) and \
                                              ('flashback') not in x)


past = df[df['recent'] == False].copy()
recent = df[df['recent'] == True].copy()

print('{} comments from past diabetes diagnoses'.format(len(past)))
print('{} comments from recent diabetes diagnoses'.format(len(recent)))

566 comments from past diabetes diagnoses
18537 comments from recent diabetes diagnoses


## Export data

In [7]:
recent.to_csv(join(src, 'reddit_diagnosed_diabetes_clean.csv'), index=False)

user_list = list(recent['author'].unique())
print('Saving {} usernames.'.format(len(user_list)))

with open(join(src, "reddit_diagnosed_user_IDs.txt"), "w") as outfile:
    for username in user_list:
        outfile.write("%s\n" % username)

Saving 5408 usernames.


In [9]:
diagnosis_dates = df[['author', 'created_utc', 'id']]\
    .sort_values(by=['author', 'created_utc'])\
    .reset_index(drop=True)\
    .drop_duplicates(subset=['author'])
    
diagnosis_dates['created_dt'] = diagnosis_dates['created_utc'].apply(lambda x: dt.datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S')
)

diagnosis_dates.to_csv(join(src, 'reddit_user_diagnosis_dates.csv'), index=False)