In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import time
import requests
import json

In [2]:
kind = 'submission'
subreddit = 'OSHA'
size = 500
f'https://api.pushshift.io/reddit/search/{kind}/?subreddit={subreddit}&size={size}'

'https://api.pushshift.io/reddit/search/submission/?subreddit=OSHA&size=500'

In [3]:
field = ['selftext', 'author', 'title', 'created_utc', 'cum_comments', 'is_self', 'subreddit']

In [4]:
# Brian Collins' webscraping function 
def query_pushshift(subreddit, # subreddit name
                    kind='submission', # can be either a submission or a comment
                    skip=30, # number of periods iterated
                    times=6, # number of days in time period
                    subfield = ['selftext', 'author', 'title', 'created_utc', 'num_comments', 'is_self', 'subreddit'],
                    # fields under submission 
                   comfield = ['body', 'created_utc']): # fields for comments
    stem = f'https://api.pushshift.io/reddit/search/{kind}/?subreddit={subreddit}&size={size}' # base url 
    
    my_list = [] # instantiates an empty list 
    
    for x in range(1, times): # iterates through times
        URL = f'{stem}&after={skip * x}d' # new url for each time period 
        print(URL) # prints new url
        response = requests.get(URL) # activates scrape 
        assert response.status_code == 200 # continues if code is 200 for all clear
        mine = response.json()['data'] # get the text data from the scrape
        df = pd.DataFrame.from_dict(mine) # takes data from scrape and puts it into a dataframe
        my_list.append(df) # adds to my_list
        time.sleep(2) # setting a time inbetween each scrape as to not overwhelm api and servers
        
    full = pd.concat(my_list, sort=False) # combines all the dataframes from each scrape into one df
    
    if kind == 'submission': # for submissions drops the duplicates not including itself
        full = full[subfield]
        full = full.drop_duplicates()
        full = full.loc[full['is_self'] == True]
        
    def get_date(created): # getting date from datetime and returning created_utc
        return dt.date.fromtimestamp(created)
    
    _timestamp = full ['created_utc'].apply(get_date) # changing created_utc to date
    
    full['timestamp'] = _timestamp # setting new timestamp as field in dataframe
    
    print(full.shape) # prints shape of the final dataframe
    
    return full # saves the dataframe 

In [5]:
twilight_zone = query_pushshift('twilightzone') # queries and saves scraped text data into dataframe 

https://api.pushshift.io/reddit/search/submission/?subreddit=twilightzone&size=500&after=30d
https://api.pushshift.io/reddit/search/submission/?subreddit=twilightzone&size=500&after=60d
https://api.pushshift.io/reddit/search/submission/?subreddit=twilightzone&size=500&after=90d
https://api.pushshift.io/reddit/search/submission/?subreddit=twilightzone&size=500&after=120d
https://api.pushshift.io/reddit/search/submission/?subreddit=twilightzone&size=500&after=150d
(422, 8)


In [6]:
comicbooks = query_pushshift('comicbooks') # queries and saves scraped text data into dataframe 

https://api.pushshift.io/reddit/search/submission/?subreddit=comicbooks&size=500&after=30d
https://api.pushshift.io/reddit/search/submission/?subreddit=comicbooks&size=500&after=60d
https://api.pushshift.io/reddit/search/submission/?subreddit=comicbooks&size=500&after=90d
https://api.pushshift.io/reddit/search/submission/?subreddit=comicbooks&size=500&after=120d
https://api.pushshift.io/reddit/search/submission/?subreddit=comicbooks&size=500&after=150d
(888, 8)


In [7]:
twilight_zone.to_csv('./twilight_zone_raw') # saving submissions to csv

In [8]:
comicbooks.to_csv('./comicbooks_raw') # saving submissions to csv