In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import time
import requests
import json

In [2]:
kind = 'submission'
subreddit = 'OSHA'
size = 1000
f'https://api.pushshift.io/reddit/search/{kind}/?subreddit={subreddit}&size={size}'

'https://api.pushshift.io/reddit/search/submission/?subreddit=OSHA&size=1000'

In [3]:
field = ['selftext', 'author', 'title', 'created_utc', 'cum_comments', 'is_self', 'subreddit']

In [4]:
# Modified from Brian Collins' web-scraper given to the class 
def query_pushshift(subreddit, # subreddit name
                    kind='submission', # can be either a submission or a comment
                    skip=8, # number of days in time period
                    times=100, # number of periods iterated
                    subfield = ['selftext', 'author', 'title', 'created_utc', 'num_comments', 'is_self', 'subreddit'],
                    # fields under submission 
                   comfield = ['body', 'created_utc']): # fields for comments
    stem = f'https://api.pushshift.io/reddit/search/{kind}/?subreddit={subreddit}&size={size}' # base url 
    
    my_list = [] # instantiates an empty list 
    
    for x in range(1, times): # iterates through times
        URL = f'{stem}&after={skip * x}d' # new url for each time period 
        print(URL) # prints new url
        response = requests.get(URL) # activates scrape 
        assert response.status_code == 200 # continues if code is 200 for all clear
        mine = response.json()['data'] # get the text data from the scrape
        df = pd.DataFrame.from_dict(mine) # takes data from scrape and puts it into a dataframe
        my_list.append(df) # adds to my_list
        time.sleep(2) # setting a time inbetween each scrape as to not overwhelm api and servers
        
    full = pd.concat(my_list, sort=False) # combines all the dataframes from each scrape into one df
    
    if kind == 'submission': # for submissions drops the duplicates not including itself
        full = full[subfield]
        full = full.drop_duplicates()
        full = full.loc[full['is_self'] == True]
        
    def get_date(created): # getting date from datetime and returning created_utc
        return dt.date.fromtimestamp(created)
    
    _timestamp = full ['created_utc'].apply(get_date) # changing created_utc to date
    
    full['timestamp'] = _timestamp # setting new timestamp as field in dataframe
    
    print(full.shape) # prints shape of the final dataframe
    
    return full # saves the dataframe 

### In an attempt to balance our classes we used two different parameters for each scrape. It should be noted that the classes could have been balanced after the fact using SMOTE or similar methods. We chose to keep the data as close to the original as possible, hence the parameter difference. 

In [5]:
# Number of days and time periods (10d and 100 periods)

twilight_zone = query_pushshift('twilightzone') # queries and saves scraped text data into dataframe 

https://api.pushshift.io/reddit/search/submission/?subreddit=twilightzone&size=1000&after=8d
https://api.pushshift.io/reddit/search/submission/?subreddit=twilightzone&size=1000&after=16d
https://api.pushshift.io/reddit/search/submission/?subreddit=twilightzone&size=1000&after=24d
https://api.pushshift.io/reddit/search/submission/?subreddit=twilightzone&size=1000&after=32d
https://api.pushshift.io/reddit/search/submission/?subreddit=twilightzone&size=1000&after=40d
https://api.pushshift.io/reddit/search/submission/?subreddit=twilightzone&size=1000&after=48d
https://api.pushshift.io/reddit/search/submission/?subreddit=twilightzone&size=1000&after=56d
https://api.pushshift.io/reddit/search/submission/?subreddit=twilightzone&size=1000&after=64d
https://api.pushshift.io/reddit/search/submission/?subreddit=twilightzone&size=1000&after=72d
https://api.pushshift.io/reddit/search/submission/?subreddit=twilightzone&size=1000&after=80d
https://api.pushshift.io/reddit/search/submission/?subreddit=

https://api.pushshift.io/reddit/search/submission/?subreddit=twilightzone&size=1000&after=704d
https://api.pushshift.io/reddit/search/submission/?subreddit=twilightzone&size=1000&after=712d
https://api.pushshift.io/reddit/search/submission/?subreddit=twilightzone&size=1000&after=720d
https://api.pushshift.io/reddit/search/submission/?subreddit=twilightzone&size=1000&after=728d
https://api.pushshift.io/reddit/search/submission/?subreddit=twilightzone&size=1000&after=736d
https://api.pushshift.io/reddit/search/submission/?subreddit=twilightzone&size=1000&after=744d
https://api.pushshift.io/reddit/search/submission/?subreddit=twilightzone&size=1000&after=752d
https://api.pushshift.io/reddit/search/submission/?subreddit=twilightzone&size=1000&after=760d
https://api.pushshift.io/reddit/search/submission/?subreddit=twilightzone&size=1000&after=768d
https://api.pushshift.io/reddit/search/submission/?subreddit=twilightzone&size=1000&after=776d
https://api.pushshift.io/reddit/search/submission/

In [6]:
# Modified from Brian Collins' web-scraper given to the class 
def query_pushshift(subreddit, # subreddit name
                    kind='submission', # can be either a submission or a comment
                    skip=6, # number of days in time period
                    times=10, # number of periods iterated
                    subfield = ['selftext', 'author', 'title', 'created_utc', 'num_comments', 'is_self', 'subreddit'],
                    # fields under submission 
                   comfield = ['body', 'created_utc']): # fields for comments
    stem = f'https://api.pushshift.io/reddit/search/{kind}/?subreddit={subreddit}&size={size}' # base url 
    
    my_list = [] # instantiates an empty list 
    
    for x in range(1, times): # iterates through times
        URL = f'{stem}&after={skip * x}d' # new url for each time period 
        print(URL) # prints new url
        response = requests.get(URL) # activates scrape 
        assert response.status_code == 200 # continues if code is 200 for all clear
        mine = response.json()['data'] # get the text data from the scrape
        df = pd.DataFrame.from_dict(mine) # takes data from scrape and puts it into a dataframe
        my_list.append(df) # adds to my_list
        time.sleep(2) # setting a time inbetween each scrape as to not overwhelm api and servers
        
    full = pd.concat(my_list, sort=False) # combines all the dataframes from each scrape into one df
    
    if kind == 'submission': # for submissions drops the duplicates not including itself
        full = full[subfield]
        full = full.drop_duplicates()
        full = full.loc[full['is_self'] == True]
        
    def get_date(created): # getting date from datetime and returning created_utc
        return dt.date.fromtimestamp(created)
    
    _timestamp = full ['created_utc'].apply(get_date) # changing created_utc to date
    
    full['timestamp'] = _timestamp # setting new timestamp as field in dataframe
    
    print(full.shape) # prints shape of the final dataframe
    
    return full # saves the dataframe 

In [7]:
# Number of days and time periods (6d and 10 periods) 

comicbooks = query_pushshift('comicbooks') # queries and saves scraped text data into dataframe 

https://api.pushshift.io/reddit/search/submission/?subreddit=comicbooks&size=1000&after=6d
https://api.pushshift.io/reddit/search/submission/?subreddit=comicbooks&size=1000&after=12d
https://api.pushshift.io/reddit/search/submission/?subreddit=comicbooks&size=1000&after=18d
https://api.pushshift.io/reddit/search/submission/?subreddit=comicbooks&size=1000&after=24d
https://api.pushshift.io/reddit/search/submission/?subreddit=comicbooks&size=1000&after=30d
https://api.pushshift.io/reddit/search/submission/?subreddit=comicbooks&size=1000&after=36d
https://api.pushshift.io/reddit/search/submission/?subreddit=comicbooks&size=1000&after=42d
https://api.pushshift.io/reddit/search/submission/?subreddit=comicbooks&size=1000&after=48d
https://api.pushshift.io/reddit/search/submission/?subreddit=comicbooks&size=1000&after=54d
(1797, 8)


In [8]:
twilight_zone.to_csv('./twilight_zone_raw') # saving submissions to csv

In [9]:
comicbooks.to_csv('./comicbooks_raw') # saving submissions to csv