In [15]:
import pandas as pd
import requests
import datetime
import logging
import boto3
from botocore.exceptions import ClientError
from bs4 import BeautifulSoup

In [3]:
def create_bucket(bucket_name, region=None):
    
    """Create an S3 bucket
    param bucket_name: Bucket to create
    return: True if bucket created, else False
    """
    
    try:
        if region is None:
            s3_client = boto3.client('s3')
            s3_client.create_bucket(Bucket=bucket_name)
        else:
            s3_client = boto3.client('s3', region_name=region)
            location = {'LocationConstraint': region}
            s3_client.create_bucket(Bucket=bucket_name,
                                    CreateBucketConfiguration=location)
    except ClientError as e:
        logging.error(e)
        return False
    return True

In [94]:
def date_interval_list(start_date, end_date, interval):

    """give a start and end date, and return a list of all dates in between the range
       seperated by days specified in the interval parameter e.g. 
       date_interval_list(20-01-01, 20-02-01, 5) will give..
       [20-01-01,20-01-06,20-01-11...20-02-01]
    """
    
    start_date = str(start_date)
    end_date = str(end_date)
    end_datetime = datetime.datetime.strptime(str(end_date),"%y-%m-%d") - \
        datetime.timedelta(days=interval)
    x = datetime.datetime.strptime(str(start_date),"%y-%m-%d")
    date_list = []
    date_list.append(str(x.date()))
    while x <= end_datetime:
        plus_5_days = (x + datetime.timedelta(days=int(interval)))
        date_list.append(str(plus_5_days.date()))
        x = plus_5_days
        
    return date_list

In [5]:
def push_shift_submissions(subreddit, start, end, limit):
    
    """Scrape subreddit submissions
    Parameters:
    # subreddit - (string) subreddit name which you are looking for submissions
    # start/end - (string) start and end date as timestamp or 'yyyy-mm-dd' 
    # limit     - (string) maximum number of submissions to be returned
    """
    
    # create url using parameters given
    url = 'https://api.pushshift.io/reddit/submission/search/?subreddit=' + str(subreddit) + \
    '&after=' + str(start) + '&before=' + str(end) + '&limit=' + str(limit)
    print(url)
    
    # on failure retry 5 times
    RetryCount = 0
    while RetryCount < 5:
        
        try:
            r = requests.get(url)
            assert r.status_code == 200
            # if response 200 is given return data as json format
            data = r.json()
            print('status code = ' + str(r.status_code),
                  '\nrequest successful.. list of dict objects returned', '\n' + \
                  'total submissions retrieved: ' + str(len(data['data'])))
            return data['data']
        
        except:
            RetryCount = RetryCount + 1
            print('status code = ' + str(r.status_code) + \
                  ', failed to retrieve data, number of retries attempted: ' + str(RetryCount))
    

In [6]:
data = push_shift_submissions('RoastMe', '2020-02-28', str(datetime.date.today()), '1000')

https://api.pushshift.io/reddit/submission/search/?subreddit=RoastMe&after=2020-02-28&before=2020-02-29&limit=1000
status code = 200 
request successful.. list of dict objects returned 
total submissions retrieved: 225


In [70]:
#use id: value returned from submission json 
# use this id as link_id parameter in comments api url
r2 = requests.get('https://api.pushshift.io/reddit/comment/search/?link_id=falv9w&limit=1000')

comment_list = []
for i in r2.json()['data']:
    comment_list.append(i['body'])

<Response [200]>

In [95]:
date_interval_list('20-01-01','20-01-10',2)

['2020-01-01', '2020-01-03', '2020-01-05', '2020-01-07', '2020-01-09']