In [1]:
from datetime import datetime, timedelta, timezone

In [2]:
import time

In [3]:
import requests

In [4]:
# PUSHSHIFT SCRAPING FUNCTIONS

In [5]:
def get_posts_from_pushshift_by_day(subreddit, start_pull, end_pull):
        
    # Convert the date to a Unix timestamp
    timestamp_start = int(start_pull.timestamp())
    timestamp_end = int(end_pull.timestamp())

    # Set up the Pushshift API URL
    url = f"https://api.pushshift.io/reddit/search/submission/?subreddit={subreddit}&after={timestamp_start}&before={timestamp_end}"
    
    # Make the API request and get the JSON response
    response = requests.get(url)
    response_json = response.json()

    # Extract the posts from the response and return them
    posts = response_json['data']
    return posts

In [6]:
def scrape_range(subreddit, start, end):
    daily_postlist = {}
    
    current_start = start
    current_end = current_start + timedelta(days=1) - timedelta(seconds=1)
    while (current_end < end):

        max_tries = 8
        current_tries = 0  
        while current_tries < max_tries:      
            # repeat failed calls with linear backoff
            time.sleep(current_tries) 
            try:
                daily_posts = get_posts_from_pushshift_by_day(subreddit, current_start, current_end)
                daily_postlist[current_start] = daily_posts
   
                date_string = current_start.strftime('%Y-%m-%d %H:%M:%S')
                print(f"pushshift scraped for {date_string}")
                
                break
            except:
                current_tries += 1
                date_string = current_start.strftime('%Y-%m-%d %H:%M:%S')
                print(f'pushshift call failed for {date_string}, retrying {current_tries} / {max_tries}')
        
        current_start = current_start + timedelta(days=1)
        current_end = current_start + timedelta(days=1) - timedelta(seconds=1)
    
    return daily_postlist

In [7]:
# GET PUSHSHIFT POSTS

In [8]:
subreddit = 'newworldgame'

In [9]:
start = datetime(2021, 4, 1, tzinfo=timezone.utc)  

In [10]:
end = datetime(2021, 4, 2, tzinfo=timezone.utc)  

In [19]:
postlist_daydict = scrape_range(subreddit, start, end)

In [12]:
# REVIEW PUSHSHIFT DATA

In [18]:
total_count = 0
for key in postlist_daydict:
    print(f'{key.date()} with {len(postlist_daydict[key])} entries')
    print()
    for post in postlist_daydict[key]:
        total_count = total_count + 1
        print(f"{post['title']} | {post['id']}")