In [1]:
import pandas as pd
import requests
import time

In [2]:
url = 'https://api.pushshift.io/reddit/search/submission'

In [3]:
start_utc = 1587419679  # Start collecting posts at this time and work back from there

In [4]:
# This function takes a subreddit title , the start time, and the desired number of posts,
# and adds those posts to keep_lst, which is then returned by the function

def get_subreddit_posts(subreddit, before_utc, num_posts):
    
    # This list will hold posts that have text in them
    keep_lst = []
    
    sent = 0
    good_post_counter = 1
    
    # don't get more than the specified amount of posts and don't send more than 50 requests
    while good_post_counter < num_posts and sent < 50:

        params = {
        'subreddit': subreddit,
        'before': before_utc,
        'size': 100
        }

        # Send a request, if the request fails then try it again 10 sec later. If
        # it fails 5 times in a row, stop the execution of the program. 
        
        request_successful = False
        request_counter = 1
        while request_successful == False:
            
                try:
                    res = requests.get(url, params)
                    request_successful = True
                    
                except:
                    
                    if request_counter < 5:
                        print(f'Error: Request: {sent + 1} failed, repeating request...')
                        
                        # wait 10 sec and try again
                        time.sleep(10)
                        
                    else:
                        print(f'Request {sent + 1} sent {request_counter} times, all failed')
                        break
                        
                    request_counter += 1


        # Print the status code from the request
        print(f'Request {sent + 1}: ', res.status_code)

        # Convert the json response to a dictionary
        data = res.json()
        
        cur_posts = data['data']
        
        # take each item (post) in cur_posts and add it to keep_lst if if contains text in 'selftext'
        for item in cur_posts:
            
            # break if the specified number of posts have already been collected
            if good_post_counter > num_posts:
                break
            
            # Filter out posts that have no text or have been removed/deleted, add the good posts to keep_lst
            try:
                if ((len(item['selftext']) > 0) and (item['selftext'] != ['removed']) and 
                    ('deleted' not in item['selftext']) and ('[removed]' not in item['selftext'])):

                    post = {'subreddit': item['subreddit'], 'id: ': item['id'], 
                            'title': item['title'], 'selftext: ': item['selftext']}
                    
                    keep_lst.append(post)
                    good_post_counter += 1
                    
            except:
                print('There was an issue retreiving "selftext" from the post at: ', item['url'])

        print('total posts retreived: ', good_post_counter, '  request: ', (sent + 1), '\n\n')

        # Set prev_utc to the utc of the last post retreived
        before_utc = cur_posts[-1]['created_utc']

        sent += 1

        # delay 10 sec in between requests
        time.sleep(10)
        
    return keep_lst

In [5]:
# Get 1000 posts from the 'MTB' subreddit
mtb_posts = get_subreddit_posts('MTB', start_utc, 1000)

Error: Request: 1 failed, repeating request...
Request 1:  200
total posts retreived:  45   request:  1 


Request 2:  200
There was an issue retreiving "selftext" from the post at:  https://www.reddit.com/r/MTB/comments/g4017o/which_bikeframe_has_the_best_upgrade_potential/
total posts retreived:  80   request:  2 


Request 3:  200
total posts retreived:  120   request:  3 


Request 4:  200
There was an issue retreiving "selftext" from the post at:  https://www.reddit.com/r/MTB/comments/g2u3sk/trail_bike_recommendations_specialized_v_santa/
total posts retreived:  151   request:  4 


Request 5:  200
There was an issue retreiving "selftext" from the post at:  https://www.reddit.com/r/MTB/comments/g27dv0/good_deal_on_used_2019_fathom/
There was an issue retreiving "selftext" from the post at:  https://www.reddit.com/r/MTB/comments/g27c90/good_deal_for_2019_fathom/
There was an issue retreiving "selftext" from the post at:  https://www.reddit.com/r/MTB/comments/g271y5/used_2019_giant_

In [6]:
# Get 1000 posts from the 'gravelcycling' subreddit
gravel_posts = get_subreddit_posts('gravelcycling', start_utc, 1000)

Request 1:  200
total posts retreived:  26   request:  1 


Request 2:  200
total posts retreived:  49   request:  2 


Request 3:  200
total posts retreived:  83   request:  3 


Request 4:  200
total posts retreived:  98   request:  4 


Request 5:  200
total posts retreived:  127   request:  5 


Request 6:  200
total posts retreived:  148   request:  6 


Request 7:  200
total posts retreived:  173   request:  7 


Request 8:  200
total posts retreived:  205   request:  8 


Request 9:  200
total posts retreived:  239   request:  9 


Request 10:  200
total posts retreived:  282   request:  10 


Request 11:  200
total posts retreived:  324   request:  11 


Request 12:  200
total posts retreived:  362   request:  12 


Request 13:  200
total posts retreived:  397   request:  13 


Request 14:  200
total posts retreived:  424   request:  14 


Request 15:  200
total posts retreived:  458   request:  15 


Request 16:  200
total posts retreived:  489   request:  16 


Request 17:  2

In [7]:
posts = []
for post in mtb_posts + gravel_posts:
    posts.append(post)

In [8]:
# Convert posts into a dataframe
df = pd.DataFrame(posts)

In [9]:
df

Unnamed: 0,subreddit,id:,title,selftext:
0,MTB,g51oh2,Question About Suntour Aion 34,I just ordered this fork from CRC: https://www...
1,MTB,g512qr,Beginner needs advice for new wheels,A few years ago my mountain bike's wheels wher...
2,MTB,g50qzr,Second hand MTB: Scott Scale 80. Opinions needed,Looking to get into some light Mtbing as a fun...
3,MTB,g50b6w,Rear shock options for 2018 Stumpjumper?,I have this bike [https://www.specialized.com...
4,MTB,g503j8,Anyone riding the Nero R or the Selva R? How i...,Thinking about buying the Nero R used as an up...
...,...,...,...,...
1995,gravelcycling,atajw2,Gravel fun!,I got into gravel riding earlier this month wh...
1996,gravelcycling,asr2gv,Ohio Gravel Race Series,The Ohio Gravel Race Series is a point based s...
1997,gravelcycling,arq37z,"Wheel suggestions , nothing fancy",I’m looking to swap the wheels on my Specializ...
1998,gravelcycling,aqyhw2,Washington Gravel Grinders??,Any Washington (State) gravel riders on here? ...


In [10]:
# Read the df to a csv file
df.to_csv('../data/reddit_posts.csv', index = False)