# Scraping subreddit post data

## Imports

In [129]:
#Imports
import pandas as pd
import numpy as np
import requests
import time

#panda character display limit
pd.options.display.max_colwidth = 400

## Scraping Subreddit Posts using the Pushshift API

### Reddit Scraping Function

In [137]:
def get_posts(subreddit,post_amount):
    f"""
    Arguments:
        subreddit: subreddit to retrieve posts from
        post_amount: total number of posts to take from r/{subreddit}
    Output: 
        pandas.core.frame.DataFrame containing {post_amount} rows of data with each row representing a unique reddit post
        and each column storing a different post attribute
    """
    
    url = 'https://api.pushshift.io/reddit/search/submission' # Base url for pushshift
    print(f'retrieving {post_amount} recent posts from r/{subreddit}') # Message confirming parameters
    
    timestamp = 1651204800 #April 29th, 2022 at 12am
    pulled_data = pd.DataFrame()
    for i in range(post_amount//100):
        parameters = {
            'subreddit':subreddit,
            'size': 100,
            'before':timestamp
        }
        res = requests.get(url, params=parameters)
        data = res.json()
        posts = data['data']
        pulled_data = pd.concat([pulled_data,pd.DataFrame(posts)])#adds batch of posts to DataFrame
        timestamp = posts[-1]['created_utc'] #references oldest post in the batch of 100 to draw the next batch before it
        time.sleep(60) #pulling 100 posts per minute
    return pulled_data

### Calling the Scraping Function on r/natureisfuckinglit and r/natureismetal

#### The following code block is commented out as it will take 1 hour and 40 minutes to run. The data has already been collected and stored in subreddit_data.csv in the data folder.

In [138]:
# Each subreddit has its own dataframe
natureisfuckinglit = get_posts('natureisfuckinglit',5000)
natureismetal = get_posts('natureismetal',5000)

retrieving 5000 recent posts from r/natureisfuckinglit
retrieving 5000 recent posts from r/natureismetal


In [139]:
natureisfuckinglit.shape

(4997, 79)

In [140]:
natureismetal.shape

(5000, 83)

### Pruning irrelevant information from the datatframes

In [141]:
natureisfuckinglit = natureisfuckinglit[['subreddit', 'selftext', 'title']].reset_index().drop(columns='index')
natureisfuckinglit.head(3)

Unnamed: 0,subreddit,selftext,title
0,NatureIsFuckingLit,,A centipede that lives in my house
1,NatureIsFuckingLit,,The Most Beautiful and Naturally White Animals in the World
2,NatureIsFuckingLit,,"🔥 kissing camels, Desert, Algeria 🐪"


In [142]:
natureismetal = natureismetal[['subreddit', 'selftext', 'title']].reset_index().drop(columns='index')
natureismetal.head(3)

Unnamed: 0,subreddit,selftext,title
0,natureismetal,,Suddenly mini golf wasn’t so fun anymore
1,natureismetal,,Crow dissects baby bird in front of parents
2,natureismetal,,Last stand for lone wolf


### Combining the dataframes vertically into one DataFrame

In [143]:
combined_data = pd.concat([natureismetal,natureisfuckinglit],ignore_index=1)

In [144]:
combined_data.shape

(9997, 3)

In [145]:
combined_data['subreddit'].value_counts()

natureismetal         5000
NatureIsFuckingLit    4997
Name: subreddit, dtype: int64

In [148]:
combined_data['selftext'].value_counts()

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        9695
[removed]                                                  

### Writing the contents of the combined DataFrame into a csv

In [150]:
combined_data.to_csv('../data/subreddit_data.csv',index=False)