# Gathering data from Reddit API
Starting with required libraries

In [1]:
import requests
import time
import pandas as pd

Checking out the details of the JSON file, acquired by adding the .json extension to the end of subreddit url

In [2]:
url = 'https://www.reddit.com/r/science/.json'

In [3]:
#create my own unique header so that I do not get a status error
headers = {'User-agent': 'Jingle Bells 2.0'}

In [4]:
res = requests.get(url, headers=headers)

In [5]:
#200 means we are good to go. #400 means its a problem on the user's end
#500 means there is a problem on reddit's end
res.status_code

200

In [6]:
#turn the output of the requests.get into a json format 
the_json = res.json()

In [7]:
#check out the keys of the json
sorted(the_json.keys())

['data', 'kind']

In [8]:
#check out the nested keys within 'data'
sorted(the_json['data'].keys())

['after', 'before', 'children', 'dist', 'modhash']

Begin with an empty lists for the posts and an empty dictionary for the parameters.  Run the for loop once to get the first 100 posts.

In [9]:
posts = []
after = None
for i in range(4):
    if after == None:
        params = {}
    else:
        params = {'after': after}
    #start with the science posts
    url = 'https://www.reddit.com/r/science/.json'
    res = requests.get(url, params = params, headers = headers)
    if res.status_code == 200:
        the_json = res.json()
        posts.extend(the_json['data']['children'])
        after = the_json['data']['after']
    
    #in case there is a status error we will break the for loop and see
    #what kind of error it is
    else:
        print(res.status_code)
        break
    time.sleep(1)
        
        

In [10]:
#make sure there are posts in the list
len(posts)

100

Once the list has begun populating, run this for loop so that it doesn't write over what is already in the list.  I can run this cell multiple times to continue gathering posts.

In [22]:
for i in range(4):
    if after == None:
        params = {}
    else:
        params = {'after': after}
    url = 'https://www.reddit.com/r/science/.json'
    res = requests.get(url, params=params, headers = headers)
    if res.status_code == 200:
        the_json = res.json()
        posts.extend(the_json['data']['children'])
        after = the_json['data']['after']
    else:
        print(res.status_code)
        break
    time.sleep(1)

These two lines of code will show how many total posts are in the list and how many unique (non duplicated) posts there are.  Once the first line does not match the second line, we know we've gotten all the unique posts we can for that day.

In [24]:
display(len(posts))
display(len(set([p['data']['name'] for p in posts])))

787

688

# Create a dataframe with the populated posts

In [25]:
import numpy as np

In [26]:
#put all of the posts into a list
title_lst = [p['data']['title'] for p in posts]
#create a dictionary to eventually make into a dataframe
data_dict = {
    'posts':title_lst,
    #this labels all the posts as a 1 for science.
    #all the comedy posts will have a zero under the science column
    'science': np.full(len(title_lst), 1)
}

post_df = pd.DataFrame.from_dict(data_dict, orient='columns')

Dropping all of the duplicated posts

In [27]:
#make sure each post in the dataframe is unique
unique_post_df = post_df[~post_df.duplicated()]

In [28]:
unique_post_df.head()

Unnamed: 0,posts,science
0,Analysing data about cannabis use among more t...,1
1,"Breeding bees with ""clean genes"" could help pr...",1
2,New research suggests that megaliths — monumen...,1
3,Eleven spiders from the Cretaceous period have...,1
4,Teachers’ helping behaviors leads to better st...,1


Saving the science data

In [None]:
unique_post_df.to_csv('./datasets/unique_sciencedata', index=False)

# Repeat the process for comedy posts