# Imports

In [1]:
import requests
import pandas as pd

# Helper Functions

In [2]:
def json_to_dict(url, **kwargs):
    params = kwargs['params']
    res = requests.get(url, params)
    return pd.DataFrame(res.json()['data'])

In [3]:
def get_x_days_posts(url, subreddit, days):
    """
    Get all posts since x days in the past
    """
    
    agg_df = pd.DataFrame()
    
    for i in range(1,days+1):
        try:
            df = json_to_dict(url, params={
                'subreddit': subreddit,
                'before': f'{i-1}d',
                'after': f'{i}d',
                'size':500
            })

            agg_df = pd.concat([agg_df, df], axis=0)
        except:
            continue
        
    return agg_df
        
    

# Brainstorming Potential Subreddit Comparisons and Initial Request

| pairs              	|                    	|
|--------------------	|--------------------	|
| me_irl             	| 2meirl4meirl       	|
| whitepeopletwitter 	| blackpeopletwitter 	|
| jokes              	| dadjokes           	|
| moviedetails       	| shittymoviedetails 	|

In [4]:
base_url = 'https://api.pushshift.io/reddit/search/'
subreddit = 'submission'
comment = 'comment'

params = {
    'subreddit': 'shittymoviedetails',
    'before': '1d',
    'after' : '2d',
    'size' : 500
}

In [5]:
#smd just stands for shitty movie details nothing else
smd = json_to_dict(base_url+subreddit, params = params)

In [6]:
smd.shape

(37, 71)

In [7]:
smd.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_is_blocked,author_patreon_flair,...,url,url_overridden_by_dest,whitelist_status,wls,link_flair_template_id,link_flair_text,media_metadata,author_flair_background_color,author_flair_text_color,removed_by_category
0,[],False,SnowySergal,,[],,text,t2_1a37frbn,False,False,...,https://i.redd.it/o4dzkwai9h581.png,https://i.redd.it/o4dzkwai9h581.png,some_ads,7,,,,,,
1,[],False,Stony_Hawk,,[],,text,t2_1v5p7iz1,False,False,...,https://i.redd.it/q334kezgdh581.jpg,https://i.redd.it/q334kezgdh581.jpg,some_ads,7,,,,,,
2,[],False,RutabagaMundane5354,,[],,text,t2_8s834k49,False,False,...,https://i.redd.it/fb06t9s9jh581.jpg,https://i.redd.it/fb06t9s9jh581.jpg,some_ads,7,22f83518-96c7-11e7-a9d8-0e93623777de,default,,,,
3,[],False,nem_v_39,,[],,text,t2_b1ozx31l,False,False,...,https://i.redd.it/b2pu2blvmh581.gif,https://i.redd.it/b2pu2blvmh581.gif,some_ads,7,3a64c3c6-82f1-11e8-b314-0e08f82792dc,Turd,,,,
4,[],False,ALV1DA,,[],,text,t2_h55dlv98,False,False,...,https://i.redd.it/467hw7hath581.jpg,https://i.redd.it/467hw7hath581.jpg,some_ads,7,,,,,,


In [8]:
smd['title'];

Since we can only get 100 submissions at a time, we're going to have to abuse the hell out of multiple requests

# Data Aquisition

In [21]:
%%time

smd_df = get_x_days_posts(url = base_url + subreddit, 
                 subreddit='shittymoviedetails',
                 days = 900);

Wall time: 3min 2s


In [22]:
smd_df.shape

(7680, 87)

In [23]:
smd_df.columns

Index(['all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_type', 'author_fullname', 'author_is_blocked',
       'author_patreon_flair', 'author_premium', 'awarders', 'can_mod_post',
       'contest_mode', 'created_utc', 'domain', 'full_link', 'gildings', 'id',
       'is_created_from_ads_ui', 'is_crosspostable', 'is_meta',
       'is_original_content', 'is_reddit_media_domain', 'is_robot_indexable',
       'is_self', 'is_video', 'link_flair_background_color',
       'link_flair_richtext', 'link_flair_text_color', 'link_flair_type',
       'locked', 'media_only', 'no_follow', 'num_comments', 'num_crossposts',
       'over_18', 'parent_whitelist_status', 'permalink', 'pinned',
       'post_hint', 'preview', 'pwls', 'retrieved_on', 'score', 'selftext',
       'send_replies', 'spoiler', 'stickied', 'subreddit', 'subreddit_id',
       'subreddit_subscribers', 'subreddit_type', 'suggested_sort'

In [24]:
smd_df['title'].unique().shape

(7524,)

In [25]:
%%time

md_df = get_x_days_posts(url = base_url + subreddit, 
                 subreddit='moviedetails',
                 days = 900)

Wall time: 2min 28s


In [26]:
md_df.shape

(7186, 89)

Difference in column comes from 

In [27]:
[col for col in smd_df.columns if col not in md_df.columns]

[]

In [28]:
[col for col in md_df.columns if col not in smd_df.columns]

['link_flair_css_class', 'poll_data']

In [29]:
smd_df['subreddit'] = 1

In [30]:
md_df['subreddit'] = 0

subreddit:

    ShittyMovieDetails = 1
    MovieDetails = 0

In [31]:
data = pd.concat([smd_df,md_df], axis=0)

# Export

In [32]:
data.to_csv('./data/subreddit_posts.csv',index=False)