# Project 3.01 - Data Collection

## Table of Contents

- [Functions](#Functions)

- [Imports](#Imports)

- [Data Pulls](#Data-Pulls)    



## Functions 

In [1]:
def pmaw_posts(subreddit, beg_date, end_date):
    '''Using the pmaw pushshift api multithread wrapper created by Matt Podolak, we can pull specific dates and results. 
    This function allows for a api search and subsequent dataframe being created that houses the data'''
    posts = api.search_submissions(subreddit= subreddit, max_results_per_request = 150000, safe_exit=True, 
                                   mem_safe=True, before = end_date, after = beg_date)
    create_list = [p for p in posts]
    post_df = pd.DataFrame(create_list)
    post_df = post_df[['created_utc','id', 'title', 'is_video', 'num_comments', 'subreddit', 'gildings']]
    post_df.to_csv('./Data/posts.csv', index = False)
    return post_df

In [2]:
def pmaw_comments(subreddit, limit, end_date):
    '''Using the pmaw pushshift api multithread wrapper created by Matt Podolak, we can pull specific dates and results. 
    This function allows for a api search and subsequent dataframe being created that houses the data'''
    comments = api.search_comments(subreddit= subreddit, limit=10000, before = end_date)
    create_list = [c for c in comments]
    comment_df = pd.DataFrame(create_list)
    comment_df = comment_df[['created_utc','id', 'author', 'body', 'score', 'gildings', 'subreddit', 'permalink']]
    comment_df.to_csv('./Data/comments.csv', index = False)
    return comment_df

In [3]:
def get_posts(sub,itr):
    '''The PMAW pushift package pulled incomplete data, so the function below
    pulls all post data starting with the end date, given a specified amount 
    of iterations. It also exports a csv.'''
    #column_names = ["subreddit", "title", "selftext", 'created_utc', 'url']
    post_url = 'https://api.pushshift.io/reddit/search/submission'
    params = {'subreddit': sub,'size' : 100, 'after': end}
    res_post = requests.get(post_url, params)
    posts = res_post.json()['data']
    df_post = pd.DataFrame(posts)#[["subreddit", "title", "selftext", 'created_utc', 'url']]
    
    for i in range(itr):
        time.sleep(5)
        params = {'subreddit': sub,'size': 100,
        'before': df_post['created_utc'].tail(1)}
        res_post = requests.get(post_url, params)
        posts = res_post.json()['data']
        df = pd.DataFrame(posts)#[['subreddit', 'title', 'selftext', 'created_utc', 'url']]
        merge = [df_post, df]
        df_post = pd.concat(merge, axis = 0) 
        
    df_post.to_csv('./Data/' + sub + '.csv', index = False)
    print("Dataframe Exported")
    return df_post

In [4]:
'''This function is a build-in-progress function, that is to be left for future development in pulling post comments'''

def get_comments(sub, itr):
    '''The PMAW pushift package pulled incomplete data, so the function below
    pulls all comments starting with the end date, given a specified amount of 
    iterations. It also exports a csv.'''
    column_names = ['created_utc','author', 'author', 'body', 'title', 'score', 'subreddit', 'permalink']
    post_url = 'https://api.pushshift.io/reddit/search/comment'
    params = {'subreddit': sub,'size' : 100, 'after': end}
    res_post = requests.get(post_url, params)
    posts = res_post.json()['data']
    df_comm = pd.DataFrame(posts)[['created_utc','author', 'author', 'body', 'score', 'subreddit', 'permalink']]
    for i in range(itr):
        time.sleep(5)
        params = {'subreddit': sub,'size': 100,
        'before': df_comm['created_utc'].iloc[-1]}
        res_comm = requests.get(post_url, params)
        comm = res_comm.json()['data']
        res_comm.status_code
        df = pd.DataFrame(comm)[[column_names]]
        merge = [df, df_comm]
        df_comm = pd.concat(merge, axis = 0) 
    df_comm.to_csv('./Data/' + sub + '_comments.csv', index = False)
    print("Dataframe Exported")
    return df_comm
    


## Imports

In [5]:
#Import API packages
import pandas as pd
import requests
import time
import datetime as dt
from pmaw import PushshiftAPI
api = PushshiftAPI()

## Data Pulls

In [6]:
#URL for pushift API
post_url = 'https://api.pushshift.io/reddit/search/submission'
comment_url = 'https://api.pushshift.io//reddit/search/comment'

In [7]:
#Select End Date
end = int(dt.datetime(2021,9,10,0,0).timestamp())

In [12]:
#get_posts(subreddit, iterations)

get_posts('economics', 100)

Dataframe Exported


Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_is_blocked,author_patreon_flair,...,author_flair_background_color,author_flair_text_color,gallery_data,media_metadata,author_cakeday,distinguished,suggested_sort,gilded,crosspost_parent,crosspost_parent_list
0,[],False,davesmith001,,[],,text,t2_4dgpszrg,False,False,...,,,,,,,,,,
1,[],False,treyday22,,[],,text,t2_9970p,False,False,...,,,,,,,,,,
2,[],False,davesmith001,,[],,text,t2_4dgpszrg,False,False,...,,,,,,,,,,
3,[],False,Ituglobal,,[],,text,t2_6rjw5,False,False,...,,,,,,,,,,
4,[],False,Dardaniela,,[],,text,t2_efe3eupp,False,False,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,[],False,Monacegoose,,[],,text,t2_3o73s8bg,,False,...,,,,,,,,,,
96,[],False,Relevant_Quit_8355,,[],,text,t2_9099lond,,False,...,,,,,,,,,,
97,[],False,DoremusJessup,,[],,text,t2_612zd,,False,...,,,,,,,,,,
98,[],False,Helmidoric_of_York,,[],,text,t2_52hlhcnz,,False,...,,,,,,,,,,


In [9]:
#get_posts(subreddit, iterations)

get_posts('finance', 100)

Dataframe Exported


Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_is_blocked,author_patreon_flair,...,author_flair_background_color,author_flair_text_color,crosspost_parent,crosspost_parent_list,is_gallery,author_flair_template_id,author_cakeday,gallery_data,media_metadata,distinguished
0,[],False,kmuinnovation,,[],,text,t2_42xdt,False,False,...,,,,,,,,,,
1,[],False,sillychillly,,[],,text,t2_2c3y1k8m,False,False,...,,,,,,,,,,
2,[],False,kmuinnovation,,[],,text,t2_42xdt,False,False,...,,,,,,,,,,
3,[],False,kmuinnovation,,[],,text,t2_42xdt,False,False,...,,,,,,,,,,
4,[],False,kmuinnovation,,[],,text,t2_42xdt,False,False,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,[],False,kmuinnovation,,[],,text,t2_42xdt,,False,...,,,,,,,,,,
96,[],False,kmuinnovation,,[],,text,t2_42xdt,,False,...,,,,,,,,,,
97,[],False,tacunningham00,,[],,text,t2_3zodhf9b,,False,...,,,,,,,,,,
98,[],False,Rdock87,,[],,text,t2_17tnl0if,,False,...,,,,,,,,,,


In [10]:
#get_comments(subreddit, iterations)

#get_comments('Finance', 100)

In [11]:
#get_comments(subreddit, iterations)

#get_comments('Finance', 100)