In [1]:
#Import libraries
import pandas as pd
import numpy as np
import datetime as dt
import time
import requests
import json
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
import re
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [2]:
#Use this function that Brian so graceously gave me that uses reddit's API to bring in subreddit posts and comments.
#subreddit says what subreddit you want to bring in and kind is if you are looking for posts or comments.
#skip is the amount of days you are searching for and times is the amount of times you are searching for data
#They come in as a dataframe only if the response code is 200
#Have sleep set as 2 seconds so there is a second wait in between each time you grab data
def query_pushshift(subreddit, kind='submission', skip=30, times=10, 
                    subfield = ['title', 'selftext', 'subreddit', 'created_utc', 'author', 'num_comments', 'score', 'is_self'],
                    comfields = ['body', 'score', 'created_utc']):

    #Size is trying to pull 500 results per pull
    stem = "https://api.pushshift.io/reddit/search/{}/?subreddit={}&size=500".format(kind, subreddit)
    mylist = []
    
    for x in range(1, times):
        
        URL = "{}&after={}d".format(stem, skip * x)
        print(URL)
        response = requests.get(URL)
        assert response.status_code == 200
        mine = response.json()['data']
        df = pd.DataFrame.from_dict(mine)
        mylist.append(df)
        time.sleep(2)
        
    full = pd.concat(mylist, sort=False)
    
    if kind == "submission":
        
        full = full[subfield]
        
        full = full.drop_duplicates()
        
        full = full.loc[full['is_self'] == True]
        
    def get_date(created):
        return dt.date.fromtimestamp(created)
    
    _timestamp = full["created_utc"].apply(get_date)
    
    full['timestamp'] = _timestamp

    print(full.shape)
    
    return full

In [3]:
df_curb_comment = query_pushshift('curb', kind='comment', times=10)

https://api.pushshift.io/reddit/search/comment/?subreddit=curb&size=500&after=30d
https://api.pushshift.io/reddit/search/comment/?subreddit=curb&size=500&after=60d
https://api.pushshift.io/reddit/search/comment/?subreddit=curb&size=500&after=90d
https://api.pushshift.io/reddit/search/comment/?subreddit=curb&size=500&after=120d
https://api.pushshift.io/reddit/search/comment/?subreddit=curb&size=500&after=150d
https://api.pushshift.io/reddit/search/comment/?subreddit=curb&size=500&after=180d
https://api.pushshift.io/reddit/search/comment/?subreddit=curb&size=500&after=210d
https://api.pushshift.io/reddit/search/comment/?subreddit=curb&size=500&after=240d
https://api.pushshift.io/reddit/search/comment/?subreddit=curb&size=500&after=270d
(4500, 33)


In [4]:
df_curb_comment.to_csv('./datasets/curb_comment.csv')

In [5]:
df_seinfeld_comment = query_pushshift('seinfeld', kind='comment', times=10)

https://api.pushshift.io/reddit/search/comment/?subreddit=seinfeld&size=500&after=30d
https://api.pushshift.io/reddit/search/comment/?subreddit=seinfeld&size=500&after=60d
https://api.pushshift.io/reddit/search/comment/?subreddit=seinfeld&size=500&after=90d
https://api.pushshift.io/reddit/search/comment/?subreddit=seinfeld&size=500&after=120d
https://api.pushshift.io/reddit/search/comment/?subreddit=seinfeld&size=500&after=150d
https://api.pushshift.io/reddit/search/comment/?subreddit=seinfeld&size=500&after=180d
https://api.pushshift.io/reddit/search/comment/?subreddit=seinfeld&size=500&after=210d
https://api.pushshift.io/reddit/search/comment/?subreddit=seinfeld&size=500&after=240d
https://api.pushshift.io/reddit/search/comment/?subreddit=seinfeld&size=500&after=270d
(4500, 30)


In [6]:
df_seinfeld_comment.to_csv('./datasets/seinfeld_comment.csv')

In [7]:
df_curb_post_all = query_pushshift('curb', times=100)

https://api.pushshift.io/reddit/search/submission/?subreddit=curb&size=500&after=30d
https://api.pushshift.io/reddit/search/submission/?subreddit=curb&size=500&after=60d
https://api.pushshift.io/reddit/search/submission/?subreddit=curb&size=500&after=90d
https://api.pushshift.io/reddit/search/submission/?subreddit=curb&size=500&after=120d
https://api.pushshift.io/reddit/search/submission/?subreddit=curb&size=500&after=150d
https://api.pushshift.io/reddit/search/submission/?subreddit=curb&size=500&after=180d
https://api.pushshift.io/reddit/search/submission/?subreddit=curb&size=500&after=210d
https://api.pushshift.io/reddit/search/submission/?subreddit=curb&size=500&after=240d
https://api.pushshift.io/reddit/search/submission/?subreddit=curb&size=500&after=270d
https://api.pushshift.io/reddit/search/submission/?subreddit=curb&size=500&after=300d
https://api.pushshift.io/reddit/search/submission/?subreddit=curb&size=500&after=330d
https://api.pushshift.io/reddit/search/submission/?subred

https://api.pushshift.io/reddit/search/submission/?subreddit=curb&size=500&after=2880d
https://api.pushshift.io/reddit/search/submission/?subreddit=curb&size=500&after=2910d
https://api.pushshift.io/reddit/search/submission/?subreddit=curb&size=500&after=2940d
https://api.pushshift.io/reddit/search/submission/?subreddit=curb&size=500&after=2970d
(1062, 9)


In [8]:
df_curb_post_all.to_csv('./datasets/curb_post_all.csv')

In [9]:
df_seinfeld_post_all = query_pushshift('seinfeld', times=13)

https://api.pushshift.io/reddit/search/submission/?subreddit=seinfeld&size=500&after=30d
https://api.pushshift.io/reddit/search/submission/?subreddit=seinfeld&size=500&after=60d
https://api.pushshift.io/reddit/search/submission/?subreddit=seinfeld&size=500&after=90d
https://api.pushshift.io/reddit/search/submission/?subreddit=seinfeld&size=500&after=120d
https://api.pushshift.io/reddit/search/submission/?subreddit=seinfeld&size=500&after=150d
https://api.pushshift.io/reddit/search/submission/?subreddit=seinfeld&size=500&after=180d
https://api.pushshift.io/reddit/search/submission/?subreddit=seinfeld&size=500&after=210d
https://api.pushshift.io/reddit/search/submission/?subreddit=seinfeld&size=500&after=240d
https://api.pushshift.io/reddit/search/submission/?subreddit=seinfeld&size=500&after=270d
https://api.pushshift.io/reddit/search/submission/?subreddit=seinfeld&size=500&after=300d
https://api.pushshift.io/reddit/search/submission/?subreddit=seinfeld&size=500&after=330d
https://api.p

In [10]:
df_seinfeld_post_all.to_csv('./datasets/seinfeld_post_all.csv')