In [176]:
import json 
import requests 
import os 
import math
import pandas as pd
from tqdm import tqdm_notebook as tqdm

In [5]:
fortniteCompetitive_df = pd.read_csv('data/fortniteCompetitive.csv')
fortniteBR_df = pd.read_csv('data/fortniteBR.csv')

### Post Scraper

In [82]:
SUBREDDIT = 'FortniteCompetitive'
url = 'https://api.pushshift.io/reddit/search/submission/'
columns = ['author', 'created_utc', 'id', 'num_comments', 'permalink', 'score', 'title', 'selftext', 'subreddit']
count = 0

In [83]:
# Uncomment if starting from the beginning
# checkpoint = {'date' : 1561584036, 
#               'count': count}

# with open('checkpoint.txt', 'w') as outfile:
#     json.dump(checkpoint, outfile)

In [84]:
with open('checkpoint.txt') as file: 
    checkpoint = json.load(file)

In [85]:
params={'size':'500', 
        'subreddit': SUBREDDIT, 
        'num_comments':'>10', 
        'before' : checkpoint['date']}

In [34]:
def scrape(params):
    response = [1]
    count = checkpoint['count']
    
    while True:
        response = requests.get(url, params=params)
        print('Status code: ' + str(response.status_code))
        response = response.json()
        length = len(response['data'])
        print('Data length: ' + str(length))
        
        if length == 0: 
            print('Scraping finished.')
            break
        
        df = pd.DataFrame(response['data'])
        df = df[columns]
        
        filename = SUBREDDIT + str(count)
        path = 'data/' + filename 
        df.to_csv(path)
        print('File named: ' + filename + ' saved')
        
        count = count + 1
        checkpoint['count'] = count
        
        earliest = length  - 1
        checkpoint['date'] = response['data'][earliest]['created_utc']
        params['before'] = checkpoint['date']
        
        with open('checkpoint.txt', 'w') as outfile:
            json.dump(checkpoint, outfile)

In [91]:
scrape(params)

Status code: 200
Data length: 0


#### Merge all separate CSVs

In [21]:
def pull_csvs(subreddit, count): 
    path = 'data/'    
    filename = subreddit + str(0)
    
    if subreddit == 'FortNiteBR': 
        filename = '\data\\' + filename 
        
    master = pd.read_csv(path + filename)
    
    for csv in range(1, count): 
        filename = subreddit + str(csv)
        if (subreddit == 'FortNiteBR') & (csv <= 214): 
            filename = '\data\\' + filename 
        
        df = pd.read_csv(path + filename)
        master = pd.concat([master, df])

    return master

In [26]:
def remove_csvs(subreddit, count): 
    path = 'data/' 
    
    for csv in range(0, count): 
        filename = subreddit + str(csv)
        
        if (subreddit == 'FortNiteBR') & (csv <= 214): 
            filename = '\data\\' + filename 
        
        os.remove(path + filename)
        
    return 'Removed ' + str(count) + ' files'

In [22]:
fortniteCompetitive_df = pull_csvs('FortniteCompetitive', 63)
fortniteBR_df = pull_csvs('FortNiteBR', 430)

fortniteCompetitive_df.to_csv('data/fortniteCompetitive.csv')
fortniteBR_df.to_csv('data/fortniteBR.csv')

In [27]:
remove_csvs('FortniteCompetitive', 63)

'Removed 63 files'

In [28]:
remove_csvs('FortNiteBR', 430)

'Removed 430 files'

### Comment Scraper

In [213]:
url_ids = 'https://api.pushshift.io/reddit/submission/comment_ids/'
url_comments = 'https://api.pushshift.io/reddit/comment/search'
BATCH_SIZE = 50
BATCH_NO = 0
COLUMNS = ['author', 'body', 'created_utc', 'id', 'parent_id', 'score', 'subreddit', 'permalink']

In [214]:
def scrape_comments(id):    
    ids = requests.get(url_ids + id).json()['data']
    comments_df = pd.DataFrame(columns=COLUMNS)
    length = len(ids)
    
    for i in range(math.ceil(length / 500)):  
        lo = i * 500
        hi = min(length, 500 * (i + 1)) 

        ids = ids[lo:hi]
        params = {'ids' : ids}
        comments = requests.get(url_comments, params=params).json()['data']
        
        df = pd.DataFrame(comments)
        test_df = df
        df = df[COLUMNS]
        comments_df = pd.concat([comments_df, df])
    
    return comments_df

In [215]:
def handle_batch(batch):     
    master = pd.DataFrame(columns=COLUMNS)
    
    for id in tqdm(batch): 
        df = scrape_comments(id)
        master = pd.concat([master, df])
        
    return master

In [216]:
def get_all_comments(posts): 
    ids = posts['id']
    subreddit = posts['subreddit'][0]
    
    #resume scraping from prev. location or start new 
    if (os.path.exists('comment_checkpoint.txt')): 
        with open('comment_checkpoint.txt') as file: 
            checkpoint = json.load(file)
    else: 
        checkpoint = {'batch_no' : 0}
        with open('comment_checkpoint.txt', 'w') as outfile: 
            json.dump(checkpoint, outfile)
    
    #setup filename structure for saved csvs
    path = 'data/' + subreddit +'Comments'
    
    #iterate through posts to get all comments
    x, y = checkpoint['batch_no'], math.ceil(len(ids) / BATCH_SIZE)
    for i in range(x, y): 
        lo = i * BATCH_SIZE
        hi = min(len(ids), BATCH_SIZE * (i + 1)) 
        batch = ids[lo:hi]
        
        df = handle_batch(batch)
        df.to_csv(path + str(i))
        print('Saved: ' + path + str(i))
        
        checkpoint['batch_no'] = i + 1
        with open('comment_checkpoint.txt', 'w') as outfile: 
            json.dump(checkpoint, outfile)

In [222]:
get_all_comments(fortniteCompetitive_df)

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments424


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments425


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments426


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments427


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments428


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments429


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments430


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments431


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments432


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments433


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments434


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments435


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments436


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments437


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments438


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments439


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments440


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments441


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments442


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments443


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments444


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments445


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments446


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments447


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments448


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments449


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments450


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments451


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments452


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments453


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments454


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saved: data/FortniteCompetitiveComments455


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
get_all_comments(fortniteBR_df)