In [102]:
import os 
import math
import json 
import time
import requests 
import requests.auth
import asyncio
import aiohttp
from aiohttp import ClientSession
import pandas as pd
from tqdm import tqdm_notebook as tqdm

In [2]:
fortniteCompetitive_df = pd.read_csv('data/fortniteCompetitive.csv')
fortniteBR_df = pd.read_csv('data/fortniteBR.csv')

### Post Scraper

In [82]:
SUBREDDIT = 'FortniteCompetitive'
url = 'https://api.pushshift.io/reddit/search/submission/'
columns = ['author', 'created_utc', 'id', 'num_comments', 'permalink', 'score', 'title', 'selftext', 'subreddit']
count = 0

In [83]:
# Uncomment if starting from the beginning
# checkpoint = {'date' : 1561584036, 
#               'count': count}

# with open('checkpoint.txt', 'w') as outfile:
#     json.dump(checkpoint, outfile)

In [84]:
with open('checkpoint.txt') as file: 
    checkpoint = json.load(file)

In [85]:
params={'size':'500', 
        'subreddit': SUBREDDIT, 
        'num_comments':'>10', 
        'before' : checkpoint['date']}

In [34]:
def scrape(params):
    response = [1]
    count = checkpoint['count']
    
    while True:
        response = requests.get(url, params=params)
        print('Status code: ' + str(response.status_code))
        response = response.json()
        length = len(response['data'])
        print('Data length: ' + str(length))
        
        if length == 0: 
            print('Scraping finished.')
            break
        
        df = pd.DataFrame(response['data'])
        df = df[columns]
        
        filename = SUBREDDIT + str(count)
        path = 'data/' + filename 
        df.to_csv(path)
        print('File named: ' + filename + ' saved')
        
        count = count + 1
        checkpoint['count'] = count
        
        earliest = length  - 1
        checkpoint['date'] = response['data'][earliest]['created_utc']
        params['before'] = checkpoint['date']
        
        with open('checkpoint.txt', 'w') as outfile:
            json.dump(checkpoint, outfile)

In [91]:
scrape(params)

Status code: 200
Data length: 0


#### Merge all separate CSVs

In [188]:
def pull_csvs(subreddit, count): 
    path = 'data/'    
    filename = subreddit + str(0)
    
    if subreddit == 'FortNiteBR': 
        filename = '\data\\' + filename 
        
    master = pd.read_csv(path + filename)
    
    for csv in range(1, count): 
        filename = subreddit + str(csv)
        if (subreddit == 'FortNiteBR') & (csv <= 214): 
            filename = '\data\\' + filename 
        
        df = pd.read_csv(path + filename)
        master = pd.concat([master, df])

    return master

In [189]:
def remove_csvs(subreddit, count, start=0): 
    path = 'data/' 
    
    for csv in range(start, count): 
        filename = subreddit + str(csv)
        
        if (subreddit == 'FortNiteBR') & (csv <= 214): 
            filename = '\data\\' + filename 
        
        os.remove(path + filename)
        
    return 'Removed ' + str(count) + ' files'

In [22]:
fortniteCompetitive_df = pull_csvs('FortniteCompetitive', 63)
fortniteBR_df = pull_csvs('FortNiteBR', 430)

fortniteCompetitive_df.to_csv('data/fortniteCompetitive.csv')
fortniteBR_df.to_csv('data/fortniteBR.csv')

In [27]:
remove_csvs('FortniteCompetitive', 63)

'Removed 63 files'

In [28]:
remove_csvs('FortNiteBR', 430)

'Removed 430 files'

### Comment Scraper

In [141]:
url_comments = 'https://oauth.reddit.com/r/fortniteCompetitive/comments/article'
BATCH_SIZE = 20
BATCH_NO = 0
COLUMNS = ['author', 'body', 'created_utc', 'id', 'parent_id', 'score', 'subreddit', 'permalink']

base_headers = {"Authorization": "bearer 72466118-BGb8Kd7V0GF973kFmdLR0aepCdY", "User-Agent": "ChangeMeClient/0.1 by YourUsername"}
headers = set_auth(headers)
params = {
    'article':'',
    'context':8,
    'showedits':'True',
    'showmore':'False',
    'limit' : 1000,
    'sort':'confidence',
    'threaded':'True',
    'truncate':50
}

In [154]:
def get_checkpoint(): 
    #resume scraping from prev. location or start new 
    if (os.path.exists('comment_checkpoint.txt')): 
        with open('comment_checkpoint.txt') as file: 
            checkpoint = json.load(file)
    else: 
        checkpoint = {'batch_no' : 0}
        with open('comment_checkpoint.txt', 'w') as outfile: 
            json.dump(checkpoint, outfile)
            
    return checkpoint

In [155]:
def get_auth():
    client_auth = requests.auth.HTTPBasicAuth('NELdoctFZ_tqVw', 'JgSF4SwJdrCsnuOSggxFoMIbzsA')
    post_data = {"grant_type": "password", "username": "jeromeco", "password": "Skicat12"}
    headers = {"User-Agent": "ChangeMeClient/0.1 by YourUsername"}
    response = requests.post("https://www.reddit.com/api/v1/access_token", auth=client_auth, data=post_data, headers=headers)
    r = response.json()
    return r['access_token']

In [156]:
def set_auth(headers):
    headers['Authorization'] = 'bearer ' + get_auth()
    return headers

In [178]:
#this function is purposefully blocking
def check_limit(id): 
    params['article'] = id
    response = requests.get(url_comments, headers=headers, params=params)
    limit = response.headers['x-ratelimit-remaining']
    time_remaining = response.headers['x-ratelimit-reset']
    return float(limit), float(time_remaining)

In [179]:
def json2df(response): 
    comment_queue = response[:] 
    post = comment_queue.pop(0) # Seed with top-level
    comments = []

    while comment_queue:
        #get comment of queue
        try : 
            comment = comment_queue.pop(0)
            comment = comment['data']
        except: 
            comment = comment_queue.pop(0)

        #append new comment as a dict to list 
        if 'body' in comment:
            new_comment = {k: comment[k] for k in COLUMNS}    
            comments.append(new_comment)

        #get children / replies of current comment
        if 'children' in comment: 
            comment = comment['children']
            comment_queue.extend(comment)
        elif 'replies' in comment: 
            if len(comment['replies']) > 0: 
                comment = comment['replies']['data']['children']
                comment_queue.extend(comment)
        else: 
            print('error')
    
    return pd.DataFrame(comments)

In [180]:
async def scrape_comment(id, session): 
    params['article'] = id
    async with session.get(url_comments, headers=headers, params=params) as resp:
        resp.raise_for_status()
        response = await resp.json()  
        return json2df(response)

In [184]:
async def handle_batch(batch):  
    limit, time_remaining = check_limit('8yrb5e')
    if limit < BATCH_SIZE: 
        print('API LIMIT REACHED. \nSleeping for: ' + str(time_remaining))
        time.sleep(time_remaining)
    async with ClientSession() as session:
        dfs = []
        
        for id in batch: 
            dfs.append(scrape_comment(id, session))
        
        master = await asyncio.gather(*dfs)
        df = pd.concat(master)
    
    return df

In [185]:
async def get_all_comments(posts): 
    ids = posts['id']
    subreddit = posts['subreddit'][0]
    checkpoint = get_checkpoint()
    
    #setup filename structure for saved csvs
    path = 'data/' + subreddit +'Comments'
    
    #iterate through posts to get all comments
    x, y = checkpoint['batch_no'], math.ceil(len(ids) / BATCH_SIZE)
    for i in tqdm(range(x, y)): 
        lo = i * BATCH_SIZE
        hi = min(len(ids), BATCH_SIZE * (i + 1)) 
        batch = ids[lo:hi]
        
        df = await handle_batch(batch)
        df.to_csv(path + str(i))
        
        checkpoint['batch_no'] = i + 1
        with open('comment_checkpoint.txt', 'w') as outfile: 
            json.dump(checkpoint, outfile)

In [186]:
await get_all_comments(fortniteCompetitive_df)

HBox(children=(IntProgress(value=0, max=31), HTML(value='')))

API LIMIT REACHED. 
 Sleeping for: 230.0
API LIMIT REACHED. 
 Sleeping for: 542.0


In [45]:
await get_all_comments(fortniteBR_df)

<coroutine object get_all_comments at 0x1114eb448>

#### Handling CSVs

In [190]:
fortniteCompetitiveComments_df = pull_csvs('FortniteCompetitiveComments', 1575)
fortniteCompetitiveComments_df.to_csv('data/fortniteCompComments')
remove_csvs('FortniteCompetitiveComments', 1575)

'Removed 1575 files'

In [191]:
len(fortniteCompetitiveComments_df)

843019

In [195]:
fortniteCompetitiveComments_df['parent_id']

0       t3_c5vyi4
1       t3_c5vyi4
2       t3_c5vyi4
3       t3_c5vyi4
4       t3_c5vyi4
5       t3_c5vyi4
6       t3_c5vyi4
7       t3_c5vyi4
8       t3_c5vyi4
9       t3_c5vyi4
10     t1_es4jmgr
11     t1_es4k7vi
12     t1_es4k7vi
13     t1_es4fe68
14     t1_es4rq13
15     t1_es4rq13
16     t1_es5xcqh
17     t1_es4ffxu
18     t1_es4fmjy
19     t1_es4jsgi
20     t1_es4kq41
21      t3_c5vaq9
22      t3_c5vaq9
23      t3_c5vaq9
24      t3_c5vaq9
25      t3_c5vaq9
26      t3_c5vaq9
27     t1_es4bamp
28     t1_es4aslr
29     t1_es4dwjw
          ...    
530    t1_e08do8m
531    t1_e08do8m
532    t1_e08g932
533    t1_e08f4ke
534    t1_e099vhr
535    t1_e09bpke
536    t1_e08j9tt
537    t1_e08ipk8
538    t1_e09djgi
539    t1_e08fvkf
540    t1_e08fvkf
541    t1_e09axlj
542    t1_e08tlun
543    t1_e08gp1r
544    t1_e09au68
545    t1_e08j68p
546    t1_e08j68p
547     t3_8p3q6f
548     t3_8p3q6f
549     t3_8p3q6f
550     t3_8p3q6f
551     t3_8p3q6f
552     t3_8p3q6f
553     t3_8p3q6f
554     t3