In [53]:
import os 
import math
import json 
import requests 
import requests.auth
import asyncio
import aiohttp
from aiohttp import ClientSession
import aiofiles
import pandas as pd
from tqdm import tqdm_notebook as tqdm

In [2]:
fortniteCompetitive_df = pd.read_csv('data/fortniteCompetitive.csv')
fortniteBR_df = pd.read_csv('data/fortniteBR.csv')

### Post Scraper

In [82]:
SUBREDDIT = 'FortniteCompetitive'
url = 'https://api.pushshift.io/reddit/search/submission/'
columns = ['author', 'created_utc', 'id', 'num_comments', 'permalink', 'score', 'title', 'selftext', 'subreddit']
count = 0

In [83]:
# Uncomment if starting from the beginning
# checkpoint = {'date' : 1561584036, 
#               'count': count}

# with open('checkpoint.txt', 'w') as outfile:
#     json.dump(checkpoint, outfile)

In [84]:
with open('checkpoint.txt') as file: 
    checkpoint = json.load(file)

In [85]:
params={'size':'500', 
        'subreddit': SUBREDDIT, 
        'num_comments':'>10', 
        'before' : checkpoint['date']}

In [34]:
def scrape(params):
    response = [1]
    count = checkpoint['count']
    
    while True:
        response = requests.get(url, params=params)
        print('Status code: ' + str(response.status_code))
        response = response.json()
        length = len(response['data'])
        print('Data length: ' + str(length))
        
        if length == 0: 
            print('Scraping finished.')
            break
        
        df = pd.DataFrame(response['data'])
        df = df[columns]
        
        filename = SUBREDDIT + str(count)
        path = 'data/' + filename 
        df.to_csv(path)
        print('File named: ' + filename + ' saved')
        
        count = count + 1
        checkpoint['count'] = count
        
        earliest = length  - 1
        checkpoint['date'] = response['data'][earliest]['created_utc']
        params['before'] = checkpoint['date']
        
        with open('checkpoint.txt', 'w') as outfile:
            json.dump(checkpoint, outfile)

In [91]:
scrape(params)

Status code: 200
Data length: 0


#### Merge all separate CSVs

In [21]:
def pull_csvs(subreddit, count): 
    path = 'data/'    
    filename = subreddit + str(0)
    
    if subreddit == 'FortNiteBR': 
        filename = '\data\\' + filename 
        
    master = pd.read_csv(path + filename)
    
    for csv in range(1, count): 
        filename = subreddit + str(csv)
        if (subreddit == 'FortNiteBR') & (csv <= 214): 
            filename = '\data\\' + filename 
        
        df = pd.read_csv(path + filename)
        master = pd.concat([master, df])

    return master

In [73]:
def remove_csvs(subreddit, count): 
    path = 'data/' 
    
    for csv in range(0, count): 
        filename = subreddit + str(csv)
        
        if (subreddit == 'FortNiteBR') & (csv <= 214): 
            filename = '\data\\' + filename 
        
        os.remove(path + filename)
        
    return 'Removed ' + str(count) + ' files'

In [22]:
fortniteCompetitive_df = pull_csvs('FortniteCompetitive', 63)
fortniteBR_df = pull_csvs('FortNiteBR', 430)

fortniteCompetitive_df.to_csv('data/fortniteCompetitive.csv')
fortniteBR_df.to_csv('data/fortniteBR.csv')

In [27]:
remove_csvs('FortniteCompetitive', 63)

'Removed 63 files'

In [28]:
remove_csvs('FortNiteBR', 430)

'Removed 430 files'

### Comment Scraper

In [66]:
url_comments = 'https://oauth.reddit.com/r/fortniteCompetitive/comments/article'
BATCH_SIZE = 20
BATCH_NO = 0
COLUMNS = ['author', 'body', 'created_utc', 'id', 'parent_id', 'score', 'subreddit', 'permalink']
headers = {"Authorization": "bearer 72466118-BGb8Kd7V0GF973kFmdLR0aepCdY", "User-Agent": "ChangeMeClient/0.1 by YourUsername"}
params = {
    'article':'',
    'context':8,
    'showedits':'True',
    'showmore':'False',
    'limit' : 1000,
    'sort':'confidence',
    'threaded':'True',
    'truncate':50
}

In [63]:
def get_auth():
    client_auth = requests.auth.HTTPBasicAuth('NELdoctFZ_tqVw', 'JgSF4SwJdrCsnuOSggxFoMIbzsA')
    post_data = {"grant_type": "password", "username": "jeromeco", "password": "Skicat12"}
    headers = {"User-Agent": "ChangeMeClient/0.1 by YourUsername"}
    response = requests.post("https://www.reddit.com/api/v1/access_token", auth=client_auth, data=post_data, headers=headers)
    r = response.json()
    return r['access_token']

In [64]:
print(get_auth())

72466118-BGb8Kd7V0GF973kFmdLR0aepCdY


In [59]:
async def scrape_comment(id, session): 
    params['article'] = id
    async with session.get(url_comments, headers=headers, params=params) as resp:
        resp.raise_for_status()
        response = await resp.json()
        return json2df(response)

In [60]:
async def handle_batch(batch):        
    async with ClientSession() as session:
        dfs = []
        
        for id in tqdm(batch): 
            dfs.append(scrape_comment(id, session))
        
        master = await asyncio.gather(*dfs)
        df = pd.concat(master)
    
    return df

In [61]:
async def get_all_comments(posts): 
    ids = posts['id']
    subreddit = posts['subreddit'][0]
    
    #resume scraping from prev. location or start new 
    if (os.path.exists('comment_checkpoint.txt')): 
        with open('comment_checkpoint.txt') as file: 
            checkpoint = json.load(file)
    else: 
        checkpoint = {'batch_no' : 0}
        with open('comment_checkpoint.txt', 'w') as outfile: 
            json.dump(checkpoint, outfile)
    
    #setup filename structure for saved csvs
    path = 'data/' + subreddit +'Comments'
    
    #iterate through posts to get all comments
    x, y = checkpoint['batch_no'], math.ceil(len(ids) / BATCH_SIZE)
    for i in range(x, y): 
        lo = i * BATCH_SIZE
        hi = min(len(ids), BATCH_SIZE * (i + 1)) 
        batch = ids[lo:hi]
        df = await handle_batch(batch)
        print('done')
        df.to_csv(path + str(i))
        print('Saved: ' + path + str(i))
        
        checkpoint['batch_no'] = i + 1
        with open('comment_checkpoint.txt', 'w') as outfile: 
            json.dump(checkpoint, outfile)

In [75]:
await get_all_comments(fortniteCompetitive_df)

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

c4tv2u
c4tofk
c4tee9
c4sn9n
c4sc2j
c4s9sj
c4rvop
c4rryu
c4rmcr
c4quzn
c4qmnx
c4qm56
c4q8bu
c4p5kt
c4okco
c4oe8d
c4oc1h
c4ntsz
c4nmgx
c4n322

done
Saved: data/FortniteCompetitiveComments12


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

c4m3k3
c4lxce
c4lsiq
c4lo2r
c4liov
c4lhav
c4kn1g
c4kfqr
c4kf03
c4k0pl
c4jomn
c4jo4l
c4jjnv
c4jfaz
c4j75h
c4j5nv
c4j139
c4iowe
c4ikb9
c4if9u

done
Saved: data/FortniteCompetitiveComments13


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

c4i4jd
c4hz7n
c4hpie
c4hhbj
c4h5qw
c4h4bb
c4gla6
c4fokx
c4fluw
c4fkbs
c4fjy9
c4ez6l
c4elm1
c4egqc
c4ecfp
c4eafu
c4e34v
c4dwyh
c4dcai
c4d2ft

done
Saved: data/FortniteCompetitiveComments14


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

c4cpoy
c4cngj
c4cjjg
c4ccb8
c4c1il
c4buz4
c4bo0f
c4bc6y
c4b7pu
c4b4wd
c4b3dv
c4b01h
c4apjc
c4a858
c4a592
c4a3kg
c49zic
c49ydi
c49kcj
c49k4h

done
Saved: data/FortniteCompetitiveComments15


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

c49d4p
c493hg
c48x2u
c48luo
c48jmt
c48j0b
c48dtf
c48bcs
c48723
c4814x
c47x6h
c47prw
c47ai3
c479lm
c476o1
c472ap
c46uyq
c46rq5
c46qqc
c46hpn

done
Saved: data/FortniteCompetitiveComments16


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

c46emc
c45tkg
c45t82
c45ls5
c45gfe
c45foz
c44ydv
c44mn7
c43wz8
c43s3w
c43puf
c43n73
c43ltk
c43fwb
c43eec
c434zk
c42xao
c42c9f
c428dj
c421lf

done
Saved: data/FortniteCompetitiveComments17


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

c4206a
c41tq1
c417zb
c414bo
c413wh
c413ax
c40qce
c40o4z
c40nn5
c40d5q
c3zm06
c3zfsq
c3zb2t
c3zavr
c3z9u0
c3yfj5
c3y3a1
c3xji7
c3xcss
c3x90e

done
Saved: data/FortniteCompetitiveComments18


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

c3x4zq
c3x1qb
c3x1pm
c3wyav
c3wwvc
c3ww8x
c3w7ez
c3w3q0
c3vzr7
c3vxbc
c3vpm6
c3v8nl
c3v5tn
c3v5by
c3v1gt
c3v0ri
c3utms
c3uo8i
c3um6v
c3uka1



CancelledError: 

In [45]:
await get_all_comments(fortniteBR_df)

<coroutine object get_all_comments at 0x1114eb448>

In [192]:
def list_to_string(x):
    returnString = ''
    for item in x: 
        returnString = returnString + str(item) + ','
        
    return returnString[:-1]
        

In [3]:
import requests.auth
client_auth = requests.auth.HTTPBasicAuth('NELdoctFZ_tqVw', 'JgSF4SwJdrCsnuOSggxFoMIbzsA')
post_data = {"grant_type": "password", "username": "jeromeco", "password": "Skicat12"}
headers = {"User-Agent": "ChangeMeClient/0.1 by YourUsername"}
response = requests.post("https://www.reddit.com/api/v1/access_token", auth=client_auth, data=post_data, headers=headers)
response.json()

{'access_token': '72466118-Gf3siQQrQKpEQG0UIt2gNEXmMlI',
 'token_type': 'bearer',
 'expires_in': 3600,
 'scope': '*'}

In [4]:
headers = {"Authorization": "bearer 72466118-Gf3siQQrQKpEQG0UIt2gNEXmMlI", "User-Agent": "ChangeMeClient/0.1 by YourUsername"}

In [5]:
params = {
    'article':'9jb0y4',
    'context':8,
    'showedits':True,
    'showmore':False,
    'limit' : 1000,
    'sort':'confidence',
    'threaded':True,
    'truncate':50
}
response = requests.get("https://oauth.reddit.com/r/fortniteCompetitive/comments/article", headers=headers, params=params)

In [23]:
r = response.json()
COLUMNS = ['author', 'body', 'created_utc', 'id', 'parent_id', 'score', 'subreddit', 'permalink']

In [35]:
def json2df(response): 
    comment_queue = response[:] 
    post = comment_queue.pop(0) # Seed with top-level
    comments = []

    while comment_queue:
        #get comment of queue
        try : 
            comment = comment_queue.pop(0)
            comment = comment['data']
        except: 
            comment = comment_queue.pop(0)

        #append new comment as a dict to list 
        if 'body' in comment:
            new_comment = {k: comment[k] for k in COLUMNS}    
            comments.append(new_comment)

        #get children / replies of current comment
        if 'children' in comment: 
            comment = comment['children']
            comment_queue.extend(comment)
        elif 'replies' in comment: 
            if len(comment['replies']) > 0: 
                comment = comment['replies']['data']['children']
                comment_queue.extend(comment)
        else: 
            print('error')
    
    return pd.DataFrame(comments)