In [1]:
import os 
import math
import json 
import requests 
import asyncio
import aiohttp
import aiofiles
import pandas as pd
from tqdm import tqdm_notebook as tqdm

In [2]:
fortniteCompetitive_df = pd.read_csv('data/fortniteCompetitive.csv')
fortniteBR_df = pd.read_csv('data/fortniteBR.csv')

### Post Scraper

In [82]:
SUBREDDIT = 'FortniteCompetitive'
url = 'https://api.pushshift.io/reddit/search/submission/'
columns = ['author', 'created_utc', 'id', 'num_comments', 'permalink', 'score', 'title', 'selftext', 'subreddit']
count = 0

In [83]:
# Uncomment if starting from the beginning
# checkpoint = {'date' : 1561584036, 
#               'count': count}

# with open('checkpoint.txt', 'w') as outfile:
#     json.dump(checkpoint, outfile)

In [84]:
with open('checkpoint.txt') as file: 
    checkpoint = json.load(file)

In [85]:
params={'size':'500', 
        'subreddit': SUBREDDIT, 
        'num_comments':'>10', 
        'before' : checkpoint['date']}

In [34]:
def scrape(params):
    response = [1]
    count = checkpoint['count']
    
    while True:
        response = requests.get(url, params=params)
        print('Status code: ' + str(response.status_code))
        response = response.json()
        length = len(response['data'])
        print('Data length: ' + str(length))
        
        if length == 0: 
            print('Scraping finished.')
            break
        
        df = pd.DataFrame(response['data'])
        df = df[columns]
        
        filename = SUBREDDIT + str(count)
        path = 'data/' + filename 
        df.to_csv(path)
        print('File named: ' + filename + ' saved')
        
        count = count + 1
        checkpoint['count'] = count
        
        earliest = length  - 1
        checkpoint['date'] = response['data'][earliest]['created_utc']
        params['before'] = checkpoint['date']
        
        with open('checkpoint.txt', 'w') as outfile:
            json.dump(checkpoint, outfile)

In [91]:
scrape(params)

Status code: 200
Data length: 0


#### Merge all separate CSVs

In [21]:
def pull_csvs(subreddit, count): 
    path = 'data/'    
    filename = subreddit + str(0)
    
    if subreddit == 'FortNiteBR': 
        filename = '\data\\' + filename 
        
    master = pd.read_csv(path + filename)
    
    for csv in range(1, count): 
        filename = subreddit + str(csv)
        if (subreddit == 'FortNiteBR') & (csv <= 214): 
            filename = '\data\\' + filename 
        
        df = pd.read_csv(path + filename)
        master = pd.concat([master, df])

    return master

In [26]:
def remove_csvs(subreddit, count): 
    path = 'data/' 
    
    for csv in range(0, count): 
        filename = subreddit + str(csv)
        
        if (subreddit == 'FortNiteBR') & (csv <= 214): 
            filename = '\data\\' + filename 
        
        os.remove(path + filename)
        
    return 'Removed ' + str(count) + ' files'

In [22]:
fortniteCompetitive_df = pull_csvs('FortniteCompetitive', 63)
fortniteBR_df = pull_csvs('FortNiteBR', 430)

fortniteCompetitive_df.to_csv('data/fortniteCompetitive.csv')
fortniteBR_df.to_csv('data/fortniteBR.csv')

In [27]:
remove_csvs('FortniteCompetitive', 63)

'Removed 63 files'

In [28]:
remove_csvs('FortNiteBR', 430)

'Removed 430 files'

### Comment Scraper

In [7]:
url_ids = 'https://api.pushshift.io/reddit/submission/comment_ids/'
url_comments = 'https://api.pushshift.io/reddit/comment/search'
BATCH_SIZE = 50
BATCH_NO = 0
COLUMNS = ['author', 'body', 'created_utc', 'id', 'parent_id', 'score', 'subreddit', 'permalink']

In [202]:
async def scrape_comments(id, session):  
    ids = requests.get(url_ids + id).json()['data']
    comments_df = pd.DataFrame(columns=COLUMNS)
    length = len(ids)
    
    for i in range(math.ceil(length / 500)):  
        lo = i * 500
        hi = min(length, 500 * (i + 1)) 

        ids = ids[lo:hi]
        ids = list_to_string(ids)
        
        params = {'ids' : ids}

        async with session.get(url_comments, params=params) as resp:
            resp.raise_for_status()
            comments = (await resp.json())['data']
            df = pd.DataFrame(comments)
            df = df[COLUMNS]
            comments_df = pd.concat([comments_df, df])

    return comments_df

In [206]:
async def handle_batch(batch):        
    async with ClientSession() as session:
        dfs = []
        
        for id in tqdm(batch): 
            print(id)
            dfs.append(scrape_comments(id, session))
        
        master = await asyncio.gather(*dfs)
        df = pd.concat(master)
    
    return df

In [207]:
async def get_all_comments(posts): 
    ids = posts['id']
    subreddit = posts['subreddit'][0]
    
    #resume scraping from prev. location or start new 
    if (os.path.exists('comment_checkpoint.txt')): 
        with open('comment_checkpoint.txt') as file: 
            checkpoint = json.load(file)
    else: 
        checkpoint = {'batch_no' : 0}
        with open('comment_checkpoint.txt', 'w') as outfile: 
            json.dump(checkpoint, outfile)
    
    #setup filename structure for saved csvs
    path = 'data/' + subreddit +'Comments'
    
    #iterate through posts to get all comments
    x, y = checkpoint['batch_no'], math.ceil(len(ids) / BATCH_SIZE)
    for i in range(x, y): 
        lo = i * BATCH_SIZE
        hi = min(len(ids), BATCH_SIZE * (i + 1)) 
        batch = ids[lo:hi]
        print('why')
        df = await handle_batch(batch)
        print('done')
        df.to_csv(path + str(i))
        print('Saved: ' + path + str(i))
        
        checkpoint['batch_no'] = i + 1
        with open('comment_checkpoint.txt', 'w') as outfile: 
            json.dump(checkpoint, outfile)

In [71]:
get_all_comments(fortniteBR_df)

<coroutine object get_all_comments at 0x1216b17c8>

In [192]:
def list_to_string(x):
    returnString = ''
    for item in x: 
        returnString = returnString + str(item) + ','
        
    return returnString[:-1]
        

In [3]:
import requests.auth
client_auth = requests.auth.HTTPBasicAuth('NELdoctFZ_tqVw', 'JgSF4SwJdrCsnuOSggxFoMIbzsA')
post_data = {"grant_type": "password", "username": "jeromeco", "password": "Skicat12"}
headers = {"User-Agent": "ChangeMeClient/0.1 by YourUsername"}
response = requests.post("https://www.reddit.com/api/v1/access_token", auth=client_auth, data=post_data, headers=headers)
response.json()

{'access_token': '72466118-Gf3siQQrQKpEQG0UIt2gNEXmMlI',
 'token_type': 'bearer',
 'expires_in': 3600,
 'scope': '*'}

In [4]:
headers = {"Authorization": "bearer 72466118-Gf3siQQrQKpEQG0UIt2gNEXmMlI", "User-Agent": "ChangeMeClient/0.1 by YourUsername"}

In [5]:
params = {
    'article':'9jb0y4',
    'context':8,
    'showedits':True,
    'showmore':False,
    'limit' : 1000,
    'sort':'confidence',
    'threaded':True,
    'truncate':50
}
response = requests.get("https://oauth.reddit.com/r/fortniteCompetitive/comments/article", headers=headers, params=params)

In [23]:
r = response.json()
COLUMNS = ['author', 'body', 'created_utc', 'id', 'parent_id', 'score', 'subreddit', 'permalink']

In [35]:
def json2df(response): 
    comment_queue = response[:] 
    post = comment_queue.pop(0) # Seed with top-level
    comments = []

    while comment_queue:
        #get comment of queue
        try : 
            comment = comment_queue.pop(0)
            comment = comment['data']
        except: 
            comment = comment_queue.pop(0)

        #append new comment as a dict to list 
        if 'body' in comment:
            new_comment = {k: comment[k] for k in COLUMNS}    
            comments.append(new_comment)

        #get children / replies of current comment
        if 'children' in comment: 
            comment = comment['children']
            comment_queue.extend(comment)
        elif 'replies' in comment: 
            if len(comment['replies']) > 0: 
                comment = comment['replies']['data']['children']
                comment_queue.extend(comment)
        else: 
            print('error')
    
    return pd.DataFrame(comments)

In [36]:
json2df(r)

Unnamed: 0,author,body,created_utc,id,parent_id,permalink,score,subreddit
0,SETTLEDOWNSIR,The worst part is how a build fight against a ...,1538037000.0,e6q0mzg,t3_9jb0y4,/r/FortniteCompetitive/comments/9jb0y4/bouncer...,472,FortniteCompetitive
1,larrylime,Bouncers were possibly my favorite non-weapon ...,1538043000.0,e6q3amj,t3_9jb0y4,/r/FortniteCompetitive/comments/9jb0y4/bouncer...,320,FortniteCompetitive
2,ALLST6R,"And if you end up too high, you have a lot les...",1538051000.0,e6q880k,t1_e6q0mzg,/r/FortniteCompetitive/comments/9jb0y4/bouncer...,70,FortniteCompetitive
3,pudniskool,Literally the reason they were vaulted,1538056000.0,e6qdoql,t1_e6q0mzg,/r/FortniteCompetitive/comments/9jb0y4/bouncer...,110,FortniteCompetitive
4,kikkansson,isn't it a further skill incentive that you mi...,1538061000.0,e6qiy8m,t1_e6q0mzg,/r/FortniteCompetitive/comments/9jb0y4/bouncer...,9,FortniteCompetitive
5,deanresin,That is the best part. The game is best the m...,1538064000.0,e6qmudu,t1_e6q0mzg,/r/FortniteCompetitive/comments/9jb0y4/bouncer...,5,FortniteCompetitive
6,[deleted],[deleted],1538047000.0,e6q5ozx,t1_e6q0mzg,/r/FortniteCompetitive/comments/9jb0y4/bouncer...,2,FortniteCompetitive
7,eaglessoar,&gt; nothing else to prevent fall damage\n\nSh...,1538086000.0,e6rffjd,t1_e6q0mzg,/r/FortniteCompetitive/comments/9jb0y4/bouncer...,1,FortniteCompetitive
8,DiamondHyena,ya man that's the point. You should be punishe...,1538155000.0,e6t2pq3,t1_e6q0mzg,/r/FortniteCompetitive/comments/9jb0y4/bouncer...,1,FortniteCompetitive
9,LeoPier0,And what if you’re the good player of the situ...,1538042000.0,e6q327t,t1_e6q0mzg,/r/FortniteCompetitive/comments/9jb0y4/bouncer...,-16,FortniteCompetitive
