# AVISO: Existe uma versão 3. Atualizar.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
BASEDIR='/content/drive/My Drive/MentalHealthShared/'
DATADIR=BASEDIR+'data/no_thx_2017/'
#DATADIR=BASEDIR+'data/'
# DATADIR=BASEDIR+'data/data_2020/'

In [None]:
!pip install -q ratelimit

In [None]:
import requests
import datetime
from timeit import default_timer
import pickle
import itertools
import asyncio
from concurrent.futures import ThreadPoolExecutor
import random
import time
from ratelimit import limits, sleep_and_retry
import os

First we will get all reddit submissions to a given subreddit, in each day. Since there is a limit of 500 results per request, let's hope the number of submissions is smaller than that. All we need is the submission id and content.

In [None]:
dt_curr = int(datetime.datetime(2017,1,1,0,0).timestamp())
dt_last = int(datetime.datetime(2018, 1,1,0,0).timestamp())
post_endpoint=' https://api.pushshift.io/reddit/search/submission/'
post_parameters = {
    'after': dt_curr,
    'before': dt_last,
    'fields': ('author','selftext','id','created_utc','num_comments'),
    'size': 500,
    'sort': 'asc'
}

In [None]:
# subreddit = 'whatisthisthing'
# outname=DATADIR + subreddit+'_post2data_no_thx_2017.pkl'
# if os.path.exists(outname):
#   with open(outname,'rb') as infile:
#     post2data = pickle.load(infile)

In [None]:
print('Collecting posts...')
subreddit = 'movies'
post_parameters['subreddit'] = subreddit
post2data = dict()
nresults = 100


@sleep_and_retry
@limits(calls=6, period=4)
def fetch(session, post_parameters):
    for attempts in range(100):
        response = session.get(post_endpoint,params=post_parameters)
        if response.status_code != 200:
            print('Error at timestamp {}. Retrying...'.format(post_parameters['after']))
        else:
            return response

outname=DATADIR + subreddit+'_post2data_no_thx_2017.pkl'
if os.path.exists(outname):
  with open(outname,'rb') as infile:
    post2data = pickle.load(infile)
else:
  while nresults == 100:
      print('Getting posts created after', datetime.datetime.fromtimestamp(post_parameters['after']))
      with requests.Session() as session:
          response = fetch(session, post_parameters)
          nresults = len(response.json()['data'])
          for post in response.json()['data']:
              post_id = post['id']
              del post['id']
              post2data[post_id]=post
          post_parameters['after'] = post['created_utc']

  with open(outname,'wb') as outfile:
      pickle.dump(post2data,outfile)
      #pickle.dump(post2data,outfile)

Collecting posts...
Getting posts created after 2017-01-01 00:00:00
Getting posts created after 2017-01-01 04:55:13
Getting posts created after 2017-01-01 08:50:47
Getting posts created after 2017-01-01 12:08:37
Getting posts created after 2017-01-01 16:05:54
Getting posts created after 2017-01-01 18:47:23
Getting posts created after 2017-01-01 22:08:04
Getting posts created after 2017-01-02 01:23:25
Getting posts created after 2017-01-02 04:53:01
Getting posts created after 2017-01-02 08:14:53
Getting posts created after 2017-01-02 11:52:08
Getting posts created after 2017-01-02 14:52:12
Getting posts created after 2017-01-02 17:40:06
Getting posts created after 2017-01-02 20:11:15
Getting posts created after 2017-01-02 23:11:06
Getting posts created after 2017-01-03 02:43:02
Getting posts created after 2017-01-03 06:25:01
Getting posts created after 2017-01-03 09:41:19
Getting posts created after 2017-01-03 12:26:14
Getting posts created after 2017-01-03 14:48:53
Getting posts create

Now we will collect the ids of the comments associated with each post.

In [None]:
import time
base_url = 'https://api.pushshift.io/reddit/submission/comment_ids/'
post_ids = [post_id for post_id, post_data in post2data.items() if post_data['num_comments']>0]
nposts = len(post_ids)
post2comments = dict()


In [None]:
# def fetch(session, i):
#     post_id = post_ids[i]
#     if i%10 == 0:
#         print('(Elapsed {}s) Processing post # {} of {}'.format(int(default_timer()-start_time),i,nposts) )
#     for attempts in range(100):
#         response = session.get(base_url+post_id)
#         if response.status_code == 200:
#             break
#         else:
#             print('Error (too many requests). Retrying after some random amount of time.')
#             time.sleep(random.random())
            
#     return post_id,response
# start_time = default_timer()

In [None]:
print('Collecting comment ids...')
start_time = default_timer()

@sleep_and_retry
@limits(calls=6, period=4)
def fetch(session, i):
    post_id = post_ids[i]
    if i%10 == 0:
        print('(Elapsed {}s) Processing post # {} of {}'.format(int(default_timer()-start_time),i,nposts) )
    for attempts in range(100):
        response = session.get(base_url+post_id)
        if response.status_code == 200:
            break
        else:
            print('Error (too many requests). Retrying after some random amount of time.')
            time.sleep(2 * random.random())
            
    return post_id,response


async def get_data_asynchronous():
    with ThreadPoolExecutor(max_workers=6) as executor:
        session = requests.Session()
        # Set any session parameters here before calling `fetch`

        # Initialize the event loop        
        loop = asyncio.get_event_loop()
        
        # Use list comprehension to create a list of
        # tasks to complete. The executor will run the `fetch`
        # function for each csv in the csvs_to_fetch list
        tasks = [
            loop.run_in_executor(
                executor,
                fetch,
                *(session, ind) # Allows us to pass in multiple arguments to `fetch`
            )
            for ind in range(len(post_ids[:]))
        ]
        
        # Initializes the tasks to run and awaits their results
        for post_id,response in await asyncio.gather(*tasks):
            if response.status_code != 200:
                print('Error at url {}'.format(response.url))
            else:
                post2comments[post_id] = response.json()['data']

outname=DATADIR + subreddit +'_post2comments_no_thx_2017.pkl'
if os.path.exists(outname):
  with open(outname,'rb') as infile:
    post2comments = pickle.load(infile)
else:                        
  loop = asyncio.get_event_loop()
  future = asyncio.ensure_future(get_data_asynchronous())
  loop.run_until_complete(future)
  with open(outname,'wb') as outfile:
      pickle.dump(post2comments,outfile)

In [None]:
comment_ids = list(itertools.chain.from_iterable(post2comments.values()))
print(len(comment_ids))
print(len(post2comments))

108751
13815


Now we will collect the comments.

In [None]:
comment_endpoint='https://api.pushshift.io/reddit/comment/search'
# comment_parameters = {
#     'fields': ('author','body','link_id','parent_id','id','created_utc'),
#     'sort': 'asc'
# }

In [None]:
print('Collecting comments...')

base_url = comment_endpoint+'?sort=asc&fields=author,body,link_id,parent_id,id,created_utc&ids='
comment2data = dict()
ncomments = len(comment_ids)
start_time = default_timer()

@sleep_and_retry
@limits(calls=6, period=4)
def fetch(session, i):
    full_url = base_url+','.join(comment_ids[i:min(i+1000,ncomments)])
    if i%10000 == 0:
        print('(Elapsed {}s) Processing comment # {} of {}'.format(int(default_timer()-start_time),i,ncomments) )
    for attempts in range(100):
        response = session.get(full_url)
        if response.status_code == 200:
            break
        else:
            print('Error (too many requests). Retrying after some random amount of time.')
            time.sleep(random.random())

    return response


async def get_data_asynchronous():
    with ThreadPoolExecutor(max_workers=6) as executor:
        with requests.Session() as session:
            # Set any session parameters here before calling `fetch`

            # Initialize the event loop        
            loop = asyncio.get_event_loop()
            
            
            # Use list comprehension to create a list of
            # tasks to complete. The executor will run the `fetch`
            # function for each csv in the csvs_to_fetch list
            tasks = [
                loop.run_in_executor(
                    executor,
                    fetch,
                    *(session, i) # Allows us to pass in multiple arguments to `fetch`
                )
                for i in range(0,ncomments,1000)
            ]
            
            # Initializes the tasks to run and awaits their results
            for response in await asyncio.gather(*tasks):
                if response.status_code != 200:
                    print('Error at {}-th comment_id ()'.format(i,comment_ids[i]))
                else:
                    for comment in response.json()['data']:
                        comment_id = comment['id']
                        del comment['id']
                        comment2data[comment_id]=comment

outname=subreddit+'_comment2data_after_Dec2019.pkl'
if os.path.exists(outname):
  with open(outname,'rb') as infile:
    comment2data = pickle.load(infile)
else:                            
  loop = asyncio.get_event_loop()
  future = asyncio.ensure_future(get_data_asynchronous())
  loop.run_until_complete(future)
  with open(outname,'wb') as outfile:
    pickle.dump(comment2data,outfile)

Collecting comments...
(Elapsed 0s) Processing comment # 0 of 108751
(Elapsed 4s) Processing comment # 10000 of 108751
(Elapsed 12s) Processing comment # 20000 of 108751
(Elapsed 20s) Processing comment # 30000 of 108751
(Elapsed 24s) Processing comment # 40000 of 108751
(Elapsed 32s) Processing comment # 50000 of 108751
(Elapsed 40s) Processing comment # 60000 of 108751
(Elapsed 44s) Processing comment # 70000 of 108751
(Elapsed 52s) Processing comment # 80000 of 108751
(Elapsed 60s) Processing comment # 90000 of 108751
(Elapsed 64s) Processing comment # 100000 of 108751


In [None]:
len(comment2data)

0