In [1]:
import pandas as pd
import requests
import json
import csv
import time
import datetime

In [2]:
# Function for building PushShift URLs
# as seen in https://medium.com/@RareLoot/using-pushshifts-
# api-to-extract-reddit-submissions-fb517b286563

def getPushshiftData(query, after, before):
    url = 'https://api.pushshift.io/reddit/search/comment/?q='+str(query)+'&size=1000&after='+str(after)+'&before='+str(before)
    print(url)
    r = requests.get(url)
    data = json.loads(r.text)
    return data['data']

In [3]:
# Function to extract key data points
# as seen in https://medium.com/@RareLoot/using-pushshifts-
# api-to-extract-reddit-submissions-fb517b286563

def collectSubData(subm):
    subData = list() #list to store data points
    comment_id = subm['id']
    text = subm['body']
    try:
        flair = subm['author_flair_text']
    except KeyError:
        flair = "NaN"    
    author = subm['author']
    is_submitter = subm['is_submitter']
    subreddit = subm['subreddit']
    sub_id = subm['subreddit_id']
    score = subm['score']
    parent_id = subm['parent_id']
    permalink = subm['permalink']
    
    created = datetime.datetime.fromtimestamp(subm['created_utc']) #1520561700.0
    
    subData.append((comment_id,text,flair,author,is_submitter,subreddit,
                    sub_id,score,parent_id,permalink,created))
    subStats[comment_id] = subData

In [4]:
# Parameters to query:
# All submissions with “face mask” in their title between 1st Jan ’20
# and 22 May ’20. 
# subCount tracks the no. of total submissions we collect. 
# subStats is the dictionary where we will store our data.


# before and after dates (Unix Timestamp)
after = "1582329600" # 04/01/2018 @ 12:00am (UTC)
before = "1590182370"  # 03/01/2018 @ 12:00am (UTC)
query = "face-mask"
subCount = 0
subStats = {}

In [5]:
# Run code
data = getPushshiftData(query, after, before)


# Will run until all posts have been gathered 
# from the 'after' date up until before date
while len(data) > 0:
    for submission in data:
        collectSubData(submission)
        subCount+=1
    # Calls getPushshiftData() with the created date of the last submission
    print(len(data))
    print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
    after = data[-1]['created_utc']
    data = getPushshiftData(query, after, before)
    
print(len(data))

https://api.pushshift.io/reddit/search/comment/?q=face-mask&size=1000&after=1582329600&before=1590182370
1000
2020-02-24 08:32:00
https://api.pushshift.io/reddit/search/comment/?q=face-mask&size=1000&after=1582529520&before=1590182370
1000
2020-02-26 02:26:47
https://api.pushshift.io/reddit/search/comment/?q=face-mask&size=1000&after=1582680407&before=1590182370
1000
2020-02-27 13:54:05
https://api.pushshift.io/reddit/search/comment/?q=face-mask&size=1000&after=1582808045&before=1590182370
1000
2020-02-28 19:15:28
https://api.pushshift.io/reddit/search/comment/?q=face-mask&size=1000&after=1582913728&before=1590182370
1000
2020-03-01 00:40:48
https://api.pushshift.io/reddit/search/comment/?q=face-mask&size=1000&after=1583019648&before=1590182370
1000
2020-03-02 04:18:58
https://api.pushshift.io/reddit/search/comment/?q=face-mask&size=1000&after=1583119138&before=1590182370
1000
2020-03-03 06:07:39
https://api.pushshift.io/reddit/search/comment/?q=face-mask&size=1000&after=1583212059&bef

1000
2020-04-11 06:32:38
https://api.pushshift.io/reddit/search/comment/?q=face-mask&size=1000&after=1586579558&before=1590182370
1000
2020-04-12 01:32:39
https://api.pushshift.io/reddit/search/comment/?q=face-mask&size=1000&after=1586647959&before=1590182370
1000
2020-04-12 22:36:42
https://api.pushshift.io/reddit/search/comment/?q=face-mask&size=1000&after=1586723802&before=1590182370
1000
2020-04-13 19:44:00
https://api.pushshift.io/reddit/search/comment/?q=face-mask&size=1000&after=1586799840&before=1590182370
1000
2020-04-14 15:15:36
https://api.pushshift.io/reddit/search/comment/?q=face-mask&size=1000&after=1586870136&before=1590182370
1000
2020-04-15 06:03:06
https://api.pushshift.io/reddit/search/comment/?q=face-mask&size=1000&after=1586923386&before=1590182370
1000
2020-04-15 22:36:55
https://api.pushshift.io/reddit/search/comment/?q=face-mask&size=1000&after=1586983015&before=1590182370
1000
2020-04-16 13:27:35
https://api.pushshift.io/reddit/search/comment/?q=face-mask&size=

In [6]:
# Check submissions
print(str(len(subStats)) + " submissions have added to list")
print("1st entry is:")
print(list(subStats.values())[0][0][1] + " created: " + str(list(subStats.values())[0][0][5]))
print("Last entry is:")
print(list(subStats.values())[-1][0][1] + " created: " + str(list(subStats.values())[-1][0][5]))

124308 submissions have added to list
1st entry is:
 

We started moving westbound again, still about two hours out from the safehouse. The road was quiet, the only noise was the overactive police scanner and the odd freightliner passing by. Alfred had fallen asleep in his chair, but the sensors were hardwired into the front dash of the van. If anyone decided to attack, at least I would know about it. It was odd, for $100 million I had figured that enemies would be crawling out of the woodworks, but so far only two small groups of mercenaries. This was still the first night, and it was only just beginning. We drove for an hour and a half before an alert blared across the screen of the computer. “Sir, it appears you have a message.” Alfred said as he half lazily glared through his glasses. 

“From who?” 

“It appears to be encrypted. It reads ‘I know where you’re going, but don’t let it stop you, I will see you soon enough.’ Rather boring. . .” He trailed off as if he had more to say. 


In [7]:
# Upload to csv file
def updateSubs_file():
    upload_count = 0
    location = "/Users/merle-sophie/Desktop/FTL Hackathon 052020/Data"
    filename = "_Reddit_FaceMasks.csv"
    file = location + filename
    with open(file, 'w', newline='', encoding='utf-8') as file: 
        a = csv.writer(file, delimiter=',')
       # subData.append((comment_id,text,flair,author,is_submitter,subreddit,
       #             sub_id,score,parent_id,permalink))
        headers = ["Comment ID","Text","Flair","Author","Is Submitter","Subreddit","Sub Id","Score","Parent Id", "Permalink", "Created Date"]
        a.writerow(headers)
        for sub in subStats:
            a.writerow(subStats[sub][0])
            upload_count+=1
            
        print(str(upload_count) + " submissions have been uploaded")
updateSubs_file()

124308 submissions have been uploaded
