## Data Collection Part 2: Submissions
Extracting relevant Reddit submissions (i.e. posts) using the Pushshift API. 
Source: https://medium.com/@RareLoot/using-pushshifts-api-to-extract-reddit-submissions-fb517b286563

In [None]:
import pandas as pd
import requests
import json
import csv
import time
import datetime

#### Function for building PushShift URLs

In [None]:
def getPushshiftData(query, after, before):
    url = 'https://api.pushshift.io/reddit/search/submission/?q='+str(query)+'&size=1000&after='+str(after)+'&before='+str(before)
    print(url)
    r = requests.get(url)
    data = json.loads(r.text)
    return data['data']

#### Function to extract key data points

In [None]:
def collectSubData(subm):
    subData = list() #list to store data points
    title = subm['title']
    try:
        flair = subm['author_flair_text']
    except KeyError:
        flair = "NaN"
    try:
        selftext = subm['selftext']
    except KeyError:
        selftext = "NaN"    
    author = subm['author']
    submission_id = subm['id']
    score = subm['score']
    created = datetime.datetime.fromtimestamp(subm['created_utc']) #1520561700.0
    numComms = subm['num_comments']
    numCrossposts = subm['num_crossposts']
    subreddit = subm['subreddit']
    sub_id = subm['subreddit_id']
    permalink = subm['permalink']
    
    
    subData.append((submission_id,title,selftext,flair,author,subreddit,sub_id,score,
                    numComms,numCrossposts,permalink,created))
    subStats[sub_id] = subData

#### Parameters to query:
All submissions that include “face mask” in their title or self text between 22 Feb ’20 and 22 May ’20. 

In [None]:
# before and after dates (Unix Timestamp)
after = "1582329600"  
before = "1590182370"  
query = "face-mask"

# subCount tracks the no. of total submissions we collect. 
# subStats is the dictionary where we will store our data.
subCount = 0
subStats = {}

#### Run code

In [None]:
data = getPushshiftData(query, after, before)


# Will run until all posts have been gathered 
# from the 'after' date up until before date
while len(data) > 0:
    for submission in data:
        collectSubData(submission)
        subCount+=1
    # Calls getPushshiftData() with the created date of the last submission
    print(len(data))
    print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
    after = data[-1]['created_utc']
    data = getPushshiftData(query, after, before)
    
print(len(data))

#### Check submissions

In [None]:
print(str(len(subStats)) + " submissions have added to list")
print("1st entry is:")
print(list(subStats.values())[0][0][1] + " created: " + str(list(subStats.values())[0][0][5]))
print("Last entry is:")
print(list(subStats.values())[-1][0][1] + " created: " + str(list(subStats.values())[-1][0][5]))

#### Store data in csv file

In [None]:
def updateSubs_file():
    upload_count = 0
    location = "/Users/merle-sophie/Desktop/FTL Hackathon 052020/Data"
    filename = "_Reddit_FaceMasks_Subm.csv"
    file = location + filename
    with open(file, 'w', newline='', encoding='utf-8') as file: 
        a = csv.writer(file, delimiter=',')
        headers = ["Post ID","Title","Selftext","Flair","Author","Subreddit",
                   "Subreddit_Id","Score","Total No. of Comments",
                   "No. of Crossposts","Permalink","Publish Date"]
        a.writerow(headers)
        for sub in subStats:
            a.writerow(subStats[sub][0])
            upload_count+=1
            
        print(str(upload_count) + " submissions have been uploaded")
updateSubs_file()