In [11]:
#Scraping based on this tutorial - https://rareloot.medium.com/using-pushshifts-api-to-extract-reddit-submissions-fb517b286563
import pandas as pd
import time
import datetime
import requests
import json
import time
import csv

In [12]:
#Using Pushshift's API to extract Reddit submissions -> this function builds a URL to query the API
#
#
#Parameters explained:
#subreddit = the subreddit you want to extract data from
#start_date = 4 and end_date = 2 means that the data will be extracted from 4 days ago to 2 days ago
#amount_posts = the amount of posts you want to extract -> can be max 1000
#
#

amount_posts = 999

def query_URL(subreddit, start_date, end_date, amount_posts):
    #Build the URL
    pushshift_url = "http://api.pushshift.io/reddit/search/submission?subreddit="+subreddit+"&after="+str(start_date)+"d&before="+str(end_date)+"d&size="+str(amount_posts)
    return pushshift_url

#Using Pushshift's API to extract Reddit submissions -> this function extracts the data from the URL
def get_data(url):
    #Get the data
    r = requests.get(url)
    data = json.loads(r.text)
    return data['data']


In [13]:
#queried these subreddits to get neutral data
subreddits = ['unpopularopinion','IWantToLearn', 'tifu', 'legaladvice']
labels = {'unpopularopinion': 5, 'IWantToLearn': 5, 'tifu': 5, 'legaladvice': 5 } #label of 5 means neutral

In [14]:
#function to query the past 100 days in intervals of 1 day
def query_subreddit(subreddit):
    all_data = []
    for i in range(1,102):
        start_date = i+1
        end_date = i
        url = query_URL(subreddit, start_date, end_date, amount_posts)
        data = get_data(url)
        for post in data:
            all_data.append([post['title'], post['selftext'], post['subreddit'], post['utc_datetime_str'], post['num_comments'], post['score'], labels[subreddit]])
        
    #convert to dataframe
    df = pd.DataFrame(all_data, columns = ['title', 'selftext', 'subreddit', 'date', 'num_comments', 'score', 'label'])

    #convert date to datetime
    df['date'] = pd.to_datetime(df['date'])

    #deal with "need to escape, but no escapechar set" error
    df['selftext'] = df['selftext'].str.replace('\n', '')
    df['title'] = df['title'].str.replace('\n', '')
    df['selftext'] = df['selftext'].str.replace('\r', '')
    df['title'] = df['title'].str.replace('\r', '')
    df['selftext'] = df['selftext'].str.replace('\t', '')
    df['title'] = df['title'].str.replace('\t', '')
    df['selftext'] = df['selftext'].str.replace('|', '')
    df['title'] = df['title'].str.replace('|', '')

    #save to csv - no clue why i need escapechar, it errors otherwise
    df.to_csv('reddit_{}.csv'.format(subreddit), index = False, escapechar=' ')

In [15]:
for subreddit in subreddits:
    query_subreddit(subreddit)
    print('Done: ' + subreddit)

  df['selftext'] = df['selftext'].str.replace('|', '')
  df['title'] = df['title'].str.replace('|', '')


Done: tifu


  df['selftext'] = df['selftext'].str.replace('|', '')
  df['title'] = df['title'].str.replace('|', '')


Done: legaladvice


In [16]:
#reach each reddit csv file and take out the rows where the selftext is empty or equal to [removed]
for subreddit in subreddits:
    df = pd.read_csv('reddit_{}.csv'.format(subreddit))
    df = df[df.selftext != '[removed]']
    df = df[df.selftext != '']
    df.to_csv('reddit_{}.csv'.format(subreddit), index = False)

#print length of each csv file
for subreddit in subreddits:
    df = pd.read_csv('reddit_{}.csv'.format(subreddit))
    print(len(df))

16144
1098
3763
48425


In [17]:
#checking there are no duplicate posts
def read_csv(subreddit):
    df = pd.read_csv('reddit_{}.csv'.format(subreddit))
    print(len(df))
    df.drop_duplicates()
    print(len(df))
    print('')
    return df

for subreddit in subreddits:
    read_csv(subreddit)

16144
16144

1098
1098

3763
3763

48425
48425



In [None]:
#save all csv files into one csv file
df = pd.concat([pd.read_csv('reddit_{}.csv'.format(subreddit)) for subreddit in subreddits])
df.to_csv('reddit_neutral_all.csv', index = False)

print(len(df))