In [1]:
from psaw import PushshiftAPI
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from tqdm import tqdm
import datetime
import os
from itertools import islice

# Get data

Every submission on r/news, r/television or r/worldnews beteewn the 2022Oscars and the 18th of april with any of the following:
slap, Will Smith, Chris Rock, Oscars.
Has been downloaded.

Every comment has been downloaded from the above submissions.

News about people slapping with no contaxt to the Oscars may have been downlaoded.

In [14]:
subreddits = ['news', 'television', 'worldnews'] # if more subreddits wants to be added, 
                                                 # replace these, overide start_time in next cell and run code
    
subreddits = ['USnews','qualitynews', 'offbeat', 'OutOfTheLoop', 'Oscars', 'boxoffice', 'willsmith', 'entertainment' ]
    
query = 'slap|Will Smith|Chris Rock|chris rock|will smith|Oscars|keep my wife|'

fields = ['author', 'author_fullname', 'created_utc', 'id', 'num_comments',
          'score', 'subreddit', 'subreddit_id', 'title', 'upvote_ratio']

start_time = int(datetime.datetime(2022,3,28).timestamp()) # Oscars started midnight UTC
end_time = int(datetime.datetime(2022,4,18).timestamp())


In [3]:
if not os.path.exists('./data/submissions.csv'):
    data_submissions = pd.DataFrame(columns=fields + ['created'])
    data_submissions.to_csv('./data/submissions.csv', index=False)
else:
    data_submissions = pd.read_csv('./data/submissions.csv')
    if len(data_submissions['created_utc']):
        start_time = max(data_submissions['created_utc']) + 1 # override start_time here


In [4]:
api = PushshiftAPI()

submission_generator = api.search_submissions(subreddit=','.join(subreddits),
                                              after=start_time,
                                              before=end_time,
                                              q=query,
                                              fields=fields)

In [5]:
save_n_subs_at_the_time = 100

while True:
    generator_sliced = islice(submission_generator, save_n_subs_at_the_time)
    
    temp_df = pd.DataFrame([obj.d_ for obj in generator_sliced])
    temp_df.to_csv('./data/submissions.csv', mode='a', index=False, header=False)
    
    if len(temp_df) != save_n_subs_at_the_time:
        break
        

In [6]:
data_submissions = pd.read_csv('./data/submissions.csv')

In [11]:
comment_fields = ['author', 'body', 'controversiality', 'created_utc',
                  'id', 'link_id', 'parent_id', 'score', 'score_hidden', 'subreddit', 'subreddit_id']
last_id_index = 0

if not os.path.exists('./data/comments.csv'):
    df_comments = pd.DataFrame(columns=comment_fields + ['created', 'post_link_id'])
    df_comments.to_csv('./data/comments.csv', index=False)
else:
    old_df = pd.read_csv('./data/comments.csv')
    last_id = old_df['post_link_id'][len(old_df) - 1]
    last_id_index = np.where(data_submissions['id'] == last_id)[0][0] + 1
    
print(f'{len(data_submissions)} submissions in total')
for row in tqdm(data_submissions[['subreddit', 'id']][last_id_index:].iterrows()):
    subreddit, id = row[1]
    
    comments_generator = api.search_comments(subreddit=subreddit,
                                             link_id=id,
                                             fields=comment_fields)
    
    temp_df = pd.DataFrame([obj.d_ for obj in comments_generator])
    temp_df['post_link_id'] = [id] * len(temp_df)
    
    temp_df.to_csv('./data/comments.csv', mode='a', index=False, header=False)

681 submissions in total


681it [26:01,  2.29s/it]


In [12]:
data_comments = pd.read_csv('./data/comments.csv')

# Data analyzis

In [13]:
data_comments

Unnamed: 0,author,body,controversiality,created_utc,id,link_id,parent_id,score,score_hidden,subreddit,subreddit_id,created,post_link_id
0,[deleted],[removed],0,1649769321,i4fa17l,t3_u1ybvn,t3_u1ybvn,1,True,news,t5_2qh3l,1.649762e+09,u1ybvn
1,Avante-Gardenerd,Who wrote this headline? What the actual fuck?,0,1649769253,i4f9von,t3_u1ybvn,t3_u1ybvn,1,True,news,t5_2qh3l,1.649762e+09,u1ybvn
2,IamChooch,CNN reported it was Trump's fault. Will must o...,0,1649769204,i4f9rua,t3_u1ybvn,t1_i4f7mq1,1,True,news,t5_2qh3l,1.649762e+09,u1ybvn
3,[deleted],[removed],0,1649768969,i4f996c,t3_u1ybvn,t3_u1ybvn,1,True,news,t5_2qh3l,1.649762e+09,u1ybvn
4,[deleted],[removed],0,1649768846,i4f8zno,t3_u1ybvn,t1_i4f7mq1,1,True,news,t5_2qh3l,1.649762e+09,u1ybvn
...,...,...,...,...,...,...,...,...,...,...,...,...,...
74302,gnrc,Will Smith has lost his g dang mind.,0,1648435004,i2e2zpc,t3_tpz9w3,t3_tpz9w3,1,False,television,t5_2qh6e,1.648428e+09,tpz9w3
74303,justice4juicy2020,Just came to post this. Theres a more clear v...,0,1648435001,i2e2zi6,t3_tpz9w3,t3_tpz9w3,1,False,television,t5_2qh6e,1.648428e+09,tpz9w3
74304,JesusChristSuperDerp,i busted yo wife,0,1648434994,i2e2z0f,t3_tpz9w3,t3_tpz9w3,1,False,television,t5_2qh6e,1.648428e+09,tpz9w3
74305,Goose_Dickling,This was fucking crazy,0,1648434959,i2e2wjn,t3_tpz9w3,t3_tpz9w3,1,False,television,t5_2qh6e,1.648428e+09,tpz9w3
