In [1]:
import praw
import pandas as pd
import numpy as np
import pprint
import time
import requests
import datetime
from bs4 import BeautifulSoup
from requests.exceptions import Timeout
from IPython.display import clear_output

reddit = praw.Reddit(client_id='2qM5x5EPvG5FUw',
                     client_secret='FZPbAzurDBqSvfSxQbAv4S-nxrI',
                     user_agent='my user agent')

In [2]:
def load_dataframe():
    try:
        
        # if the csv file already exists, load that
        df = pd.read_csv('results.csv')
        print('File found')
        
    except:
        
        # else, create a fresh one
        print('File not found. creating new dataframe...')
        data = {'submissionid' : [], 'tile' : [], 'upvotes' : [], 'upvote_ratio' : [], 'comments' : [], 
                   'top_level_comments' : [], 'crossposts' : [], 'awards' : [], 'time_of_post' : [], 
                   'time_of_request' : [], 'locked' : [], 'removed' : [], 'reason_removed' : [], 'domain' : [], 
                   'url' : [], 'uploader' : [], 'subreddit' : [], 'flair' : [], 'fake' : []}
        df = pd.DataFrame(data=data)
        
    return df

In [3]:
def get_data(s, df):
    
    # make the object non lazy
    temp = s.title
    
    # check if the post is locked
    if s.removal_reason == None:
        removed = False
    else:
        removed = True
        
    # set the data to put in the dataframe
    data = {'submissionid' : [s.id], 'tile' : [temp], 'upvotes' : [s.ups], 'upvote_ratio' : [s.upvote_ratio], 
            'comments' : [s.num_comments], 
            'top_level_comments' : [len(s._comments_by_id)], 'crossposts' : [s.num_crossposts], 
            'awards' : [s.total_awards_received], 'time_of_post' : [str(s.created_utc)],
            'time_of_request' : [str(time.time())], 'locked' : [s.locked], 'removed' : [removed], 
            'reason_removed' : [s.removal_reason], 'domain' : [s.domain], 'url' : [s.url], 'uploader' : [s.author.name], 
            'subreddit' : [s.subreddit.display_name], 'flair' : [s.link_flair_text], 'fake' : [None]}
    
    # merge with the existing dataframe 
    return pd.concat([df, pd.DataFrame(data)])

In [4]:
def run(submissions, hours, df):
    
    # set hours to seconds and set start time
    seconds = hours * 3600
    start_time = time.time()
    loops = 1
    
    # loop while the run time is less than seconds
    while True:
        
        # get start time of the loop
        loop_time = time.time()
        posts = 0
        
        # loop over every post
        for s in submissions: 
            posts += 1
            
            # check if run time is more than secondes
            end_time = time.time()
            if end_time - start_time > seconds:
                
                # print run time, and return df
                print(round((end_time - start_time) / 3600,2))
                return df
            
            try:
                
                # try to get the data
                df = get_data(reddit.submission(id=s), df)
            except:
                pass
            
        # print loop statistics
        t = datetime.datetime.now()
        print(f'Time elapsed(loop {loops}, posts {posts}): {round(end_time - loop_time, 2)} seconds.\n \
        Time since start: {round((end_time - start_time) / 3600,2)} hours at {t.hour}:{t.minute}\n')
        loops += 1

In [5]:
def clean(url):
    try:
        
        # try to clean
        try:
            
            # try to get the news page in less than 10 seconds
            html = requests.get(url, timeout=10).text
        except Timeout:
            
            # esle time out
            print('Timeout in clean()')
            return ''
        
    except:
        print('Unknown error in clean()')
        return ''
        
    soup = BeautifulSoup(html)

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)
    return ' '.join([x for x in text.split('\n') if len(x.split(' ')) > 3])


In [6]:
def tag_fake_news(df, name):
    
    # set base url, get list of unique posts, set data template
    BASE = 'http://localhost:8080/fakebox/check'
    posts = list(df[df.fake.isnull()].url.unique())
    n_posts = len(posts)
    counter = 0
    data = {"url": '',
            "title": '',
            "content": ''}
    
    # write progress log
    with open('Results folder\\progress.txt', 'a') as f:
        f.write(f'In progress: {name}, to do: {n_posts}\n')
    
    # loop over every unique posts
    for post in posts:
        
        # get cleaned content of news article
        content = clean(post)
        if content == '':
            
            # if no content is returned, call fail()
            df, counter = fail(post, df, counter, n_posts)
            continue
        
        # set data atributes
        data['content'] = content
        data['url'] = post
        
        # make sure response from api is in json
        try:
            try:
                
                # try to get the api output
                response = requests.post(BASE, data=data, timeout=60).json()
            except Timeout:
                
                # if it times out, notify users and run fail()
                print('error: TIMEOUT' , post)
                df, counter = fail(post, df, counter, n_posts)
                continue
                
        except:
            
            # if respons not in json, notify user and run fail()
            print('error: JSON' , post)
            df, counter = fail(post, df, counter, n_posts)
            continue
        
        # run succes to add api resonse to dataframe
        df, counter = succes(post, df, counter, n_posts, response)
    
    # update progress losg
    with open('Results folder\\progress.txt', 'a') as f:
        f.write(f'{name} done\n\n')
    
    # return df
    return df

# is run when something in the above process fails
def fail(post, df, counter, n):
    
    # print where the error occurs
    print(counter, ' of ', n, ' fail')
    counter += 1
    
    # update dataframe with non in the fake column
    df.loc[df.url == post, 'fake'] = None
    return df, counter

# is run when everything in tag_fake_news() works
def succes(post, df, counter, n, response):
    
    # try to add api response to the dataframe
    try:
        
        # if response is unsure or biased, set fake news to true
        if response['content']['decision'] != 'impartial':
            df.loc[df.url == post, 'fake'] = True
        
        # else set to false
        else:
            df.loc[df.url == post, 'fake'] = False
        
        # update the user
        print(counter, ' of ', n, ' succes')
        counter += 1
        
    # if something goes wrong, run fail() anyway
    except:
        df, counter = fail(post, df, counter, n)
        print('No response')
    
    # return counter and df
    return df, counter

In [7]:
def start(subreddits, time, n_posts):
    
    # load data frame
    df = load_dataframe()
    ids = []
    
    # get all posts of all 15 subreddits
    for sub in subreddits:
        posts = reddit.subreddit(sub).new(limit=n_posts)
        ids += [item.id for item in posts]
        
    # run the tracker
    df = run(ids, time, df)
    
    # run the tagger
    #df = tag_fake_news(df)
    
    # save result to .csv
    df.to_csv(r'results.csv', index=False, header=True)
    return df

In [8]:
# set subs to track
subs = ['news', 'politics', 'worldnews', 'Uplifitingnews', 'coronavirus', 'covid19', 
        'worldevents', 'economics', 'environment', 'europe', 'republican', 'democrats', 
        'conservative', 'futurology', 'technology']

# First argument is sub, second is number of hours, third is number of posts
# df = start(subs, 12, 100)


In [9]:
# df = pd.read_csv('Results folder\\results day 1 - 3.csv')
# df = pd.concat([df, pd.read_csv('Results folder\\results day 4 - 6.csv')])
# df = pd.concat([df, pd.read_csv('Results folder\\results day 7 - 9.csv')])
# df = pd.concat([df, pd.read_csv('Results folder\\results day 10.csv')])

In [8]:
files = ['Results folder\\results day 10.csv', 'Results folder\\results day 1 - 3.csv', 
         'Results folder\\results day 4 - 6.csv', 'Results folder\\results day 7 - 9.csv']

for file in files:       
    print(file)
    df = pd.read_csv(file)
    df = tag_fake_news(df, file)
    df.to_csv(file, index=False, header=True)
    clear_output()
    
for file in files:
    print(file)
    df = pd.read_csv(file)
    df = tag_fake_news(df, file)
    df.to_csv(file, index=False, header=True)
    clear_output()