In [None]:
import requests
import json
import os
from dotenv import load_dotenv
from pathlib import Path
import sys
import praw
from datetime import datetime
import pandas as pd
from praw.models import MoreComments
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from os.path import exists

# loads login information from .env file
load_dotenv('logstuff.env')

# login credentials for Reddit
username = os.environ.get('user')
password = os.environ.get('password')

app_id = os.environ.get('app_id')
secret = os.environ.get('secret')
useragent = os.environ.get('useragent')
headers = {'User-Agent': useragent}
threadList = []

# opens Reddit instance
reddit = praw.Reddit(
    client_id=app_id,
    client_secret=secret,
    user_agent=useragent,
    username=username,
    password=password
)

# opens Subreddit and gathers thread URLs
subreddit = reddit.subreddit('politics')
for submission in subreddit.hot(limit=25):
    submissionName = submission.name.split('_')
    threadList.append(submissionName[1])
    
print(threadList)


In [None]:

# gets comments from each comment tree
def getSubComments(comment, allComments, verbose=True):

    allComments.append(comment)
    if not hasattr(comment, "replies"):
        replies = comment.comments()
        if verbose: print("fetching (" + str(len(allComments)) + " comments fetched total)")
    else:
        replies = comment.replies
    for child in replies:
        getSubComments(child, allComments, verbose=verbose)    
    
# iterates over all comment trees    
def getAll(r, submissionId, verbose=True):
    submission = r.submission(submissionId)
    comments = submission.comments
    commentsList = []
    getSubComments(comment, commentsList, verbose=verbose)
    return commentsList
  

In [None]:

#creates dataframe
df = pd.DataFrame(columns = ['Name', 'TotalKarma', 'CommentLength', 'CommentDateTime',
                               'AccountDateTime','ThreadDateTime', 'CommentKarma', 'AdjustedKarma', 
                              'AdjustedKarmaPercent', 'CommentURL' ])    

# opens banned accounts log file
bannedAccounts =open('BannedAccounts.txt', 'a')

today = datetime.now()

for url in threadList:
    # this is to let me know it's actually iterating
    print('tick')
    print(url)
    
    res = reddit.submission(url)
    submissionDateTime = datetime.fromtimestamp(res.created_utc)
    for comment in res.comments:
        if isinstance(comment, MoreComments):
            continue
        try:
            accountDateTime = datetime.fromtimestamp(comment.author.created_utc)
            commentDateTime = datetime.fromtimestamp(comment.created_utc)
            if comment.author is None:
                continue
            elif comment.author.comment_karma == 0:
                continue
            else:
                commentURL = reddit.config.reddit_url  + comment.permalink
                addToDF = {'Name' : comment.author, 'TotalKarma' : comment.author.comment_karma, 
                                  'CommentKarma' : comment.score, 'CommentDateTime' : commentDateTime,
                                  'AdjustedKarma' : -(comment.score - comment.author.comment_karma),
                                  'AdjustedKarmaPercent' : comment.score / comment.author.comment_karma,
                                  'ThreadDateTime' : submissionDateTime, 'AccountDateTime' : accountDateTime,
                                  'CommentLength' : len(comment.body), 'CommentURL': commentURL}
                df = df.append(addToDF, ignore_index = True)               
        except AttributeError:
            print("Attribute error for " + str(comment))
            write = ['Author of comment ', str(comment), ', ', str(comment.author), 
                     ', in thread ', str(res.name), '(', str(res.title), ')', ' shows as a banned account \n', 
                     str(today), '\n']
            bannedAccounts.writelines(write)
            continue
    

bannedAccounts.close()

df.head(500)

In [None]:
# checks length of df
print(len(df))

In [None]:
# prints to out file for this instance of scraping
df.to_csv('out.csv', index = False)

# prints to cumulative out file. Checks to see if file exists; if it does, it appends to file and omits headers
if exists('cumulativeout.csv'):
    df.to_csv('cumulativeout.csv', mode = 'a', index = False, header = False)
    print('appending to cumulativeout.csv')
else:
    df.to_csv('cumulativeout.csv', index = False)
    print("started new cumulativeout.csv!")