# Reddit scrape notebook

Script to scrape Reddit r/AmITheAsshole subreddit for posts and comments, using praw: https://praw.readthedocs.io/en/latest/getting_started/quick_start.html

In [14]:
import praw
import pandas as pd
import datetime as dt
import time
from dotenv import load_dotenv
import os
import itertools

load_dotenv()

True

In [15]:
reddit = praw.Reddit(
    client_id=os.getenv("REDDIT_CLIENT_ID"),
    client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
    password=os.getenv("REDDIT_PASS"),
    user_agent=os.getenv("REDDIT_USER_AGENT"),
    username=os.getenv("REDDIT_UNAME")
)

In [16]:
reddit.read_only = True

In [17]:
subreddit = reddit.subreddit("AmItheAsshole")
print(subreddit.display_name)


AmItheAsshole


In [18]:
submissions = []
for submission in subreddit.top(limit=5002, time_filter='all'):
    
    comment_list = []
    submission.comments.replace_more(limit=0)
    for top_level_comment in submission.comments[1:]:
        comment_list.append(top_level_comment.body)
    
    submissions.append(
        {
            "title": submission.title,
            "body": submission.selftext,
            "verdict": submission.link_flair_text,
            "comments": comment_list,
            "num_comments": submission.num_comments,
            "upvotes": submission.score,
            "upvote_ratio": submission.upvote_ratio,
            "url": submission.permalink,
            "created": submission.created_utc,
            "edited": submission.edited                       
        }
    )


In [19]:
filename = "data/aita-5000.csv"
data = pd.DataFrame(submissions)
data.to_csv(filename)

In [20]:
data.head()

Unnamed: 0,title,body,verdict,comments,num_comments,upvotes,upvote_ratio,url,created,edited
0,AITA for telling my wife the lock on my daught...,My brother in-law (Sammy) lost his home shortl...,Not the A-hole,"[[deleted], NTA. Don't back down. You are the ...",5244,81018,0.92,/r/AmItheAsshole/comments/ocx94s/aita_for_tell...,1625316000.0,False
1,META: This sub is moving towards a value syste...,I’ve enjoyed reading and posting on this sub f...,META,[While I find this notion accurate to a degree...,6149,80921,0.92,/r/AmItheAsshole/comments/d6xoro/meta_this_sub...,1568998000.0,1574222702.0
2,"UPDATE, AITA for despising my mentally handica...","I'm back like I said I would be,. My [original...",UPDATE,[I'm glad your other family is coming through ...,1972,72782,0.96,/r/AmItheAsshole/comments/azvko1/update_aita_f...,1552322000.0,1552376988.0
3,AITA For suing my girlfriend after she had my ...,I'll try to keep this short. I had a [1967 Imp...,Not the A-hole,[NTA. My husband has several old cars. I haven...,2728,70803,0.98,/r/AmItheAsshole/comments/gr8bp3/aita_for_suin...,1590536000.0,1590606255.0
4,UPDATE: AITA for wanting to go to the funeral ...,I want to sincerely thank everyone who comment...,Update,[],2,67573,0.91,/r/AmItheAsshole/comments/cjetsa/update_aita_f...,1564423000.0,False


In [21]:
data = data.iloc[1:] # quitar el primer post que son las reglas del subreddit

In [22]:
# aqui creamos las columnas de los votos que tiene cada veredicto
def count_occurrences(comments, keyword):
    keyword = keyword.lower() 
    return sum(keyword in comment.lower() for comment in comments)

verdicts = ['YTA', 'YWBTA', 'NTA', 'YWNBTA', 'ESH', 'NAH', 'INFO']

for verdict in verdicts:
    data[verdict] = data['comments'].apply(lambda x: count_occurrences(x, verdict))

data.head()


Unnamed: 0,title,body,verdict,comments,num_comments,upvotes,upvote_ratio,url,created,edited,YTA,YWBTA,NTA,YWNBTA,ESH,NAH,INFO
1,META: This sub is moving towards a value syste...,I’ve enjoyed reading and posting on this sub f...,META,[While I find this notion accurate to a degree...,6149,80921,0.92,/r/AmItheAsshole/comments/d6xoro/meta_this_sub...,1568998000.0,1574222702.0,3,0,12,0,5,0,0
2,"UPDATE, AITA for despising my mentally handica...","I'm back like I said I would be,. My [original...",UPDATE,[I'm glad your other family is coming through ...,1972,72782,0.96,/r/AmItheAsshole/comments/azvko1/update_aita_f...,1552322000.0,1552376988.0,0,0,18,0,0,0,3
3,AITA For suing my girlfriend after she had my ...,I'll try to keep this short. I had a [1967 Imp...,Not the A-hole,[NTA. My husband has several old cars. I haven...,2728,70803,0.98,/r/AmItheAsshole/comments/gr8bp3/aita_for_suin...,1590536000.0,1590606255.0,0,0,124,0,0,0,1
4,UPDATE: AITA for wanting to go to the funeral ...,I want to sincerely thank everyone who comment...,Update,[],2,67573,0.91,/r/AmItheAsshole/comments/cjetsa/update_aita_f...,1564423000.0,False,0,0,0,0,0,0,0
5,AITA for pretending to get fired when customer...,I am a high schooler with a weekend job at a c...,Not the A-hole,[NTA - Customers can be fucking dicks. Usually...,3602,63523,0.92,/r/AmItheAsshole/comments/e5k3z2/aita_for_pret...,1575393000.0,False,37,0,23,0,24,2,12


In [23]:
data['created'] = data['created'].apply(lambda x: dt.datetime.fromtimestamp(x))
data['edited'] = data['edited'].apply(lambda x: dt.datetime.fromtimestamp(x) if x > 0 else None)


In [24]:
from ydata_profiling import ProfileReport

profile = ProfileReport(data, title="AITA data report")
profile.to_file("aita-report.html") 

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [25]:
processed_data_filename = "data/aita-5000-processed.csv"
data.to_csv(processed_data_filename)

In [26]:
data.shape

(996, 17)