# Reddit scrape notebook

Script to scrape Reddit r/AmITheAsshole subreddit for posts and comments, using praw: https://praw.readthedocs.io/en/latest/getting_started/quick_start.html

In [1]:
import praw
import pandas as pd
import datetime as dt
import time
from dotenv import load_dotenv
import os
import itertools

load_dotenv()

True

In [4]:
reddit = praw.Reddit(
    client_id=os.getenv("REDDIT_CLIENT_ID"),
    client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
    password=os.getenv("REDDIT_PASS"),
    user_agent=os.getenv("REDDIT_USER_AGENT"),
    username=os.getenv("REDDIT_UNAME")
)

In [5]:
reddit.read_only = True

In [6]:
subreddit = reddit.subreddit("AmItheAsshole")
print(subreddit.display_name)


AmItheAsshole


In [7]:
submissions = []
for submission in subreddit.top(limit=501, time_filter='week'):
    
    comment_list = []
    submission.comments.replace_more(limit=0)
    for top_level_comment in submission.comments[1:]:
        comment_list.append(top_level_comment.body)
    
    submissions.append(
        {
            "title": submission.title,
            "body": submission.selftext,
            "verdict": submission.link_flair_text,
            "comments": comment_list,
            "num_comments": submission.num_comments,
            "upvotes": submission.score,
            "upvote_ratio": submission.upvote_ratio,
            "url": submission.permalink,
            "created": submission.created_utc,
            "edited": submission.edited                       
        }
    )


In [8]:
filename = "aita-500-top-this-week.csv"
data = pd.DataFrame(submissions)
data.to_csv(filename)

In [9]:
data.head()

Unnamed: 0,title,body,verdict,comments,num_comments,upvotes,upvote_ratio,url,created,edited
0,AITA for asking my GF if she can take a shower?,"My GF really only showers once a week, twice i...",Not the A-hole POO Mode,"[""I'm not even sure if I started to resent her...",4253,15872,0.96,/r/AmItheAsshole/comments/1iy5n9x/aita_for_ask...,1740517000.0,1740790583.0
1,AITA for insulting my husband for what he said...,My daughter (17f) recently started dating this...,Not the A-hole,[So he would rather your daughter be married t...,1481,10857,0.96,/r/AmItheAsshole/comments/1iz4o6f/aita_for_ins...,1740621000.0,False
2,AITA for asking a guest to not crochet at my b...,I'm (28F) getting married this year (yay!) and...,Not the A-hole,[So... do we maybe see why brother's GF would ...,775,8091,0.97,/r/AmItheAsshole/comments/1j0isvh/aita_for_ask...,1740778000.0,1740780160.0
3,AITA for not comforting my bf after he didn’t ...,"Earlier in the day, I told my bf that I was go...",Not the A-hole,[It's not your job to help him regulate his em...,818,7515,0.96,/r/AmItheAsshole/comments/1j284st/aita_for_not...,1740968000.0,False
4,Update: AITA because I don't want my half brot...,this is an update to my [original post](https:...,UPDATE,[Definitely time for me to leave the internet ...,118,7296,0.99,/r/AmItheAsshole/comments/1j12uam/update_aita_...,1740845000.0,False


In [10]:
data = data.iloc[1:] # quitar el primer post que son las reglas del subreddit

In [11]:
# aqui creamos las columnas de los votos que tiene cada veredicto
def count_occurrences(comments, keyword):
    keyword = keyword.lower() 
    return sum(keyword in comment.lower() for comment in comments)

verdicts = ['YTA', 'YWBTA', 'NTA', 'YWNBTA', 'ESH', 'NAH', 'INFO']

for verdict in verdicts:
    data[verdict] = data['comments'].apply(lambda x: count_occurrences(x, verdict))

data.head()


Unnamed: 0,title,body,verdict,comments,num_comments,upvotes,upvote_ratio,url,created,edited,YTA,YWBTA,NTA,YWNBTA,ESH,NAH,INFO
1,AITA for insulting my husband for what he said...,My daughter (17f) recently started dating this...,Not the A-hole,[So he would rather your daughter be married t...,1481,10857,0.96,/r/AmItheAsshole/comments/1iz4o6f/aita_for_ins...,1740621000.0,False,9,0,164,0,13,2,0
2,AITA for asking a guest to not crochet at my b...,I'm (28F) getting married this year (yay!) and...,Not the A-hole,[So... do we maybe see why brother's GF would ...,775,8091,0.97,/r/AmItheAsshole/comments/1j0isvh/aita_for_ask...,1740778000.0,1740780160.0,2,0,161,0,0,3,3
3,AITA for not comforting my bf after he didn’t ...,"Earlier in the day, I told my bf that I was go...",Not the A-hole,[It's not your job to help him regulate his em...,818,7515,0.96,/r/AmItheAsshole/comments/1j284st/aita_for_not...,1740968000.0,False,12,0,143,0,21,5,2
4,Update: AITA because I don't want my half brot...,this is an update to my [original post](https:...,UPDATE,[Definitely time for me to leave the internet ...,118,7296,0.99,/r/AmItheAsshole/comments/1j12uam/update_aita_...,1740845000.0,False,1,0,2,0,0,0,0
5,WIBTA if I send an email to the bday girl’s mo...,My (43F) daughter Annie 9F) was invited to a p...,Not the A-hole,"[NTA, but: You should have asked, why there is...",645,7343,0.96,/r/AmItheAsshole/comments/1j2hp1b/wibta_if_i_s...,1741005000.0,1741020672.0,33,38,41,1,15,7,6


In [12]:
data['created'] = data['created'].apply(lambda x: dt.datetime.fromtimestamp(x))
data['edited'] = data['edited'].apply(lambda x: dt.datetime.fromtimestamp(x) if x > 0 else None)



In [13]:
from ydata_profiling import ProfileReport

profile = ProfileReport(data, title="AITA data report")
profile.to_file("aita-report.html") 

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [14]:
processed_data_filename = "aita-500-top-this-week-processed.csv"
data.to_csv(processed_data_filename)