In [1]:
import datetime
import os
import praw
import pandas as pd

from dataclasses import dataclass, asdict
from decouple import config
from tqdm import tqdm


reddit = praw.Reddit(
    client_id=config("REDDIT_KEY"),
    client_secret=config("REDDIT_SECRET"),
    user_agent="NA"
)



## Functions

In [2]:

def extract_attributes_from_subreddit(subreddit):
    return {
        "active_user_count": subreddit.active_user_count,
        "url": subreddit.url,
        "title": subreddit.title,
        "subscribers": subreddit.subscribers,
        "subreddit_type": subreddit.subreddit_type,
        "spoilers_enabled": subreddit.spoilers_enabled,
        "public_description": subreddit.public_description,
        "over18": subreddit.over18,
        "created": subreddit.created,
        "created_utc": subreddit.created_utc,
        "lang": subreddit.lang,
        "videos_allowed": subreddit.allow_videos,
        "images_allowed": subreddit.allow_images
    }


def convert_timestamp(ts):
    datetime_obj = datetime.datetime.fromtimestamp(ts)
    date = datetime.datetime.strftime(datetime_obj,"%m-%d-%Y")
    return date



def get_submissions(subreddit_name: str, category: str, limit: int) -> list:
    '''
    Categories: hot, new or top
    
    >>> get_submissions('MrRobot', 'hot', 10)
    '''
    subreddit = reddit.subreddit(subreddit_name)
    if category == 'hot':
        submission_obj = subreddit.hot(limit=limit)
    elif category == 'new':
        submission_obj = subreddit.new(limit=limit)
    else:
        submission_obj = subreddit.top(limit=limit)
    
    return [sub for sub in submission_obj]



def get_comments(submission: praw.models.reddit.submission.Submission, body: bool=False) -> list:
    if body:
        return [comment.body for comment in submission.comments]
    return list(submission.comments)



In [60]:
mr10h = get_submissions('MrRobot', 'hot', 10)

In [76]:

@dataclass
class Submission:
    subreddit_url: str
    subreddit_name: str
    title: str
    selftext: str
    author: str
    created: int
    
    over_18: bool
    edited: bool
    is_original_content: bool
    locked: bool
    spoiler: bool
    
    num_comments: int
    num_crossposts: int
    num_duplicates: int
    num_reports: int
    num_upvotes: int
    num_downvotes: int
    
    def __post_init__(self):
        self.title = self.title.replace(',', ' ')
        self.selftext = self.selftext.replace(',', ' ')
        self.created = convert_timestamp(self.created)
    
    
def submission_factory(subreddit):
    return  Submission(
    subreddit.subreddit.url,
    subreddit.subreddit.url.split('/')[-2],
    subreddit.title,
    subreddit.selftext,
    subreddit.author.name,
    subreddit.created,
    subreddit.over_18,
    subreddit.edited,
    subreddit.is_original_content,
    subreddit.locked,
    subreddit.spoiler,
    subreddit.num_comments,
    subreddit.num_crossposts,
    subreddit.num_duplicates,
    subreddit.num_reports,
    subreddit.ups,
    subreddit.downs
    )


@dataclass
class Comment:
    author: str
    created: int
    body: str
    ups: int
    downs: int
    subreddit: str
    submission: str
        
    def __post_init__(self):
        self.body = self.body.replace(',', ' ')
        self.created = convert_timestamp(self.created)
        
        
def comment_factory(comment):
    return Comment(
        comment.author.name,
        comment.created,
        comment.body,
        comment.ups,
        comment.downs,
        comment.subreddit.title,
        comment.submission.title
    )


# testcoms = get_comments(mr10h[0])
# comments = [comment_factory(comment) for comment in testcoms]

In [78]:
comments

[Comment(author='tavuskusu', created='09-05-2020', body='I recently started my first whole series rewatch since watching S4 live as it came out last fall. I needed some time. It definitely struck some deep chords within me. But damn. I don’t think I’ll ever find another show that matters so much to me.', ups=5, downs=0, subreddit='Mr. Robot', submission='Free Talk Monthly (Month of September 04, 2020)'),
 Comment(author='fifteensunflwrs', created='09-06-2020', body="The only inconsistency in this show: in the second episode  Elliot mentioned that now that he is going to try to have a normal life maybe he'll go to gym. But at the same time he is lowkey pretty ripped???? Literally unwatchable....", ups=2, downs=0, subreddit='Mr. Robot', submission='Free Talk Monthly (Month of September 04, 2020)'),
 Comment(author='ohsnapitserny', created='09-06-2020', body='So sorry  for the dumb question. But since our Elliot isn’t the real Elliot. Does the real Elliot looks totally different?', ups=1,

# Subreddits

In [80]:


date_asof = datetime.datetime.strftime(datetime.datetime.now(),"%m-%d-%Y")
date_file_format = datetime.datetime.strftime(datetime.datetime.now(),"%m%d%Y")


def get_top_100_subreddits():
    # read table from the webiste
    top_100 = pd.read_html('https://frontpagemetrics.com/top')
    top_100_subreddits = top_100[0]

    # return subreddit names as list 
    subreddit_names = [subreddit.split('/')[-1] for subreddit in top_100_subreddits.Reddit]
    return subreddit_names


def extract_attributes_from_subreddits(subreddit_names):
    # get subreddit objects
    subreddits = [reddit.subreddit(subreddit_name) for subreddit_name in subreddit_names]

    # return attributes from each subreddit list of dict
    
    subreddit_data = [extract_attributes_from_subreddit(subreddit) for subreddit in tqdm(subreddits)]
    return subreddit_data    
    

def store_subreddits_to_csv(subreddit_data, subreddit_names):
    # store the result in dataframe
    subreddits_df = pd.DataFrame(subreddit_data)

    # add a few columns
    subreddits_df['created_date'] = subreddits_df.created.apply(convert_timestamp)
    subreddits_df['name'] = top_100_subreddits
    subreddits_df['asof'] = date_asof

    # save as csv
    subreddits_df.to_csv(f'top_100_subreddits_{date_file_format}.csv', index=False)
    

In [53]:
# subreddit_names = get_top_100_subreddits()

# subreddit_data = extract_attributes_from_subreddits(subreddit_names)

# store_subreddits_to_csv(subreddit_data, subreddit_names)

100%|██████████| 100/100 [00:28<00:00,  3.55it/s]


# Submissions

In [44]:
    
def store_submissions_to_csv(category, num_subreddits=10, limit=10):
    
    print('Getting list of subreddits...')
    
    submissions = []
    
    for subreddit in tqdm(top_100_subreddits[:num_subreddits]):
        submission_objects = get_submissions(subreddit, category, limit)
        submission_data = [submission_factory(submission_obj) for submission_obj in submission_objects]
        submissions += [asdict(submission) for submission in submission_data]
    
    print('Data extraction complete.')
    print('Storing data to csv...')
    
    pd.DataFrame(submissions).to_csv(f'{category}_submissions_{date_file_format}.csv', index=False)



In [45]:
get_submissions_as_dict('new', 2, 2)

  0%|          | 0/2 [00:00<?, ?it/s]

Getting list of subreddits...


100%|██████████| 2/2 [00:12<00:00,  6.06s/it]

Data extraction complete.
Storing data to csv...





# Comments