In [132]:
import datetime
import os
import praw
import pandas as pd

from dataclasses import dataclass, asdict
from decouple import config
from tqdm import tqdm


reddit = praw.Reddit(
    client_id=config("REDDIT_KEY"),
    client_secret=config("REDDIT_SECRET"),
    user_agent="NA"
)



## Functions

In [135]:

def extract_attributes_from_subreddit(subreddit):
    return {
        "active_user_count": subreddit.active_user_count,
        "url": subreddit.url,
        "title": subreddit.title,
        "subscribers": subreddit.subscribers,
        "subreddit_type": subreddit.subreddit_type,
        "spoilers_enabled": subreddit.spoilers_enabled,
        "public_description": subreddit.public_description,
        "over18": subreddit.over18,
        "created": subreddit.created,
        "created_utc": subreddit.created_utc,
        "lang": subreddit.lang,
        "videos_allowed": subreddit.allow_videos,
        "images_allowed": subreddit.allow_images
    }


def convert_timestamp(ts):
    datetime_obj = datetime.datetime.fromtimestamp(ts)
    date = datetime.datetime.strftime(datetime_obj,"%m-%d-%Y")
    return date



def get_submissions(subreddit_name: str, category: str, limit: int) -> list:
    '''
    Categories: hot, new or top
    
    >>> get_submissions('MrRobot', 'hot', 10)
    '''
    subreddit = reddit.subreddit(subreddit_name)
    if category == 'hot':
        submission_obj = subreddit.hot(limit=limit)
    elif category == 'new':
        submission_obj = subreddit.new(limit=limit)
    else:
        submission_obj = subreddit.top(limit=limit)
    
    return [sub for sub in submission_obj]



def get_comments(submission: praw.models.reddit.submission.Submission, body: bool=False) -> list:
    if body:
        return [comment.body for comment in submission.comments]
    return list(submission.comments)



In [41]:
mr10h = get_submissions('MrRobot', 'hot', 10)

In [125]:

@dataclass
class Submission:
    subreddit_url: str
    subreddit_name: str
    title: str
    selftext: str
    author: str
    created: int
    
    over_18: bool
    edited: bool
    is_original_content: bool
    locked: bool
    spoiler: bool
    
    num_comments: int
    num_crossposts: int
    num_duplicates: int
    num_reports: int
    num_upvotes: int
    num_downvotes: int
    
    def __post_init__(self):
        self.title = self.title.replace(',', ' ')
        self.selftext = self.selftext.replace(',', ' ')
        self.created = convert_timestamp(self.created)
        
        
@dataclass
class Comment:
    '''
    Comment dataclass
    '''

In [138]:
testsub = Submission(
    mr10h[0].subreddit.url,
    mr10h[0].subreddit.url.split('/')[-2],
    mr10h[0].title,
    mr10h[0].selftext,
    mr10h[0].author.name,
    mr10h[0].created,
    mr10h[0].over_18,
    mr10h[0].edited,
    mr10h[0].is_original_content,
    mr10h[0].locked,
    mr10h[0].spoiler,
    mr10h[0].num_comments,
    mr10h[0].num_crossposts,
    mr10h[0].num_duplicates,
    mr10h[0].num_reports,
    mr10h[0].ups,
    mr10h[0].downs
    
)

testsub

Submission(subreddit_url='/r/MrRobot/', subreddit_name='MrRobot', title='Free Talk Monthly (Month of August 04  2020)', selftext='Comments are sorted by new by default. Feel free to talk about anything. Spoiler tags are not needed for any Mr. Robot talk.', author='AutoModerator', created='08-05-2020', over_18=False, edited=False, is_original_content=False, locked=False, spoiler=False, num_comments=14, num_crossposts=0, num_duplicates=0, num_reports=None, num_upvotes=8, num_downvotes=0)

In [152]:
testcoms = get_comments(mr10h[0])

testcoms[1].replies[0].body

'i think its an lsd friendly show. its so immersive.'

# Top 100

In [27]:
import pandas as pd
import datetime

date_asof = datetime.datetime.strftime(datetime.datetime.now(),"%m-%d-%Y")
date_file_format = datetime.datetime.strftime(datetime.datetime.now(),"%m%d%Y")

top_100 = pd.read_html('https://frontpagemetrics.com/top')

In [3]:
top_100_subreddits = top_100[0]
top_100_subreddits = [subreddit.split('/')[-1] for subreddit in top_100_subreddits.Reddit]

In [23]:
subreddits = [reddit.subreddit(subreddit_name) for subreddit_name in tqdm(top_100_subreddits)]

100%|██████████| 100/100 [00:00<00:00, 117224.82it/s]


In [24]:
subreddit_data = [extract_attributes_from_subreddit(subreddit) for subreddit in tqdm(subreddits)]

100%|██████████| 100/100 [00:30<00:00,  3.32it/s]


In [9]:
subreddits_df = pd.DataFrame(subreddit_data)

In [28]:
subreddits_df['created_date'] = subreddits_df.created.apply(convert_timestamp)
subreddits_df['name'] = top_100_subreddits
subreddits_df['asof'] = date_asof

In [29]:
subreddits_df.to_csv(f'top100subreddits{date_file_format}.csv', index=False)