In [2]:
import pandas as pd
import praw
import json
# import boto3
import io
import time
import pickle

In [3]:
def get_post_data(subreddit_name, subreddit_type = 'new', post_limit = 100, comment_limmit = 100, reddit = None):
    print(f'Getting Reddit Data: Subreddit: {subreddit_name} --- Number of Posts: {post_limit} --- Comment Limit : {comment_limmit}')
    
    ## post id로 가져올 수도 있음 (id argument로)
    subreddit = reddit.subreddit(subreddit_name)
    
    if subreddit_type =='top':
        print('Getting top posts')
        posts = subreddit.top(limit=post_limit)  
    elif subreddit_type=='new':
        print('Getting new posts')
        posts = subreddit.new(limit=post_limit)  
    elif subreddit_type=='hot':
        print('Getting hot posts')
        posts = subreddit.hot(limit=post_limit)  
    posts_with_comments = []
    for post in posts:
        post.comments.replace_more(limit=comment_limmit)
        comments = []
        for comment in post.comments.list():
            comment_data = {
                'body': comment.body,
                'author': str(comment.author),
                'score': comment.score,
                'created_utc': comment.created_utc,
                'is_top_level': comment.is_root,
                'parent_id': comment.parent_id,
                'depth': comment.depth,
                'gilded': comment.gilded
            }
            comments.append(comment_data)

        post_data = {
            'title': post.title,
            'selftext': post.selftext,
            'score': post.score,
            'url': post.url,
            'author': str(post.author),
            'created_utc': post.created_utc,
            'num_comments': post.num_comments,
            'upvote_ratio': post.upvote_ratio,
            'subreddit': str(post.subreddit),
            'comments': comments
        }
        posts_with_comments.append(post_data)
        #stream_to_s3('reddit-project-data', subreddit_name, post_data)
    print('Got Reddit Data')
    return posts_with_comments

In [4]:
# https://lovit.github.io/dataset/2019/01/16/get_reddit/
# https://praw.readthedocs.io/en/latest/index.html
print('Getting Reddit Credentials')
reddit_cred_file = 'utils/reddit_cred.json'
with open(reddit_cred_file, 'r') as file:
    reddit_cred = json.load(file)

# Reddit app
client_id = reddit_cred['client_id']
client_secret = reddit_cred['client_secret']
user_agent = reddit_cred['user_agent']
reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=user_agent)
print('Got Reddit Credentials')

Getting Reddit Credentials
Got Reddit Credentials


In [8]:
## GET DATA
posts_with_comments = get_post_data(
    subreddit_name = 'stocks',
    subreddit_type = 'hot',
    post_limit = 5,
    comment_limmit = 10,
    reddit = reddit
    )

Getting Reddit Data: Subreddit: stocks --- Number of Posts: 5 --- Comment Limit : 10
Getting hot posts
Got Reddit Data


In [9]:
posts_with_comments

[{'title': 'Rate My Portfolio - r/Stocks Quarterly Thread March 2025',
  'selftext': "Please use this thread to discuss your portfolio, learn of other stock tickers & portfolios like [Warren Buffet's](https://buffett.online/en/portfolio/), and help out users by giving constructive criticism.\n\nWhy quarterly?  Public companies report earnings quarterly; many investors take this as an opportunity to rebalance their portfolios.  We highly recommend you do some reading:  Check out our wiki's list of [relevant posts & book recommendations.](https://www.reddit.com/r/stocks/wiki/index/#wiki_relevant_posts.2C_books.2C_wiki_recommendations)\n\nYou can find stocks on your own by using a scanner like your broker's or [Finviz.](https://finviz.com/screener.ashx)  To help further, here's a list of [relevant websites.](https://www.reddit.com/r/stocks/wiki/index/#wiki_relevant_websites.2Fapps)\n\nIf you don't have a broker yet, see our [list of brokers](https://www.reddit.com/r/stocks/wiki/index/#wik

In [5]:
file_path = './reddit_data/reddit_data_stoks_hot_10.pkl'

In [16]:
with open(file_path, 'wb') as f:
    pickle.dump(posts_with_comments, f)
print(f'Data saved to {file_path}')

Data saved to ./reddit_data/reddit_data_stoks_hot_10.pkl


In [6]:
with open(file_path, 'rb') as f:
    loaded_data = pickle.load(f)

In [10]:
df = pd.DataFrame(loaded_data)
df['all_text'] = df['title'] + df['selftext']
# df['clean_title'] = df['all_text'].apply(lambda x : clean_text(x))
# df = get_sentiment(df, 'clean_title')
# df['timestamp'] = df['created_utc'].apply(convert_utc)

# df['year'] = df['timestamp'].dt.year
# df['month'] = df['timestamp'].dt.month
# df['day'] = df['timestamp'].dt.day

In [11]:
df

Unnamed: 0,title,selftext,score,url,author,created_utc,num_comments,upvote_ratio,subreddit,comments,all_text
0,Rate My Portfolio - r/Stocks Quarterly Thread ...,Please use this thread to discuss your portfol...,132,https://www.reddit.com/r/stocks/comments/1j0w7...,AutoModerator,1740823000.0,335,0.97,stocks,[{'body': 'Late 20s. Decided to keep it simple...,Rate My Portfolio - r/Stocks Quarterly Thread ...
1,r/Stocks Daily Discussion & Options Trading Th...,"This is the daily discussion, so anything stoc...",11,https://www.reddit.com/r/stocks/comments/1kn3z...,AutoModerator,1747301000.0,274,0.99,stocks,"[{'body': 'For those who don't know, UNH is th...",r/Stocks Daily Discussion & Options Trading Th...
2,BREAKING: Walmart to hike prices imminently,"Earnings Call On prices\n\n""We will likely see...",1721,https://www.reddit.com/r/stocks/comments/1kn7v...,ActuallyMy,1747315000.0,392,0.95,stocks,[{'body': 'Get ready for a beautiful phone cal...,BREAKING: Walmart to hike prices imminentlyEar...
3,"No way Musk gets $50B. In my Aussie MBA, the U...","One reason is the CEO PAY RATIO, and the other...",253,https://www.reddit.com/r/stocks/comments/1kn74...,duck4355555,1747313000.0,85,0.81,stocks,"[{'body': '50B on a company going downhill, go...","No way Musk gets $50B. In my Aussie MBA, the U..."
4,Coinbase says hackers bribed staff to steal cu...,> Coinbase on Thursday reported that cyber cri...,162,https://www.reddit.com/r/stocks/comments/1kn7o...,WickedSensitiveCrew,1747314000.0,29,0.97,stocks,[{'body': 'Yeah great idea to offshore jobs wi...,Coinbase says hackers bribed staff to steal cu...
