In this Notebook we fetch Reddit posts and comments using Reddit API

In [None]:
# installing required libraries
!pip install praw




[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import praw
import pandas as pd

# Setup Reddit API
def setup_reddit_api(client_id, client_secret, user_agent):
    """Set up Reddit API client."""
    reddit = praw.Reddit(
        client_id=client_id,
        client_secret=client_secret,
        user_agent=user_agent
    )
    return reddit

# Infer content type (text, image, video, link, other)
def infer_content_type(post):
    if post.is_self:
        return "text"
    elif hasattr(post, "post_hint"):
        if post.post_hint == "image":
            return "image"
        elif post.post_hint in ["hosted:video", "rich:video"] or post.is_video:
            return "video"
        elif post.post_hint == "link":
            return "link"
    elif post.url.endswith(('.jpg', '.jpeg', '.png', '.gif')):
        return "image"
    elif "v.redd.it" in post.url:
        return "video"
    return "other"

# Get posts from a subreddit (with content_type)
def get_subreddit_posts(reddit, subreddit_name, limit=400):
    """Get posts from a subreddit."""
    subreddit = reddit.subreddit(subreddit_name)
    posts = []

    for post in subreddit.hot(limit=limit):
        posts.append({
            'post_id': post.id,
            'title': post.title,
            'text': post.selftext,
            'author': str(post.author),
            'created_utc': post.created_utc,
            'score': post.score,
            'num_comments': post.num_comments,
            'upvote_ratio': post.upvote_ratio,
            'content_type': infer_content_type(post) 
        })

    return pd.DataFrame(posts)

# Get comments for a specific post
def get_post_comments(reddit, post_id, limit=400):
    """Get comments for a specific post."""
    post = reddit.submission(id=post_id)
    post.comments.replace_more(limit=0) 
    comments = []

    for comment in post.comments[:limit]:
        comments.append({
            'comment_id': comment.id,
            'post_id': post_id,
            'author': str(comment.author),
            'text': comment.body,
            'score': comment.score,
            'created_utc': comment.created_utc
        })

    return pd.DataFrame(comments)


In [18]:
# Connect to Reddit API
client_id = "YI3yiSbD9yctHZ2NMIQoZA"
client_secret = "vZ1MFPXuDEXCe-NaEwEyKnDrKyaPyg"
user_agent = "script : data_collection :v1 .0 (by/u/data_collection)"
reddit = setup_reddit_api( client_id , client_secret , user_agent )


In [19]:
# Scrape Reddit posts
import os

posts_df_reddit = get_subreddit_posts(reddit , "datascience" , limit =300)

output_dir = '../reddit_data'
os.makedirs(output_dir, exist_ok=True)

# save posts_df_reddit to csv
posts_df_reddit.to_csv(f'{output_dir}/reddit_posts_df.csv', index=False)

# view posts_df_reddit
posts_df_reddit.head()

Unnamed: 0,post_id,title,text,author,created_utc,score,num_comments,upvote_ratio,content_type
0,1k44mgg,Weekly Entering & Transitioning - Thread 21 Ap...,\n\nWelcome to this week's entering & transit...,AutoModerator,1745208000.0,7,18,1.0,text
1,1i5inrb,Weekly Entering & Transitioning - Thread 20 Ja...,\n\nWelcome to this week's entering & transit...,AutoModerator,1737349000.0,13,46,1.0,text
2,1k6tz9y,Leadership said they doesn’t understand what w...,Our DS group was moved under a traditional IT ...,DeepNarwhalNetwork,1745506000.0,63,34,0.92,text
3,1k6wi45,What are some universities that you believe ar...,,Voldemort57,1745512000.0,14,43,0.72,other
4,1k6rj0y,Deep Analysis — the analytics analogue to deep...,,phicreative1997,1745499000.0,5,0,0.78,link


In [None]:
# Fetch comments for all posts in posts_df_reddit
all_comments = []

for post_id in posts_df_reddit['post_id']:
    try:
        comments_df = get_post_comments(reddit, post_id, limit=300)
        all_comments.append(comments_df)
    except Exception as e:
        print(f"❌ Failed to fetch comments for {post_id}: {e}")

# Combine all comments into a single DataFrame
all_comments_df = pd.concat(all_comments, ignore_index=True)

# Save to CSV
all_comments_df.to_csv("../reddit_data/reddit_post_comments.csv", index=False)
print("💾 All comments saved to reddit_data/reddit_post_comments.csv")


✅ Retrieved 1 comments for post: 1k44mgg
✅ Retrieved 24 comments for post: 1i5inrb
✅ Retrieved 39 comments for post: 1k4geso
✅ Retrieved 93 comments for post: 1k3nxj7
✅ Retrieved 4 comments for post: 1k3e4nb
✅ Retrieved 23 comments for post: 1k32lrl
✅ Retrieved 1 comments for post: 1k3jt7b
✅ Retrieved 13 comments for post: 1k2y84g
✅ Retrieved 2 comments for post: 1k33k6t
✅ Retrieved 7 comments for post: 1k35lig
✅ Retrieved 12 comments for post: 1k2igce
✅ Retrieved 1 comments for post: 1k2u4nd
✅ Retrieved 32 comments for post: 1k26kp3
✅ Retrieved 4 comments for post: 1k2a8t6
✅ Retrieved 34 comments for post: 1k26920
✅ Retrieved 9 comments for post: 1k2ax74
✅ Retrieved 19 comments for post: 1k22cd4
✅ Retrieved 13 comments for post: 1k1wu9o
✅ Retrieved 11 comments for post: 1k1mjok
✅ Retrieved 4 comments for post: 1k1x464
✅ Retrieved 1 comments for post: 1k1vo23
✅ Retrieved 4 comments for post: 1k1lh3r
✅ Retrieved 1 comments for post: 1k1ohsp
✅ Retrieved 5 comments for post: 1k20azb
✅ Ret

In [13]:
all_comments_df.head()

Unnamed: 0,comment_id,post_id,author,text,score,created_utc
0,m8l6gbd,1i5inrb,dogdiarrhea,Nonstandard transition question: how many peop...,5,1737573000.0
1,m8wl92v,1i5inrb,Left-Animal1559,I am a senior talent partner with Swish Analyt...,6,1737725000.0
2,m8y2o1w,1i5inrb,doorstoinfinity,Hi everyone!\n\nI'm transitioning from Data An...,4,1737741000.0
3,m84ed38,1i5inrb,Silent_Group6621,Can someone suggest me some DS/Analytics/ML pr...,3,1737354000.0
4,m85eg48,1i5inrb,j-unnlock,Here's a list of job boards I'm working on ser...,2,1737376000.0
