In [1]:
import praw
import pandas as pd
import datetime
import json
import time


In [None]:
# Initialize Reddit instance
reddit = praw.Reddit(
    client_id="your_client_id_here", #sensitive data removed
    client_secret="your_client_secret_here", #sensitive data removed
    user_agent="RR Analytics/1.0 (by /u/Eric_Xu1025)"
)

# Configuration
SUBREDDIT_NAME = "ipl"
TOTAL_POSTS = 1000
SAVE_INTERVAL = 500
REQUEST_DELAY = 2

# Storage containers
posts_list = []
comments_list = []

def main():
    print("[Phase 1/3] Collecting post IDs...")
    post_ids = []

    try:
        for post in reddit.subreddit(SUBREDDIT_NAME).new(limit=TOTAL_POSTS):
            post_ids.append(post.id)
            time.sleep(0.5)
            if len(post_ids) % 100 == 0:
                print(f"Collected {len(post_ids)} post IDs", end="\r")
        print(f"\nTotal post IDs collected: {len(post_ids)}")
    except Exception as e:
        print(f"Error collecting post IDs: {str(e)}")
        return

    print("\n[Phase 2/3] Collecting post details...")
    comment_counter = 0

    for idx, post_id in enumerate(post_ids, 1):
        try:
            submission = reddit.submission(id=post_id)

            posts_list.append({
                "post_id": submission.id,
                "title": submission.title,
                "author": submission.author.name if submission.author else "Unknown",
                "score": submission.score,
                "comments_count": submission.num_comments,
                "url": submission.url,
                "text": submission.selftext
            })

            submission.comments.replace_more(limit=None)
            for comment in submission.comments.list():
                comments_list.append({
                    "post_id": submission.id,
                    "comment_id": comment.id,
                    "comment_author": comment.author.name if comment.author else "Anonymous",
                    "comment_body": comment.body,
                    "comment_score": comment.score,
                    "created_utc": comment.created_utc
                })
                comment_counter += 1

            print(f"Processed {idx}/{len(post_ids)} posts | Comments: {comment_counter}", end="\r")

            if idx % SAVE_INTERVAL == 0:
                save_data(batch_id=idx)

            time.sleep(REQUEST_DELAY)

        except Exception as e:
            print(f"\nError processing post {post_id}: {str(e)}")
            continue

    save_data(batch_id="final")
    print("\n[Phase 3/3] CSV export complete!")

def save_data(batch_id):
    """Save post and comment data to CSV"""
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    posts_filename = f"reddit_posts_batch_{batch_id}_{timestamp}.csv"
    comments_filename = f"reddit_comments_batch_{batch_id}_{timestamp}.csv"

    pd.DataFrame(posts_list).to_csv(posts_filename, index=False)
    pd.DataFrame(comments_list).to_csv(comments_filename, index=False)

    print(f"\n✔️ Saved: {posts_filename}, {comments_filename}")

    # Optional: Clear lists if saving periodically
    if batch_id != "final":
        posts_list.clear()
        comments_list.clear()

if __name__ == "__main__":
    main()
    print("\n✅ Operation completed successfully!")


Version 7.1.4 of praw is outdated. Version 7.8.1 was released Friday October 25, 2024.


[Phase 1/3] Collecting post IDs...
Collected 900 post IDs
Total post IDs collected: 972

[Phase 2/3] Collecting post details...
Processed 500/972 posts | Comments: 22429
✔️ Saved: reddit_posts_batch_500_20250421_190157.csv, reddit_comments_batch_500_20250421_190157.csv
Processed 791/972 posts | Comments: 36042