# Reddit Sentiment Analysis Data Scraper
Extract posts and comments from Reddit for sentiment analysis - structured JSON format.

## 1. Configuration & Setup

In [None]:
# Install dependencies
%pip install praw python-dotenv pandas

In [1]:
import praw
import pandas as pd
from datetime import datetime
import json
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Initialize Reddit API
reddit = praw.Reddit(
    client_id=os.getenv('REDDIT_CLIENT_ID'),
    client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
    username=os.getenv('REDDIT_USERNAME'),
    password=os.getenv('REDDIT_PASSWORD'),
    user_agent=os.getenv('REDDIT_USER_AGENT')
)

print(f"‚úÖ Reddit API initialized (Read-only: {reddit.read_only})")

‚úÖ Reddit API initialized (Read-only: False)


In [2]:
# ========================================
# CONFIGURATION VARIABLES
# ========================================

# Subreddit to scrape
SUBREDDIT_NAME = "learnpython"

# Number of posts to retrieve
MAX_POSTS = 10

# Number of comments per post to retrieve
MAX_COMMENTS_PER_POST = 50

# Post sorting method: 'hot', 'new', 'top', 'rising'
SORT_METHOD = "hot"

# Time filter for 'top' sorting: 'all', 'day', 'week', 'month', 'year'
TIME_FILTER = "week"

print("Configuration:")
print(f"  Subreddit: r/{SUBREDDIT_NAME}")
print(f"  Max Posts: {MAX_POSTS}")
print(f"  Max Comments per Post: {MAX_COMMENTS_PER_POST}")
print(f"  Sort Method: {SORT_METHOD}")
print(f"  Time Filter: {TIME_FILTER}")

Configuration:
  Subreddit: r/learnpython
  Max Posts: 10
  Max Comments per Post: 50
  Sort Method: hot
  Time Filter: week


## 2. Data Extraction Functions

In [3]:
def extract_comment_data(comment):
    """Extract relevant data from a comment for sentiment analysis."""
    return {
        'id': comment.id,
        'author': str(comment.author) if comment.author else '[deleted]',
        'body': comment.body,
        'score': comment.score,
        'upvotes': comment.ups,
        'downvotes': comment.downs,
        'created_utc': datetime.fromtimestamp(comment.created_utc).isoformat(),
        'edited': bool(comment.edited),
        'is_submitter': comment.is_submitter,  # Is comment by post author
        'controversiality': comment.controversiality,
        'depth': getattr(comment, 'depth', 0),
        'permalink': f"https://reddit.com{comment.permalink}"
    }


def extract_post_data(post, max_comments=50):
    """Extract relevant data from a post including its comments."""
    
    # Get post data
    post_data = {
        'id': post.id,
        'title': post.title,
        'author': str(post.author) if post.author else '[deleted]',
        'selftext': post.selftext,  # Post content/body
        'score': post.score,
        'upvotes': post.ups,
        'downvotes': post.downs,
        'upvote_ratio': post.upvote_ratio,
        'num_comments': post.num_comments,
        'created_utc': datetime.fromtimestamp(post.created_utc).isoformat(),
        'edited': bool(post.edited),
        'is_self': post.is_self,  # Text post vs link
        'url': post.url,
        'permalink': f"https://reddit.com{post.permalink}",
        'link_flair_text': post.link_flair_text,
        'over_18': post.over_18,
        'spoiler': post.spoiler,
        'stickied': post.stickied,
        'locked': post.locked,
        'comments': []
    }
    
    # Get comments
    try:
        # Replace "MoreComments" with actual comments
        post.comments.replace_more(limit=0)
        
        # Get all comments (flattened)
        all_comments = post.comments.list()
        
        # Process up to max_comments
        for comment in all_comments[:max_comments]:
            try:
                comment_data = extract_comment_data(comment)
                post_data['comments'].append(comment_data)
            except Exception as e:
                print(f"    ‚ö†Ô∏è Error processing comment: {e}")
                continue
                
    except Exception as e:
        print(f"  ‚ö†Ô∏è Error getting comments: {e}")
    
    return post_data


print("‚úÖ Extraction functions defined")

‚úÖ Extraction functions defined


## 3. Scrape Posts & Comments

In [4]:
# Get subreddit
subreddit = reddit.subreddit(SUBREDDIT_NAME)

# Get posts based on sort method
if SORT_METHOD == 'hot':
    posts = subreddit.hot(limit=MAX_POSTS)
elif SORT_METHOD == 'new':
    posts = subreddit.new(limit=MAX_POSTS)
elif SORT_METHOD == 'top':
    posts = subreddit.top(time_filter=TIME_FILTER, limit=MAX_POSTS)
elif SORT_METHOD == 'rising':
    posts = subreddit.rising(limit=MAX_POSTS)
else:
    posts = subreddit.hot(limit=MAX_POSTS)

# Extract data from all posts
all_posts_data = []

print(f"\nüîç Scraping r/{SUBREDDIT_NAME}...\n")

for idx, post in enumerate(posts, 1):
    print(f"[{idx}/{MAX_POSTS}] Processing: {post.title[:60]}...")
    print(f"         Comments: {post.num_comments}")
    
    post_data = extract_post_data(post, max_comments=MAX_COMMENTS_PER_POST)
    all_posts_data.append(post_data)
    
    print(f"         ‚úÖ Extracted {len(post_data['comments'])} comments")

print(f"\n‚úÖ Scraping complete! Retrieved {len(all_posts_data)} posts")


üîç Scraping r/learnpython...

[1/10] Processing: Ask Anything Monday - Weekly Thread...
         Comments: 0
         ‚úÖ Extracted 0 comments
[2/10] Processing: Ask Anything Monday - Weekly Thread...
         Comments: 9
         ‚úÖ Extracted 8 comments
[3/10] Processing: i wanna start to learn coding...
         Comments: 7
         ‚úÖ Extracted 7 comments
[4/10] Processing: How did you go about learning Python, and how long did it ta...
         Comments: 11
         ‚úÖ Extracted 10 comments
[5/10] Processing: Why does Spark spill to disk even with tons of memory? What ...
         Comments: 5
         ‚úÖ Extracted 5 comments
[6/10] Processing: How to get inference.predictor module for LimiX model?...
         Comments: 0
         ‚úÖ Extracted 0 comments
[7/10] Processing: Working on maps in python text based game...
         Comments: 4
         ‚úÖ Extracted 4 comments
[8/10] Processing: How Do I Even Start?...
         Comments: 12
         ‚úÖ Extracted 12 comments
[9/10

## 4. View as DataFrame

In [5]:
# Create DataFrame from posts (excluding nested comments for table view)
posts_for_df = []

for post in all_posts_data:
    post_copy = post.copy()
    post_copy['num_comments_extracted'] = len(post['comments'])
    del post_copy['comments']  # Remove nested comments for cleaner table
    posts_for_df.append(post_copy)

posts_df = pd.DataFrame(posts_for_df)

print(f"üìä Posts Overview ({len(posts_df)} posts):\n")
print(posts_df[['title', 'author', 'score', 'upvote_ratio', 'num_comments', 'num_comments_extracted']].to_string())
print(f"\n‚úÖ Full data with nested comments stored in 'all_posts_data' variable")

# Display full DataFrame
posts_df

üìä Posts Overview (10 posts):

                                                                                                                                                                                     title                author  score  upvote_ratio  num_comments  num_comments_extracted
0                                                                                                                                                      Ask Anything Monday - Weekly Thread         AutoModerator      1          0.60             0                       0
1                                                                                                                                                      Ask Anything Monday - Weekly Thread         AutoModerator      3          0.81             9                       8
2                                                                                                                                                            i wann

Unnamed: 0,id,title,author,selftext,score,upvotes,downvotes,upvote_ratio,num_comments,created_utc,edited,is_self,url,permalink,link_flair_text,over_18,spoiler,stickied,locked,num_comments_extracted
0,1pmt14q,Ask Anything Monday - Weekly Thread,AutoModerator,"Welcome to another /r/learnPython weekly ""Ask ...",1,1,0,0.6,0,2025-12-14T19:00:58,False,True,https://www.reddit.com/r/learnpython/comments/...,https://reddit.com/r/learnpython/comments/1pmt...,,False,False,True,False,0
1,1paxmgz,Ask Anything Monday - Weekly Thread,AutoModerator,"Welcome to another /r/learnPython weekly ""Ask ...",3,3,0,0.81,9,2025-11-30T19:01:06,False,True,https://www.reddit.com/r/learnpython/comments/...,https://reddit.com/r/learnpython/comments/1pax...,,False,False,True,False,8
2,1pohs4v,i wanna start to learn coding,G2-118,so i‚Äôve heard that python is the best to start...,6,6,0,0.75,7,2025-12-16T18:59:21,False,True,https://www.reddit.com/r/learnpython/comments/...,https://reddit.com/r/learnpython/comments/1poh...,,False,False,False,False,7
3,1po7kn6,"How did you go about learning Python, and how ...",Practical-Secret3344,I recently transitioned from Cybersecurity to ...,12,12,0,0.85,11,2025-12-16T12:14:34,False,True,https://www.reddit.com/r/learnpython/comments/...,https://reddit.com/r/learnpython/comments/1po7...,,False,False,False,False,10
4,1pnxqkr,Why does Spark spill to disk even with tons of...,Familiar_Network_108,i‚Äôm running a pretty big Apache Spark job. lot...,21,21,0,0.86,5,2025-12-16T04:28:57,False,True,https://www.reddit.com/r/learnpython/comments/...,https://reddit.com/r/learnpython/comments/1pnx...,,False,False,False,False,5
5,1poh82m,How to get inference.predictor module for Limi...,d8gfdu89fdgfdu32432,I'm trying to run this model¬†https://huggingfa...,1,1,0,1.0,0,2025-12-16T18:34:46,False,True,https://www.reddit.com/r/learnpython/comments/...,https://reddit.com/r/learnpython/comments/1poh...,,False,False,False,False,0
6,1po3r3k,Working on maps in python text based game,here-to-aviod-sleep,While working on my text based game I had trou...,4,4,0,0.83,4,2025-12-16T09:46:47,False,True,https://www.reddit.com/r/learnpython/comments/...,https://reddit.com/r/learnpython/comments/1po3...,,False,False,False,False,4
7,1po1q97,How Do I Even Start?,Temporary-Fold2043,So i have to learn Python to have enough knowl...,4,4,0,0.64,12,2025-12-16T08:19:08,False,True,https://www.reddit.com/r/learnpython/comments/...,https://reddit.com/r/learnpython/comments/1po1...,,False,False,False,False,12
8,1pofrw1,What python Learning Path Is The Most Useful?,StomachSoft9643,I just decided to learn python after learning ...,0,0,0,0.25,4,2025-12-16T17:32:40,False,True,https://www.reddit.com/r/learnpython/comments/...,https://reddit.com/r/learnpython/comments/1pof...,,False,False,False,False,4
9,1pnxypd,What is the best way to figure out dependency ...,HuygensFresnel,"I have a python library that depends on Numpy,...",6,6,0,0.87,5,2025-12-16T04:44:02,False,True,https://www.reddit.com/r/learnpython/comments/...,https://reddit.com/r/learnpython/comments/1pnx...,,False,False,False,False,5


## 5. JSON Structure Example

In [7]:
# Show example of first post with nested comments
if all_posts_data:
    print("üìÑ Example JSON Structure (First Post):\n")
    
    # Create a simplified example for display (limit comments to 3)
    example = all_posts_data[1].copy()
    example['comments'] = example['comments'][:3] if len(example['comments']) > 3 else example['comments']
    
    print(json.dumps(example, indent=2))
else:
    print("No data to display")

üìÑ Example JSON Structure (First Post):

{
  "id": "1paxmgz",
  "title": "Ask Anything Monday - Weekly Thread",
  "author": "AutoModerator",
  "selftext": "Welcome to another /r/learnPython weekly \"Ask Anything\\* Monday\" thread\n\nHere you can ask all the questions that you wanted to ask but didn't feel like making a new thread.\n\n\\* It's primarily intended for simple questions but as long as it's about python it's allowed.\n\nIf you have any suggestions or questions about this thread use the message the moderators button in the sidebar.\n\n**Rules:**\n\n* Don't downvote stuff - instead explain what's wrong with the comment, if it's against the rules \"report\" it and it will be dealt with.\n* Don't post stuff that doesn't have absolutely anything to do with python.\n* Don't make fun of someone for not knowing something, insult anyone etc - this will result in an immediate ban.\n\nThat's it.",
  "score": 3,
  "upvotes": 3,
  "downvotes": 0,
  "upvote_ratio": 0.81,
  "num_comment

## 6. Summary Statistics

In [None]:
# Calculate statistics
total_posts = len(all_posts_data)
total_comments = sum(len(post['comments']) for post in all_posts_data)
avg_score = sum(post['score'] for post in all_posts_data) / total_posts if total_posts > 0 else 0
avg_comments_per_post = total_comments / total_posts if total_posts > 0 else 0

print("=" * 80)
print("üìä SCRAPING SUMMARY")
print("=" * 80)
print(f"\nSubreddit: r/{SUBREDDIT_NAME}")
print(f"Posts Retrieved: {total_posts}")
print(f"Total Comments: {total_comments}")
print(f"Average Score per Post: {avg_score:.2f}")
print(f"Average Comments per Post: {avg_comments_per_post:.2f}")
print(f"Average Upvote Ratio: {posts_df['upvote_ratio'].mean():.2%}")

print("\n" + "=" * 80)

## 7. Export Data

In [None]:
# Create main JSON structure
output_data = {
    'metadata': {
        'subreddit': SUBREDDIT_NAME,
        'scraped_at': datetime.now().isoformat(),
        'total_posts': len(all_posts_data),
        'total_comments': sum(len(post['comments']) for post in all_posts_data),
        'sort_method': SORT_METHOD,
        'time_filter': TIME_FILTER if SORT_METHOD == 'top' else None
    },
    'posts': all_posts_data
}

# Save to JSON file
filename = f"reddit_{SUBREDDIT_NAME}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(filename, 'w', encoding='utf-8') as f:
    json.dump(output_data, f, indent=2, ensure_ascii=False)

print(f"‚úÖ Data exported to: {filename}")
print(f"   File size: {os.path.getsize(filename) / 1024:.2f} KB")

# Also save posts DataFrame as CSV (without nested comments)
csv_filename = f"reddit_{SUBREDDIT_NAME}_posts_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
posts_df.to_csv(csv_filename, index=False)
print(f"‚úÖ Posts DataFrame exported to: {csv_filename}")

## 8. Quick Data Access Examples

In [None]:
# Examples of how to access the data

print("üìö Quick Access Examples:\n")

# Example 1: Get all post titles
print("1. All post titles:")
for i, post in enumerate(all_posts_data[:3], 1):
    print(f"   {i}. {post['title']}")

# Example 2: Get all comment text from first post
if all_posts_data and all_posts_data[0]['comments']:
    print(f"\n2. Comments from first post (showing first 3):")
    for i, comment in enumerate(all_posts_data[0]['comments'][:3], 1):
        print(f"   {i}. [{comment['author']}] {comment['body'][:80]}...")

# Example 3: Posts with highest score
print("\n3. Top 3 posts by score:")
sorted_posts = sorted(all_posts_data, key=lambda x: x['score'], reverse=True)
for i, post in enumerate(sorted_posts[:3], 1):
    print(f"   {i}. Score: {post['score']:4d} | {post['title'][:60]}")

# Example 4: Total sentiment indicators
print("\n4. Overall sentiment indicators:")
total_post_score = sum(p['score'] for p in all_posts_data)
total_comment_score = sum(c['score'] for p in all_posts_data for c in p['comments'])
print(f"   Total Post Scores: {total_post_score:,}")
print(f"   Total Comment Scores: {total_comment_score:,}")
print(f"   Combined Score: {total_post_score + total_comment_score:,}")