In [1]:
import requests
import pandas as pd
import time
from datetime import datetime

class RedditDeepScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) ResearchBot/2.0"})

    def fetch_all_comments(self, subreddit, post_id):
        url = f"https://www.reddit.com/r/{subreddit}/comments/{post_id}.json"
        response = self.session.get(url)
        if response.status_code != 200:
            return []
        
        # Reddit JSON structure: [PostObject, CommentObject]
        # We target the 'children' in the CommentObject
        comment_forest = response.json()[1]['data']['children']
        
        flat_comments = []
        self._parse_tree(comment_forest, flat_comments)
        return flat_comments

    def _parse_tree(self, children, result_list):
        """Recursively walks the comment tree to flatten it."""
        for child in children:
            if child['kind'] == 't1': # t1 = Comment
                data = child['data']
                result_list.append({
                    "comment_id": data.get('id'),
                    "parent_id": data.get('parent_id'),
                    "author": data.get('author'),
                    "body": data.get('body'),
                    "score": data.get('score'),
                    "created_utc": datetime.fromtimestamp(data.get('created_utc')) if data.get('created_utc') else None
                })
                
                # Check for replies (nested tree)
                replies = data.get('replies')
                if replies and isinstance(replies, dict):
                    inner_children = replies.get('data', {}).get('children', [])
                    self._parse_tree(inner_children, result_list)

scraper = RedditDeepScraper()

In [2]:
SUBREDDIT = "Audi"
QUERY = "bmw"
POST_LIMIT = 100

search_url = f"https://www.reddit.com/r/{SUBREDDIT}/search.json"
search_params = {"q": QUERY, "restrict_sr": "on", "sort": "hot", "limit": POST_LIMIT}
search_res = scraper.session.get(search_url, params=search_params).json()
posts = [p['data'] for p in search_res['data']['children']]

all_data_rows = []

for post in posts:
    p_id = post['id']
    p_title = post['title']
    print(f"Deep scraping: {p_title[:60]}...")
    
    all_data_rows.append({
        "type": "post",
        "post_id": p_id,
        "post_title": p_title,
        "comment_id": None,
        "author": post.get('author'),
        "body": post.get('selftext', ''),
        "score": post.get('score'),
        "created_utc": post.get('created_utc')
    })
    
    # 3. Get all comments in the tree
    comments = scraper.fetch_all_comments(SUBREDDIT, p_id)
    
    for c in comments:
        c['type'] = "comment"
        c['post_id'] = p_id
        c['post_title'] = p_title
        all_data_rows.append(c)
    
    time.sleep(5) 

# 4. Create DataFrame
df = pd.DataFrame(all_data_rows)
print(f"\nDone! Captured {len(df)} total rows (posts + comments) across {len(posts)} posts.")

Deep scraping: First Time Audi Owner...
Deep scraping: Talk me out of it: 2016 A5 Sportback 3.0T (B8.5) with 80k km...
Deep scraping: Don't buy the new A5. I did....
Deep scraping: We'll never have it this good again.....
Deep scraping: Is Volkswagen Group currently producing more interesting and...
Deep scraping: US Spec S5/SQ5 AWD system...
Deep scraping: Audi Connect Activation Help...
Deep scraping: Buying used @ 40-60k miles...
Deep scraping: Audi owners - typecasting...
Deep scraping: Hiring European Automotive Technician â€” Houston, TX...
Deep scraping: What made you choose audi from the german?...
Deep scraping: Questions questions...
Deep scraping: Audi RS3 vs BMW M2/M3 steering wheel, which is better?...
Deep scraping: M5 CS copied RS 5 color combo...
Deep scraping: Building a rally A3 (8V) Convertible with custom widebody...
Deep scraping: Finally bit the bullet and bought a new 2025 Q8...
Deep scraping: Few questions on the A3 2026...
Deep scraping: More photos of the hatc

In [3]:
import os
from pathlib import Path

target_dir = Path("..") / "data"
filename = f"{SUBREDDIT}_{QUERY}_100posts_full.csv"
file_path = target_dir / filename

target_dir.mkdir(parents=True, exist_ok=True)

# 3. Final Export
if not df.empty:
    df.to_csv(file_path, index=False, encoding='utf-8-sig')
    print(f"Successfully exported {len(df)} entries.")
    print(f"File location: {file_path.resolve()}")
else:
    print("DataFrame is empty. No file was saved.")

df[['type', 'post_id', 'author', 'body']].head(10)

Successfully exported 5132 entries.
File location: C:\fh-mit\s1\data-eng\DataEngineering_Project\data\Audi_bmw_100posts_full.csv


Unnamed: 0,type,post_id,author,body
0,post,1q4ia7n,Colgray21,Traded up from a BMW 535i to this 2024 Q5 and ...
1,comment,1q4ia7n,ecom_ryan,Welcome to the club!
2,post,1q3wjtk,ThatOtherOmar,Iâ€™m considering trading my 2007 BMW E92 328i f...
3,comment,1q3wjtk,Beemeristic,"Easy, if you don't have maintenance/repair mon..."
4,comment,1q3wjtk,PurpleSlightlyRed,Previous owner spent more money on the fake RS...
5,comment,1q3wjtk,alexberbo,"Skip, those mods are terrible, fake RS bumper ..."
6,comment,1q3wjtk,96JY,The seats are a shade of poop.
7,comment,1q3wjtk,No-Room-3886,I didnt know these existed. Previous dudes don...
8,comment,1q3wjtk,xRaffaell,Probably fake kms or not mentained properly du...
9,comment,1q3wjtk,Super-Total-661,You dont need us to talk you out of it or into...
