In [5]:
import praw
import json
import time
import os
from dotenv import load_dotenv
load_dotenv()

# Environment Variables
reddit = praw.Reddit(
    client_id=os.getenv("CLIENT_ID"),
    client_secret=os.getenv("CLIENT_SECRET"),
    user_agent=os.getenv("USER_AGENT")
)

# ---- Parameters ----
subreddits = ["AskReddit", "LifeProTips"]
desired_total_posts = 1000
top_n_comments = 3
min_score_threshold = 100

results = []
posts_per_subreddit = (desired_total_posts // len(subreddits)) + 100  # Overfetch buffer

# ---- Fetch posts and top-level comments ----
for sub in subreddits:
    print(f"Fetching from r/{sub}")
    subreddit = reddit.subreddit(sub)

    collected = 0
    for submission in subreddit.hot(limit=posts_per_subreddit):
        if submission.stickied or submission.score < min_score_threshold:
            continue

        submission.comments.replace_more(limit=0)
        top_comments = [
            comment.body.strip()
            for comment in submission.comments[:top_n_comments]
            if hasattr(comment, "body") and comment.body.strip()
        ]

        if not top_comments:
            continue

        post_data = {
            "subreddit": sub,
            "title": submission.title.strip(),
            "selftext": submission.selftext.strip(),
            "score": submission.score,
            "url": submission.url,
            "id": submission.id,
            "created_utc": submission.created_utc,
            "comments": top_comments
        }

        results.append(post_data)
        collected += 1
        print(f"[{collected}] Collected from r/{sub}")

        if collected >= desired_total_posts // len(subreddits):
            break

        time.sleep(1)  # Rate limit

# ---- Save to JSON file ----
with open("reddit_wisdom_data.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"\nSaved {len(results)} high-quality posts with top comments.")

Fetching from r/AskReddit
[1] Collected from r/AskReddit
[2] Collected from r/AskReddit
[3] Collected from r/AskReddit
[4] Collected from r/AskReddit
[5] Collected from r/AskReddit
[6] Collected from r/AskReddit
[7] Collected from r/AskReddit
[8] Collected from r/AskReddit
[9] Collected from r/AskReddit
[10] Collected from r/AskReddit
[11] Collected from r/AskReddit
[12] Collected from r/AskReddit
[13] Collected from r/AskReddit
[14] Collected from r/AskReddit
[15] Collected from r/AskReddit
[16] Collected from r/AskReddit
[17] Collected from r/AskReddit
[18] Collected from r/AskReddit
[19] Collected from r/AskReddit
[20] Collected from r/AskReddit
[21] Collected from r/AskReddit
[22] Collected from r/AskReddit
[23] Collected from r/AskReddit
[24] Collected from r/AskReddit
[25] Collected from r/AskReddit
[26] Collected from r/AskReddit
[27] Collected from r/AskReddit
[28] Collected from r/AskReddit
[29] Collected from r/AskReddit
[30] Collected from r/AskReddit
[31] Collected from r/A

In [6]:
import json
import re
from pathlib import Path

# --- Paths ---
INPUT_PATH = Path("reddit_wisdom_data.json")
OUTPUT_PATH = Path("reddit_wisdom_data.jsonl")

# --- Bot/Spam Filter ---
def is_valid_comment(text: str) -> bool:
    lower = text.lower()
    return not any([
        "i am a bot" in lower,
        "this action was performed automatically" in lower,
        "[removed]" in lower,
        "[deleted]" in lower,
        "moderator" in lower,
        "http" in lower and len(text) < 50  # Likely just a link
    ])

# --- Improved text cleaner ---
def clean_text(text: str) -> str:
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"\s+", " ", text)  # Normalize whitespace
    return text.strip().lower()

# --- Load raw data ---
with open(INPUT_PATH, "r", encoding="utf-8") as f:
    raw_data = json.load(f)

processed = []
skipped = 0

for entry in raw_data:
    title = entry.get("title", "").strip()
    selftext = entry.get("selftext", "").strip()
    comments = entry.get("comments", [])

    # Filter junk/bot comments
    valid_comments = [c for c in comments if is_valid_comment(c)]

    if not title and not selftext and not valid_comments:
        skipped += 1
        continue

    top_comment = valid_comments[0] if valid_comments else ""
    combined_raw = f"{title} {selftext} {' '.join(valid_comments)}"
    combined_clean = clean_text(combined_raw)

    processed.append({
        "title": title,
        "selftext": selftext,
        "comments": valid_comments,
        "top_comment": top_comment,
        "combined_clean": combined_clean,
        "url": entry.get("url", ""),
        "score": entry.get("score", 0),
        "subreddit": entry.get("subreddit", ""),
        "created_utc": entry.get("created_utc", None),
    })

print(f"✓ Loaded {len(raw_data)} entries")
print(f"✓ Processed {len(processed)} valid entries")
print(f"✗ Skipped {skipped} empty or low-quality entries")

# --- Save cleaned JSONL ---
with open(OUTPUT_PATH, "w", encoding="utf-8") as outfile:
    for item in processed:
        json.dump(item, outfile)
        outfile.write("\n")

print(f"✓ Saved cleaned data to {OUTPUT_PATH.resolve()}")


✓ Loaded 260 entries
✓ Processed 260 valid entries
✗ Skipped 0 empty or low-quality entries
✓ Saved cleaned data to /Users/martinkrawtzow/Library/CloudStorage/OneDrive-MichaelMöhleundRainerBraker/Studium/Master/Semester2/Social Media Analytics/SMA_Capstone/reddit_wisdom_data.jsonl
