In [8]:
import praw
import json
import time
import os
from dotenv import load_dotenv

# ---- Load Environment Variables ----
load_dotenv()

reddit = praw.Reddit(
    client_id=os.getenv("CLIENT_ID"),
    client_secret=os.getenv("CLIENT_SECRET"),
    user_agent=os.getenv("USER_AGENT")
)

# ---- Parameters ----
subreddit_name = "LifeProTips"
desired_total_posts = 10000
top_n_comments = 3
min_score_threshold = 100
save_every = 500  # Save every N posts
output_file = "reddit_lpt_data.json"

results = []
seen_ids = set()

# ---- Try to Resume From Existing File ----
if os.path.exists(output_file):
    with open(output_file, "r", encoding="utf-8") as f:
        results = json.load(f)
        seen_ids = {post["id"] for post in results}
    print(f"✔ Resumed with {len(results)} previously collected posts.")

# ---- Main Collection Loop ----
print(f"\n🔍 Fetching from r/{subreddit_name}")
subreddit = reddit.subreddit(subreddit_name)
collected = 0

for submission in subreddit.top(limit=None, time_filter="all"):
    if submission.id in seen_ids:
        continue
    if submission.stickied or submission.score < min_score_threshold:
        continue

    try:
        submission.comments.replace_more(limit=0)
        top_comments = [
            comment.body.strip()
            for comment in submission.comments[:top_n_comments]
            if hasattr(comment, "body") and comment.body.strip()
        ]

        if not top_comments:
            continue

        post_data = {
            "subreddit": subreddit_name,
            "title": submission.title.strip(),
            "selftext": submission.selftext.strip(),
            "score": submission.score,
            "url": submission.url,
            "id": submission.id,
            "created_utc": submission.created_utc,
            "comments": top_comments
        }

        results.append(post_data)
        seen_ids.add(submission.id)
        collected += 1

        if collected % 50 == 0:
            print(f"→ {collected} posts collected...")

        if collected % save_every == 0:
            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(results, f, ensure_ascii=False, indent=2)
            print(f"💾 Auto-saved after {collected} posts.")

        if len(results) >= desired_total_posts:
            break

        time.sleep(1)

    except Exception as e:
        print(f"⚠️ Error on submission {submission.id}: {e}")
        time.sleep(2)

print(f"\n✅ Finished r/{subreddit_name}: Collected {collected} new posts")

# ---- Final Save ----
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"\n🎉 Total saved: {len(results)} posts → {output_file}")



🔍 Fetching from r/LifeProTips


KeyboardInterrupt: 

In [6]:
import json
import re
from pathlib import Path

# --- Paths ---
INPUT_PATH = Path("reddit_wisdom_data.json")
OUTPUT_PATH = Path("reddit_wisdom_data.jsonl")

# --- Bot/Spam Filter ---
def is_valid_comment(text: str) -> bool:
    lower = text.lower()
    return not any([
        "i am a bot" in lower,
        "this action was performed automatically" in lower,
        "[removed]" in lower,
        "[deleted]" in lower,
        "moderator" in lower,
        "http" in lower and len(text) < 50  # Likely just a link
    ])

# --- Improved text cleaner ---
def clean_text(text: str) -> str:
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"\s+", " ", text)  # Normalize whitespace
    return text.strip().lower()

# --- Load raw data ---
with open(INPUT_PATH, "r", encoding="utf-8") as f:
    raw_data = json.load(f)

processed = []
skipped = 0

for entry in raw_data:
    title = entry.get("title", "").strip()
    selftext = entry.get("selftext", "").strip()
    comments = entry.get("comments", [])

    # Filter junk/bot comments
    valid_comments = [c for c in comments if is_valid_comment(c)]

    if not title and not selftext and not valid_comments:
        skipped += 1
        continue

    top_comment = valid_comments[0] if valid_comments else ""
    combined_raw = f"{title} {selftext} {' '.join(valid_comments)}"
    combined_clean = clean_text(combined_raw)

    processed.append({
        "title": title,
        "selftext": selftext,
        "comments": valid_comments,
        "top_comment": top_comment,
        "combined_clean": combined_clean,
        "url": entry.get("url", ""),
        "score": entry.get("score", 0),
        "subreddit": entry.get("subreddit", ""),
        "created_utc": entry.get("created_utc", None),
    })

print(f"✓ Loaded {len(raw_data)} entries")
print(f"✓ Processed {len(processed)} valid entries")
print(f"✗ Skipped {skipped} empty or low-quality entries")

# --- Save cleaned JSONL ---
with open(OUTPUT_PATH, "w", encoding="utf-8") as outfile:
    for item in processed:
        json.dump(item, outfile)
        outfile.write("\n")

print(f"✓ Saved cleaned data to {OUTPUT_PATH.resolve()}")


✓ Loaded 260 entries
✓ Processed 260 valid entries
✗ Skipped 0 empty or low-quality entries
✓ Saved cleaned data to /Users/martinkrawtzow/Library/CloudStorage/OneDrive-MichaelMöhleundRainerBraker/Studium/Master/Semester2/Social Media Analytics/SMA_Capstone/reddit_wisdom_data.jsonl
