In [3]:
import json, time, re, sys
from datetime import datetime
from urllib.parse import urlparse
import requests
import pandas as pd

In [8]:
# reddit_ao3_ai_sentiment.py
import argparse, json, re, sys, time
from datetime import datetime
from urllib.parse import urlparse
import requests
import pandas as pd

UA = "AO3-Reddit-Sentiment/2.0 (contact: you@example.com)"

# ---------- Sentiment (VADER) ----------
def get_sia():
    from nltk.sentiment import SentimentIntensityAnalyzer
    import nltk
    try:
        nltk.data.find("sentiment/vader_lexicon.zip")
    except LookupError:
        nltk.download("vader_lexicon", quiet=True)
    return SentimentIntensityAnalyzer()

# ---------- URL helpers ----------
def ensure_scheme(url: str) -> str:
    """Add https:// if missing."""
    if not url.lower().startswith(("http://", "https://")):
        return "https://" + url.lstrip("/")
    return url

def to_json_url(thread_url: str, sort: str = "best") -> str:
    """
    Normalize a Reddit thread URL to its JSON endpoint.
    Works whether or not you include the trailing slash or .json.
    """
    url = ensure_scheme(thread_url.strip())
    if not url.endswith("/"):
        url += "/"
    if url.endswith(".json/"):
        return url + f"?sort={sort}"
    if url.endswith(".json/") or url.endswith(".json"):
        return url + f"?sort={sort}"
    return url + f".json?sort={sort}"

# ---------- HTTP fetch with retries ----------
def fetch_json(url: str, max_retries: int = 4, base_sleep: float = 1.5):
    for attempt in range(1, max_retries + 1):
        try:
            r = requests.get(url, headers={"User-Agent": UA}, timeout=30)
            if r.status_code == 429:
                retry_after = r.headers.get("Retry-After")
                wait = float(retry_after) if retry_after else base_sleep * attempt
                print(f"Rate limited (429). Sleeping {wait:.1f}s...")
                time.sleep(wait)
                continue
            r.raise_for_status()
            return r.json()
        except requests.RequestException as e:
            if attempt == max_retries:
                raise
            wait = base_sleep * attempt
            print(f"Fetch error ({e}). Retry {attempt}/{max_retries} in {wait:.1f}s...")
            time.sleep(wait)

# ---------- Flatten reddit comment tree ----------
def walk_comments(node, out, link_id, parent_id=None, depth=0,
                  skip_authors={"[deleted]", "AutoModerator"}):
    data = node.get("data", {})
    if "body" in data:
        author = data.get("author")
        if author not in skip_authors:
            body = data.get("body") or ""
            # Clean: remove blockquotes and collapse whitespace
            clean = re.sub(r"(?m)^>.*\n?", "", body)
            clean = re.sub(r"\s+", " ", clean).strip()
            out.append({
                "link_id": link_id,
                "comment_id": data.get("id"),
                "parent_id": data.get("parent_id"),
                "depth": depth,
                "author": author,
                "score": data.get("score"),
                "created_utc": data.get("created_utc"),
                "created_iso": datetime.utcfromtimestamp(
                    data.get("created_utc")
                ).isoformat() if data.get("created_utc") else "",
                "body": clean
            })

    # Recurse into replies
    replies = data.get("replies")
    if isinstance(replies, dict):
        children = replies.get("data", {}).get("children", [])
        for c in children:
            if c.get("kind") == "t1":  # <- keep the colon!
                walk_comments(c, out, link_id, data.get("id"), depth + 1, skip_authors)

def load_comments(json_root) -> pd.DataFrame:
    # Reddit post is j[0], comments tree is j[1]
    post = json_root[0]["data"]["children"][0]["data"]
    link_id = post.get("id")
    children = json_root[1]["data"]["children"]

    out = []
    for c in children:
        if c.get("kind") == "t1":  # <- keep the colon!
            walk_comments(c, out, link_id)
    return pd.DataFrame(out)

# ---------- Sentiment analysis ----------
def analyze_sentiment(df: pd.DataFrame, weight_by_score: bool = False):
    from numpy import average
    sia = get_sia()
    scores = df["body"].apply(lambda t: sia.polarity_scores(t))
    sent = pd.DataFrame(list(scores))
    df = pd.concat([df, sent], axis=1)

    def bucket(x):
        if x >= 0.05: return "positive"
        if x <= -0.05: return "negative"
        return "neutral"
    df["sentiment"] = df["compound"].apply(bucket)

    if weight_by_score:
        weights = (df["score"].fillna(0) + 1).clip(lower=0)
        avg_compound = float(average(df["compound"], weights=weights))
    else:
        avg_compound = float(df["compound"].mean())

    summary = {
        "n_comments": int(len(df)),
        "avg_compound": avg_compound,
        "share_negative": float((df["sentiment"]=="negative").mean()),
        "share_neutral":  float((df["sentiment"]=="neutral").mean()),
        "share_positive": float((df["sentiment"]=="positive").mean())
    }
    return df, summary

# ---------- CLI ----------
def parse_args():
    p = argparse.ArgumentParser(
        description="Scrape a Reddit thread's comments and run sentiment analysis."
    )
    p.add_argument("url", help="Full Reddit thread URL (include https://)")
    p.add_argument("-o", "--out", default=None, help="Output CSV filename (default auto)")
    p.add_argument("--sort", choices=["best","top","new","controversial","old","q&a"],
                   default="best", help="Sort for the Reddit JSON endpoint")
    p.add_argument("--weight-by-score", action="store_true",
                   help="Weight average sentiment by comment score")
    p.add_argument("--no-chart", action="store_true",
                   help="Skip saving sentiment bar chart")
    return p.parse_args()

def main():
    args = parse_args()

    # Guard against flags being passed as the "url"
    if args.url.startswith("-"):
        print("Error: First positional argument must be the Reddit thread URL (including https://).")
        sys.exit(1)

    jurl = to_json_url(args.url, sort=args.sort)
    print(f"Fetching JSON: {jurl}")
    try:
        j = fetch_json(jurl)
    except Exception as e:
        print(f"Failed to fetch JSON: {e}")
        sys.exit(2)

    try:
        df = load_comments(j)
    except Exception as e:
        print(f"Failed to parse comments: {e}")
        sys.exit(3)

    if df.empty:
        print("No comments found (thread may be empty, locked, or requires login).")
        sys.exit(0)

    df, summary = analyze_sentiment(df, weight_by_score=args.weight_by_score)

    # Save CSV
    if args.out:
        out_csv = args.out
    else:
        host = urlparse(args.url).netloc.replace(".", "_")
        out_csv = f"reddit_comments_{host}.csv"
    df.to_csv(out_csv, index=False, encoding="utf-8")
    print(f"Saved {len(df)} comments to {out_csv}")

    # Print summary
    print("\n--- Sentiment Summary ---")
    for k, v in summary.items():
        print(f"{k}: {v}")

    # Examples
    print("\nMost negative examples:")
    print(df.sort_values("compound").head(5)[["compound","body"]].to_string(index=False, max_colwidth=120))
    print("\nMost positive examples:")
    print(df.sort_values("compound", ascending=False).head(5)[["compound","body"]].to_string(index=False, max_colwidth=120))

    # Optional chart
    if not args.no_chart:
        try:
            import matplotlib.pyplot as plt
            counts = df["sentiment"].value_counts().reindex(["negative","neutral","positive"]).fillna(0)
            fig, ax = plt.subplots(figsize=(5,4))
            ax.bar(counts.index, counts.values)
            ax.set_title("Reddit Sentiment on AI-Generated Writing")
            ax.set_ylabel("Number of comments")
            fig.tight_layout()
            plt.savefig("reddit_ao3_ai_sentiment_counts.png", dpi=200)
            print("Saved bar chart: reddit_ao3_ai_sentiment_counts.png")
        except Exception as e:
            print("Chart skipped:", e)

if __name__ == "__main__":
    main()


usage: ipykernel_launcher.py [-h] [-o OUT]
                             [--sort {best,top,new,controversial,old,q&a}]
                             [--weight-by-score] [--no-chart]
                             url
ipykernel_launcher.py: error: unrecognized arguments: -f


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
