In [1]:
import pandas as pd

In [5]:
#!/usr/bin/env python3
"""
DailyDabbers post + comment network CSV
post ▸ comment (depth 1) ▸ reply (depth ≥2)  …
"""

from pathlib import Path
import pandas as pd
from tqdm import tqdm

# ── 1.  FILES ────────────────────────────────────────────────────
POSTS_FILE = Path(
    "/Users/jacksonsorenson/Documents/Computational Media Lab/Weed Study/"
    "CSV for mass database organization/R:DailyDabbers/Extracted Coulumns/"
    "r_daily_dabbers_posts_subset.csv"
)
COMMENTS_FILE = Path(
    "/Users/jacksonsorenson/Documents/Computational Media Lab/Weed Study/"
    "CSV for mass database organization/R:DailyDabbers/Extracted Coulumns/"
    "r_dailydabbers_comments_subset.csv"
)
OUT_FILE = Path.home() / "Desktop" / "r_dailydabbers_network.csv"

# ── 2.  HELPERS ─────────────────────────────────────────────────
def strip_prefix(x: str) -> str:
    return x.split("_", 1)[1] if isinstance(x, str) and "_" in x else x
def to_int(x):
    try: return int(x)
    except Exception: return pd.NA

# ── 3.  LOAD + NORMALISE ───────────────────────────────────────
posts    = pd.read_csv(POSTS_FILE, dtype=str)
comments = pd.read_csv(COMMENTS_FILE, dtype=str)

posts["post_id"]       = posts["id"].apply(strip_prefix)
posts["created_utc_i"] = posts["created_utc"].apply(to_int)

comments["comment_id"]     = comments["id"].apply(strip_prefix)
comments["parent_id_norm"] = comments["parent_id"].apply(strip_prefix)
comments["created_utc_i"]  = comments["created_utc"].apply(to_int)

# ── 4.  CHILD LISTS  ────────────────────────────────────────────
children = {}
for _, row in comments.iterrows():
    children.setdefault(row["parent_id_norm"], []).append(row)
for lst in children.values():
    lst.sort(key=lambda r: r["created_utc_i"] or 0)

# ── 5.  TEXT LOOK-UPS  ─────────────────────────────────────────
post_text   = {r["post_id"]: f"{r.get('title','')}\n\n{r.get('selftext','')}"
               for _, r in posts.iterrows()}
comment_text = {r["comment_id"]: r.get("body", "")
                for _, r in comments.iterrows()}
parent_text  = lambda pid: comment_text.get(pid) or post_text.get(pid) or ""

# ── 6.  DEPTH-FIRST WALK  ──────────────────────────────────────
rows_out = []

def dfs(parent_id: str, depth: int):
    for com in children.get(parent_id, []):
        cid  = com["comment_id"]
        node = "comment" if depth == 1 else "reply"   # ← ★ key change
        rows_out.append({
            "node_type"        : node,
            "depth"            : depth,
            "id"               : cid,
            "parent_id"        : com["parent_id_norm"],
            "post_id"          : com["link_id"].split("_",1)[1] if "link_id" in com else "",
            "created_utc"      : com["created_utc"],
            "author"           : com["author"],
            "score"            : com["score"],
            "body_text"        : comment_text[cid],
            "parent_body_text" : parent_text(com["parent_id_norm"]),
            "distinguished"    : com.get("distinguished", "")
        })
        dfs(cid, depth + 1)

posts = posts.sort_values("created_utc_i")
for _, p in tqdm(posts.iterrows(), total=len(posts), desc="Threading"):
    pid = p["post_id"]
    rows_out.append({
        "node_type"        : "post",
        "depth"            : 0,
        "id"               : pid,
        "parent_id"        : "",
        "post_id"          : pid,
        "created_utc"      : p["created_utc"],
        "author"           : p["author"],
        "score"            : p["score"],
        "body_text"        : post_text[pid],
        "parent_body_text" : "",
        "distinguished"    : p.get("distinguished", ""),
        "num_comments"     : p.get("num_comments", ""),
        "media"            : p.get("media", "")
    })
    dfs(pid, 1)

# ── 7.  SAVE  ───────────────────────────────────────────────────
pd.DataFrame(rows_out).to_csv(OUT_FILE, index=False)
print(f"✅  Thread network saved → {OUT_FILE}")


Threading: 100%|██████████| 4284/4284 [00:00<00:00, 13659.89it/s]


✅  Thread network saved → /Users/jacksonsorenson/Desktop/r_dailydabbers_network.csv


In [6]:
#!/usr/bin/env python3
"""
Build a hierarchical (post › comment › reply) CSV
for r/Dabs using the extracted subset files.
"""

from pathlib import Path
import pandas as pd
from tqdm import tqdm

# ── 1.  FILE LOCATIONS ───────────────────────────────────────────
POSTS_FILE = Path(
    "/Users/jacksonsorenson/Documents/Computational Media Lab/Weed Study/"
    "CSV for mass database organization/R:Dabs/Extracted Columns/"
    "r_dabs_posts_subset.csv"
)
COMMENTS_FILE = Path(
    "/Users/jacksonsorenson/Documents/Computational Media Lab/Weed Study/"
    "CSV for mass database organization/R:Dabs/Extracted Columns/"
    "r_dabs_comments_subset.csv"
)
OUT_FILE = Path.home() / "Desktop" / "r_dabs_network.csv"

# ── 2.  SMALL HELPERS ────────────────────────────────────────────
def strip_id(x: str) -> str:
    """Remove 't3_' / 't1_' prefixes if present."""
    return x.split("_", 1)[1] if isinstance(x, str) and "_" in x else x

def to_int(x):
    try: return int(x)
    except Exception: return pd.NA

# ── 3.  LOAD & NORMALISE ────────────────────────────────────────
posts    = pd.read_csv(POSTS_FILE, dtype=str)
comments = pd.read_csv(COMMENTS_FILE, dtype=str)

posts["post_id"]        = posts["id"].apply(strip_id)
posts["created_utc_int"] = posts["created_utc"].apply(to_int)

comments["comment_id"]      = comments["id"].apply(strip_id)
comments["parent_id_norm"]  = comments["parent_id"].apply(strip_id)
comments["created_utc_int"] = comments["created_utc"].apply(to_int)

# ── 4.  BUILD CHILD LISTS FOR EACH PARENT ───────────────────────
children = {}
for _, row in comments.iterrows():
    children.setdefault(row["parent_id_norm"], []).append(row)

for lst in children.values():                      # chronological within each parent
    lst.sort(key=lambda r: r["created_utc_int"] or 0)

# ── 5.  QUICK TEXT LOOK-UPS ─────────────────────────────────────
post_text    = {r["post_id"]: f"{r.get('title','')}\n\n{r.get('selftext','')}"
                for _, r in posts.iterrows()}
comment_text = {r["comment_id"]: r.get("body", "")
                for _, r in comments.iterrows()}
get_parent_txt = lambda pid: comment_text.get(pid) or post_text.get(pid) or ""

# ── 6.  DEPTH-FIRST WALK TO EMIT ROWS ───────────────────────────
rows = []

def dfs(parent_id: str, depth: int):
    """Attach all children of *parent_id* depth-first."""
    for com in children.get(parent_id, []):
        cid   = com["comment_id"]
        ntype = "comment" if depth == 1 else "reply"
        rows.append({
            "node_type"        : ntype,
            "depth"            : depth,
            "id"               : cid,
            "parent_id"        : com["parent_id_norm"],
            "post_id"          : strip_id(com.get("link_id", "")) or "",  # may be absent in subset
            "created_utc"      : com["created_utc"],
            "author"           : com["author"],
            "score"            : com["score"],
            "body_text"        : comment_text[cid],
            "parent_body_text" : get_parent_txt(com["parent_id_norm"]),
            "distinguished"    : com.get("distinguished", "")
        })
        dfs(cid, depth + 1)

# walk each submission chronologically
posts = posts.sort_values("created_utc_int")
for _, p in tqdm(posts.iterrows(), total=len(posts), desc="Threading"):
    pid = p["post_id"]
    rows.append({
        "node_type"        : "post",
        "depth"            : 0,
        "id"               : pid,
        "parent_id"        : "",
        "post_id"          : pid,
        "created_utc"      : p["created_utc"],
        "author"           : p["author"],
        "score"            : p["score"],
        "body_text"        : post_text[pid],
        "parent_body_text" : "",
        "distinguished"    : p.get("distinguished", ""),
        "num_comments"     : p.get("num_comments", ""),
        "media"            : p.get("media", "")
    })
    dfs(pid, depth=1)

# ── 7.  SAVE OUTPUT ─────────────────────────────────────────────
pd.DataFrame(rows).to_csv(OUT_FILE, index=False)
print(f"✅  Thread network saved → {OUT_FILE}")


Threading: 100%|██████████| 113708/113708 [00:13<00:00, 8475.18it/s] 


✅  Thread network saved → /Users/jacksonsorenson/Desktop/r_dabs_network.csv
