In [3]:
#!/usr/bin/env python3
"""
Extract the analysis-ready columns from r_daily_dabbers_posts.csv
and write a lighter CSV to the Desktop.
"""

from pathlib import Path
import pandas as pd

# ── 1. FILE LOCATIONS ─────────────────────────────────────────────
SRC  = Path(
    "/Users/jacksonsorenson/Documents/Computational Media Lab/Weed Study/"
    "CSV for mass database organization/R:DailyDabbers/r_daily_dabbers_posts.csv"
)
DEST = Path.home() / "Desktop" / "r_daily_dabbers_posts_subset.csv"

# ── 2. COLUMNS TO KEEP  ───────────────────────────────────────────
# Pushshift exports sometimes capital-case these; .str.lower() fixes mismatches.
KEEP = {
    "author",
    "created_utc",
    "distinguished",
    "id",
    "media",          # Pushshift calls it “media”
    "num_comments",
    "score",
    "title",
    "selftext",
    "subreddit",
}

# ── 3. LOAD, FILTER, SAVE  ───────────────────────────────────────
print("📥  Reading source file …")
df = pd.read_csv(SRC, dtype=str)         # keep everything as string for safety
df.columns = df.columns.str.lower()      # handle Title vs title, Score vs score

missing = KEEP - set(df.columns)
if missing:
    raise KeyError(f"Column(s) not found in file: {missing}")

print("✂️  Selecting relevant columns …")
df_subset = df[list(KEEP)]

print("💾  Writing trimmed CSV …")
df_subset.to_csv(DEST, index=False)
print(f"✅  Done! Saved to → {DEST}")


📥  Reading source file …
✂️  Selecting relevant columns …
💾  Writing trimmed CSV …
✅  Done! Saved to → /Users/jacksonsorenson/Desktop/r_daily_dabbers_posts_subset.csv


In [4]:
#!/usr/bin/env python3
"""
Trim r_dailydabbers_comments copy.csv down to the columns needed for
thread-building and analysis, then save the lighter file to the Desktop.
"""

from pathlib import Path
import pandas as pd

# ── 1. FILE LOCATIONS ─────────────────────────────────────────────
SRC  = Path(
    "/Users/jacksonsorenson/Documents/Computational Media Lab/Weed Study/"
    "CSV for mass database organization/R:DailyDabbers/"
    "r_dailydabbers_comments copy.csv"
)
DEST = Path.home() / "Desktop" / "r_dailydabbers_comments_subset.csv"

# ── 2. COLUMNS TO KEEP  ───────────────────────────────────────────
KEEP = {
    "author",
    "body",
    "created_utc",
    "id",
    "parent_id",
    "score",
    "distinguished",
    "subreddit",
}

# ── 3. LOAD, FILTER, SAVE  ───────────────────────────────────────
print("📥  Reading source file …")
df = pd.read_csv(SRC, dtype=str)
df.columns = df.columns.str.lower()      # normalise capitalisation (Body → body)

missing = KEEP - set(df.columns)
if missing:
    raise KeyError(f"Column(s) not found in file: {missing}")

print("✂️  Selecting relevant columns …")
df_subset = df[list(KEEP)]

print("💾  Writing trimmed CSV …")
df_subset.to_csv(DEST, index=False)
print(f"✅  Done! Saved to → {DEST}")


📥  Reading source file …
✂️  Selecting relevant columns …
💾  Writing trimmed CSV …
✅  Done! Saved to → /Users/jacksonsorenson/Desktop/r_dailydabbers_comments_subset.csv


In [2]:
#!/usr/bin/env python3
"""
Create r_dabs_posts_subset.csv with columns in the order:
author, created_utc, distinguished, id, media, num_comments,
score, title, selftext, subreddit
"""

from pathlib import Path
import pandas as pd

# ── 1. FILE LOCATIONS ────────────────────────────────────────────
SRC  = Path(
    "/Users/jacksonsorenson/Documents/Computational Media Lab/Weed Study/"
    "CSV for mass database organization/R:Dabs/r_dabs_posts.csv"
)
DEST = Path.home() / "Desktop" / "r_dabs_posts_subset.csv"

# ── 2. COLUMNS TO KEEP (IN DESIRED ORDER) ────────────────────────
KEEP_ORDERED = [
    "author",
    "created_utc",
    "distinguished",
    "id",
    "media",         # Pushshift calls it “media”
    "num_comments",
    "score",
    "title",
    "selftext",
    "subreddit",
]

# ── 3. LOAD → FILTER → SAVE ─────────────────────────────────────
print("📥  Reading source file …")
df = pd.read_csv(SRC, dtype=str)
df.columns = df.columns.str.lower()          # handle any capitalisation

missing = set(KEEP_ORDERED) - set(df.columns)
if missing:
    raise KeyError(f"Column(s) not found in file: {missing}")

print("✂️  Selecting columns in the specified order …")
df_subset = df[KEEP_ORDERED]

print("💾  Writing trimmed CSV …")
df_subset.to_csv(DEST, index=False)
print(f"✅  Done! Saved to → {DEST}")


📥  Reading source file …
✂️  Selecting columns in the specified order …
💾  Writing trimmed CSV …
✅  Done! Saved to → /Users/jacksonsorenson/Desktop/r_dabs_posts_subset.csv


In [4]:
#!/usr/bin/env python3
"""
Create r_dabs_comments_subset.csv with columns in the order:
author, body, created_utc, id, parent_id, score,
distinguished, subreddit
"""

from pathlib import Path
import pandas as pd

# ── 1. FILE LOCATIONS ───────────────────────────────────────────
SRC  = Path(
    "/Users/jacksonsorenson/Documents/Computational Media Lab/Weed Study/"
    "CSV for mass database organization/R:Dabs/rdabs_comments.csv"
)
DEST = Path.home() / "Desktop" / "r_dabs_comments_subset.csv"

# ── 2. COLUMNS TO KEEP (IN DESIRED ORDER) ───────────────────────
KEEP_ORDERED = [
    "author",        # commenter username
    "body",          # text of the comment
    "created_utc",   # UTC timestamp
    "id",            # unique comment ID
    "parent_id",     # ID of parent (post or comment)
    "score",         # up-vote score
    "distinguished", # moderator/admin flag
    "subreddit",     # should be "Dabs"
]

# ── 3. LOAD → FILTER → SAVE ─────────────────────────────────────
print("📥  Reading source file …")
df = pd.read_csv(SRC, dtype=str)
df.columns = df.columns.str.lower()          # normalise capitalisation

missing = set(KEEP_ORDERED) - set(df.columns)
if missing:
    raise KeyError(f"Column(s) not found in file: {missing}")

print("✂️  Selecting columns in the specified order …")
df_subset = df[KEEP_ORDERED]

print("💾  Writing trimmed CSV …")
df_subset.to_csv(DEST, index=False)
print(f"✅  Done! Saved to → {DEST}")


📥  Reading source file …
✂️  Selecting columns in the specified order …
💾  Writing trimmed CSV …
✅  Done! Saved to → /Users/jacksonsorenson/Desktop/r_dabs_comments_subset.csv


In [6]:
#!/usr/bin/env python3
"""
Build overlap_users_network.csv where the first column is `author`.
"""

from pathlib import Path
import pandas as pd

# ── 1. FILE PATHS ────────────────────────────────────────────────
DABS_NET = Path(
    "/Users/jacksonsorenson/Documents/Computational Media Lab/Weed Study/"
    "CSV for mass database organization/R:Dabs/Combined Dabs W Network/"
    "r_dabs_network.csv"
)
DAILY_NET = Path(
    "/Users/jacksonsorenson/Documents/Computational Media Lab/Weed Study/"
    "CSV for mass database organization/R:DailyDabbers/Combined Daily Dabbers W Network/"
    "r_dailydabbers_network.csv"
)
OUT = Path.home() / "Desktop" / "overlap_users_network.csv"

# ── 2. LOAD ──────────────────────────────────────────────────────
dabs   = pd.read_csv(DABS_NET,  dtype=str)
daily  = pd.read_csv(DAILY_NET, dtype=str)
dabs.columns  = dabs.columns.str.lower()
daily.columns = daily.columns.str.lower()

# ── 3.  OVERLAP USERS ────────────────────────────────────────────
overlap = set(dabs["author"].dropna()) & set(daily["author"].dropna())
dabs  = dabs[dabs["author"].isin(overlap)].copy()
daily = daily[daily["author"].isin(overlap)].copy()
dabs["source_sub"]  = "Dabs"
daily["source_sub"] = "DailyDabbers"

# ── 4.  COMBINE & SORT ──────────────────────────────────────────
df = pd.concat([dabs, daily], ignore_index=True)
df["created_utc_int"] = pd.to_numeric(df["created_utc"], errors="coerce")
df.sort_values(["author", "created_utc_int"], inplace=True)

# ── 5.  REORDER SO AUTHOR IS FIRST ──────────────────────────────
cols = ["author"] + [c for c in df.columns if c != "author"]
df = df[cols]

# ── 6.  SAVE ────────────────────────────────────────────────────
df.to_csv(OUT, index=False)
print(f"✅  overlap saved → {OUT}")


✅  overlap saved → /Users/jacksonsorenson/Desktop/overlap_users_network.csv
