In [None]:
import pandas as pd
import numpy as np
import re

# loads data and clean text column
csv_path = "results/reddit_prep.csv"     
df = pd.read_csv(csv_path)
df["body"] = df["body"].fillna("")
df["char_len"] = df["body"].str.len()      

In [7]:
# (No. of rows, char length mean(SD), unique authors)

def basic_stats(sub):
    return pd.Series({
        "No. of discussions scraped": len(sub),
        "No. of characters, mean (SD)": f"{sub['char_len'].mean():.1f} ({sub['char_len'].std():.1f})",
        "Unique authors": sub["author"].nunique()
    })

summary = pd.concat({
    "All discussions": basic_stats(df),
    "Comments":        basic_stats(df[df["type"] == "comment"]),
    "Posts":           basic_stats(df[df["type"] == "submission"])
}, axis=1)



In [17]:
# keyword descriptive stats

keywords = [
    "liraglutide", "victorza", "xultophy", "saxenda",
    "semaglutide", "ozempic", "wegovy",
    "tirzepatide", "mounjaro", "zepbound"
]

def keyword_table(sub, total_label):
    total_n = len(sub)
    records = []
    for kw in keywords:
        mask = sub["body"].str.contains(fr"\b{re.escape(kw)}\b", case=False, regex=True)
        n = mask.sum()
        pct = n / total_n * 100 if total_n else 0
        records.append({"Search word": kw, f"{total_label} #": n, f"{total_label} %": round(pct, 1)})
    return pd.DataFrame(records)

kw_all      = keyword_table(df, "All discussions")
kw_comments = keyword_table(df[df["type"]=="comment"], "Comments")
kw_posts    = keyword_table(df[df["type"]=="submission"],  "posts")

# merge three tables
kw_df = kw_all.merge(kw_comments, on="Search word").merge(kw_posts, on="Search word")



In [18]:
#  community descriptive stats
def community_stats(sub):
    total_n = len(sub)
    return (
        sub.groupby("subreddit")
           .size()
           .sort_values(ascending=False)
           .reset_index(name="Count")
           .assign(Percent=lambda d: (d["Count"] / total_n * 100).round(1))
    )

comm_all      = community_stats(df)                      
comm_comments = community_stats(df[df["type"]=="comment"])
comm_posts    = community_stats(df[df["type"]=="submission"])


In [None]:
# checking descriptive stats
print("\n=== Table 1. Post and Comment Summary Statistics ===")
print(summary)

print("\n=== Table 2. GLP-1 Agonist Keyword Occurrences ===")
print(kw_df)

print("\n=== Table 3. Top 20 Subreddits (All discussions) ===")
print(comm_all.head(20))

In [21]:
# save tables
summary.to_csv("tables/table1_summary.csv")
kw_df.to_csv("tables/table2_keywords.csv", index=False)
comm_all.to_csv("tables/table3_communities_all.csv", index=False)