In [8]:
import pandas as pd
import hashlib
from supabase import create_client, Client
import os
from dotenv import load_dotenv
from itertools import islice



In [2]:
load_dotenv()

SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_KEY = os.getenv("SUPABASE_KEY")
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)


In [3]:
df = pd.read_csv("preprocessed_redditData-removedtopic.csv")


In [12]:
print("Script started!", flush=True)
def hash_content(content: str) -> str:
    return hashlib.md5(content.encode('utf-8')).hexdigest()


df["content_hash"] = df["text"].apply(lambda x: hash_content(str(x)) if pd.notna(x) else None)
unique_contents = df[["text", "content_hash"]].drop_duplicates().dropna()
print(f"Unique content rows: {len(unique_contents)}", flush=True)

chunk_size = 500
existing_map = {}

for i in range(0, len(unique_contents), chunk_size):
    chunk = unique_contents.iloc[i:i + chunk_size]
    result = supabase.table("dim_content") \
        .select("id", "content_hash") \
        .in_("content_hash", chunk["content_hash"].tolist()) \
        .execute()
    if result.data:
        existing_map.update({row["content_hash"]: row["id"] for row in result.data})

print(f"Existing content hashes in DB: {len(existing_map)}", flush=True)


missing_rows = unique_contents[~unique_contents["content_hash"].isin(existing_map.keys())]
print(f"Missing content rows to insert: {len(missing_rows)}", flush=True)

if not missing_rows.empty:
    to_insert = [
        {"content": row["text"], "content_hash": row["content_hash"]}
        for _, row in missing_rows.iterrows()
    ]
    for i in range(0, len(to_insert), chunk_size):
        batch = to_insert[i:i+chunk_size]
        inserted = supabase.table("dim_content").insert(batch).execute()
        if inserted.data:
            for row in inserted.data:
                existing_map[row["content_hash"]] = row["id"]


content_id_map = {
    row["text"]: existing_map[row["content_hash"]]
    for _, row in unique_contents.iterrows()
    if row["content_hash"] in existing_map
}
print(f"Total content IDs mapped: {len(content_id_map)}", flush=True)
dim_cache = {}

def get_or_create_id_cached(table, column, value):
    key = f"{table}:{value}"
    if key in dim_cache:
        return dim_cache[key]
    result = supabase.table(table).select("id").eq(column, value).execute()
    if result.data:
        dim_cache[key] = result.data[0]["id"]
        return dim_cache[key]
    else:
        inserted = supabase.table(table).insert({column: value}).execute()
        dim_cache[key] = inserted.data[0]["id"]
        return dim_cache[key]

fact_post_rows = []

for idx, row in df.iterrows():
    try:
        if pd.isna(row["text"]) or not str(row["text"]).strip():
            continue
        subreddit_id = get_or_create_id_cached("dim_subreddit", "name", row["subreddit"])
        content_id = content_id_map.get(row["text"])
        if not content_id:
            continue  # skip if content not mapped
        year_id = get_or_create_id_cached("dim_year", "year", int(row["year"]))
        month_id = get_or_create_id_cached("dim_month", "month", int(row["month"]))
        day = pd.to_datetime(row["created_utc"]).day
        day_id = get_or_create_id_cached("dim_day", "day", day)
        fact_post_rows.append({
            "subreddit_id": subreddit_id,
            "content_id": content_id,
            "year_id": year_id,
            "month_id": month_id,
            "day_id": day_id
        })
    except Exception as e:
        print(f"Skipping row {idx} due to error: {e}", flush=True)

print(f"Total fact_post rows ready: {len(fact_post_rows)}", flush=True)

if fact_post_rows:
    for i in range(0, len(fact_post_rows), chunk_size):
        batch = fact_post_rows[i:i + chunk_size]
        supabase.table("fact_post").insert(batch).execute()
    print(f"Inserted {len(fact_post_rows)} rows into fact_post.", flush=True)
else:
    print("No valid rows to insert.", flush=True)

Script started!
Unique content rows: 63805
Existing content hashes in DB: 0
Missing content rows to insert: 63805
Total content IDs mapped: 63805
Total fact_post rows ready: 63986
Inserted 63986 rows into fact_post.


In [14]:
response = supabase.table("post_summary").select("*").execute()
df_summary = pd.DataFrame(response.data)
df_summary.to_csv("post_summary.csv", index=False)

print("post_summary saved to post_summary.csv")

post_summary saved to post_summary.csv
