In [None]:
import pandas as pd
from pathlib import Path


PROJECT_ROOT = Path.cwd().parent
DATA_DIR     = PROJECT_ROOT / "data"
OUTPUT_DIR   = PROJECT_ROOT / "preprocessed_data"


OUTPUT_DIR.mkdir(exist_ok=True)

FILE_PATHS = {
    "ks_projects_2016_12": DATA_DIR / "ks-projects-201612.csv",
    "ks_projects_2018_01": DATA_DIR / "ks-projects-201801.csv",
    "live_campaigns":      DATA_DIR / "live.csv",
    "most_backed":         DATA_DIR / "most_backed.csv",
}

# Quick existence check
for name, path in FILE_PATHS.items():
    status = "FOUND" if path.exists() else "MISSING"
    print(f"{name:<20} → {status}")


In [None]:
def preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
    # 1) normalize column names
    df.columns = (
        df.columns
          .str.strip()
          .str.lower()
          .str.replace(" ", "_", regex=False)
    )
    # 2) drop exact duplicates on id
    if "id" in df.columns:
        df = df.drop_duplicates(subset="id")
    # 3) parse timestamps & compute duration
    if {"deadline", "launched_at"}.issubset(df.columns):
        df["deadline"]    = pd.to_datetime(df["deadline"],    unit="s", errors="coerce")
        df["launched_at"] = pd.to_datetime(df["launched_at"], unit="s", errors="coerce")
        df["campaign_duration_days"] = (df["deadline"] - df["launched_at"]).dt.days
    # 4) convert key financial columns to numeric
    for col in ["goal", "pledged", "usd_goal_real", "usd_pledged_real"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")
    # 5) drop columns with >90% missing
    threshold = int(len(df) * 0.10)
    df = df.dropna(axis=1, thresh=threshold)
    # 6) impute remaining missings
    for col in df.columns:
        if df[col].dtype.kind in "biufc":  # numeric
            df[col].fillna(df[col].median(), inplace=True)
        else:                             # categorical/text
            mode = df[col].mode()
            df[col].fillna(mode.iloc[0] if not mode.empty else "", inplace=True)
    return df


In [None]:
for name, path in FILE_PATHS.items():
    print(f"Processing {name}…")
    if not path.exists():
        print(f"  ERROR: file not found at {path}")
        continue

    raw = pd.read_csv(path, encoding="latin1", low_memory=False)
    print(f"  raw shape:   {raw.shape}")

    clean = preprocess_df(raw)
    print(f"  clean shape: {clean.shape}")
    display(clean.head())

    out_path = OUTPUT_DIR / f"{name}_clean.csv"
    clean.to_csv(out_path, index=False)
    print(f"  saved to    {out_path}\n")
