In [1]:
from pathlib import Path
import pandas as pd

DATA_DIR = Path(".")         # raw CSVs here
OUT_DIR  = Path("data_clean")   # cleaned CSVs will be written here
OUT_DIR.mkdir(exist_ok=True)

ORDERS_FILE    = DATA_DIR / "fact_orders_2023_2025.csv"
CUSTOMERS_FILE = DATA_DIR / "dim_customers_2023_2025.csv"
SPEND_FILE     = DATA_DIR / "fact_marketing_spend_daily_2023_2025.csv"


In [2]:
# number (currency) formatting 
import numpy as np
import pandas as pd

def parse_money(series):
    s = series.astype(str)

    # removing currency sign and spacing
    s = (s.str.replace("€", "", regex=False)
           .str.replace("\u00a0", "", regex=False)
           .str.replace(" ", "", regex=False))

    # comma -> decimal separator conversion depending if "." is a thousand or decimal separator
    has_comma = s.str.contains(",", na=False)
    has_dot   = s.str.contains(r"\.", na=False)
    
    s.loc[has_comma & has_dot] = s.loc[has_comma & has_dot].str.replace(".", "", regex=False)
    s.loc[has_comma] = s.loc[has_comma].str.replace(",", ".", regex=False)

    return pd.to_numeric(s, errors="coerce")



In [3]:
orders = pd.read_csv(ORDERS_FILE)
cust   = pd.read_csv(CUSTOMERS_FILE)
spend  = pd.read_csv(SPEND_FILE)
orders["order_date"] = pd.to_datetime(orders["order_date"])
cust["first_order_date"] = pd.to_datetime(cust["first_order_date"])
cust["first_order_month"] = pd.to_datetime(cust["first_order_month"])
spend["date"] = pd.to_datetime(spend["date"])

orders["order_net_revenue"] = parse_money(orders["order_net_revenue"])
spend["spend"] = parse_money(spend["spend"])

#cleaned
orders.to_csv(OUT_DIR / "fact_orders_2023_2025.csv", index=False)
cust.to_csv(OUT_DIR / "dim_customers_2023_2025.csv", index=False)
spend.to_csv(OUT_DIR / "fact_marketing_spend_daily_2023_2025.csv", index=False)


✅ Written to: /Users/konradkuleta/Cohort Analysis/data_clean
