In [None]:
"""
Pipeline for Retail Customer Segmentation (RFM + Clustering).
"""

from pathlib import Path
import pandas as pd

from config import RAW_DATA_DIR, PROCESSED_DATA_DIR


def fetch_olist() -> Path:
    """
    You can download the Olist dataset from Kaggle and save the
    core CSVs into data/raw. This function just returns the path.
    """
    return RAW_DATA_DIR / "olist_orders_dataset.csv"


def run_pipeline():
    """
    Steps:

    1. Load order, customer, item and payment tables from Olist.
    2. Build a transaction-level dataframe (customer_id, order_date, revenue).
    3. Compute RFM metrics per customer:
        - Recency
        - Frequency
        - Monetary value
    4. Standardize RFM and fit K-Means clusters.
    5. Optionally reduce dimensionality with PCA/UMAP for visualization.
    6. Join in review sentiment (NLP) for psychographic flavor.
    7. Export segment-level summaries and customer-level labels for Tableau.
    """
    RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
    PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

    orders_path = fetch_olist()

    # TODO: implement:
    # orders = pd.read_csv(orders_path)
    # items = pd.read_csv(RAW_DATA_DIR / "olist_order_items_dataset.csv")
    # payments = pd.read_csv(RAW_DATA_DIR / "olist_order_payments_dataset.csv")
    # customers = pd.read_csv(RAW_DATA_DIR / "olist_customers_dataset.csv")
    #
    # tx = build_transactions(orders, items, payments, customers)
    # rfm = compute_rfm(tx)
    # rfm_clusters = cluster_customers(rfm)
    #
    # rfm_clusters.to_csv(PROCESSED_DATA_DIR / "customer_segments.csv", index=False)

    print("Retail segmentation pipeline outline executed (add implementation).")


if __name__ == "__main__":
    run_pipeline()