In [24]:
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta

def generate_collection_from_ar(fact_ar: pd.DataFrame, df_customers: pd.DataFrame, df_bu: pd.DataFrame, output_dir="data_collection"):
    """
    Extend existing df_customers and df_bu with collection-related fields,
    and generate fact_agent and fact_dunning tables.
    Input:
        df_gl — fact_general_ledger or fact_ar DataFrame
                 Must include: ['customer_id','bu_id','invoice_number','invoice_date','due_date','paid_date','amount','terms_id','status']
        df_customers — your existing dimension table (will be enriched)
        df_bu — your existing business unit dimension (will be enriched)
    Output:
        Returns updated customers/bu plus fact_agent and fact_dunning
    """

    os.makedirs(output_dir, exist_ok=True)
    np.random.seed(42)
    print("Building collection dataset from fact_GL...")

    # ==========================================================
    # 1️⃣ ENRICH EXISTING CUSTOMERS
    # ==========================================================
    n_cust = len(df_customers)
    df_customers = df_customers.copy()

    if "CUSTOMERNUMBER" not in df_customers.columns:
        df_customers["CUSTOMERNUMBER"] = df_customers["customer_id"]

    # Add new collection attributes
    df_customers["SFBusinessArea"] = np.random.choice(
        ["Feedstock", "Gas", "Certificates", "Retail"], n_cust
    )
    df_customers["CCARCUSTOMERNUMBER"] = ["CCAR-" + str(i).zfill(6) for i in range(n_cust)]
    df_customers["Customer Email"] = [f"{str(cid).lower()}@example.com" for cid in df_customers["CUSTOMERNUMBER"]]
    df_customers["creditblock"] = np.random.choice(["Yes", "No"], n_cust, p=[0.05, 0.95])
    df_customers["CREDITLIMITLOCALCURRENCY"] = np.random.uniform(20000, 100000, n_cust).round(2)
    df_customers["DunsNumber"] = np.random.randint(100000000, 999999999, n_cust)
    df_customers["DNB_CL"] = np.random.choice(["A", "B", "C"], n_cust, p=[0.6, 0.3, 0.1])
    df_customers["PAYMENTTERM"] = np.random.choice(["Net 30", "Net 45", "Net 60"], n_cust)
    df_customers["FACTORINGCUSTOMER"] = np.random.choice(["Yes", "No"], n_cust, p=[0.2, 0.8])
    df_customers["CustomerSegment"] = np.random.choice(["SME", "Corporate", "Enterprise"], n_cust)
    df_customers["RiskCategoryText"] = np.random.choice(["Low", "Medium", "High"], n_cust, p=[0.6, 0.3, 0.1])
    df_customers["CUSTOMERCREATIONDATE"] = pd.Timestamp("2020-01-01") + pd.to_timedelta(
        np.random.randint(0, 1200, n_cust), unit="D"
    )
    df_customers["RTEXT"] = np.random.choice(["OK", "Review ongoing", "Escalated"], n_cust)
    df_customers["CLERKPHONE"] = ["+45" + str(np.random.randint(20000000, 99999999)) for _ in range(n_cust)]

    # ==========================================================
    # 2️⃣ ENRICH EXISTING BU TABLE
    # ==========================================================
    n_bu = len(df_bu)
    df_bu = df_bu.copy()

    if "Company Code" not in df_bu.columns:
        df_bu["Company Code"] = df_bu["bu_id"]

    df_bu["SALESORGANIZATION"] = ["SO-" + str(i).zfill(3) for i in range(1, n_bu + 1)]
    df_bu["CreditcontrolArea"] = np.random.choice(["CCA01", "CCA02", "CCA03"], n_bu)
    df_bu["SALESORGANIZATION_DESC"] = ["Sales Org " + str(i) for i in range(1, n_bu + 1)]

    # ==========================================================
    # 3️⃣ FACT_AGENT
    # ==========================================================
    agents = ["Anna Møller", "Jacob Kristensen", "Laura Holm", "Søren Olesen", "Peter Jensen"]
    df_agent = pd.DataFrame({
        "AGENT_ID": range(1, len(agents) + 1),
        "AGENT": agents,
        "AGENT_EMAIL": [a.lower().replace(" ", ".") + "@company.com" for a in agents]
    })

    # ==========================================================
    # 4️⃣ FACT_DUNNING
    # ==========================================================
    df_dunning = df_gl.copy()
    df_dunning["invoice_date"] = pd.to_datetime(df_dunning["invoice_date"], errors="coerce")
    df_dunning["due_date"] = pd.to_datetime(df_dunning["due_date"], errors="coerce")
    df_dunning["paid_date"] = pd.to_datetime(df_dunning["paid_date"], errors="coerce")

    df_dunning["DUNNINGLEVEL"] = np.random.choice([0, 1, 2, 3, 4], len(df_dunning), p=[0.5, 0.25, 0.15, 0.08, 0.02])
    df_dunning["BILLINGDOCUMENT"] = df_dunning["invoice_number"].str.replace("INV:", "BILL:", regex=False)
    df_dunning["REFERENCEDOCUMENTNUMBER"] = df_dunning["invoice_number"]
    df_dunning["ACCOUNTINGDOCUMENTNUMBER"] = [f"ACC-{i:07d}" for i in range(len(df_dunning))]
    df_dunning["ACCOUNTINGDOCUMENTDATE"] = df_dunning["invoice_date"]
    df_dunning["FISCALYEAR"] = pd.DatetimeIndex(df_dunning["invoice_date"]).year
    df_dunning["ITEM"] = np.random.randint(10, 99, len(df_dunning))
    df_dunning["ITEMTEXT"] = np.random.choice(
        ["", "Correction", "Reversal", "Write-off"], len(df_dunning), p=[0.7, 0.1, 0.1, 0.1]
    )
    df_dunning["NETDUEDATE"] = df_dunning["due_date"]
    df_dunning["RUNDATE"] = datetime.now().date()
    df_dunning["AMOUNTLOCALCURRENCY"] = df_dunning["amount"]
    df_dunning["AMOUNTEUR"] = (df_dunning["amount"] / 7.45).round(2)
    df_dunning["CASHDISCOUNTDISCOUNT"] = np.random.uniform(0, 0.05, len(df_dunning)).round(3)
    df_dunning["CURRENCY"] = "DKK"
    df_dunning["DOCUMENTTYPE"] = np.random.choice(["DR", "KR", "DZ"], len(df_dunning))
    df_dunning["CLEARINGDATE"] = df_dunning["paid_date"]
    df_dunning["AGENT_ID"] = np.random.choice(df_agent["AGENT_ID"], len(df_dunning))
    df_dunning["LASTDUNNEDON"] = df_dunning["due_date"] + pd.to_timedelta(np.random.randint(5, 60, len(df_dunning)), unit="D")
    df_dunning["Status"] = df_dunning["status"]
    df_dunning["SFCaseNumber"] = ["SF-" + str(i).zfill(6) for i in range(len(df_dunning))]
    df_dunning["ReasonCode"] = np.random.choice(["RC01", "RC02", "RC03", "RC04"], len(df_dunning))
    df_dunning["Segment"] = np.random.choice(["Feedstock", "Energy", "Certificates", "Retail"], len(df_dunning))
    df_dunning["RC Description"] = np.random.choice(["Customer delay", "Technical issue", "Invoice dispute"], len(df_dunning))
    df_dunning["Explanation"] = np.random.choice(["Awaiting payment", "In dispute", "Follow-up needed"], len(df_dunning))
    df_dunning["NextActionDate"] = df_dunning["due_date"] + pd.to_timedelta(np.random.randint(10, 40, len(df_dunning)), unit="D")
    df_dunning["ActionDaysOverdue"] = (df_dunning["paid_date"] - df_dunning["due_date"]).dt.days.fillna(0)
    df_dunning["CollectionInitiated"] = np.where(df_dunning["ActionDaysOverdue"] > 0, "Yes", "No")
    df_dunning["NextAction"] = np.random.choice(["Send Reminder", "Call Customer", "Escalate"], len(df_dunning))
    df_dunning["ActionPending"] = np.random.choice(["Yes", "No"], len(df_dunning), p=[0.3, 0.7])
    df_dunning["ActionOwner"] = np.random.choice(df_agent["AGENT"], len(df_dunning))

    # ==========================================================
    # 5️⃣ SAVE
    # ==========================================================
    df_agent.to_csv(os.path.join(output_dir, "dim_agent.csv"), index=False)
    df_dunning.to_csv(os.path.join(output_dir, "fact_dunning.csv"), index=False)
    df_customers.to_csv(os.path.join(output_dir, "dim_customers.csv"), index=False)
    df_bu.to_csv(os.path.join(output_dir, "dim_business_units.csv"), index=False)
    print(f"✅ Collection dataset saved under {output_dir}/")

    return {
        "customers": df_customers,
        "business_units": df_bu,
        "agent": df_agent,
        "dunning": df_dunning
    }


In [25]:
fact_ar = pd.read_csv("../../data/outputdata_v1/fact/fact_ar.csv")  # Load your fact_general_ledger or fact_ar data
df_bus = pd.read_csv("../../data/outputdata_v1/dimensions/business_unit.csv")  # Load your business unit dimension
df_cust = pd.read_csv("../../data/outputdata_v1/dimensions/customer.csv")  # Load your customer dimension

In [26]:
data_dunning = generate_collection_from_ar(fact_ar, df_cust, df_bus)

Building collection dataset from fact_GL...
✅ Collection dataset saved under data_collection/


In [27]:
#data_dunning["customers"]
#data_dunning["business_units"]
data_dunning["agent"]

Unnamed: 0,AGENT_ID,AGENT,AGENT_EMAIL
0,1,Anna Møller,anna.møller@company.com
1,2,Jacob Kristensen,jacob.kristensen@company.com
2,3,Laura Holm,laura.holm@company.com
3,4,Søren Olesen,søren.olesen@company.com
4,5,Peter Jensen,peter.jensen@company.com
