In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
DATA_DIR = Path("..") / "data"


#### helpers

In [3]:
def read_csv_flex(path: Path) -> pd.DataFrame:
    try:
        df = pd.read_csv(path)
        if df.shape[1] == 1:
            df = pd.read_csv(path, sep=";")
    except Exception:
        df = pd.read_csv(path, sep=";")
    return df

def assert_cols(df: pd.DataFrame, required: set, name: str):
    missing = required - set(df.columns)
    assert not missing, f"{name}: missing columns {missing}"


### Data Loading

In [4]:
aob = read_csv_flex(DATA_DIR/"aob_config.csv")
payout = read_csv_flex(DATA_DIR/"payout_rules.csv")
planning = read_csv_flex(DATA_DIR/"planning_rules.csv")
ws_cfg = read_csv_flex(DATA_DIR/"ws_config.csv")
date_dim = read_csv_flex(DATA_DIR/"date_dim.csv")
org = read_csv_flex(DATA_DIR/"org_hierarchy.csv")
accounts = read_csv_flex(DATA_DIR/"accounts_dim.csv")
fact = read_csv_flex(DATA_DIR/"sales_monthly.csv")

In [5]:
# parse dates
if "date" in fact.columns:
    fact["date"] = pd.to_datetime(fact["date"], errors="coerce")
    date_dim["date"] = pd.to_datetime(date_dim["date"], errors="coerce")

fact.head(3), org.head(3), accounts.head(3), date_dim.head(3)

(        date  year  month quarter country_id area_id salesperson_id  \
 0 2024-01-01  2024      1      Q1         PL    PL_N     SP_BIA_001   
 1 2024-02-01  2024      2      Q1         PL    PL_N     SP_BIA_001   
 2 2024-03-01  2024      3      Q1         PL    PL_N     SP_BIA_001   
 
                account_id  tier  plan_revenue  actual_revenue  \
 0  ACC_T1_SP_BIA_001_0001     1   8941.951131     9210.898894   
 1  ACC_T1_SP_BIA_001_0001     1   8941.951131     8761.233837   
 2  ACC_T1_SP_BIA_001_0001     1  10219.372722    11954.465374   
 
    last_year_revenue  windfall_flag  shortfall_flag  sales_driven  \
 0        7440.401123              0               0             0   
 1        7440.401123              0               0             0   
 2        8503.315569              1               0             0   
 
    healthy_prev3    actual_adj  
 0              0   9210.898894  
 1              1   8761.233837  
 2              1  10219.372722  ,
   country_id country_man

### Schema checks

In [6]:
req_fact = {"date","year","month","quarter",
            "country_id","area_id","salesperson_id","account_id","tier",
            "plan_revenue","actual_revenue","last_year_revenue"}
req_org  = {"country_id","area_id","area_name","aob_band","salesperson_id","city","tenure_years"}
req_acc  = {"account_id","account_name","tier","city","salesperson_id","is_new"}

assert_cols(fact, req_fact, "fact")
assert_cols(org, req_org, "org")
assert_cols(accounts, req_acc, "accounts")

assert fact["date"].notna().all(), "fact: null dates"
assert len(date_dim)==12, "date_dim should be monthly (12 rows)"


### Integrity

In [7]:
issues = []

# referential integrity
missing_acc = fact.loc[~fact["account_id"].isin(accounts["account_id"]), "account_id"].unique()
if len(missing_acc): issues.append(("missing accounts in dim", len(missing_acc)))

missing_sp = fact.loc[~fact["salesperson_id"].isin(org["salesperson_id"]), "salesperson_id"].unique()
if len(missing_sp): issues.append(("missing salespeople in org", len(missing_sp)))

# area mismatch vs org mapping
sp2area = org.set_index("salesperson_id")["area_id"].to_dict()
mism = fact[fact.apply(lambda r: sp2area.get(r["salesperson_id"]) != r["area_id"], axis=1)]
if len(mism): issues.append(("area_id mismatch rows", len(mism)))

# 12 months per account
months_per_acc = fact.groupby("account_id")["date"].nunique()
not_12 = months_per_acc[months_per_acc!=12]
if len(not_12): issues.append(("accounts with !=12 months", len(not_12)))

# duplicates
dups = fact.duplicated(subset=["account_id","date"]).sum()
if dups: issues.append(("duplicate (account_id,date)", int(dups)))

issues


[]

### Values sanity

In [8]:
neg_counts = {c: int((fact[c] < 0).sum()) for c in ["plan_revenue","actual_revenue","last_year_revenue"] if c in fact}
neg_counts

# NEW account rule: if is_new==1 then LY==0 and Plan==0 for 2024
merged = fact.merge(accounts[["account_id","is_new"]], on="account_id", how="left")
violations_new = merged[(merged["is_new"]==1) & ((merged["last_year_revenue"]!=0) | (merged["plan_revenue"]!=0))]
violations_new[["account_id","date","plan_revenue","last_year_revenue"]].head()


Unnamed: 0,account_id,date,plan_revenue,last_year_revenue


### Distribution

In [9]:
summary = {
    "salespeople": org["salesperson_id"].nunique(),
    "accounts": accounts["account_id"].nunique(),
    "rows_fact": len(fact),
    "months_distinct": fact["date"].dt.to_period("M").nunique(),
    "years": fact["date"].dt.year.unique().tolist(),
}
tier_dist = accounts["tier"].value_counts().rename_axis("tier").reset_index(name="accounts_by_tier")
city_aob = org.groupby(["city","aob_band"]).size().reset_index(name="count")

summary, tier_dist.head(10), city_aob


({'salespeople': 10,
  'accounts': 628,
  'rows_fact': 7536,
  'months_distinct': 12,
  'years': [2024]},
    tier  accounts_by_tier
 0     1               506
 1     3               108
 2     5                14,
         city aob_band  count
 0  Bialystok      LOW      1
 1  Bydgoszcz      LOW      1
 2     Gdansk      MED      1
 3     Krakow     HIGH      1
 4       Lodz      MED      1
 5     Lublin      LOW      1
 6     Poznan      MED      1
 7   Szczecin      LOW      1
 8     Warsaw     HIGH      1
 9    Wroclaw     HIGH      1)

### QA

In [10]:
qa = pd.DataFrame(issues, columns=["issue","count"])
qa_path = DATA_DIR/"qa_report.csv"
qa.to_csv(qa_path, index=False)
qa_path, qa


(WindowsPath('../data/qa_report.csv'),
 Empty DataFrame
 Columns: [issue, count]
 Index: [])