# Profit Leakage Detection — Advanced Statistical Methods
Integrated analysis using **Bayesian inference**, **Statistical Process Control (SPC)**, and **Bootstrapping** to detect margin anomalies and quantify uncertainty.

In [ ]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, pymc as pm, arviz as az
df = pd.read_csv('data/service_orders_synthetic.csv', parse_dates=['order_dt'])
df.head()

## Bayesian Hierarchical Modeling
We estimate region-level true mean margins with partial pooling:
\[ y_{ij} \sim \mathcal{N}(\mu_i, \sigma^2), \quad \mu_i \sim \mathcal{N}(\mu_0, \tau^2) \]

In [ ]:
y = df[['region','margin_pct']].dropna(); regions = y['region'].astype('category'); idx = regions.cat.codes.values; y_obs = y['margin_pct'].values
with pm.Model() as model:
    mu_overall = pm.Normal('mu_overall', mu=0.15, sigma=0.1)
    sigma_group = pm.HalfNormal('sigma_group', sigma=0.1)
    sigma = pm.HalfNormal('sigma', sigma=0.1)
    mu_group = pm.Normal('mu_group', mu=mu_overall, sigma=sigma_group, shape=regions.cat.categories.size)
    pm.Normal('y_like', mu=mu_group[idx], sigma=sigma, observed=y_obs)
    idata = pm.sample(1000, tune=1000, target_accept=0.9, chains=2, cores=1, progressbar=False)
az.summary(idata, var_names=['mu_overall','sigma_group','sigma','mu_group']).head()

In [ ]:
post = az.extract(idata, var_names=['mu_group']).to_numpy()
probs = (post < 0).mean(axis=0)
pd.DataFrame({'region': regions.cat.categories, 'p_loss': probs}).sort_values('p_loss', ascending=False)

## SPC — Shewhart and CUSUM Charts
Tracks weekly mean margin% for stability and small persistent drifts.

In [ ]:
weekly = (df.groupby(df['order_dt'].dt.isocalendar().week, as_index=False)
            .agg(mean_margin=('margin_pct','mean')))
weekly.columns=['week','mean_margin']
m, s = weekly['mean_margin'].mean(), weekly['mean_margin'].std()
ucl, lcl = m+3*s, m-3*s
plt.figure(figsize=(10,4)); plt.plot(weekly['week'], weekly['mean_margin'], marker='o');
plt.axhline(m, color='blue', linestyle='--'); plt.axhline(ucl, color='red', linestyle='--'); plt.axhline(lcl, color='red', linestyle='--');
plt.title('Shewhart Control Chart'); plt.xlabel('Week'); plt.ylabel('Mean Margin%'); plt.show()

In [ ]:
k = 0.5*s; h = 5*s; pos=[0]; neg=[0]; signals=[]
for i, x in enumerate(weekly['mean_margin']):
    pos.append(max(0, pos[-1] + (x - (m + k))))
    neg.append(min(0, neg[-1] + (x - (m - k))))
    if pos[-1] > h or neg[-1] < -h: signals.append(i)
plt.figure(figsize=(8,4)); plt.plot(pos[1:], label='CUSUM+'); plt.plot(neg[1:], label='CUSUM-'); plt.legend(); plt.title('CUSUM Chart'); plt.show(); signals

## Bootstrapping Confidence Intervals for Loss Rate

In [ ]:
loss = (df['margin_amount']<0).astype(int)
means = [loss.sample(frac=1, replace=True).mean() for _ in range(1000)]
ci = np.percentile(means, [2.5, 97.5])
plt.hist(means, bins=30, color='skyblue'); plt.axvline(ci[0], color='red'); plt.axvline(ci[1], color='red'); plt.title('Bootstrap 95% CI for Loss Rate'); plt.show(); ci

## Priority Scoring — Frequency × Financial Impact

In [ ]:
seg = (df.assign(loss=lambda x: x['margin_amount']<0)
         .groupby(['region','job_code'], as_index=False)
         .agg(orders=('service_order_no','count'), total_margin=('margin_amount','sum'), loss_rate=('loss','mean')))
seg['priority'] = (-seg['total_margin'].clip(upper=0)) * (0.5 + seg['loss_rate'])
top = seg.sort_values('priority', ascending=False).head(10)
labels = top['region'] + ' | ' + top['job_code'].astype(str)
plt.figure(figsize=(8,4)); plt.barh(labels, -top['total_margin']); plt.gca().invert_yaxis(); plt.title('Top 10 Loss Segments'); plt.tight_layout(); plt.show(); top