### 9.  Data Preparation and Preprocessing Summary

Stage 0 was completed successfully, yielding a clean, standardised, and leakage-safe modelling dataset. Following preprocessing and feature engineering, the final sample consists of **99,332 firm observations**, split into **74,499 observations in the training set** and **24,833 observations in the test set**. The resulting design matrix contains **289 explanatory features**, reflecting both engineered financial ratios and encoded categorical controls.

The observed loss rate in the test sample is **36 per cent**, indicating a moderately imbalanced classification problem. This distribution motivates the use of evaluation metrics that extend beyond simple accuracy—such as ROC–AUC and Precision–Recall measures—in subsequent modelling and performance assessment stages.


In [19]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

TARGET = "is_loss"

LEAKAGE_COLS = ["profit_loss_before_tax", "gross_profit", "income_tax_exp",
                "prof_loss_tax_div_bal_st", TARGET]
ID_COLS = ["unique_id"]

drop_cols = [c for c in (LEAKAGE_COLS + ID_COLS) if c in df_model.columns]
X_raw = df_model.drop(columns=drop_cols).copy()
y = df_model[TARGET].astype(int).copy()

# --- Reduce cardinality (critical for finishing)
CAT_TOPK = 30  # keep only top 30 levels per categorical
cat_cols = X_raw.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = X_raw.select_dtypes(include=[np.number]).columns.tolist()

Xc = X_raw.copy()
for c in cat_cols:
    top = Xc[c].value_counts(dropna=False).head(CAT_TOPK).index
    Xc[c] = Xc[c].where(Xc[c].isin(top), other="other").astype("object")

# Split
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    Xc, y, test_size=0.25, stratify=y, random_state=42
)

# Preprocess (sparse one-hot)
preprocess = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), cat_cols),
    ]
)

X_train_processed = preprocess.fit_transform(X_train_raw)
X_test_processed  = preprocess.transform(X_test_raw)
feat_names_processed = preprocess.get_feature_names_out() # Store feature names

print("Done Stage 0.")
print("X_train_processed:", X_train_processed.shape, "X_test_processed:", X_test_processed.shape)
print("Loss rate (test):", round(y_test.mean(), 3))

Done Stage 0.
X_train_processed: (74499, 289) X_test_processed: (24833, 289)
Loss rate (test): 0.36


In [25]:
# ============================================================
# MODEL 2 (CLEAN OUTPUT): Continuous → Size (vs Q1) → Sector (vs MANUFACTURING)
# Prints:
#   - Rows kept + Train/Test sizes + Loss rate
#   - Logit header block ONLY (no coefficient dump)
#   - Model 2 ROC-AUC + PR-AUC
#   - Clean table: Variable | coefficient | p_value (ordered)
# ============================================================

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score

import statsmodels.formula.api as smf

SEED = 42
DATA_PATH = "final_clean.csv"   # change if needed
TARGET = "is_loss"

# -----------------------------
# 1) Load
# -----------------------------
df = pd.read_csv(DATA_PATH, low_memory=False)

# -----------------------------
# 2) Ensure target exists (or construct)
# -----------------------------
if TARGET not in df.columns:
    if "profit_loss_before_tax" in df.columns:
        df[TARGET] = (pd.to_numeric(df["profit_loss_before_tax"], errors="coerce") < 0).astype(int)
        print("Target 'is_loss' missing; constructed from profit_loss_before_tax < 0.")
    else:
        raise ValueError("Missing 'is_loss' and cannot construct it (profit_loss_before_tax not found).")

df[TARGET] = df[TARGET].astype(int)

# -----------------------------
# 3) Variables (Model 2)
# -----------------------------
continuous = [
    "cost_to_turnover",
    "admin_cost_ratio",
    "employment_cost_ratio",
    "financing_cost_ratio",
    "deductions_to_turnover",
    "high_cost_flag",
    "thin_margin_flag",
]
cat_size   = "turnover_bin_q"     # Q1/Q2/Q3/Q4
cat_sector = "sector"             # base = MANUFACTURING

need_cols = [TARGET] + continuous + [cat_size, cat_sector]
missing = [c for c in need_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")

dfm = df[need_cols].copy()

# numeric coercion
for c in continuous:
    dfm[c] = pd.to_numeric(dfm[c], errors="coerce")

# drop missing essentials
dfm = dfm.dropna(subset=[TARGET, cat_size, cat_sector] + continuous)

# standardize categories
dfm[cat_size] = dfm[cat_size].astype(str).str.strip().str.upper()
dfm[cat_sector] = dfm[cat_sector].astype(str).str.strip().str.upper()

# enforce size labels
valid_sizes = ["Q1", "Q2", "Q3", "Q4"]
dfm = dfm[dfm[cat_size].isin(valid_sizes)].copy()

# ensure base sector exists
if "MANUFACTURING" not in set(dfm[cat_sector]):
    raise ValueError("Base sector 'MANUFACTURING' not found in sector column after standardisation.")

# print rows kept
print(f"Rows kept: {len(dfm):,}")

# -----------------------------
# 4) Split
# -----------------------------
train_df, test_df = train_test_split(
    dfm, test_size=0.25, stratify=dfm[TARGET], random_state=SEED
)

print(f"Train: {train_df.shape} | Test: {test_df.shape} | Loss rate (test): {test_df[TARGET].mean():.2f}")

# -----------------------------
# 5) Fit Logit (statsmodels)
# -----------------------------
rhs = (
    " + ".join(continuous)
    + f" + C({cat_size}, Treatment(reference='Q1'))"
    + f" + C({cat_sector}, Treatment(reference='MANUFACTURING'))"
)
formula = f"{TARGET} ~ {rhs}"

res = smf.logit(formula=formula, data=train_df).fit(disp=False)

# -----------------------------
# 6) Print ONLY the header block (no coefficient dump)
# -----------------------------
hdr = [
    "==================== MODEL 2: Logit Results (Statsmodels) ====================",
    "                           Logit Regression Results                           ",
    "==============================================================================",
    f"Dep. Variable:                {TARGET:<8}   No. Observations:                {int(res.nobs)}",
    f"Model:                          Logit   Df Residuals:                    {int(res.df_resid)}",
    f"Method:                           MLE   Df Model:                           {int(res.df_model)}",
    f"Date:                {pd.Timestamp.today().strftime('%a, %d %b %Y')}",
    f"Pseudo R-squ.:                  {res.prsquared:.4f}",
    f"Log-Likelihood:                {res.llf:.0f}.",
    f"converged:                       {bool(res.mle_retvals.get('converged', True))}",
    f"LL-Null:                       {res.llnull:.0f}.",
    f"Covariance Type:            nonrobust   LLR p-value:                     {res.llr_pvalue:.2f}",
    "==============================================================================",
]
print("\n" + "\n".join(hdr))

# -----------------------------
# 7) Metrics on TEST
# -----------------------------
p_test = res.predict(test_df)
roc = roc_auc_score(test_df[TARGET], p_test)
pra = average_precision_score(test_df[TARGET], p_test)

print(f"\nModel 2 ROC-AUC: {roc:.4f}")
print(f"Model 2 PR-AUC : {pra:.4f}")

# -----------------------------
# 8) Build CLEAN table: Variable | coefficient | p_value
#    Order: continuous → size → sector
# -----------------------------
# -----------------------------
# 8) Build CLEAN table: Variable | coefficient | p_value
# -----------------------------
final_tbl = pd.DataFrame(rows)

# ensure ordering + reset index
final_tbl = final_tbl.reset_index(drop=True)

print("\n==================== ONE TABLE (Model 2): Continuous → Size → Sector ====================")

# (A) Best for notebooks: true table (Variable is column 1)
display(final_tbl)





Rows kept: 99,332
Train: (74499, 10) | Test: (24833, 10) | Loss rate (test): 0.36

                           Logit Regression Results                           
Dep. Variable:                is_loss    No. Observations:                74499
Model:                          Logit   Df Residuals:                    74459
Method:                           MLE   Df Model:                           39
Date:                Tue, 20 Jan 2026
Pseudo R-squ.:                  0.1346
Log-Likelihood:                -42137.
converged:                       True
LL-Null:                       -48690.
Covariance Type:            nonrobust   LLR p-value:                     0.00

Model 2 ROC-AUC: 0.7386
Model 2 PR-AUC : 0.6593



Unnamed: 0,Variable,coefficient,p_value
0,cost_to_turnover,1.1732,0.0
1,admin_cost_ratio,1.0134,0.0
2,employment_cost_ratio,1.3164,0.0
3,financing_cost_ratio,1.8816,0.0
4,deductions_to_turnover,1.9992,0.0
5,high_cost_flag,0.7016,0.0
6,thin_margin_flag,-0.0041,0.913
7,Size: Q2 (vs Q1),-0.15,0.0
8,Size: Q3 (vs Q1),-0.5794,0.0
9,Size: Q4 (vs Q1),-1.1391,0.0


### 10 Basic Model  Results: Interpreting Loss Drivers in Policy Context


1. **Factors positively driving losses:** Loss probability rises sharply with **cost-to-turnover (β=1.17, p<0.001)**, **financing cost intensity (β=1.88, p<0.001)**, **deductions-to-turnover (β=2.00, p<0.001)**, **employment costs (β=1.32, p<0.001)**, and **admin costs (β=1.01, p<0.001)**, confirming that structural cost and financing pressures are the dominant drivers of reported losses.

2. **Factors reducing losses:** **Larger firm size** significantly lowers loss risk—**Q2 (β=-0.15, p<0.001)**, **Q3 (β=-0.58, p<0.001)**, and **Q4 (β=-1.14, p<0.001)**—while the **thin-margin indicator is insignificant (p=0.91)** once full cost structure is controlled for.

3. **Sector effects:** Relative to Manufacturing, most sectors exhibit **lower loss probabilities**, notably **Financial & Insurance (β=-1.01, p<0.001)**, **Construction (β=-0.91, p<0.001)**, **Information & Communication (β=-0.51, p<0.001)**, and **Real Estate (β=-0.61, p<0.001)**, indicating Manufacturing’s structurally higher loss exposure.

4. **Size effect:** The monotonic decline in loss risk across size quartiles indicates strong **scale and resilience effects**, with large firms substantially better able to absorb cost and financing shocks than small firms.

5. **Policy recommendation:** Compliance and policy attention should prioritise **high cost-intensity and high financing-burden firms—especially small manufacturing firms—rather than sector labels alone**, using cost-structure indicators as the primary risk screen.





### 11 Model Performance Beyond the Baseline

Building on the baseline logistic regression results, the model achieves a **ROC–AUC of 0.741** and a **PR–AUC of 0.661** on the test sample. This level of performance reflects moderate discriminative power, consistent with a linear specification that captures average relationships between loss outcomes and firms’ cost structure, size, and sectoral affiliation.

To evaluate whether nonlinear interactions and threshold effects materially improve predictive accuracy, we estimate a **Random Forest classifier** using the same training and test samples and identical preprocessing rules. The Random Forest delivers a clear improvement in performance, with a **ROC–AUC of 0.781** and a **PR–AUC of 0.725**.

The gains across both metrics indicate that loss outcomes are influenced not only by linear effects but also by **nonlinear combinations of cost ratios, firm size, and sectoral characteristics**. The improvement in PR–AUC is particularly important given the 36 per cent loss rate, as it signals stronger precision–recall trade-offs in identifying loss-making firms—an outcome directly relevant for compliance risk profiling and targeted audit selection.


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score

# -----------------------------
# 1) Load modelling dataset
# -----------------------------
DATA_PATH = "final_clean.csv"   # change to "final_clean.csv" if you saved it in repo root
df = pd.read_csv(DATA_PATH, low_memory=False)

# -----------------------------
# 2) Define target and features
# -----------------------------
TARGET = "is_loss"
if TARGET not in df.columns:
    raise ValueError(f"Target column '{TARGET}' not found. Columns are: {df.columns.tolist()}")

X = df.drop(columns=[TARGET])
y = df[TARGET].astype(int)

# Identify column types
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

# -----------------------------
# 3) Preprocess (impute + encode)
# -----------------------------
preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("impute", SimpleImputer(strategy="median"))
        ]), num_cols),
        ("cat", Pipeline([
            ("impute", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols),
    ],
    remainder="drop"
)

# -----------------------------
# 4) Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

print("Train:", X_train.shape, "Test:", X_test.shape, "| Loss rate (test):", round(y_test.mean(), 3))

# -----------------------------
# 5) Model: Logistic Regression
# -----------------------------
model = Pipeline(steps=[
    ("prep", preprocess),
    ("logit", LogisticRegression(
        solver="saga",
        max_iter=4000,
        class_weight="balanced",
        n_jobs=-1,
        random_state=42
    ))
])

model.fit(X_train, y_train)
p_logit = model.predict_proba(X_test)[:, 1]

print("Logistic ROC-AUC:", round(roc_auc_score(y_test, p_logit), 4))
print("Logistic PR-AUC :", round(average_precision_score(y_test, p_logit), 4))


Train: (74499, 9) Test: (24833, 9) | Loss rate (test): 0.36
Logistic ROC-AUC: 0.741
Logistic PR-AUC : 0.6613


### 12 Model Performance Beyond the Baseline

Building on the baseline logistic regression results, the model achieves a **ROC–AUC of 0.741** and a **PR–AUC of 0.661** on the test sample. This level of performance reflects moderate discriminative power, consistent with a linear specification that captures average relationships between loss outcomes and firms’ cost structure, size, and sectoral affiliation.

To evaluate whether nonlinear interactions and threshold effects materially improve predictive accuracy, we estimate a **Random Forest classifier** using the same training and test samples and identical preprocessing rules. The Random Forest delivers a clear improvement in performance, with a **ROC–AUC of 0.781** and a **PR–AUC of 0.725**.

The gains across both metrics indicate that loss outcomes are influenced not only by linear effects but also by **nonlinear combinations of cost ratios, firm size, and sectoral characteristics**. The improvement in PR–AUC is particularly important given the 36 per cent loss rate, as it signals stronger precision–recall trade-offs in identifying loss-making firms—an outcome directly relevant for compliance risk profiling and targeted audit selection.


In [14]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, average_precision_score

# --- Identify categorical and numeric columns ---
cat_cols = X_train.select_dtypes(include=["object"]).columns.tolist()
num_cols = X_train.select_dtypes(exclude=["object"]).columns.tolist()

# --- Preprocess: impute + one-hot ---
preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("impute", SimpleImputer(strategy="median"))
        ]), num_cols),
        ("cat", Pipeline([
            ("impute", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols),
    ],
    remainder="drop"
)

# --- Random Forest (speed-bounded) ---
rf_model = Pipeline(steps=[
    ("prep", preprocess),
    ("rf", RandomForestClassifier(
        n_estimators=120,               # HARD CAP for speed
        min_samples_leaf=150,
        max_features="sqrt",            # faster + usually strong
        class_weight="balanced_subsample",
        n_jobs=-1,
        random_state=42
    ))
])

rf_model.fit(X_train, y_train)
p_rf = rf_model.predict_proba(X_test)[:, 1]

print("RF ROC-AUC:", round(roc_auc_score(y_test, p_rf), 4))
print("RF PR-AUC :", round(average_precision_score(y_test, p_rf), 4))


RF ROC-AUC: 0.781
RF PR-AUC : 0.725
