In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from pathlib import Path

# ---------- 参数 ----------
CSV_PATH = Path("cvm_indicators_dataset_2011-2021.csv")
TARGET    = "LABEL"          # 1 = distress
META_COLS = {"ID", "QUARTER", TARGET}

# ---------- 工具函数 ----------
def cramers_v(x, y):
    """Compute Cramér's V for two binary (0/1) vectors."""
    tbl = pd.crosstab(x, y)
    chi2 = chi2_contingency(tbl, correction=False)[0]
    n = tbl.values.sum()
    return np.sqrt(chi2 / (n * (min(tbl.shape) - 1)))

# ---------- 读取数据 ----------
df = pd.read_csv(CSV_PATH)

# ---------- 逐列检验 ----------
results = []
for col in df.columns.difference(META_COLS):
    # 仅在列里至少出现 1 个 0 和 1 个非 0 时才检验
    flag = (df[col] == 0).astype(int)
    if flag.nunique() < 2:
        continue                    # 全列无结构 0
    if df[TARGET].nunique() < 2:    # 极端情况：全 0 或全 1
        continue
    chi2, p, *_ = chi2_contingency(pd.crosstab(flag, df[TARGET]), correction=False)
    v = cramers_v(flag, df[TARGET])
    results.append({"variable": col, "cramers_v": v, "p_value": p})

res_df = (pd.DataFrame(results)
          .query("cramers_v > 0.10 and p_value < 0.05")
          .sort_values("cramers_v", ascending=False))

print("Variables whose structural-zero flag correlates with distress (V > 0.10 & p < 0.05):")
print(res_df.to_string(index=False))


Variables whose structural-zero flag correlates with distress (V > 0.10 & p < 0.05):
variable  cramers_v       p_value
     A30   0.154063 4.811851e-125
     A33   0.123066  1.730725e-80
