In [1]:
import numpy as np
import pandas as pd
from math import sqrt

def assign_grade(s, scheme='fixed'):
    s = s.astype(float)

    if scheme == 'fixed':
        # 고정 구간: A(800–900), B(740–799), C(680–739), D(620–679), E(1–619)
        bins   = [0, 620, 680, 740, 800, 901]
        labels = ['E','D','C','B','A']  # 낮을수록 위험 → E가 최하
        return pd.cut(s, bins=bins, labels=labels, right=False, include_lowest=True)

    elif scheme == 'quantile':
        # 분위수 기반: 각 구간이 대략 20%씩
        qs = s.quantile([0.0, 0.2, 0.4, 0.6, 0.8, 1.0]).to_list()
        eps = 1e-9
        qs = [qs[0]-eps, qs[1], qs[2], qs[3], qs[4], qs[5]+eps]
        labels = ['E','D','C','B','A']  # 하위 분위가 E
        return pd.cut(s, bins=qs, labels=labels, include_lowest=True)

    else:
        raise ValueError("scheme은 'fixed' 또는 'quantile'이어야 합니다.")


In [2]:
df = pd.read_csv('client_full_dataset_0811.csv')

# 두 방식 등급 생성
df['grade_fixed']    = assign_grade(df['score_display'], scheme='fixed')
df['grade_quantile'] = assign_grade(df['score_display'], scheme='quantile')

# 순서 지정
order_grade = ['A','B','C','D','E']
for col in ['grade_fixed','grade_quantile']:
    df[col] = pd.Categorical(df[col], categories=order_grade, ordered=True)

# 신규/기존 × 등급별 연체율/표본수 비교
def wilson_ci(k, n, z=1.96):
    if n == 0: return (np.nan, np.nan)
    p = k/n; denom = 1 + (z**2)/n
    center = (p + (z**2)/(2*n))/denom
    half = (z/denom)*np.sqrt((p*(1-p)/n) + (z**2)/(4*n*n))
    return (max(0.0, center-half), min(1.0, center+half))

def summarize(by_col):
    g = df.groupby(['is_new_client', by_col], dropna=False).agg(
        cnt=('TARGET','size'), bad=('TARGET','sum')
    ).reset_index()
    g['bad_rate'] = g['bad']/g['cnt']
    g[['ci_low','ci_high']] = g.apply(
        lambda r: pd.Series(wilson_ci(int(r['bad']), int(r['cnt']))), axis=1
    )
    piv_rate = g.pivot(index='is_new_client', columns=by_col, values='bad_rate').reindex(columns=order_grade)
    piv_cnt  = g.pivot(index='is_new_client', columns=by_col, values='cnt').reindex(columns=order_grade)
    return (piv_rate*100).round(2), piv_cnt.fillna(0).astype(int)

rate_f, cnt_f = summarize('grade_fixed')
rate_q, cnt_q = summarize('grade_quantile')

print("▶ fixed 연체율(%)\n", rate_f, "\n\n▶ fixed 표본수\n", cnt_f)
print("\n▶ quantile 연체율(%)\n", rate_q, "\n\n▶ quantile 표본수\n", cnt_q)


▶ fixed 연체율(%)
 grade_fixed       A     B     C     D      E
is_new_client                               
기존             1.53  2.44  4.03  6.24  13.49
신규             1.27  2.80  4.73  7.49  14.40 

▶ fixed 표본수
 grade_fixed        A      B      C      D      E
is_new_client                                   
기존             13477  28833  46498  53677  96634
신규              8819   8635  12093  12622  26223

▶ quantile 연체율(%)
 grade_quantile     A     B     C      D      E
is_new_client                                 
기존              2.17  4.13  6.30  10.50  16.58
신규              2.07  4.87  7.51  11.63  16.63 

▶ quantile 표본수
 grade_quantile      A      B      C      D      E
is_new_client                                    
기존              43663  48718  49214  50228  47296
신규              17803  12575  11572  11972  14470


  g = df.groupby(['is_new_client', by_col], dropna=False).agg(
  g = df.groupby(['is_new_client', by_col], dropna=False).agg(


In [4]:
import numpy as np
import pandas as pd
from math import sqrt

# --------------------------
# 0) 윌슨 신뢰구간 함수
# --------------------------
def wilson_ci(k, n, z=1.96):
    if n == 0 or pd.isna(n):
        return (np.nan, np.nan)
    p = k / n
    denom = 1 + (z**2)/n
    center = (p + (z**2)/(2*n)) / denom
    half = (z/denom) * np.sqrt((p*(1-p)/n) + (z**2)/(4*n*n))
    return (max(0.0, center-half), min(1.0, center+half))

# --------------------------
# 1) 등급 부여 함수
# --------------------------
def assign_grade(s, scheme='fixed'):
    s = s.astype(float)
    if scheme == 'fixed':
        bins   = [0, 620, 680, 740, 800, 901]
        labels = ['E','D','C','B','A']
        return pd.cut(s, bins=bins, labels=labels, right=False, include_lowest=True)
    elif scheme == 'quantile':
        qs = s.quantile([0.0, 0.2, 0.4, 0.6, 0.8, 1.0]).to_list()
        eps = 1e-9
        qs = [qs[0]-eps, qs[1], qs[2], qs[3], qs[4], qs[5]+eps]
        labels = ['E','D','C','B','A']
        return pd.cut(s, bins=qs, labels=labels, include_lowest=True)
    else:
        raise ValueError("scheme은 'fixed' 또는 'quantile'만 가능")

# --------------------------
# 2) 등급별 집계 함수
# --------------------------
def summarize(df, grade_col):
    order_grade = ['A','B','C','D','E']
    df[grade_col] = pd.Categorical(df[grade_col], categories=order_grade, ordered=True)

    grp = df.groupby(['is_new_client', grade_col], dropna=False).agg(
        cnt=('TARGET','size'),
        bad=('TARGET','sum'),
        mean_score=('score_display','mean')
    ).reset_index()

    grp['bad_rate'] = grp['bad'] / grp['cnt']
    grp[['ci_low','ci_high']] = grp.apply(
        lambda r: pd.Series(wilson_ci(int(r['bad']), int(r['cnt']))), axis=1
    )

    pivot_rate = grp.pivot(index='is_new_client', columns=grade_col, values='bad_rate').reindex(columns=order_grade)
    pivot_cnt  = grp.pivot(index='is_new_client', columns=grade_col, values='cnt').reindex(columns=order_grade)
    pivot_mean = grp.pivot(index='is_new_client', columns=grade_col, values='mean_score').reindex(columns=order_grade)

    return (pivot_rate, pivot_cnt, pivot_mean)

# --------------------------
# 3) 데이터 로드 & 두 방식 비교
# --------------------------
df = pd.read_csv('client_full_dataset_0811.csv')

# fixed 방식 등급
df['grade_fixed'] = assign_grade(df['score_display'], scheme='fixed')
# quantile 방식 등급
df['grade_quantile'] = assign_grade(df['score_display'], scheme='quantile')

# 집계
rate_f, cnt_f, mean_f = summarize(df.copy(), 'grade_fixed')
rate_q, cnt_q, mean_q = summarize(df.copy(), 'grade_quantile')

# --------------------------
# 4) 출력
# --------------------------
print("=== [Fixed 방식] 신규/기존 × 등급별 연체율(%) ===")
print((rate_f*100).round(2))
print("\n[표본수]")
print(cnt_f.fillna(0).astype(int))
print("\n[평균 점수]")
print(mean_f.round(2))

print("\n=== [Quantile 방식] 신규/기존 × 등급별 연체율(%) ===")
print((rate_q*100).round(2))
print("\n[표본수]")
print(cnt_q.fillna(0).astype(int))
print("\n[평균 점수]")
print(mean_q.round(2))

=== [Fixed 방식] 신규/기존 × 등급별 연체율(%) ===
grade_fixed       A     B     C     D      E
is_new_client                               
기존             1.53  2.44  4.03  6.24  13.49
신규             1.27  2.80  4.73  7.49  14.40

[표본수]
grade_fixed        A      B      C      D      E
is_new_client                                   
기존             13477  28833  46498  53677  96634
신규              8819   8635  12093  12622  26223

[평균 점수]
grade_fixed         A       B       C       D       E
is_new_client                                        
기존             832.00  766.15  708.16  649.49  554.45
신규             852.15  766.83  708.20  649.89  542.71

=== [Quantile 방식] 신규/기존 × 등급별 연체율(%) ===
grade_quantile     A     B     C      D      E
is_new_client                                 
기존              2.17  4.13  6.30  10.50  16.58
신규              2.07  4.87  7.51  11.63  16.63

[표본수]
grade_quantile      A      B      C      D      E
is_new_client                                    
기존              4

  grp = df.groupby(['is_new_client', grade_col], dropna=False).agg(
  grp = df.groupby(['is_new_client', grade_col], dropna=False).agg(


In [5]:
cols = [
    'SK_ID_CURR',
    'TARGET',
    'score_display',
    'score_risk',
    'grade_fixed',
    'grade_quantile'
]

df_export = df[cols]

In [None]:
df

In [7]:
df_export.to_csv('client_final_score_0813.csv', index=False, encoding='utf-8-sig')

In [9]:
df.to_csv('client_full_dataset_0813.csv', index=False, encoding='utf-8-sig')