In [1]:
import pandas as pd
import sys
import os
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy.stats import spearmanr, chi2_contingency, pointbiserialr

In [2]:
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    return np.sqrt(phi2 / min(k-1, r-1))

In [4]:
df_trian = pd.read_csv(r"C:\Users\rtc32\OneDrive\바탕 화면\데이톤\Datasets\application_train.csv")
df_credit = pd.read_csv(r"C:\Users\rtc32\OneDrive\바탕 화면\데이톤\Datasets\credit_card_balance.csv")
df_install = pd.read_csv(r"C:\Users\rtc32\OneDrive\바탕 화면\데이톤\Datasets\installments_payments.csv")

In [3]:
df_breau_blance = pd.read_csv(r"C:\Users\rtc32\OneDrive\바탕 화면\데이톤\Datasets\bureau_balance.csv")
df_bureau = pd.read_csv(r"C:\Users\rtc32\OneDrive\바탕 화면\데이톤\Datasets\bureau.csv")
df_poscash = pd.read_csv(r"C:\Users\rtc32\OneDrive\바탕 화면\데이톤\Datasets\POS_CASH_balance.csv")
df_prev = pd.read_csv(r"C:\Users\rtc32\OneDrive\바탕 화면\데이톤\Datasets\previous_application.csv")

In [5]:
df_train = df_trian[['SK_ID_CURR', 'TARGET']]

In [8]:
df_train.shape

(307511, 2)

In [9]:
df_credit.shape

(3840312, 23)

In [10]:
df_credit = df_credit.merge(df_train, on='SK_ID_CURR', how='left')

In [11]:
df_credit.shape

(3840312, 24)

In [12]:
df_install.shape

(13605401, 8)

In [14]:
for df in [df_credit, df_install]:
    for col in df.columns:
        if df[col].dtype == 'int64':
            df[col] = df[col].astype('int32')
        elif df[col].dtype == 'float64':
            df[col] = df[col].astype('float32')

In [16]:
df_install = df_install.merge(df_train, on='SK_ID_CURR', how='left')

In [19]:
df_credit.dropna(inplace=True)

In [27]:
df_credit.shape

(2556314, 24)

In [20]:
# 수치형 변수 리스트
numeric_cols = df_credit.select_dtypes(include=['int32', 'float32']).columns.drop(['SK_ID_CURR', 'TARGET'])

# 범주형 변수 리스트
categorical_cols = df_credit.select_dtypes(include=['object']).columns

# 수치형 변수와 TARGET의 pointbiserial correlation
for col in numeric_cols:
    corr, p = pointbiserialr(df_credit['TARGET'], df_credit[col])
    print(f"{col}: pointbiserial corr={corr:.4f}, p-value={p:.4g}")

# 범주형 변수와 TARGET의 Cramér's V
for col in categorical_cols:
    v = cramers_v(df_credit[col], df_credit['TARGET'])
    print(f"{col}: Cramér's V={v:.4f}")

SK_ID_PREV: pointbiserial corr=0.0032, p-value=3.668e-07
MONTHS_BALANCE: pointbiserial corr=0.0458, p-value=0
AMT_BALANCE: pointbiserial corr=0.0515, p-value=0
AMT_CREDIT_LIMIT_ACTUAL: pointbiserial corr=0.0260, p-value=0
AMT_DRAWINGS_ATM_CURRENT: pointbiserial corr=0.0229, p-value=1.003e-292
AMT_DRAWINGS_CURRENT: pointbiserial corr=0.0190, p-value=8.518e-203
AMT_DRAWINGS_OTHER_CURRENT: pointbiserial corr=0.0017, p-value=0.006298
AMT_DRAWINGS_POS_CURRENT: pointbiserial corr=0.0035, p-value=2.274e-08
AMT_INST_MIN_REGULARITY: pointbiserial corr=0.0410, p-value=0
AMT_PAYMENT_CURRENT: pointbiserial corr=0.0127, p-value=3.149e-92
AMT_PAYMENT_TOTAL_CURRENT: pointbiserial corr=0.0120, p-value=1.981e-81
AMT_RECEIVABLE_PRINCIPAL: pointbiserial corr=0.0510, p-value=0
AMT_RECIVABLE: pointbiserial corr=0.0512, p-value=0
AMT_TOTAL_RECEIVABLE: pointbiserial corr=0.0513, p-value=0
CNT_DRAWINGS_ATM_CURRENT: pointbiserial corr=0.0369, p-value=0
CNT_DRAWINGS_CURRENT: pointbiserial corr=0.0361, p-value=0

In [22]:
df_install.dropna(inplace=True)

In [28]:
df_install.shape

(11589009, 9)

In [26]:
# 수치형 변수 리스트
numeric_cols = df_install.select_dtypes(include=['int32', 'float32', 'int64', 'float64']).columns.drop(['SK_ID_CURR', 'TARGET'])

# 범주형 변수 리스트
categorical_cols = df_install.select_dtypes(include=['object']).columns

# 수치형 변수와 TARGET의 pointbiserial correlation
for col in numeric_cols:
    corr, p = pointbiserialr(df_install['TARGET'], df_install[col])
    print(f"{col}: pointbiserial corr={corr:.4f}, p-value={p:.4g}")

# 범주형 변수와 TARGET의 Cramér's V
for col in categorical_cols:
    v = cramers_v(df_install[col], df_install['TARGET'])
    print(f"{col}: Cramér's V={v:.4f}")

SK_ID_PREV: pointbiserial corr=-0.0002, p-value=0.4438
NUM_INSTALMENT_VERSION: pointbiserial corr=-0.0101, p-value=1.734e-261
NUM_INSTALMENT_NUMBER: pointbiserial corr=-0.0163, p-value=0
DAYS_INSTALMENT: pointbiserial corr=0.0348, p-value=0
DAYS_ENTRY_PAYMENT: pointbiserial corr=0.0351, p-value=0
AMT_INSTALMENT: pointbiserial corr=-0.0015, p-value=2.438e-07
AMT_PAYMENT: pointbiserial corr=-0.0036, p-value=5.896e-35


In [6]:
def correlation_report(df, df_name='dataframe'):
    print(f"\n=== {df_name} ===")
    # 수치형 변수 리스트
    numeric_cols = df.select_dtypes(include=['int32', 'float32', 'int64', 'float64']).columns.drop(['SK_ID_CURR', 'TARGET'])
    # 범주형 변수 리스트
    categorical_cols = df.select_dtypes(include=['object']).columns

    # 수치형 변수와 TARGET의 pointbiserial correlation
    for col in numeric_cols:
        corr, p = pointbiserialr(df['TARGET'], df[col])
        print(f"{col}: pointbiserial corr={corr:.4f}, p-value={p:.4g}")

    # 범주형 변수와 TARGET의 Cramér's V
    for col in categorical_cols:
        v = cramers_v(df[col].fillna('missing'), df['TARGET'])
        print(f"{col}: Cramér's V={v:.4f}")


In [8]:
df_trian.dropna(inplace=True)

In [10]:
df_poscash_merged = df_poscash.merge(df_train, on='SK_ID_CURR', how='left')
df_prev_merged = df_prev.merge(df_train, on='SK_ID_CURR', how='left')

In [11]:

df_bureau_merged = df_bureau.merge(df_train, on='SK_ID_CURR', how='left')



In [12]:
df_poscash_merged.dropna(inplace=True)
df_prev_merged.dropna(inplace=True)
df_bureau_merged.dropna(inplace=True)
df_trian.dropna(inplace=True)

In [14]:
merged_dfs = [
    df_trian,
    df_bureau_merged, 
    df_poscash_merged, 
    df_prev_merged
]
merged_names = [
    'df_trian', 
    'df_bureau_merged',  
    'df_poscash_merged', 
    'df_prev_merged'
]

for df, name in zip(merged_dfs, merged_names):
    print(f"\n=== {name} ===")
    numeric_cols = df.select_dtypes(include=['int32', 'float32', 'int64', 'float64']).columns.drop(['SK_ID_CURR', 'TARGET'])
    categorical_cols = df.select_dtypes(include=['object']).columns

    results = []

    for col in numeric_cols:
        corr, p = pointbiserialr(df['TARGET'], df[col])
        results.append({'variable': col, 'type': 'numeric', 'corr': abs(corr), 'p_value': p})

    for col in categorical_cols:
        v = cramers_v(df[col].fillna('missing'), df['TARGET'])
        results.append({'variable': col, 'type': 'categorical', 'corr': abs(v), 'p_value': None})

    # 내림차순 정렬 후 출력
    results_sorted = sorted(results, key=lambda x: x['corr'], reverse=True)
    for r in results_sorted:
        print(f"{r['variable']} ({r['type']}): corr={r['corr']:.4f}, p-value={r['p_value']}")


=== df_trian ===
FLAG_MOBIL (numeric): corr=nan, p-value=nan
EXT_SOURCE_3 (numeric): corr=0.1507, p-value=6.961734524966711e-45
EXT_SOURCE_2 (numeric): corr=0.1310, p-value=3.1947562843210103e-34
EXT_SOURCE_1 (numeric): corr=0.1295, p-value=1.6311798519247395e-33
DAYS_EMPLOYED (numeric): corr=0.0630, p-value=5.114511754822699e-09
REGION_RATING_CLIENT_W_CITY (numeric): corr=0.0612, p-value=1.3319373983871017e-08
REGION_RATING_CLIENT (numeric): corr=0.0536, p-value=6.510807066637697e-07
FLOORSMAX_AVG (numeric): corr=0.0461, p-value=1.8838074235306675e-05
FLOORSMAX_MODE (numeric): corr=0.0460, p-value=1.9805004302615255e-05
DAYS_BIRTH (numeric): corr=0.0411, p-value=0.0001369139681425598
AMT_INCOME_TOTAL (numeric): corr=0.0398, p-value=0.00022540909625652864
FLOORSMIN_AVG (numeric): corr=0.0329, p-value=0.002272205922911595
FLOORSMIN_MODE (numeric): corr=0.0317, p-value=0.003289527498505338
LIVINGAPARTMENTS_AVG (numeric): corr=0.0281, p-value=0.009144622944903752
APARTMENTS_AVG (numeric)

  rpb, prob = pearsonr(x, y)
  return np.sqrt(phi2 / min(k-1, r-1))
  rpb, prob = pearsonr(x, y)


DAYS_ENDDATE_FACT (numeric): corr=0.0412, p-value=2.0742592431927516e-16
DAYS_CREDIT (numeric): corr=0.0384, p-value=2.1113841653292625e-14
AMT_CREDIT_SUM_OVERDUE (numeric): corr=nan, p-value=nan
CREDIT_TYPE (categorical): corr=0.0312, p-value=None
DAYS_CREDIT_UPDATE (numeric): corr=0.0305, p-value=1.2909920306832028e-09
AMT_CREDIT_MAX_OVERDUE (numeric): corr=0.0224, p-value=8.404324369420824e-06
DAYS_CREDIT_ENDDATE (numeric): corr=0.0191, p-value=0.0001446284973624677
CREDIT_ACTIVE (categorical): corr=0.0156, p-value=None
AMT_CREDIT_SUM_LIMIT (numeric): corr=0.0071, p-value=0.15727147526009752
SK_ID_BUREAU (numeric): corr=0.0056, p-value=0.26848937835255693
CREDIT_CURRENCY (categorical): corr=0.0045, p-value=None
CNT_CREDIT_PROLONG (numeric): corr=0.0036, p-value=0.4697107707691481
AMT_CREDIT_SUM_DEBT (numeric): corr=0.0019, p-value=0.7059638666976843
CREDIT_DAY_OVERDUE (numeric): corr=0.0015, p-value=0.7673113693987369
AMT_CREDIT_SUM (numeric): corr=0.0013, p-value=0.7979353865289296

  rpb, prob = pearsonr(x, y)
  return np.sqrt(phi2 / min(k-1, r-1))
  return np.sqrt(phi2 / min(k-1, r-1))
  return np.sqrt(phi2 / min(k-1, r-1))
  return np.sqrt(phi2 / min(k-1, r-1))
  return np.sqrt(phi2 / min(k-1, r-1))
  return np.sqrt(phi2 / min(k-1, r-1))
  return np.sqrt(phi2 / min(k-1, r-1))
  return np.sqrt(phi2 / min(k-1, r-1))


In [15]:
df_credit_kpi = pd.read_csv("credit_kpi.csv")
df_installment_kpi = pd.read_csv("installment_kpi.csv")

In [16]:
df_credit_kpi.dropna(inplace=True)
df_installment_kpi.dropna(inplace=True)

In [17]:
merged_dfs = [
    df_credit_kpi,
    df_installment_kpi
]
merged_names = [
    'df_credit_kpi',
    'df_installment_kpi'
]

for df, name in zip(merged_dfs, merged_names):
    print(f"\n=== {name} ===")
    numeric_cols = df.select_dtypes(include=['int32', 'float32', 'int64', 'float64']).columns.drop(['SK_ID_CURR', 'TARGET'])
    categorical_cols = df.select_dtypes(include=['object']).columns

    results = []

    for col in numeric_cols:
        corr, p = pointbiserialr(df['TARGET'], df[col])
        results.append({'variable': col, 'type': 'numeric', 'corr': abs(corr), 'p_value': p})

    for col in categorical_cols:
        v = cramers_v(df[col].fillna('missing'), df['TARGET'])
        results.append({'variable': col, 'type': 'categorical', 'corr': abs(v), 'p_value': None})

    # 내림차순 정렬 후 출력
    results_sorted = sorted(results, key=lambda x: x['corr'], reverse=True)
    for r in results_sorted:
        print(f"{r['variable']} ({r['type']}): corr={r['corr']:.4f}, p-value={r['p_value']}")


=== df_credit_kpi ===
AVG_UTIL_RATIO (numeric): corr=0.1262, p-value=8.866363200873621e-101
OTHER_RATIO (numeric): corr=nan, p-value=nan
STATUS_Active (numeric): corr=0.0911, p-value=3.164485429042871e-53
AMT_PAYMENT_CURRENT_max (numeric): corr=0.0790, p-value=2.1176643438301963e-40
AMT_BALANCE_min (numeric): corr=0.0722, p-value=5.4876337129834026e-34
AMT_TOTAL_RECEIVABLE_min (numeric): corr=0.0716, p-value=1.671209984835706e-33
AMT_CREDIT_LIMIT_ACTUAL_max (numeric): corr=0.0671, p-value=1.2664008324531166e-29
AMT_PAYMENT_CURRENT_std (numeric): corr=0.0633, p-value=1.670675170007816e-26
CNT_DRAWINGS_AVG (numeric): corr=0.0605, p-value=2.1426621838296852e-24
AMT_CREDIT_LIMIT_ACTUAL_mean (numeric): corr=0.0593, p-value=1.7659018272310384e-23
AMT_BALANCE_mean (numeric): corr=0.0587, p-value=4.850305687980885e-23
AMT_TOTAL_RECEIVABLE_mean (numeric): corr=0.0580, p-value=1.651952247384159e-22
AMT_DRAWINGS_CURRENT_max (numeric): corr=0.0549, p-value=2.5905302155593493e-20
AMT_INST_MIN_REGU

  rpb, prob = pearsonr(x, y)
