In [1]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 학습용과 검증용으로 나누는 함수
from sklearn.model_selection import train_test_split

# 교차 검증
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 평가함수
# 분류용
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# 회귀용
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 모델의 최적의 하이퍼 파라미터를 찾기 위한 도구
from sklearn.model_selection import GridSearchCV

# 머신러닝 알고리즘 - 분류
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# 머신러닝 알고리즘 - 회귀
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor

# 차원 축소
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# 군집
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift
from sklearn.cluster import estimate_bandwidth

# 학습 모델 저장을 위한 라이브러리
import pickle

In [3]:
# 1. 모든 parquet 파일 경로 가져오기
file_list = glob.glob('open/train/1.회원정보/2018*_train_회원정보.parquet')

# 2. 데이터프레임 리스트 생성
df_list = [pd.read_parquet(f) for f in file_list]

# 3. 하나로 합치기
all_df = pd.concat(df_list, ignore_index=True)

# 4. 확인
print(all_df.shape)
all_df.head(10)

(2400000, 78)


Unnamed: 0,기준년월,ID,남녀구분코드,연령,Segment,회원여부_이용가능,회원여부_이용가능_CA,회원여부_이용가능_카드론,소지여부_신용,소지카드수_유효_신용,...,할인금액_제휴연회비_B0M,청구금액_기본연회비_B0M,청구금액_제휴연회비_B0M,상품관련면제카드수_B0M,임직원면제카드수_B0M,우수회원면제카드수_B0M,기타면제카드수_B0M,카드신청건수,Life_Stage,최종카드발급경과월
0,201807,TRAIN_000000,2,40대,D,1,1,0,1,1,...,0,0,0,0개,0개,0개,0개,0,자녀성장(2),22
1,201807,TRAIN_000001,1,30대,E,1,1,1,1,1,...,0,0,0,0개,0개,0개,0개,0,자녀성장(1),18
2,201807,TRAIN_000002,1,30대,C,1,1,0,1,1,...,0,0,0,0개,0개,0개,0개,0,자녀출산기,20
3,201807,TRAIN_000003,2,40대,D,1,1,0,1,2,...,0,0,0,0개,0개,0개,0개,1,자녀성장(2),17
4,201807,TRAIN_000004,2,40대,E,1,1,1,1,1,...,0,0,0,0개,0개,0개,0개,1,자녀성장(1),15
5,201807,TRAIN_000005,2,30대,E,1,1,0,1,1,...,0,0,0,0개,0개,0개,0개,0,자녀성장(1),51
6,201807,TRAIN_000006,1,20대,E,1,1,0,1,1,...,0,0,0,0개,0개,0개,0개,0,독신,6
7,201807,TRAIN_000007,2,60대,E,1,1,1,1,2,...,0,0,0,0개,0개,0개,0개,0,노년생활,28
8,201807,TRAIN_000008,1,30대,C,1,1,1,1,3,...,0,0,0,0개,0개,0개,0개,1,자녀출산기,5
9,201807,TRAIN_000009,1,20대,E,1,0,0,1,1,...,0,0,0,0개,0개,0개,0개,0,독신,10


In [5]:

from scipy.stats import chi2_contingency, f_oneway

def cramers_v(confusion_matrix):
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    r, k = confusion_matrix.shape
    return np.sqrt(chi2 / (n * (min(k, r) - 1)))

def eta_squared(anova_ss_between, total_ss):
    return anova_ss_between / total_ss if total_ss != 0 else np.nan

# 수치형/범주형 자동 구분
num_cols = all_df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = all_df.select_dtypes(exclude=[np.number]).columns.tolist()

results = []

for col in all_df.columns:
    if col == 'Segment':
        continue

    if col in num_cols:
        data = all_df[[col, 'Segment']].dropna()
        groups = [data[data['Segment'] == val][col] for val in data['Segment'].unique()]
        try:
            stat = f_oneway(*groups).statistic
            ss_between = sum([(g.mean() - data[col].mean())**2 * len(g) for g in groups])
            ss_total = sum((data[col] - data[col].mean())**2)
            eta2 = eta_squared(ss_between, ss_total)
            results.append({'변수': col, '유형': '수치형', '계수종류': 'Eta²', '상관계수': eta2})
        except:
            continue

    elif col in cat_cols:
        contingency = pd.crosstab(all_df[col], all_df['Segment'])
        if contingency.shape[0] > 1 and contingency.shape[1] > 1:
            try:
                v = cramers_v(contingency)
                results.append({'변수': col, '유형': '범주형', '계수종류': "Cramér's V", '상관계수': v})
            except:
                continue

# 결과 정리
result_df = pd.DataFrame(results)
result_df = result_df.sort_values(by='상관계수', ascending=False).reset_index(drop=True)

# 결과 출력
display(result_df)

Unnamed: 0,변수,유형,계수종류,상관계수
0,ID,범주형,Cramér's V,1.000000
1,이용금액_R3M_신용체크,수치형,Eta²,0.389813
2,이용금액_R3M_신용,수치형,Eta²,0.347924
3,_1순위카드이용금액,수치형,Eta²,0.329734
4,이용카드수_신용체크,수치형,Eta²,0.166022
...,...,...,...,...
68,이용카드수_체크_가족,수치형,Eta²,
69,이용금액_R3M_체크_가족,수치형,Eta²,
70,연회비할인카드수_B0M,수치형,Eta²,
71,할인금액_기본연회비_B0M,수치형,Eta²,


In [9]:
# 수치형/범주형 자동 구분
num_cols = all_df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = all_df.select_dtypes(exclude=[np.number]).columns.tolist()

anova = []
chi = []

for col in all_df.columns:
    if col == 'Segment':
        continue

    if col in num_cols:
        data = all_df[[col, 'Segment']].dropna()
        groups = [data[data['Segment'] == val][col] for val in data['Segment'].unique()]
        try:
            stat = f_oneway(*groups).statistic
            ss_between = sum([(g.mean() - data[col].mean())**2 * len(g) for g in groups])
            ss_total = sum((data[col] - data[col].mean())**2)
            eta2 = eta_squared(ss_between, ss_total)
            anova.append({'변수': col, '유형': '수치형', '계수종류': 'Eta²', '상관계수': eta2})
        except:
            continue

    elif col in cat_cols:
        contingency = pd.crosstab(all_df[col], all_df['Segment'])
        if contingency.shape[0] > 1 and contingency.shape[1] > 1:
            try:
                v = cramers_v(contingency)
                chi.append({'변수': col, '유형': '범주형', '계수종류': "Cramér's V", '상관계수': v})
            except:
                continue

# 결과 정리
result_df1 = pd.DataFrame(anova)
result_df2 = pd.DataFrame(chi)
result_df1 = result_df1.sort_values(by='상관계수', ascending=False).reset_index(drop=True)
result_df2 = result_df2.sort_values(by='상관계수', ascending=False).reset_index(drop=True)

# 결과 출력
display(result_df1)
display(result_df2)

Unnamed: 0,변수,유형,계수종류,상관계수
0,이용금액_R3M_신용체크,수치형,Eta²,0.389813
1,이용금액_R3M_신용,수치형,Eta²,0.347924
2,_1순위카드이용금액,수치형,Eta²,0.329734
3,이용카드수_신용체크,수치형,Eta²,0.166022
4,_2순위카드이용금액,수치형,Eta²,0.165938
...,...,...,...,...
59,이용카드수_체크_가족,수치형,Eta²,
60,이용금액_R3M_체크_가족,수치형,Eta²,
61,연회비할인카드수_B0M,수치형,Eta²,
62,할인금액_기본연회비_B0M,수치형,Eta²,


Unnamed: 0,변수,유형,계수종류,상관계수
0,ID,범주형,Cramér's V,1.0
1,_2순위신용체크구분,범주형,Cramér's V,0.154185
2,가입통신회사코드,범주형,Cramér's V,0.053978
3,Life_Stage,범주형,Cramér's V,0.049693
4,_1순위신용체크구분,범주형,Cramér's V,0.046067
5,직장시도명,범주형,Cramér's V,0.045916
6,연령,범주형,Cramér's V,0.04583
7,거주시도명,범주형,Cramér's V,0.034416
8,연회비발생카드수_B0M,범주형,Cramér's V,0.008985


### 범주형 컬럼 지정한 경우 

In [11]:
# 분석용 패키지 임포트
import pandas as pd
import numpy as np
from scipy.stats import f_oneway
import seaborn as sns
import matplotlib.pyplot as plt

# 범주형 상관계수용 함수 (Cramér's V)
from scipy.stats import chi2_contingency

def cramers_v(confusion_matrix):
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

# 수치형 중 범주형으로 처리할 컬럼 지정
manual_cat_cols = [
    '성별', '동의여부', '남녀구분코드', '연령', 'Life_Stage',
    '가입통신회사코드', '거주시도명', '직장시도명',
    '회원여부_이용가능', '회원여부_이용가능_CA', '회원여부_이용가능_카드론',
    '회원여부_연체', '탈퇴횟수_누적', '탈회회수_발급6개월이내', '탈회횟수_발급1년이내',
    '소지여부_신용', '소지카드수_유효_신용', '소지카드수_이용가능_신용',
    '보유여부_해외겸용_본인', '이용가능여부_해외겸용_본인', '이용여부_3M_해외겸용_보인',
    '연회비발생카드수_BOM', '_1순위신용체크구분', '_2순위신용체크구분',
    '이용거절여부_카드론', '동의여부_한도증액안내',
    '수신거부여부_TM', '수신거부여부_DM', '수신거부여부_메일', '수신거부여부_SMS',
    '마케팅동의여부'
]

# <- 여기 본인 데이터에 맞게 컬럼명 입력

# 수치형, 범주형 컬럼 자동 구분
num_cols = all_df.select_dtypes(include=[np.number]).columns.tolist()
num_cols = [col for col in num_cols if col not in manual_cat_cols]

cat_cols = all_df.select_dtypes(exclude=[np.number]).columns.tolist()
cat_cols += manual_cat_cols

# 결과 리스트
anova = []
chi = []

# 컬럼 반복
for col in all_df.columns:
    if col == 'Segment':
        continue

    if col in num_cols:
        data = all_df[[col, 'Segment']].dropna()
        groups = [data[data['Segment'] == val][col] for val in data['Segment'].unique()]
        try:
            stat = f_oneway(*groups).statistic
            ss_between = sum([(g.mean() - data[col].mean())**2 * len(g) for g in groups])
            ss_total = sum((data[col] - data[col].mean())**2)
            eta2 = ss_between / ss_total
            anova.append({'변수': col, '유형': '수치형', '계수종류': 'Eta²', '상관계수': eta2})
        except:
            continue

    elif col in cat_cols:
        contingency = pd.crosstab(all_df[col], all_df['Segment'])
        if contingency.shape[0] > 1 and contingency.shape[1] > 1:
            try:
                v = cramers_v(contingency)
                chi.append({'변수': col, '유형': '범주형', '계수종류': "Cramér's V", '상관계수': v})
            except:
                continue

# 결과 데이터프레임
result_df1 = pd.DataFrame(anova)
result_df2 = pd.DataFrame(chi)

# 정렬
result_df1 = result_df1.sort_values(by='상관계수', ascending=False).reset_index(drop=True)
result_df2 = result_df2.sort_values(by='상관계수', ascending=False).reset_index(drop=True)

# 출력
display(result_df1)
display(result_df2)

Unnamed: 0,변수,유형,계수종류,상관계수
0,이용금액_R3M_신용체크,수치형,Eta²,0.389813
1,이용금액_R3M_신용,수치형,Eta²,0.347924
2,_1순위카드이용금액,수치형,Eta²,0.329734
3,이용카드수_신용체크,수치형,Eta²,0.166022
4,_2순위카드이용금액,수치형,Eta²,0.165938
5,_1순위카드이용건수,수치형,Eta²,0.154889
6,_2순위카드이용건수,수치형,Eta²,0.151003
7,이용카드수_신용,수치형,Eta²,0.147888
8,이용가능카드수_신용체크,수치형,Eta²,0.129013
9,이용가능카드수_신용,수치형,Eta²,0.127935


Unnamed: 0,변수,유형,계수종류,상관계수
0,ID,범주형,Cramér's V,0.912872
1,소지카드수_이용가능_신용,범주형,Cramér's V,0.200274
2,소지카드수_유효_신용,범주형,Cramér's V,0.173719
3,이용가능여부_해외겸용_본인,범주형,Cramér's V,0.164515
4,_2순위신용체크구분,범주형,Cramér's V,0.154176
5,보유여부_해외겸용_본인,범주형,Cramér's V,0.14787
6,수신거부여부_TM,범주형,Cramér's V,0.110451
7,수신거부여부_메일,범주형,Cramér's V,0.110134
8,수신거부여부_DM,범주형,Cramér's V,0.108477
9,수신거부여부_SMS,범주형,Cramér's V,0.096991


In [13]:
# 엑셀로 저장
result_df1.to_excel("open/1.회원정보_Anova_결과.xlsx", index=False)
result_df2.to_excel("open/1.회원정보_카이제곱_결과.xlsx", index=False)


In [37]:
# Label Encoding
le = LabelEncoder()
all_df['Segment_encoded'] = le.fit_transform(all_df['Segment'].astype(str))

# 수치형 변수 + Segment_encoded만 선택
num_cols_with_seg = num_cols + ['Segment_encoded']

# 상관계수 계산
corr_df = all_df[num_cols_with_seg].corr()

# Segment_encoded 기준 상관계수 추출
segment_corr = corr_df[['Segment_encoded']].drop(index='Segment_encoded').reset_index()
segment_corr.columns = ['변수', '상관계수']

# 정렬: 절대값 기준으로만 정렬 (값은 원래 그대로)
segment_corr = segment_corr.reindex(segment_corr['상관계수'].abs().sort_values(ascending=False).index).reset_index(drop=True)

display(segment_corr.style.format({'상관계수': "{:.3f}"}))


Unnamed: 0,변수,상관계수
0,이용금액_R3M_신용체크,-0.623
1,이용금액_R3M_신용,-0.589
2,_1순위카드이용금액,-0.574
3,_2순위카드이용금액,-0.4
4,이용카드수_신용체크,-0.398
5,_1순위카드이용건수,-0.384
6,이용카드수_신용,-0.377
7,_2순위카드이용건수,-0.376
8,이용가능카드수_신용,-0.353
9,이용가능카드수_신용체크,-0.352


### Anova vs Pearson 비교 
- 일단, 수치형 컬럼만 비교 진행했음.

In [41]:
# ===== Pearson 준비 =====
pearson_df = segment_corr[['변수', '상관계수']].copy()
pearson_df.columns = ['변수', 'Pearson']
pearson_df['Pearson_rank'] = pearson_df['Pearson'].abs().rank(ascending=False, method='min')
pearson_df['Pearson_rank'] = pearson_df['Pearson_rank'].fillna(0).astype(int)

# ===== ANOVA 준비 =====
anova_df = result_df1[['변수', '상관계수']].copy()
anova_df.columns = ['변수', 'Eta2']
anova_df['Eta2_rank'] = anova_df['Eta2'].rank(ascending=False, method='min')
anova_df['Eta2_rank'] = anova_df['Eta2_rank'].fillna(0).astype(int)

# ===== 병합 =====
merged_df = pd.merge(anova_df, pearson_df, on='변수', how='outer')

# ===== 순위 차이 =====
merged_df['Rank_diff'] = (merged_df['Eta2_rank'] - merged_df['Pearson_rank']).abs()

# ===== NaN 제거 & 정렬 =====
merged_df_clean = merged_df.dropna(subset=['Eta2', 'Pearson'])
merged_df_clean = merged_df_clean.sort_values(by='Eta2', ascending=False, na_position='last').reset_index(drop=True)

# ===== 출력 =====
display(merged_df_clean.style.format({'Eta2': "{:.3f}", 'Pearson': "{:.3f}"}))


Unnamed: 0,변수,Eta2,Eta2_rank,Pearson,Pearson_rank,Rank_diff
0,이용금액_R3M_신용체크,0.39,1,-0.623,1,0
1,이용금액_R3M_신용,0.348,2,-0.589,2,0
2,_1순위카드이용금액,0.33,3,-0.574,3,0
3,이용카드수_신용체크,0.166,4,-0.398,5,1
4,_2순위카드이용금액,0.166,5,-0.4,4,1
5,_1순위카드이용건수,0.155,6,-0.384,6,0
6,_2순위카드이용건수,0.151,7,-0.376,8,1
7,이용카드수_신용,0.148,8,-0.377,7,1
8,이용가능카드수_신용체크,0.129,9,-0.352,10,1
9,이용가능카드수_신용,0.128,10,-0.353,9,1
