In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

import warnings 
warnings.filterwarnings('ignore')

sns.set() 

plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['figure.figsize'] = 12, 6 
plt.rcParams['font.size'] = 14 
plt.rcParams['axes.unicode_minus'] = False 

# 결측치 시각화를 위한 라이브러리
import missingno 

# 데이터 전처리 알고리즘 
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import StandardScaler 

# 학습용과 검증용으로 나누는  함수 
from sklearn.model_selection import train_test_split 

# 교차검증 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 평가함수
# 분류용
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# 회귀용
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 모델의 최적의 하이퍼 파라미터를 찾기 위한도구 
from sklearn.model_selection import GridSearchCV 

# 머신러닝 알고리즘 - 분류 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import AdaBoostClassifier 
from sklearn.ensemble import GradientBoostingClassifier 
from lightgbm import LGBMClassifier 
from xgboost import XGBClassifier 
from sklearn.ensemble import VotingClassifier

# 머신러닝 알고리즘 - 회귀 
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import Ridge 
from sklearn.linear_model import Lasso 
from sklearn.linear_model import ElasticNet 
from sklearn.svm import SVR 
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor 
from sklearn.ensemble import GradientBoostingRegressor 
from lightgbm import LGBMRegressor 
from xgboost import XGBRegressor 
from sklearn.ensemble import VotingRegressor 

# 학습 모델 저장을 위한 라이브러리
import pickle 


In [2]:
train_df  = pd.read_parquet(r'C:\Users\SAMSUNG\OneDrive\바탕 화면\파이널 프로젝트\train\6.채널정보\201807_train_채널정보.parquet')
sg_df = pd.read_csv(r'C:\Users\SAMSUNG\OneDrive\바탕 화면\파이널 프로젝트\july_segment.csv', encoding='utf-8-sig')


In [16]:
df = train_df.merge(sg_df, on='ID', how='left')

In [18]:
df

Unnamed: 0,기준년월,ID,인입횟수_ARS_R6M,이용메뉴건수_ARS_R6M,인입일수_ARS_R6M,인입월수_ARS_R6M,인입후경과월_ARS,인입횟수_ARS_B0M,이용메뉴건수_ARS_B0M,인입일수_ARS_B0M,...,당사PAY_방문월수_R6M,당사멤버쉽_방문횟수_B0M,당사멤버쉽_방문횟수_R6M,당사멤버쉽_방문월수_R6M,OS구분코드,홈페이지_금융건수_R6M,홈페이지_선결제건수_R6M,홈페이지_금융건수_R3M,홈페이지_선결제건수_R3M,Segment
0,201807,TRAIN_000000,10회 이상,10회 이상,8,6,0,2,6,2,...,0,22,221,6,Android,0,0,0,0,D
1,201807,TRAIN_000001,1회 이상,1회 이상,0,0,0,0,0,0,...,0,0,0,0,,0,0,0,0,E
2,201807,TRAIN_000002,1회 이상,1회 이상,1,1,0,2,5,1,...,0,0,0,0,Android,11,6,5,5,C
3,201807,TRAIN_000003,10회 이상,10회 이상,10,6,0,2,6,2,...,0,23,219,6,Android,0,0,0,0,D
4,201807,TRAIN_000004,1회 이상,1회 이상,0,0,0,0,0,0,...,0,0,0,0,Android,0,0,0,0,E
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399995,201807,TRAIN_399995,1회 이상,1회 이상,0,0,0,0,0,0,...,0,0,0,0,,0,0,0,0,E
399996,201807,TRAIN_399996,1회 이상,1회 이상,1,1,2,0,0,0,...,0,0,8,1,,0,0,0,0,D
399997,201807,TRAIN_399997,1회 이상,1회 이상,0,0,0,0,0,0,...,0,0,0,0,,0,0,0,0,C
399998,201807,TRAIN_399998,1회 이상,1회 이상,0,0,0,0,0,0,...,0,0,0,0,,0,0,0,0,E


In [19]:
drop_cols = [
    '기준년월', 'ID', 'Segment',
    '인입횟수_ARS_R6M', '이용메뉴건수_ARS_R6M',
    '방문횟수_PC_R6M', '방문일수_PC_R6M',
    '방문횟수_앱_R6M', 'OS구분코드'
]

num_df = df.drop(columns=drop_cols)

In [None]:
# 숫자열 데이터들의 ANOVA 검정 

from scipy.stats import f_oneway

results = []

for col in num_df.columns:
    groups = [df[df['Segment'] == seg][col].dropna() for seg in df['Segment'].unique()]
    if len(groups) >= 2:
        f_val, p_val = f_oneway(*groups)
        results.append({'변수명': col, 'F값': f_val, 'p값': p_val})

anova_df = pd.DataFrame(results).sort_values(by='p값')

print(anova_df.head())


               변수명           F값   p값
0     인입일수_ARS_R6M  3349.570956  0.0
24     인입월수_IB_R6M  2809.754931  0.0
25   이용메뉴건수_IB_R6M  2493.827199  0.0
26   인입후경과월_IB_R6M  2257.433582  0.0
27     인입횟수_IB_B0M   839.211717  0.0
..             ...          ...  ...
84      불만제기건수_B0M          NaN  NaN
85     불만제기건수_R12M          NaN  NaN
87  당사PAY_방문횟수_B0M          NaN  NaN
88  당사PAY_방문횟수_R6M          NaN  NaN
89  당사PAY_방문월수_R6M          NaN  NaN

[97 rows x 3 columns]


In [None]:
# 분산이 1인 값들은 모두 제외하기 

anova_df_clean = anova_df.dropna()
anova_df_clean

Unnamed: 0,변수명,F값,p값
0,인입일수_ARS_R6M,3349.570956,0.000000
24,인입월수_IB_R6M,2809.754931,0.000000
25,이용메뉴건수_IB_R6M,2493.827199,0.000000
26,인입후경과월_IB_R6M,2257.433582,0.000000
27,인입횟수_IB_B0M,839.211717,0.000000
...,...,...,...
69,IB문의건수_비밀번호_R6M,4.271000,0.001862
20,방문일수_모바일웹_B0M,2.640691,0.031945
19,방문횟수_모바일웹_B0M,2.453937,0.043650
72,IB문의건수_부대서비스_R6M,1.056871,0.376095


In [28]:
# 문자열 데이터를 크레이머스V로 검정해보기

import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    r, k = confusion_matrix.shape
    return np.sqrt(chi2 / (n * (min(k-1, r-1))))


cat_cols = [col for col in df.columns if df[col].dtype == 'object' and col != 'Segment']
results = []

for col in cat_cols:
    try:
        v = cramers_v(df['Segment'], df[col])
        results.append({'변수명': col, 'Cramers_V': v})
    except:
        continue

cramers_df = pd.DataFrame(results).sort_values(by='Cramers_V', ascending=False)
print(cramers_df)




              변수명  Cramers_V
0              ID   1.000000
1    인입횟수_ARS_R6M   0.110708
2  이용메뉴건수_ARS_R6M   0.109692
4     방문일수_PC_R6M   0.080878
5      방문횟수_앱_R6M   0.073706
3     방문횟수_PC_R6M   0.073537
6          OS구분코드   0.032469


In [34]:
# DataFrame 형식으로 sort_value를 활용하여 나타내기 

cramers_df = pd.DataFrame(results).sort_values(by='Cramers_V', ascending=False)
cramers_df

Unnamed: 0,변수명,Cramers_V
0,ID,1.0
1,인입횟수_ARS_R6M,0.110708
2,이용메뉴건수_ARS_R6M,0.109692
4,방문일수_PC_R6M,0.080878
5,방문횟수_앱_R6M,0.073706
3,방문횟수_PC_R6M,0.073537
6,OS구분코드,0.032469


In [35]:
# ID 드롭시키기 

cramers_df = cramers_df[cramers_df['변수명'] != 'ID']
cramers_df

Unnamed: 0,변수명,Cramers_V
1,인입횟수_ARS_R6M,0.110708
2,이용메뉴건수_ARS_R6M,0.109692
4,방문일수_PC_R6M,0.080878
5,방문횟수_앱_R6M,0.073706
3,방문횟수_PC_R6M,0.073537
6,OS구분코드,0.032469


In [45]:
# ANOVA 결과에 검정 타입 추가
anova_df_clean['검정타입'] = 'ANOVA'
anova_df_clean['Cramers_V'] = None  # Cramers_V 컬럼만 미리 추가

# Cramér's V 결과에 검정 타입 추가
cramers_df['검정타입'] = 'CramersV'
cramers_df['F값'] = None  # F값 컬럼 추가
cramers_df['p값'] = None  # p값 컬럼 추가

# 컬럼 순서 맞추기
anova_df_clean = anova_df_clean[['변수명', 'F값', 'p값', 'Cramers_V', '검정타입']]
cramers_df = cramers_df[['변수명', 'F값', 'p값', 'Cramers_V', '검정타입']]

# concat으로 합치기
merged_df = pd.concat([anova_df_clean, cramers_df], ignore_index=True)

# 정렬하면 보기 더 좋음 (선택사항)
merged_df = merged_df.sort_values(by='검정타입').reset_index(drop=True)

# 결과 보기
print(merged_df)

               변수명           F값   p값  Cramers_V      검정타입
0     인입일수_ARS_R6M  3349.570956  0.0        NaN     ANOVA
1      방문횟수_PC_B0M  2041.023620  0.0        NaN     ANOVA
2      방문일수_PC_B0M  2100.404422  0.0        NaN     ANOVA
3    방문월수_모바일웹_R6M   599.606566  0.0        NaN     ANOVA
4      방문월수_PC_R6M  2788.634090  0.0        NaN     ANOVA
..             ...          ...  ...        ...       ...
62    인입횟수_ARS_R6M          NaN  NaN   0.110708  CramersV
63  이용메뉴건수_ARS_R6M          NaN  NaN   0.109692  CramersV
64     방문일수_PC_R6M          NaN  NaN   0.080878  CramersV
65      방문횟수_앱_R6M          NaN  NaN   0.073706  CramersV
66          OS구분코드          NaN  NaN   0.032469  CramersV

[67 rows x 5 columns]


In [46]:
merged_df.to_csv('C:/Users/SAMSUNG/OneDrive/바탕 화면/6.채널정보(결측치 확인/anova.csv', index=False, encoding='utf-8-sig')
