### 라이브러리 임포트

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings(action='ignore')

### 데이터 불러오기

In [2]:
data = pd.read_csv("../Module03_머신러닝/membership.csv")
print(data.shape)
data.head()

(8807, 15)


Unnamed: 0,id,MEMBERSHIP_TERM_YEARS,ANNUAL_FEES,MEMBER_MARITAL_STATUS,MEMBER_GENDER,MEMBER_ANNUAL_INCOME,MEMBER_OCCUPATION_CD,MEMBERSHIP_PACKAGE,MEMBER_AGE_AT_ISSUE,ADDITIONAL_MEMBERS,PAYMENT_MODE,AGENT_CODE,MEMBERSHIP_STATUS,START_DATE,END_DATE
0,0,22,100000.0,M,M,450000.0,1.0,TYPE-B,27,1,ANNUAL,666099,0,20091029,
1,1,88,149029.58,,M,,6.0,TYPE-A,75,0,ANNUAL,132000,0,20131014,
2,2,93,102999.94,,M,600000.0,1.0,TYPE-A,49,0,ANNUAL,1407177,0,20130531,
3,3,17,147576.48,M,F,450000.0,1.0,TYPE-A,47,3,MONTHLY,343344,0,20130331,
4,4,39,100000.0,M,M,600000.0,1.0,TYPE-B,36,3,ANNUAL,119199,0,20080313,


In [3]:
data.columns = ['id', 'term', 'fee', 'married', 'gender', 'income', 'job',
'package', 'age', 'family', 'payment', 'agent', 'target', 'start_date', 'end_date']

### 결측값 제거

In [4]:
# end_date 열 삭제
data = data.drop(['end_date'], axis=1)

In [5]:
# 결측치가 있는 행 제거
data.dropna(how='any')

Unnamed: 0,id,term,fee,married,gender,income,job,package,age,family,payment,agent,target,start_date
0,0,22,100000.00,M,M,450000.0,1.0,TYPE-B,27,1,ANNUAL,666099,0,20091029
3,3,17,147576.48,M,F,450000.0,1.0,TYPE-A,47,3,MONTHLY,343344,0,20130331
4,4,39,100000.00,M,M,600000.0,1.0,TYPE-B,36,3,ANNUAL,119199,0,20080313
6,6,17,103016.16,S,M,999996.0,2.0,TYPE-A,35,0,MONTHLY,348099,0,20130930
9,9,12,300002.40,M,M,349992.0,1.0,TYPE-B,46,2,MONTHLY,1360477,1,20121130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8801,8801,12,100000.00,M,M,300000.0,2.0,TYPE-B,37,2,ANNUAL,1296622,1,20120806
8803,8803,49,100000.00,M,M,399996.0,1.0,TYPE-B,46,1,ANNUAL,841477,0,20100208
8804,8804,54,600001.16,M,M,2250000.0,2.0,TYPE-A,51,3,ANNUAL,1347855,0,20130331
8805,8805,27,100000.00,M,M,600000.0,1.0,TYPE-B,27,2,ANNUAL,798833,0,20100127


### 평균값 대체

In [6]:
# 결측치를 직업별 평균값으로 대체
data['income'] = data['income'].fillna(data.groupby('job')['income'].transform('mean'))
data['income'].isnull().sum() # 직업 결측치가 있기 때문에 결측치 16개 발생

16

In [7]:
# 결측치를 평균값으로 대체
data['income'] = data['income'].fillna(data['income'].mean()) # 따라서 전체 평균값으로 대체
data['income'].isnull().sum()

0

In [8]:
data['income_log'] = np.log1p(data['income'])
data.head()
# start_date : 날짜로 인식할수 있게 to_datetime 으로 변환하거나 가입 후 지난 날짜를 계산한 변수로 변경

Unnamed: 0,id,term,fee,married,gender,income,job,package,age,family,payment,agent,target,start_date,income_log
0,0,22,100000.0,M,M,450000.0,1.0,TYPE-B,27,1,ANNUAL,666099,0,20091029,13.017005
1,1,88,149029.58,,M,544756.571429,6.0,TYPE-A,75,0,ANNUAL,132000,0,20131014,13.208096
2,2,93,102999.94,,M,600000.0,1.0,TYPE-A,49,0,ANNUAL,1407177,0,20130531,13.304687
3,3,17,147576.48,M,F,450000.0,1.0,TYPE-A,47,3,MONTHLY,343344,0,20130331,13.017005
4,4,39,100000.0,M,M,600000.0,1.0,TYPE-B,36,3,ANNUAL,119199,0,20080313,13.304687


### 최빈값 대체

In [9]:
data['gender'] = data['gender'].fillna(data['gender'].mode()[0])
data['married'] = data['married'].fillna(data['married'].mode()[0])

data.isnull().sum()

id             0
term           0
fee            0
married        0
gender         0
income         0
job           34
package        0
age            0
family         0
payment        0
agent          0
target         0
start_date     0
income_log     0
dtype: int64

In [10]:
data['isna_job'] = data['job'].isnull()
data['job'] = data['job'].fillna('etc')

# data = data.dropna(subset=['job'], how='any')
data.isnull().sum()

id            0
term          0
fee           0
married       0
gender        0
income        0
job           0
package       0
age           0
family        0
payment       0
agent         0
target        0
start_date    0
income_log    0
isna_job      0
dtype: int64

### Label Encoding

In [11]:
# 숫자형 변수
numeric_features = []
for col in data.columns:
    if data[col].dtype != 'object':
        numeric_features.append(col)
        
numeric_features        

['id',
 'term',
 'fee',
 'income',
 'age',
 'family',
 'target',
 'start_date',
 'income_log',
 'isna_job']

In [12]:
# 범주형 변수
categorical_features = []
for col in data.columns:
    if data[col].dtype == 'O':
        categorical_features.append(col)
        
categorical_features

['married', 'gender', 'job', 'package', 'payment', 'agent']

In [13]:
# 범주형 변수 레이블 인코딩
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in categorical_features:

    data[col] = data[col].astype(str).astype('category')
    data[col] = le.fit_transform(data[col])

data.head()

Unnamed: 0,id,term,fee,married,gender,income,job,package,age,family,payment,agent,target,start_date,income_log,isna_job
0,0,22,100000.0,1,1,450000.0,0,1,27,1,0,3055,0,20091029,13.017005,False
1,1,88,149029.58,1,1,544756.571429,5,0,75,0,0,1078,0,20131014,13.208096,False
2,2,93,102999.94,1,1,600000.0,0,0,49,0,0,1460,0,20130531,13.304687,False
3,3,17,147576.48,1,0,450000.0,0,0,47,3,1,2204,0,20130331,13.017005,False
4,4,39,100000.0,1,1,600000.0,0,1,36,3,0,494,0,20080313,13.304687,False


### Feature 선택

In [14]:
selected_features = ['term', 'fee', 'married', 'gender', 'job', 'package',
       'age', 'family', 'payment', 'income_log', 'isna_job']

X = data.loc[:, selected_features].copy()
y = data.loc[:, 'target'].copy()

print(X.shape, y.shape)

(8807, 11) (8807,)


### K-fold 교차검증

In [15]:
# 모든 데이터를 검증셋과 훈련셋으로 사용할 수 있다.
# Cross-Validation (교차검증)
# 각각의 K 번의 검증의 Score 들의 평균을 보되, 분산도 고려해보아야 한다. (분산은 무조건 작을수록 좋다.)
# 특수한 데이터에만 잘 적용되는게 아닌, 모델의 일반화된 성능을 체크할 수 있다.
# 매번 K-Fold 로 검증하기엔 시간적으로 부담이라 메인으로 쓰기엔 힘든 경우가 있을 것이다.
# Hold-Out 교차검증으로 어느 정도 찾아내고 그 후에 그 데이터를 K-Fold 교차검증하는게 좋다.
# K-Fold 교차검증에 바로 시간을 쓰는 것보다 어느 변수를 사용할 것인지에 고민을 하고 정교하게 만들때 사용하자.
# K 개의 예측값의 평균으로 최종 예측값을 내는 형태이다.

In [16]:
X.head()

Unnamed: 0,term,fee,married,gender,job,package,age,family,payment,income_log,isna_job
0,22,100000.0,1,1,0,1,27,1,0,13.017005,False
1,88,149029.58,1,1,5,0,75,0,0,13.208096,False
2,93,102999.94,1,1,0,0,49,0,0,13.304687,False
3,17,147576.48,1,0,0,0,47,3,1,13.017005,False
4,39,100000.0,1,1,0,1,36,3,0,13.304687,False


In [17]:
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold
# y값의 분포를 고려해서 n등분할 때 StratifiedKFold
# 그렇지 않다면 KFold
import numpy as np
import xgboost as xgb

cv_scores = {
    'f1': [],
    'precision': [],
    'recall': []
}

skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True) 
# n등분된 값이 동일하도록 random_state (시드값) 고정. 단, 운영체제가 바뀌면 결과가 달라질 수도 있다.

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y), 1): 
# 만약에 split 을 회귀모델에 쓴다면 pd.cut 을 활용해서 연속적인 수치데이터를 구간으로 나눈다.
# enumerate 로 몇 번째 fold 인지 1부터 시작, skf.split 은 train_idx, valid_idx 반환한다.
    X_tr, X_val = X.iloc[train_idx], X.iloc[valid_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[valid_idx]
    
    cv_model = xgb.XGBClassifier(
        n_estimators=100, 
        max_depth=2, 
        subsample=0.7, 
        colsample_bytree=0.5, 
        reg_alpha=3, 
        reg_lambda=0.2,
        scale_pos_weight=2.5, 
        random_state=42)
    
    cv_model.fit(X_tr, y_tr)
    y_pred = cv_model.predict(X_val)
    
    # 평가 지표 계산 (n_split 개의 평가지표)
    cv_scores['f1'].append(f1_score(y_val, y_pred))
    cv_scores['precision'].append(precision_score(y_val, y_pred))
    cv_scores['recall'].append(recall_score(y_val, y_pred))
    
    print(f"Fold {fold} scores:")
    print(f"F1: {cv_scores['f1'][-1]:.4f}")
    print(f"Precision: {cv_scores['precision'][-1]:.4f}")
    print(f"Recall: {cv_scores['recall'][-1]:.4f}\n")

# 최종 평균 성능 출력 (위에서 계산한 n_split 개의 평가지표의 평균값)
for metric in cv_scores:
    scores = cv_scores[metric]
    print(f"Average {metric}: {np.mean(scores):.4f} (+/- {np.std(scores):.4f})")

Fold 1 scores:
F1: 0.5159
Precision: 0.3994
Recall: 0.7285

Fold 2 scores:
F1: 0.5358
Precision: 0.4108
Recall: 0.7701

Fold 3 scores:
F1: 0.5336
Precision: 0.4116
Recall: 0.7584

Fold 4 scores:
F1: 0.5214
Precision: 0.4061
Recall: 0.7285

Fold 5 scores:
F1: 0.5312
Precision: 0.4161
Recall: 0.7341

Average f1: 0.5276 (+/- 0.0076)
Average precision: 0.4088 (+/- 0.0057)
Average recall: 0.7439 (+/- 0.0171)


In [18]:
# 데이터프레임으로 변환 - 평균과 표준편차 포함하여 출력
cv_scores_df = pd.DataFrame(cv_scores).T
cv_scores_df['mean'] = cv_scores_df.mean(axis=1)
cv_scores_df['std'] = cv_scores_df.std(axis=1)
cv_scores_df

# 첫 번째 fold 성능이 상대적으로 떨어지기 때문에 여기서 검증셋 데이터를 볼 필요가 있다.
# recall 값이 중요한 경우 : 질병 예측 -> 실제로 질병이 있는데 못 맞추면 치명적이니까 recall 값이 중요하다.

Unnamed: 0,0,1,2,3,4,mean,std
f1,0.515915,0.535761,0.533597,0.521448,0.531165,0.527577,0.007611
precision,0.399384,0.410768,0.411585,0.406054,0.416136,0.408785,0.005686
recall,0.728464,0.770093,0.758427,0.728464,0.734082,0.743906,0.017147


In [19]:
# KFold 로 구현하기
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import KFold
# y값의 분포를 고려해서 등분할 때 StratifiedKFold
# 그렇지 않다면 KFold
import numpy as np

cv_scores = {
    'f1': [],
    'precision': [],
    'recall': []
}

skf = KFold(n_splits=5, shuffle=False) 
# n등분된 값이 동일하도록 random_state (시드값) 고정. 단, 운영체제가 바뀌면 결과가 달라질 수도 있다.

for fold, (train_idx, valid_idx) in enumerate(skf.split(X), 1):
# KFold 의 경우 StratifiedKFold 와 다르게 split 에서 y 값이 필수가 아니다.
# enumerate 로 몇 번째 fold 인지 1부터 시작, skf.split 은 train_idx, valid_idx 반환
    X_tr, X_val = X.iloc[train_idx], X.iloc[valid_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[valid_idx]
    
    cv_model = xgb.XGBClassifier(
        n_estimators=100, 
        max_depth=2, 
        subsample=0.7, 
        colsample_bytree=0.5, 
        reg_alpha=3, 
        reg_lambda=0.2,
        scale_pos_weight=2.5, 
        random_state=42)
    
    cv_model.fit(X_tr, y_tr)
    y_pred = cv_model.predict(X_val)
    
    # 평가 지표 계산 (n_split 개의 평가지표)
    cv_scores['f1'].append(f1_score(y_val, y_pred))
    cv_scores['precision'].append(precision_score(y_val, y_pred))
    cv_scores['recall'].append(recall_score(y_val, y_pred))
    
    print(f"Fold {fold} scores:")
    print(f"F1: {cv_scores['f1'][-1]:.4f}")
    print(f"Precision: {cv_scores['precision'][-1]:.4f}")
    print(f"Recall: {cv_scores['recall'][-1]:.4f}\n")

# 최종 평균 성능 출력 (위에서 계산한 n_split 개의 평가지표의 평균값)
for metric in cv_scores:
    scores = cv_scores[metric]
    print(f"Average {metric}: {np.mean(scores):.4f} (+/- {np.std(scores):.4f})")

Fold 1 scores:
F1: 0.5515
Precision: 0.4316
Recall: 0.7635

Fold 2 scores:
F1: 0.5088
Precision: 0.3939
Recall: 0.7184

Fold 3 scores:
F1: 0.5257
Precision: 0.4083
Recall: 0.7378

Fold 4 scores:
F1: 0.5185
Precision: 0.3910
Recall: 0.7691

Fold 5 scores:
F1: 0.5319
Precision: 0.4169
Recall: 0.7345

Average f1: 0.5273 (+/- 0.0143)
Average precision: 0.4084 (+/- 0.0150)
Average recall: 0.7447 (+/- 0.0189)


In [20]:
# 데이터프레임으로 변환 - 평균과 표준편차 포함하여 출력
cv_scores_df_kfold = pd.DataFrame(cv_scores).T
cv_scores_df_kfold['mean'] = cv_scores_df.mean(axis=1)
cv_scores_df_kfold['std'] = cv_scores_df.std(axis=1)
cv_scores_df_kfold

Unnamed: 0,0,1,2,3,4,mean,std
f1,0.551499,0.50882,0.525684,0.51847,0.531929,0.453296,0.196651
precision,0.431633,0.393908,0.40829,0.391045,0.416925,0.3512,0.152446
recall,0.763538,0.718391,0.737828,0.76908,0.734545,0.640084,0.275135
