In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

import joblib


In [2]:
# 데이터 로드
df = pd.read_csv('Data/gym_revised.csv')
# X, y 분할
X = df.drop(columns=['Churn', 'Phone', 'Unnamed: 0'])
y = df['Churn']
# print(X.info())
# print(y.info())

# train, valid 분할 
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, stratify=y, random_state=0)
# print(X_trian.shape, X_valid.shape, y_train.shape, y_valid.shape)

# 비율 확인 (이탈 N / Y)
print(np.unique(y, return_counts=True)[1]/y.size)
print(np.unique(y_train, return_counts=True)[1]/y_train.size)
print(np.unique(y_valid, return_counts=True)[1]/y_valid.size)

[0.73475 0.26525]
[0.73466667 0.26533333]
[0.735 0.265]


In [3]:
# 베이스라인 모델 학습 (Gradient Boosting / RandomForest, KNN, XGBoosting) 

models = [
    ("GBM", GradientBoostingClassifier(random_state=0)),
    ("RF", RandomForestClassifier(random_state=0)),
    ("KNN", KNeighborsClassifier()),
    ("XGB", XGBClassifier(random_state=0))
]

baseline_result_valid = {}

for name, model in models:
    model.fit(X_train, y_train)
    
    pred_valid = model.predict(X_valid)
    pred_proba_valid = model.predict_proba(X_valid)[:, 1]

    accuracy_valid = accuracy_score(y_valid, pred_valid)
    precision_valid = precision_score(y_valid, pred_valid)
    recall_valid = recall_score(y_valid, pred_valid)
    f1_valid = f1_score(y_valid, pred_valid)
    roc_auc_valid = roc_auc_score(y_valid, pred_proba_valid)
    
    baseline_result_valid[name] = [accuracy_valid, precision_valid, recall_valid, f1_valid, roc_auc_valid]

pd.DataFrame(baseline_result_valid, index=['accuracy', 'precision', 'recall', 'f1', 'roc-auc'])

Unnamed: 0,GBM,RF,KNN,XGB
accuracy,0.935,0.923,0.864,0.938
precision,0.920168,0.88843,0.747126,0.914286
recall,0.826415,0.811321,0.735849,0.845283
f1,0.870775,0.848126,0.741445,0.878431
roc-auc,0.978295,0.970838,0.882647,0.979828


In [4]:
# 모델 튜닝, 학습, 저 함수 (파라미터 수에 따라, Grid Search 또는 Randomized Search 사용)
def cv(t, model, params):
    if t == 'grid':
        cv = GridSearchCV(
            estimator=model,
            param_grid=params,
            scoring='roc_auc',
            cv=4,
            n_jobs=-1
        )
    elif t == 'rand':
        cv = RandomizedSearchCV(
            model, params, 
            cv=4, 
            scoring='roc_auc', 
            n_jobs=-1, 
            n_iter=60, 
            random_state=0
        )

    cv.fit(X_train, y_train)
    print(cv.best_params_)
    print(cv.best_estimator_)
    print(cv.best_score_)
    
    best_model = cv.best_estimator_
    
    pred_train = best_model.predict(X_train)
    pred_test = best_model.predict(X_valid)
    
    pred_train_proba = best_model.predict_proba(X_train)
    pred_valid_proba = best_model.predict_proba(X_valid)
    
    train_score = roc_auc_score(y_train, pred_train_proba[:,1])
    valid_score = roc_auc_score(y_valid, pred_valid_proba[:,1])
    
    print('Train Score:', train_score)
    print('Valid Score:', valid_score)
    return best_model


In [7]:
# RandomForest Grid Search
model_rf = RandomForestClassifier(random_state=0)

params_rf = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700],
    'max_depth': [2, 3, 4, 5, 6, 7],
}

best_rf = cv('grid', model_rf, params_rf)
joblib.dump(best_rf, 'models/best_rf.pkl')


{'max_depth': 7, 'n_estimators': 500}
RandomForestClassifier(max_depth=7, n_estimators=500, random_state=0)
0.9682737644666162
Train Score: 0.9878042663407783
Valid Score: 0.9683532280836863


['models/best_rf.pkl']

In [6]:
# Gradient Boosting Randomized Search

model_gb = GradientBoostingClassifier(random_state=0)

params_gb = {
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
    'n_estimators': [1000, 2000, 3000, 4000, 5000],
    'max_depth': range(1, 6),
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1]
}

best_gb = cv('rand', model_gb, params_gb)
joblib.dump(best_gb, 'models/best_gb.pkl')


{'subsample': 0.7, 'n_estimators': 4000, 'max_depth': 2, 'learning_rate': 0.01}
GradientBoostingClassifier(learning_rate=0.01, max_depth=2, n_estimators=4000,
                           random_state=0, subsample=0.7)
0.9829569809118187
Train Score: 0.9965190061012867
Valid Score: 0.9823796688486715


['models/best_gb.pkl']

In [8]:
# XGBoosting Randomized Search

model_xgb = XGBClassifier(random_state=0)

params_xgb = {
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
    'n_estimators': [1000, 2000, 3000, 4000, 5000],
    'max_depth': range(1, 6),
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1]
}

best_xgb = cv('rand', model_xgb, params_xgb)
joblib.dump(best_xgb, 'models/best_xgb.pkl')


{'subsample': 0.7, 'n_estimators': 4000, 'max_depth': 2, 'learning_rate': 0.01}
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.01, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=2, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=4000, n_jobs=None,
              num_parallel_tree=None, random_state=0, ...)
0.9825260604291877
Train Score: 0.9951105345237986
Valid Score: 0.98182518290335


['models/best_xgb.pkl']