In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# 📥 1. 데이터 불러오기
df = pd.read_csv('../original_data/gym_churn_us.csv')

# 🎯 2. Phone 제외한 모든 feature 사용
X = df.drop(columns=['Phone', 'Churn'])  # 'Phone' 제거, 'Churn'은 target
y = df['Churn']

# 🔄 3. 결측치 제거 (필요 시)
X = X.dropna()
y = y.loc[X.index]

# ⚖️ 4. 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 📏 5. 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 📌 6. SMOTE 적용 (훈련 데이터에만)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# 📌 7. DataFrame 변환
train_resampled = pd.DataFrame(X_train_resampled, columns=X.columns)
train_resampled['Churn'] = y_train_resampled.values

test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)
test_scaled['Churn'] = y_test.values

# 📂 8. 저장
train_resampled.to_csv('splited_data/train_resampled.csv', index=False)
test_scaled.to_csv('splited_data/test_scaled.csv', index=False)

# 📌 9. 모델 학습용 변수 정의
X_train = train_resampled.drop('Churn', axis=1)
y_train = train_resampled['Churn']
X_test = test_scaled.drop('Churn', axis=1)
y_test = test_scaled['Churn']


# LogisticRegression

In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix


# LR 학습, 예측 및 평가
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import make_scorer, f1_score

model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("👀 모델이 학습한 클래스 순서:", model.classes_)

print("\n📋 [LogisticRegression] Classification Report:\n", classification_report(y_test, y_pred))
print("✅ [LogisticRegression] Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print()
# 5-Fold CV (F1 스코어 기준)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scorer = make_scorer(f1_score)

model = LogisticRegression(random_state=42, solver='liblinear')  # solver 지정해야 L1 규제 가능

f1_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=f1_scorer)
#print("📊 [LogisticRegression] F1 Scores (CV):", f1_scores)
print("📈 [LogisticRegression] 평균 F1 Score:", f1_scores.mean())
print()


#하이퍼파라미터 튜닝 (GridSearchCV)
# param_grid = {
#     'penalty': ['l1', 'l2'],
#     'C': [0.01, 0.1, 1, 10, 100],
#     'max_iter': [100, 200],
#     'solver': ['liblinear']  # liblinear만 l1 + 이진 분류 가능
# }

from numpy import logspace

param_grid = {
    'penalty': ['l1', 'l2'],
    'C': logspace(-3, 3, 7),  # 0.001 ~ 1000
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 500]
}


grid = GridSearchCV(
    estimator=LogisticRegression(random_state=42),
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=cv,
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)

print("✅ [LogisticRegression] Best Parameters:", grid.best_params_)
print("🏆 [LogisticRegression] Best F1 Score from GridSearchCV:", grid.best_score_)
print()
# 🚀 6. Best 모델로 test set 예측 및 평가
best_model = grid.best_estimator_
y_pred_best = best_model.predict(X_test)

print("✅ [Best 모델] Confusion Matrix:\n", confusion_matrix(y_test, y_pred_best))
print("\n📋 [Best 모델] Classification Report:\n", classification_report(y_test, y_pred_best))


👀 모델이 학습한 클래스 순서: [0 1]

📋 [LogisticRegression] Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.94      0.95       588
           1       0.84      0.89      0.86       212

    accuracy                           0.93       800
   macro avg       0.90      0.92      0.91       800
weighted avg       0.93      0.93      0.93       800

✅ [LogisticRegression] Confusion Matrix:
 [[552  36]
 [ 23 189]]

📈 [LogisticRegression] 평균 F1 Score: 0.932191841022167

Fitting 5 folds for each of 84 candidates, totalling 420 fits
✅ [LogisticRegression] Best Parameters: {'C': np.float64(1.0), 'max_iter': 100, 'penalty': 'l2', 'solver': 'saga'}
🏆 [LogisticRegression] Best F1 Score from GridSearchCV: 0.9323875961342851

✅ [Best 모델] Confusion Matrix:
 [[552  36]
 [ 23 189]]

📋 [Best 모델] Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.94      0.95       588
           1       0.84    

| 질문                | 답변                                                        |
| ----------------- | --------------------------------------------------------- |
| 0 기준 학습? 1 기준 학습? | `1`이 **positive class**로 사용됨                              |
| 내가 따로 설정 안 해도?    | 네. `sklearn`의 기본 이진 분류는 항상 **큰 값(class=1)** 기준            |
| 어떤 지표들이 1 기준인가요?  | `precision`, `recall`, `f1-score` 등 모두 `class 1` 기준으로 계산됨 |


# KNN

In [4]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, make_scorer, f1_score


# 👟 2. 기본 KNN 모델 학습 및 평가
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
y_pred = knn_model.predict(X_test)

print("👀 [KNN] Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n📋 [KNN] Classification Report:\n", classification_report(y_test, y_pred))

# 🔁 3. 5-Fold CV (F1 기준)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scorer = make_scorer(f1_score)

cv_scores = cross_val_score(knn_model, X_train, y_train, scoring=f1_scorer, cv=cv)
print("📈 [KNN] 평균 F1 Score (CV):", cv_scores.mean())

#🔧 4. 하이퍼파라미터 튜닝 (GridSearchCV)
# param_grid = {
#     'n_neighbors': [3, 5, 7, 9],
#     'weights': ['uniform', 'distance'],
#     'p': [1, 2]  # 1: Manhattan, 2: Euclidean
# }



param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 13],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'chebyshev']
}



grid = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=cv,
    n_jobs=-1,
    verbose=1
)
# ✅ [Best KNN 모델] Confusion Matrix:
#  [[513  75]
#  [ 41 171]]
#
# 📋 [Best KNN 모델] Classification Report:
#                precision    recall  f1-score   support
#
#            0       0.93      0.87      0.90       588
#            1       0.70      0.81      0.75       212
#
#     accuracy                           0.85       800
#    macro avg       0.81      0.84      0.82       800
# weighted avg       0.86      0.85      0.86       800

grid.fit(X_train, y_train)

print("✅ [KNN] Best Parameters:", grid.best_params_)
print("🏆 [KNN] Best F1 Score from GridSearchCV:", grid.best_score_)

# 🧠 5. Best 모델로 Test 평가
best_knn = grid.best_estimator_
y_pred_best = best_knn.predict(X_test)

print("\n✅ [Best KNN 모델] Confusion Matrix:\n", confusion_matrix(y_test, y_pred_best))
print("\n📋 [Best KNN 모델] Classification Report:\n", classification_report(y_test, y_pred_best))


👀 [KNN] Confusion Matrix:
 [[503  85]
 [ 35 177]]

📋 [KNN] Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.86      0.89       588
           1       0.68      0.83      0.75       212

    accuracy                           0.85       800
   macro avg       0.81      0.85      0.82       800
weighted avg       0.87      0.85      0.85       800

📈 [KNN] 평균 F1 Score (CV): 0.9010183851481506
Fitting 5 folds for each of 36 candidates, totalling 180 fits
✅ [KNN] Best Parameters: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
🏆 [KNN] Best F1 Score from GridSearchCV: 0.9223134431681856

✅ [Best KNN 모델] Confusion Matrix:
 [[513  75]
 [ 41 171]]

📋 [Best KNN 모델] Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.87      0.90       588
           1       0.70      0.81      0.75       212

    accuracy                           0.85       800
   macro avg   

# DecisionTree

In [5]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("[DecisionTree] Classification Report:")
print(classification_report(y_test, y_pred))

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scorer = make_scorer(f1_score)
cv_score = cross_val_score(model, X_train, y_train, scoring=f1_scorer, cv=cv)
print("DecisionTree CV 평균 F1 Score:", cv_score.mean())

# GridSearch
# param_grid = {
#     'max_depth': [3, 5, 7, None],
#     'min_samples_split': [2, 5, 10]
# }



param_grid = {
    'max_depth': [3, 5, 7, 10, 15, None],  # 더 넓은 범위
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4],         # ✅ 추가 추천
    'criterion': ['gini', 'entropy']       # ✅ 의사결정 기준도 실험
}

# [DecisionTree] Classification Report:
#               precision    recall  f1-score   support
#
#            0       0.93      0.92      0.93       588
#            1       0.79      0.82      0.81       212
#
#     accuracy                           0.90       800
#    macro avg       0.86      0.87      0.87       800
# weighted avg       0.90      0.90      0.90       800
#
# DecisionTree CV 평균 F1 Score: 0.9097980178720965
# Best DT Params: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
# Best DT F1 Score: 0.923219461011462

grid = GridSearchCV(model, param_grid, scoring=f1_scorer, cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)
print("Best DT Params:", grid.best_params_)
print("Best DT F1 Score:", grid.best_score_)


[DecisionTree] Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.92      0.93       588
           1       0.79      0.82      0.81       212

    accuracy                           0.90       800
   macro avg       0.86      0.87      0.87       800
weighted avg       0.90      0.90      0.90       800

DecisionTree CV 평균 F1 Score: 0.9097980178720965
Best DT Params: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Best DT F1 Score: 0.923219461011462


# RandomForest

In [7]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("[RandomForest] Classification Report:")
print(classification_report(y_test, y_pred))

cv_score = cross_val_score(model, X_train, y_train, scoring=f1_scorer, cv=cv)
print("RandomForest CV 평균 F1 Score:", cv_score.mean())

# param_grid = {
#     'n_estimators': [100, 200],
#     'max_depth': [5, 10, None]
# }


param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}
# [RandomForest] Classification Report:
#               precision    recall  f1-score   support
#
#            0       0.94      0.94      0.94       588
#            1       0.84      0.84      0.84       212
#
#     accuracy                           0.92       800
#    macro avg       0.89      0.89      0.89       800
# weighted avg       0.92      0.92      0.92       800
#
# RandomForest CV 평균 F1 Score: 0.9433826232445307
# Best RF Params: {'bootstrap': True, 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
# Best RF F1 Score: 0.9469541977071406

grid = GridSearchCV(model, param_grid, scoring=f1_scorer, cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)
print("Best RF Params:", grid.best_params_)
print("Best RF F1 Score:", grid.best_score_)


[RandomForest] Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.94      0.94       588
           1       0.84      0.84      0.84       212

    accuracy                           0.92       800
   macro avg       0.89      0.89      0.89       800
weighted avg       0.92      0.92      0.92       800

RandomForest CV 평균 F1 Score: 0.9433826232445307
Best RF Params: {'bootstrap': True, 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Best RF F1 Score: 0.9469541977071406


# XGBoost

In [8]:
from xgboost import XGBClassifier

model = XGBClassifier(eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("[XGBoost] Classification Report:")
print(classification_report(y_test, y_pred))

cv_score = cross_val_score(model, X_train, y_train, scoring=f1_scorer, cv=cv)
print("XGBoost CV 평균 F1 Score:", cv_score.mean())

# param_grid = {
#     'n_estimators': [50, 100],
#     'max_depth': [3, 5],
#     'learning_rate': [0.1, 0.3]
# }


param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 1],
    'reg_alpha': [0, 0.5],
    'reg_lambda': [1, 2]
}

# [XGBoost] Classification Report:
#               precision    recall  f1-score   support
#
#            0       0.96      0.95      0.96       588
#            1       0.87      0.89      0.88       212
#
#     accuracy                           0.94       800
#    macro avg       0.91      0.92      0.92       800
# weighted avg       0.94      0.94      0.94       800
#
# XGBoost CV 평균 F1 Score: 0.9530772038170456
# Best XGB Params: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'reg_alpha': 0.5, 'reg_lambda': 1, 'subsample': 1.0}
# Best XGB F1 Score: 0.9579225903148009

grid = GridSearchCV(model, param_grid, scoring=f1_scorer, cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)
print("Best XGB Params:", grid.best_params_)
print("Best XGB F1 Score:", grid.best_score_)


[XGBoost] Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.95      0.96       588
           1       0.87      0.89      0.88       212

    accuracy                           0.94       800
   macro avg       0.91      0.92      0.92       800
weighted avg       0.94      0.94      0.94       800

XGBoost CV 평균 F1 Score: 0.9530772038170456
Best XGB Params: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'reg_alpha': 0.5, 'reg_lambda': 1, 'subsample': 1.0}
Best XGB F1 Score: 0.9579225903148009


# SVC

In [13]:
from sklearn.svm import SVC

model = SVC(probability=True, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("[SVC] Classification Report:")
print(classification_report(y_test, y_pred))

cv_score = cross_val_score(model, X_train, y_train, scoring=f1_scorer, cv=cv)
print("SVC CV 평균 F1 Score:", cv_score.mean())

# param_grid = {
#     'C': [0.1, 1, 10],
#     'kernel': ['linear', 'rbf']
# }
#grid = GridSearchCV(model, param_grid, scoring=f1_scorer, cv=cv, n_jobs=-1)
# [SVC] Classification Report:
#               precision    recall  f1-score   support
#
#            0       0.94      0.95      0.94       588
#            1       0.85      0.84      0.85       212
#
#     accuracy                           0.92       800
#    macro avg       0.90      0.89      0.90       800
# weighted avg       0.92      0.92      0.92       800
#
# SVC CV 평균 F1 Score: 0.8083978080656925
# Best SVC Params: {'C': 0.1, 'kernel': 'linear'}
# Best SVC F1 Score: 0.8197061220700691

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']  # rbf 커널일 때만 사용됨
}

# GridSearchCV
grid = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=cv,
    n_jobs=-1,
    verbose=1
)
# [SVC] Classification Report:
#               precision    recall  f1-score   support
#
#            0       0.96      0.93      0.95       588
#            1       0.83      0.89      0.86       212
#
#     accuracy                           0.92       800
#    macro avg       0.89      0.91      0.90       800
# weighted avg       0.92      0.92      0.92       800
#
# SVC CV 평균 F1 Score: 0.933628225073703
# Fitting 5 folds for each of 16 candidates, totalling 80 fits
# Best SVC Params: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}
# Best SVC F1 Score: 0.9451157816205367

grid.fit(X_train, y_train)
print("Best SVC Params:", grid.best_params_)
print("Best SVC F1 Score:", grid.best_score_)


[SVC] Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.93      0.95       588
           1       0.83      0.89      0.86       212

    accuracy                           0.92       800
   macro avg       0.89      0.91      0.90       800
weighted avg       0.92      0.92      0.92       800

SVC CV 평균 F1 Score: 0.933628225073703
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best SVC Params: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}
Best SVC F1 Score: 0.9451157816205367


# MLP Classifier

In [15]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(random_state=42, max_iter=2000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("[MLP] Classification Report:")
print(classification_report(y_test, y_pred))

cv_score = cross_val_score(model, X_train, y_train, scoring=f1_scorer, cv=cv)
print("MLP CV 평균 F1 Score:", cv_score.mean())

# param_grid = {
#     'hidden_layer_sizes': [(50,), (100,), (50, 50)],
#     'activation': ['relu', 'tanh']
# }


param_grid = {
    'hidden_layer_sizes': [(64,), (128,), (64, 32)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001],
    'solver': ['adam', 'sgd'],
    'learning_rate': ['constant', 'adaptive'],
    'learning_rate_init': [0.001, 0.01]
}


grid = GridSearchCV(model, param_grid, scoring=f1_scorer, cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)
print("Best MLP Params:", grid.best_params_)
print("Best MLP F1 Score:", grid.best_score_)


[MLP] Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.96      0.97       588
           1       0.90      0.92      0.91       212

    accuracy                           0.95       800
   macro avg       0.93      0.94      0.94       800
weighted avg       0.95      0.95      0.95       800

MLP CV 평균 F1 Score: 0.959638661578792
Best MLP Params: {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (128,), 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'solver': 'adam'}
Best MLP F1 Score: 0.9600965856974139


# VotingClassifier

In [16]:
from sklearn.ensemble import VotingClassifier

# voting_clf = VotingClassifier(
#     estimators=[
#         ('dt', DecisionTreeClassifier(random_state=42)),
#         ('rf', RandomForestClassifier(random_state=42)),
#         ('xgb', XGBClassifier(eval_metric='logloss', random_state=42))
#     ],
#     voting='soft'
# )



from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

voting_clf = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression(max_iter=1000, random_state=42)),
        ('rf', RandomForestClassifier(random_state=42)),
        ('xgb', XGBClassifier(eval_metric='logloss', random_state=42))
    ],
    voting='soft'
)



voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_test)
print("[VotingClassifier] Classification Report:")
print(classification_report(y_test, y_pred))

cv_score = cross_val_score(voting_clf, X_train, y_train, scoring=f1_scorer, cv=cv)
print("VotingClassifier CV 평균 F1 Score:", cv_score.mean())


[VotingClassifier] Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.95      0.95       588
           1       0.86      0.89      0.88       212

    accuracy                           0.93       800
   macro avg       0.91      0.92      0.92       800
weighted avg       0.93      0.93      0.93       800

VotingClassifier CV 평균 F1 Score: 0.9531650171035461
