In [11]:
#Male

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from pprint import pprint


df = pd.read_csv("data/all_m.csv", encoding='ISO-8859-1')

features = df[["GRADE", "e_s_rcrd", "e_res", "e_aid", "pr_ht", "pr_hd", "wc_mn", 
               "f_br", "f_fru", "f_drink", "f_ff", "f_wat", "pa_tot", "pa_msc", 
               "o_br_fq", "o_slnt", "hw", "ecz_dg_lt", "v_trt", "tc_lt", "tc_days",  
               "s_edu", "sp_t", "m_slp_en", "m_str", "m_lon", "GAD", "m_sui_con",  
               "m_sui_pln", "m_sui_att"]]
target = df["SP"]

#원-핫 인코딩
features = pd.get_dummies(features, drop_first=True)

#데이터 분할 (6:2:2)
X_train, X_temp, y_train, y_temp = train_test_split(features, target, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

#결측값 제거
X_train = X_train.dropna()
y_train = y_train.loc[X_train.index]
X_val = X_val.dropna()
y_val = y_val.loc[X_val.index]
X_test = X_test.dropna()
y_test = y_test.loc[X_test.index]

#데이터 표준화
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

#하이퍼파라미터 설정
param_grid_lr = {
    "C": [0.01, 0.1, 1, 10],
    "max_iter": [200, 500, 1000]
}

#로지스틱 회귀 모델 학습
lr_model = LogisticRegression(random_state=42)
grid_lr = GridSearchCV(lr_model, param_grid_lr, cv=10, scoring='accuracy', n_jobs=-1)
grid_lr.fit(X_train, y_train)

print("Logistic Regression - Best Parameters:")
pprint(grid_lr.best_params_)

#예측
best_lr = grid_lr.best_estimator_
val_preds = best_lr.predict(X_val)
test_preds = best_lr.predict(X_test)

#평가 지표 계산
val_accuracy = accuracy_score(y_val, val_preds)
test_accuracy = accuracy_score(y_test, test_preds)
precision = precision_score(y_test, test_preds, average='macro')
recall = recall_score(y_test, test_preds, average='macro')
f1 = f1_score(y_test, test_preds, average='macro')

#AUC 계산
val_probs = best_lr.predict_proba(X_val)[:, 1]
test_probs = best_lr.predict_proba(X_test)[:, 1]
val_auc = roc_auc_score(y_val, val_probs)
test_auc = roc_auc_score(y_test, test_probs)


print("\nEvaluation Metrics:")
print("Validation Accuracy: {:.4f}".format(val_accuracy))
print("Test Accuracy: {:.4f}".format(test_accuracy))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("F1 Score: {:.4f}".format(f1))
print("Validation AUC: {:.4f}".format(val_auc))
print("Test AUC: {:.4f}".format(test_auc))

Logistic Regression - Best Parameters:
{'C': 0.01, 'max_iter': 200}

Evaluation Metrics:
Validation Accuracy: 0.7659
Test Accuracy: 0.7758
Precision: 0.6988
Recall: 0.5341
F1 Score: 0.5094
Validation AUC: 0.6711
Test AUC: 0.6938


In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from pprint import pprint

#파라미터: 상위 변수 개수 및 탐색 반복수
N_TOP_FEATURES = 25
N_ITER = 10


df = pd.read_csv("data/all_m.csv", encoding='ISO-8859-1')

features = df[["GRADE", "e_s_rcrd", "e_res", "e_aid", "pr_ht", "pr_hd", "wc_mn", 
               "f_br", "f_fru", "f_drink", "f_ff", "f_wat", "pa_tot", "pa_msc", 
               "o_br_fq", "o_slnt", "hw", "ecz_dg_lt", "v_trt", "tc_lt", "tc_days",  
               "s_edu", "sp_t", "m_slp_en", "m_str", "m_lon", "GAD", "m_sui_con",  
               "m_sui_pln", "m_sui_att"]]
target = df["SP"]

#원-핫 인코딩
features = pd.get_dummies(features, drop_first=True)

#데이터 분할
X_train, X_temp, y_train, y_temp = train_test_split(
    features, target, test_size=0.4, random_state=42, stratify=target
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

#결측값 제거
X_train = X_train.dropna()
y_train = y_train.loc[X_train.index]
X_val = X_val.dropna()
y_val = y_val.loc[X_val.index]
X_test = X_test.dropna()
y_test = y_test.loc[X_test.index]

#Feature Selection
rf_temp = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_temp.fit(X_train, y_train)
feature_importances = pd.Series(rf_temp.feature_importances_, index=X_train.columns)
top_features = feature_importances.sort_values(ascending=False).head(N_TOP_FEATURES).index

X_train = X_train[top_features]
X_val = X_val[top_features]
X_test = X_test[top_features]

#하이퍼파라미터
param_dist_rf = {
    "n_estimators": [100, 200, 300],
    "max_depth": [5, 10, 15, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "class_weight": [None, "balanced"]
}

rf_model = RandomForestClassifier(random_state=42, n_jobs=-1)
random_search_rf = RandomizedSearchCV(
    rf_model,
    param_distributions=param_dist_rf,
    n_iter=N_ITER,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    random_state=42
)
random_search_rf.fit(X_train, y_train)

print("Random Forest - Best Parameters:")
pprint(random_search_rf.best_params_)

#예측
best_rf = random_search_rf.best_estimator_
val_preds = best_rf.predict(X_val)
test_preds = best_rf.predict(X_test)

#평가 지표 계산
val_accuracy = accuracy_score(y_val, val_preds)
test_accuracy = accuracy_score(y_test, test_preds)
precision = precision_score(y_test, test_preds, average='macro')
recall = recall_score(y_test, test_preds, average='macro')
f1 = f1_score(y_test, test_preds, average='macro')

#AUC 계산
val_probs = best_rf.predict_proba(X_val)[:, 1]
test_probs = best_rf.predict_proba(X_test)[:, 1]
val_auc = roc_auc_score(y_val, val_probs)
test_auc = roc_auc_score(y_test, test_probs)


print("\nEvaluation Metrics:")
print("Validation Accuracy: {:.4f}".format(val_accuracy))
print("Test Accuracy: {:.4f}".format(test_accuracy))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("F1 Score: {:.4f}".format(f1))
print("Validation AUC: {:.4f}".format(val_auc))
print("Test AUC: {:.4f}".format(test_auc))


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Random Forest - Best Parameters:
{'class_weight': None,
 'max_depth': 15,
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'n_estimators': 200}

Evaluation Metrics:
Validation Accuracy: 0.7603
Test Accuracy: 0.7613
Precision: 0.6590
Recall: 0.5195
F1 Score: 0.4792
Validation AUC: 0.6716
Test AUC: 0.6597


In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from xgboost import XGBClassifier
from scipy.stats import randint, uniform
from pprint import pprint

# 파라미터
N_TOP_FEATURES = 25
N_ITER = 10

#데이터 로드
df = pd.read_csv("data/all_m.csv", encoding='ISO-8859-1')

features = df[["GRADE", "e_s_rcrd", "e_res", "e_aid", "pr_ht", "pr_hd", "wc_mn", 
               "f_br", "f_fru", "f_drink", "f_ff", "f_wat", "pa_tot", "pa_msc", 
               "o_br_fq", "o_slnt", "hw", "ecz_dg_lt", "v_trt", "tc_lt", "tc_days",  
               "s_edu", "sp_t", "m_slp_en", "m_str", "m_lon", "GAD", "m_sui_con",  
               "m_sui_pln", "m_sui_att"]]
target = df["SP"]

#원-핫 인코딩
features = pd.get_dummies(features, drop_first=True)

#데이터 분할
X_train, X_temp, y_train, y_temp = train_test_split(features, target, test_size=0.4, random_state=42, stratify=target)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

#결측값 제거
X_train = X_train.dropna()
y_train = y_train.loc[X_train.index]
X_val = X_val.dropna()
y_val = y_val.loc[X_val.index]
X_test = X_test.dropna()
y_test = y_test.loc[X_test.index]

#Feature Selection
xgb_temp = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_temp.fit(X_train, y_train)
feature_importances = pd.Series(xgb_temp.feature_importances_, index=X_train.columns)
top_features = feature_importances.sort_values(ascending=False).head(N_TOP_FEATURES).index

X_train = X_train[top_features]
X_val = X_val[top_features]
X_test = X_test[top_features]

#하이퍼파라미터 범위
param_dist = {
    'n_estimators': randint(80, 200),
    'max_depth': [3, 5, 7],
    'learning_rate': uniform(0.01, 0.09),
    'subsample': uniform(0.8, 0.2),
    'colsample_bytree': uniform(0.8, 0.2),
    'min_child_weight': randint(1, 8),
    'gamma': uniform(0, 0.2),
    'reg_alpha': uniform(0, 0.15),
    'reg_lambda': uniform(0, 0.15),
    'scale_pos_weight': [1, sum(y_train == 0) / sum(y_train == 1)]
}

#RandomizedSearchCV
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
random_search = RandomizedSearchCV(
    xgb_model,
    param_distributions=param_dist,
    n_iter=N_ITER, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1, random_state=42
)
random_search.fit(X_train, y_train)

print("XGBoost - Best Parameters:")
pprint(random_search.best_params_)

#예측
best_xgb = random_search.best_estimator_
val_preds = best_xgb.predict(X_val)
test_preds = best_xgb.predict(X_test)

#평가 지표 계산
val_accuracy = accuracy_score(y_val, val_preds)
test_accuracy = accuracy_score(y_test, test_preds)
precision = precision_score(y_test, test_preds, average='macro')
recall = recall_score(y_test, test_preds, average='macro')
f1 = f1_score(y_test, test_preds, average='macro')

#AUC 계산
val_probs = best_xgb.predict_proba(X_val)[:, 1]
test_probs = best_xgb.predict_proba(X_test)[:, 1]
val_auc = roc_auc_score(y_val, val_probs)
test_auc = roc_auc_score(y_test, test_probs)


print("\nEvaluation Metrics:")
print("Validation Accuracy: {:.4f}".format(val_accuracy))
print("Test Accuracy: {:.4f}".format(test_accuracy))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("F1 Score: {:.4f}".format(f1))
print("Validation AUC: {:.4f}".format(val_auc))
print("Test AUC: {:.4f}".format(test_auc))


Parameters: { "use_label_encoder" } are not used.



Fitting 3 folds for each of 10 candidates, totalling 30 fits
XGBoost - Best Parameters:
{'colsample_bytree': 0.8749080237694725,
 'gamma': 0.19014286128198324,
 'learning_rate': 0.07587945476302645,
 'max_depth': 3,
 'min_child_weight': 5,
 'n_estimators': 182,
 'reg_alpha': 0.06687491292803867,
 'reg_lambda': 0.014996237372700432,
 'scale_pos_weight': 1,
 'subsample': 0.9732352291549871}

Evaluation Metrics:
Validation Accuracy: 0.7552
Test Accuracy: 0.7642
Precision: 0.6707
Recall: 0.5340
F1 Score: 0.5089
Validation AUC: 0.6747
Test AUC: 0.6723


Parameters: { "use_label_encoder" } are not used.



In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from lightgbm import LGBMClassifier
from scipy.stats import randint, uniform
from pprint import pprint

#파라미터
N_TOP_FEATURES = 25 
N_ITER = 10 


df = pd.read_csv("data/all_m.csv", encoding='ISO-8859-1')

features = df[["GRADE", "e_s_rcrd", "e_res", "e_aid", "pr_ht", "pr_hd", "wc_mn", 
               "f_br", "f_fru", "f_drink", "f_ff", "f_wat", "pa_tot", "pa_msc", 
               "o_br_fq", "o_slnt", "hw", "ecz_dg_lt", "v_trt", "tc_lt", "tc_days",  
               "s_edu", "sp_t", "m_slp_en", "m_str", "m_lon", "GAD", "m_sui_con",  
               "m_sui_pln", "m_sui_att"]]
target = df["SP"]

#원-핫 인코딩
features = pd.get_dummies(features, drop_first=True)

#데이터 분할
X_train, X_temp, y_train, y_temp = train_test_split(features, target, test_size=0.4, random_state=42, stratify=target)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

#결측값 제거
X_train = X_train.dropna()
y_train = y_train.loc[X_train.index]
X_val = X_val.dropna()
y_val = y_val.loc[X_val.index]
X_test = X_test.dropna()
y_test = y_test.loc[X_test.index]

#Feature Selection
lgb_temp = LGBMClassifier(random_state=42, n_estimators=80, n_jobs=-1)
lgb_temp.fit(X_train, y_train)
feature_importances = pd.Series(lgb_temp.feature_importances_, index=X_train.columns)
top_features = feature_importances.sort_values(ascending=False).head(N_TOP_FEATURES).index

X_train = X_train[top_features]
X_val = X_val[top_features]
X_test = X_test[top_features]

#하이퍼파라미터 범위
param_dist = {
    'n_estimators': randint(80, 200),
    'max_depth': [3, 5, -1],
    'learning_rate': uniform(0.01, 0.09),
    'subsample': uniform(0.8, 0.2),
    'colsample_bytree': uniform(0.8, 0.2),
    'min_child_samples': randint(10, 21),
    'reg_alpha': uniform(0, 0.15),
    'reg_lambda': uniform(0, 0.15),
    'class_weight': [None, 'balanced']
}

#RandomizedSearchCV
lgb_model = LGBMClassifier(random_state=42, n_jobs=-1)
random_search = RandomizedSearchCV(lgb_model, param_distributions=param_dist, 
                                   n_iter=N_ITER, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1, random_state=42)
random_search.fit(X_train, y_train)

print("LightGBM - Best Parameters:")
pprint(random_search.best_params_)

from lightgbm import early_stopping as lgb_early_stopping

#최적 파라미터로 재학습
best_lgb = LGBMClassifier(random_state=42, n_jobs=-1, **random_search.best_params_)
best_lgb.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='auc',
    callbacks=[lgb_early_stopping(15)]
)

#예측
val_preds = best_lgb.predict(X_val)
test_preds = best_lgb.predict(X_test)

#평가 지표 계산
val_accuracy = accuracy_score(y_val, val_preds)
test_accuracy = accuracy_score(y_test, test_preds)
precision = precision_score(y_test, test_preds, average='macro')
recall = recall_score(y_test, test_preds, average='macro')
f1 = f1_score(y_test, test_preds, average='macro')

#AUC 계산
val_probs = best_lgb.predict_proba(X_val)[:, 1]
test_probs = best_lgb.predict_proba(X_test)[:, 1]
val_auc = roc_auc_score(y_val, val_probs)
test_auc = roc_auc_score(y_test, test_probs)


print("\nEvaluation Metrics:")
print("Validation Accuracy: {:.4f}".format(val_accuracy))
print("Test Accuracy: {:.4f}".format(test_accuracy))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("F1 Score: {:.4f}".format(f1))
print("Validation AUC: {:.4f}".format(val_auc))
print("Test AUC: {:.4f}".format(test_auc))


[LightGBM] [Info] Number of positive: 2737, number of negative: 8597
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000736 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 278
[LightGBM] [Info] Number of data points in the train set: 11334, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.241486 -> initscore=-1.144551
[LightGBM] [Info] Start training from score -1.144551
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[LightGBM] [Info] Number of positive: 2737, number of negative: 8597
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000590 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 263
[LightGBM] [Info] Number of data points in the train set



Early stopping, best iteration is:
[147]	valid_0's auc: 0.679659	valid_0's binary_logloss: 0.515537

Evaluation Metrics:
Validation Accuracy: 0.7555
Test Accuracy: 0.7629
Precision: 0.6705
Recall: 0.5246
F1 Score: 0.4895
Validation AUC: 0.6797
Test AUC: 0.6692


In [16]:
#Female

In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from pprint import pprint


N_TOP_FEATURES = 35
N_ITER = 40


df = pd.read_csv("data/all_f.csv", encoding='ISO-8859-1')

#피처 및 타겟 지정
features = df[["GRADE", "e_s_rcrd", "e_res", "pr_ht", "BMI", "wc_mn", "f_br",  
               "f_fru", "f_drink", "f_ff", "f_edu", "f_wat", "pa_tot", "pa_msc",  
               "o_br_fq", "o_slnt", "hw", "hw_edu", "rh_dg_lt", "ecz_dg_lt", "sp_t",  
               "m_slp_en", "m_str", "m_lon", "GAD", "m_sui_con", "m_sui_pln"]]
target = df["SP"]

#원-핫 인코딩
features = pd.get_dummies(features, drop_first=True)

#데이터 분할
X_train, X_temp, y_train, y_temp = train_test_split(
    features, target, test_size=0.4, random_state=42, stratify=target
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

#결측값 제거
X_train = X_train.dropna()
y_train = y_train.loc[X_train.index]
X_val = X_val.dropna()
y_val = y_val.loc[X_val.index]
X_test = X_test.dropna()
y_test = y_test.loc[X_test.index]

#데이터 표준화
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

#상위 중요 변수 자동 선택
temp_lr = LogisticRegression(penalty='l2', solver='liblinear', random_state=42, max_iter=200)
temp_lr.fit(X_train_scaled, y_train)
feature_importances = np.abs(temp_lr.coef_[0])
top_indices = np.argsort(feature_importances)[::-1][:N_TOP_FEATURES]
X_train_selected = X_train_scaled[:, top_indices]
X_val_selected = X_val_scaled[:, top_indices]
X_test_selected = X_test_scaled[:, top_indices]
selected_feature_names = X_train.columns[top_indices]

#하이퍼파라미터 범위 및 RandomizedSearchCV
param_dist_lr = {
    "C": np.logspace(-4, 2, 30), 
    "max_iter": [200, 500, 1000, 2000],
    "class_weight": [None, "balanced"],
    "penalty": ["l1", "l2"],
    "solver": ["liblinear"], 
}

lr_model = LogisticRegression(random_state=42)
random_search = RandomizedSearchCV(
    lr_model, param_distributions=param_dist_lr,
    n_iter=N_ITER, cv=5, scoring='accuracy', n_jobs=-1, verbose=1, random_state=42
)
random_search.fit(X_train_selected, y_train)

print("Logistic Regression - Best Parameters:")
pprint(random_search.best_params_)

#예측
val_preds = random_search.best_estimator_.predict(X_val_selected)
test_preds = random_search.best_estimator_.predict(X_test_selected)

#평가 지표 계산
val_accuracy = accuracy_score(y_val, val_preds)
test_accuracy = accuracy_score(y_test, test_preds)
precision = precision_score(y_test, test_preds, average='macro')
recall = recall_score(y_test, test_preds, average='macro')
f1 = f1_score(y_test, test_preds, average='macro')

#AUC 계산
val_probs = random_search.best_estimator_.predict_proba(X_val_selected)[:, 1]
test_probs = random_search.best_estimator_.predict_proba(X_test_selected)[:, 1]
val_auc = roc_auc_score(y_val, val_probs)
test_auc = roc_auc_score(y_test, test_probs)


print("\nEvaluation Metrics:")
print("Validation Accuracy: {:.4f}".format(val_accuracy))
print("Test Accuracy: {:.4f}".format(test_accuracy))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("F1 Score: {:.4f}".format(f1))
print("Validation AUC: {:.4f}".format(val_auc))
print("Test AUC: {:.4f}".format(test_auc))


Fitting 5 folds for each of 40 candidates, totalling 200 fits
Logistic Regression - Best Parameters:
{'C': 0.20433597178569418,
 'class_weight': None,
 'max_iter': 1000,
 'penalty': 'l2',
 'solver': 'liblinear'}

Evaluation Metrics:
Validation Accuracy: 0.6907
Test Accuracy: 0.7166
Precision: 0.6782
Recall: 0.5950
F1 Score: 0.5937
Validation AUC: 0.6824
Test AUC: 0.7089


In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from pprint import pprint

# 파라미터: 상위 변수 개수 및 탐색 반복수
N_TOP_FEATURES = 25
N_ITER = 10


df = pd.read_csv("data/all_f.csv", encoding='ISO-8859-1')

#피처 및 타겟 지정
features = df[["GRADE", "e_s_rcrd", "e_res", "pr_ht", "BMI", "wc_mn", "f_br",  
               "f_fru", "f_drink", "f_ff", "f_edu", "f_wat", "pa_tot", "pa_msc",  
               "o_br_fq", "o_slnt", "hw", "hw_edu", "rh_dg_lt", "ecz_dg_lt", "sp_t",  
               "m_slp_en", "m_str", "m_lon", "GAD", "m_sui_con", "m_sui_pln"]]
target = df["SP"]

#원-핫 인코딩
features = pd.get_dummies(features, drop_first=True)

#데이터 분할
X_train, X_temp, y_train, y_temp = train_test_split(
    features, target, test_size=0.4, random_state=42, stratify=target
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

#결측값 제거
X_train = X_train.dropna()
y_train = y_train.loc[X_train.index]
X_val = X_val.dropna()
y_val = y_val.loc[X_val.index]
X_test = X_test.dropna()
y_test = y_test.loc[X_test.index]

#Feature Selection (상위 N_TOP_FEATURES 변수 선택)
rf_temp = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_temp.fit(X_train, y_train)
feature_importances = pd.Series(rf_temp.feature_importances_, index=X_train.columns)
top_features = feature_importances.sort_values(ascending=False).head(N_TOP_FEATURES).index

X_train = X_train[top_features]
X_val = X_val[top_features]
X_test = X_test[top_features]

#RandomizedSearchCV
param_dist_rf = {
    "n_estimators": [100, 200, 300],
    "max_depth": [5, 10, 15, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "class_weight": [None, "balanced"]
}

rf_model = RandomForestClassifier(random_state=42, n_jobs=-1)
random_search_rf = RandomizedSearchCV(
    rf_model,
    param_distributions=param_dist_rf,
    n_iter=N_ITER,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    random_state=42
)
random_search_rf.fit(X_train, y_train)

print("Random Forest - Best Parameters:")
pprint(random_search_rf.best_params_)

#예측
best_rf = random_search_rf.best_estimator_
val_preds = best_rf.predict(X_val)
test_preds = best_rf.predict(X_test)

#평가 지표 계산
val_accuracy = accuracy_score(y_val, val_preds)
test_accuracy = accuracy_score(y_test, test_preds)
precision = precision_score(y_test, test_preds, average='macro')
recall = recall_score(y_test, test_preds, average='macro')
f1 = f1_score(y_test, test_preds, average='macro')

#AUC 계산
val_probs = best_rf.predict_proba(X_val)[:, 1]
test_probs = best_rf.predict_proba(X_test)[:, 1]
val_auc = roc_auc_score(y_val, val_probs)
test_auc = roc_auc_score(y_test, test_probs)


print("\nEvaluation Metrics:")
print("Validation Accuracy: {:.4f}".format(val_accuracy))
print("Test Accuracy: {:.4f}".format(test_accuracy))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("F1 Score: {:.4f}".format(f1))
print("Validation AUC: {:.4f}".format(val_auc))
print("Test AUC: {:.4f}".format(test_auc))


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Random Forest - Best Parameters:
{'class_weight': None,
 'max_depth': 15,
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'n_estimators': 200}

Evaluation Metrics:
Validation Accuracy: 0.6929
Test Accuracy: 0.7032
Precision: 0.6527
Recall: 0.5724
F1 Score: 0.5627
Validation AUC: 0.6748
Test AUC: 0.7021


In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from xgboost import XGBClassifier
from scipy.stats import randint, uniform
from pprint import pprint

#파라미터
N_TOP_FEATURES = 25
N_ITER = 10


df = pd.read_csv("data/all_f.csv", encoding='ISO-8859-1')

#피처 및 타겟 지정
features = df[["GRADE", "e_s_rcrd", "e_res", "pr_ht", "BMI", "wc_mn", "f_br",  
               "f_fru", "f_drink", "f_ff", "f_edu", "f_wat", "pa_tot", "pa_msc",  
               "o_br_fq", "o_slnt", "hw", "hw_edu", "rh_dg_lt", "ecz_dg_lt", "sp_t",  
               "m_slp_en", "m_str", "m_lon", "GAD", "m_sui_con", "m_sui_pln"]]
target = df["SP"]

#원-핫 인코딩
features = pd.get_dummies(features, drop_first=True)

#데이터 분할
X_train, X_temp, y_train, y_temp = train_test_split(features, target, test_size=0.4, random_state=42, stratify=target)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

#결측값 제거
X_train = X_train.dropna()
y_train = y_train.loc[X_train.index]
X_val = X_val.dropna()
y_val = y_val.loc[X_val.index]
X_test = X_test.dropna()
y_test = y_test.loc[X_test.index]

#Feature Selection
xgb_temp = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_temp.fit(X_train, y_train)
feature_importances = pd.Series(xgb_temp.feature_importances_, index=X_train.columns)
top_features = feature_importances.sort_values(ascending=False).head(N_TOP_FEATURES).index

X_train = X_train[top_features]
X_val = X_val[top_features]
X_test = X_test[top_features]

#하이퍼파라미터
param_dist = {
    'n_estimators': randint(80, 200),
    'max_depth': [3, 5, 7],
    'learning_rate': uniform(0.01, 0.09),
    'subsample': uniform(0.8, 0.2),
    'colsample_bytree': uniform(0.8, 0.2),
    'min_child_weight': randint(1, 8),
    'gamma': uniform(0, 0.2),
    'reg_alpha': uniform(0, 0.15),
    'reg_lambda': uniform(0, 0.15),
    'scale_pos_weight': [1, sum(y_train == 0) / sum(y_train == 1)] 
}

#RandomizedSearchCV
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
random_search = RandomizedSearchCV(
    xgb_model,
    param_distributions=param_dist,
    n_iter=N_ITER, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1, random_state=42
)
random_search.fit(X_train, y_train)

print("XGBoost - Best Parameters:")
pprint(random_search.best_params_)

#예측
best_xgb = random_search.best_estimator_
val_preds = best_xgb.predict(X_val)
test_preds = best_xgb.predict(X_test)

#평가 지표 계산
val_accuracy = accuracy_score(y_val, val_preds)
test_accuracy = accuracy_score(y_test, test_preds)
precision = precision_score(y_test, test_preds, average='macro')
recall = recall_score(y_test, test_preds, average='macro')
f1 = f1_score(y_test, test_preds, average='macro')

#AUC 계산
val_probs = best_xgb.predict_proba(X_val)[:, 1]
test_probs = best_xgb.predict_proba(X_test)[:, 1]
val_auc = roc_auc_score(y_val, val_probs)
test_auc = roc_auc_score(y_test, test_probs)


print("\nEvaluation Metrics:")
print("Validation Accuracy: {:.4f}".format(val_accuracy))
print("Test Accuracy: {:.4f}".format(test_accuracy))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("F1 Score: {:.4f}".format(f1))
print("Validation AUC: {:.4f}".format(val_auc))
print("Test AUC: {:.4f}".format(test_auc))


Parameters: { "use_label_encoder" } are not used.



Fitting 3 folds for each of 10 candidates, totalling 30 fits


Parameters: { "use_label_encoder" } are not used.



XGBoost - Best Parameters:
{'colsample_bytree': 0.9570351922786028,
 'gamma': 0.03993475643167195,
 'learning_rate': 0.056281099457225044,
 'max_depth': 3,
 'min_child_weight': 3,
 'n_estimators': 180,
 'reg_alpha': 0.09113172778521575,
 'reg_lambda': 0.025578618553093728,
 'scale_pos_weight': 1,
 'subsample': 0.8026529922319734}

Evaluation Metrics:
Validation Accuracy: 0.6957
Test Accuracy: 0.7115
Precision: 0.6644
Recall: 0.5943
F1 Score: 0.5942
Validation AUC: 0.6902
Test AUC: 0.7075


In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from lightgbm import LGBMClassifier
from scipy.stats import randint, uniform
from pprint import pprint

#파라미터
N_TOP_FEATURES = 25 
N_ITER = 10   

df = pd.read_csv("data/all_f.csv", encoding='ISO-8859-1')

#피처 및 타겟 지정
features = df[["GRADE", "e_s_rcrd", "e_res", "pr_ht", "BMI", "wc_mn", "f_br",  
               "f_fru", "f_drink", "f_ff", "f_edu", "f_wat", "pa_tot", "pa_msc",  
               "o_br_fq", "o_slnt", "hw", "hw_edu", "rh_dg_lt", "ecz_dg_lt", "sp_t",  
               "m_slp_en", "m_str", "m_lon", "GAD", "m_sui_con", "m_sui_pln"]]
target = df["SP"]

#원-핫 인코딩
features = pd.get_dummies(features, drop_first=True)

#데이터 분할
X_train, X_temp, y_train, y_temp = train_test_split(features, target, test_size=0.4, random_state=42, stratify=target)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

#결측값 제거
X_train = X_train.dropna()
y_train = y_train.loc[X_train.index]
X_val = X_val.dropna()
y_val = y_val.loc[X_val.index]
X_test = X_test.dropna()
y_test = y_test.loc[X_test.index]

#Feature Selection
lgb_temp = LGBMClassifier(random_state=42, n_estimators=80, n_jobs=-1)
lgb_temp.fit(X_train, y_train)
feature_importances = pd.Series(lgb_temp.feature_importances_, index=X_train.columns)
top_features = feature_importances.sort_values(ascending=False).head(N_TOP_FEATURES).index

X_train = X_train[top_features]
X_val = X_val[top_features]
X_test = X_test[top_features]

#하이퍼파라미터 범위
param_dist = {
    'n_estimators': randint(80, 200),
    'max_depth': [3, 5, -1],
    'learning_rate': uniform(0.01, 0.09),
    'subsample': uniform(0.8, 0.2),
    'colsample_bytree': uniform(0.8, 0.2),
    'min_child_samples': randint(10, 21),
    'reg_alpha': uniform(0, 0.15),
    'reg_lambda': uniform(0, 0.15),
    'class_weight': [None, 'balanced']
}

#RandomizedSearchCV
lgb_model = LGBMClassifier(random_state=42, n_jobs=-1)
random_search = RandomizedSearchCV(lgb_model, param_distributions=param_dist, 
                                   n_iter=N_ITER, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1, random_state=42)
random_search.fit(X_train, y_train)

print("LightGBM - Best Parameters:")
pprint(random_search.best_params_)

from lightgbm import early_stopping as lgb_early_stopping

#최적 파라미터로  재학습
best_lgb = LGBMClassifier(random_state=42, n_jobs=-1, **random_search.best_params_)
best_lgb.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='auc',
    callbacks=[lgb_early_stopping(15)]
)

#예측
val_preds = best_lgb.predict(X_val)
test_preds = best_lgb.predict(X_test)

#평가 지표 계산
val_accuracy = accuracy_score(y_val, val_preds)
test_accuracy = accuracy_score(y_test, test_preds)
precision = precision_score(y_test, test_preds, average='macro')
recall = recall_score(y_test, test_preds, average='macro')
f1 = f1_score(y_test, test_preds, average='macro')

#AUC 계산
val_probs = best_lgb.predict_proba(X_val)[:, 1]
test_probs = best_lgb.predict_proba(X_test)[:, 1]
val_auc = roc_auc_score(y_val, val_probs)
test_auc = roc_auc_score(y_test, test_probs)


print("\nEvaluation Metrics:")
print("Validation Accuracy: {:.4f}".format(val_accuracy))
print("Test Accuracy: {:.4f}".format(test_accuracy))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("F1 Score: {:.4f}".format(f1))
print("Validation AUC: {:.4f}".format(val_auc))
print("Test AUC: {:.4f}".format(test_auc))


[LightGBM] [Info] Number of positive: 2637, number of negative: 5707
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000829 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 495
[LightGBM] [Info] Number of data points in the train set: 8344, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.316035 -> initscore=-0.772052
[LightGBM] [Info] Start training from score -0.772052
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[LightGBM] [Info] Number of positive: 2637, number of negative: 5707
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000459 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 490
[LightGBM] [Info] Number of data points in the train set:

Early stopping, best iteration is:
[56]	valid_0's auc: 0.688417	valid_0's binary_logloss: 0.57676

Evaluation Metrics:
Validation Accuracy: 0.6982
Test Accuracy: 0.7043
Precision: 0.6885
Recall: 0.5509
F1 Score: 0.5190
Validation AUC: 0.6884
Test AUC: 0.7093
