# 구글 드라이브 연결

In [1]:
# Google Drive Mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 라이브러리 불러오기

In [2]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score

# 데이터프레임으로 저장

In [3]:
# read xlsx to dataframe from path
import pandas as pd
df = pd.read_excel('/content/drive/MyDrive/AI/pone.0197032.s001.xlsx')

df

Unnamed: 0,Sex_1male_2female,Age,Weather_1sunny_2cloudy_3rainy_4missing,Location_1outdoor_indoor,Functionaldependency_1notdisable_2disable,HT,HeartDisease,Pscyco,DM,CerevD,...,Plt,BUN,Cre,AST,ALT,CK,CRP,Admission1,ICU,Deadtodischarge
0,1.0,24.0,3.0,,1.0,0,0,0,0,0,...,23.2,17.0,1.69,29.0,48.0,506.0,17.4,1.0,,
1,,43.0,,1.0,,0,0,0,0,0,...,24.3,23.1,3.08,36.0,35.0,883.0,0.46,1.0,,
2,1.0,58.0,,1.0,1.0,0,0,0,0,0,...,31.2,19.4,1.42,33.0,16.0,815.0,0.16,1.0,,
3,1.0,46.0,1.0,1.0,1.0,0,0,0,0,0,...,33.3,32.9,4.20,33.0,54.0,173.0,1,1.0,,
4,1.0,57.0,1.0,1.0,1.0,0,0,0,0,0,...,14.1,21.0,1.33,77.0,36.0,386.0,0.103,1.0,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3170,1.0,54.0,,,1.0,0,0,0,0,0,...,26.1,33.4,1.41,104.0,46.0,3836.0,3.7,1.0,1.0,0.0
3171,1.0,62.0,3.0,,2.0,0,0,0,0,0,...,25.4,63.6,2.01,37.0,19.0,1606.0,1.29,1.0,1.0,1.0
3172,0.0,80.0,1.0,,1.0,0,1,0,1,0,...,8.9,31.5,2.25,140.0,53.0,4388.0,10.74,1.0,1.0,0.0
3173,1.0,84.0,2.0,,2.0,1,0,0,0,0,...,20.2,15.2,0.96,20.0,11.0,61.0,0.06,1.0,1.0,0.0


In [4]:
raw = df.copy()

# 컬럼명 출력

In [5]:
# print columns of raw
raw.columns

Index(['Sex_1male_2female', 'Age', 'Weather_1sunny_2cloudy_3rainy_4missing',
       'Location_1outdoor_indoor', 'Functionaldependency_1notdisable_2disable',
       'HT', 'HeartDisease', 'Pscyco', 'DM', 'CerevD', 'ParkinD', 'CKD',
       'Dementia', 'PreSBP', 'PreRR', 'PreGCSlessthan15', 'PreBT', 'PreHR',
       'Abdminal', 'Muscular', 'GCS', 'SBP', 'BT', 'HR', 'WBC', 'Ht', 'Plt',
       'BUN', 'Cre', 'AST', 'ALT', 'CK', 'CRP', 'Admission1', 'ICU',
       'Deadtodischarge'],
      dtype='object')

# 타겟 생성

In [6]:
# 타겟 생성 (Admission1, ICU, 사망 중 하나라도 1이면 위험)
raw['target'] = (
    (raw['Admission1'] == 1) |
    (raw['ICU'] == 1) |
    (raw['Deadtodischarge'] == 1)
).astype(int)

In [7]:
raw

Unnamed: 0,Sex_1male_2female,Age,Weather_1sunny_2cloudy_3rainy_4missing,Location_1outdoor_indoor,Functionaldependency_1notdisable_2disable,HT,HeartDisease,Pscyco,DM,CerevD,...,BUN,Cre,AST,ALT,CK,CRP,Admission1,ICU,Deadtodischarge,target
0,1.0,24.0,3.0,,1.0,0,0,0,0,0,...,17.0,1.69,29.0,48.0,506.0,17.4,1.0,,,1
1,,43.0,,1.0,,0,0,0,0,0,...,23.1,3.08,36.0,35.0,883.0,0.46,1.0,,,1
2,1.0,58.0,,1.0,1.0,0,0,0,0,0,...,19.4,1.42,33.0,16.0,815.0,0.16,1.0,,,1
3,1.0,46.0,1.0,1.0,1.0,0,0,0,0,0,...,32.9,4.20,33.0,54.0,173.0,1,1.0,,,1
4,1.0,57.0,1.0,1.0,1.0,0,0,0,0,0,...,21.0,1.33,77.0,36.0,386.0,0.103,1.0,1.0,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3170,1.0,54.0,,,1.0,0,0,0,0,0,...,33.4,1.41,104.0,46.0,3836.0,3.7,1.0,1.0,0.0,1
3171,1.0,62.0,3.0,,2.0,0,0,0,0,0,...,63.6,2.01,37.0,19.0,1606.0,1.29,1.0,1.0,1.0,1
3172,0.0,80.0,1.0,,1.0,0,1,0,1,0,...,31.5,2.25,140.0,53.0,4388.0,10.74,1.0,1.0,0.0,1
3173,1.0,84.0,2.0,,2.0,1,0,0,0,0,...,15.2,0.96,20.0,11.0,61.0,0.06,1.0,1.0,0.0,1


# 라벨이 전부 결측(NaN)인 행 제거

In [8]:
mask_all_missing = raw[['Admission1','ICU','Deadtodischarge']].isna().all(axis=1)
raw = raw[~mask_all_missing]

In [9]:
raw

Unnamed: 0,Sex_1male_2female,Age,Weather_1sunny_2cloudy_3rainy_4missing,Location_1outdoor_indoor,Functionaldependency_1notdisable_2disable,HT,HeartDisease,Pscyco,DM,CerevD,...,BUN,Cre,AST,ALT,CK,CRP,Admission1,ICU,Deadtodischarge,target
0,1.0,24.0,3.0,,1.0,0,0,0,0,0,...,17.0,1.69,29.0,48.0,506.0,17.4,1.0,,,1
1,,43.0,,1.0,,0,0,0,0,0,...,23.1,3.08,36.0,35.0,883.0,0.46,1.0,,,1
2,1.0,58.0,,1.0,1.0,0,0,0,0,0,...,19.4,1.42,33.0,16.0,815.0,0.16,1.0,,,1
3,1.0,46.0,1.0,1.0,1.0,0,0,0,0,0,...,32.9,4.20,33.0,54.0,173.0,1,1.0,,,1
4,1.0,57.0,1.0,1.0,1.0,0,0,0,0,0,...,21.0,1.33,77.0,36.0,386.0,0.103,1.0,1.0,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3170,1.0,54.0,,,1.0,0,0,0,0,0,...,33.4,1.41,104.0,46.0,3836.0,3.7,1.0,1.0,0.0,1
3171,1.0,62.0,3.0,,2.0,0,0,0,0,0,...,63.6,2.01,37.0,19.0,1606.0,1.29,1.0,1.0,1.0,1
3172,0.0,80.0,1.0,,1.0,0,1,0,1,0,...,31.5,2.25,140.0,53.0,4388.0,10.74,1.0,1.0,0.0,1
3173,1.0,84.0,2.0,,2.0,1,0,0,0,0,...,15.2,0.96,20.0,11.0,61.0,0.06,1.0,1.0,0.0,1


In [10]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3028 entries, 0 to 3174
Data columns (total 37 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Sex_1male_2female                          2995 non-null   float64
 1   Age                                        3021 non-null   float64
 2   Weather_1sunny_2cloudy_3rainy_4missing     2519 non-null   float64
 3   Location_1outdoor_indoor                   1760 non-null   float64
 4   Functionaldependency_1notdisable_2disable  2701 non-null   float64
 5   HT                                         3028 non-null   int64  
 6   HeartDisease                               3028 non-null   int64  
 7   Pscyco                                     3028 non-null   int64  
 8   DM                                         3028 non-null   int64  
 9   CerevD                                     3028 non-null   int64  
 10  ParkinD                      

In [11]:
# Age 결측치 보완
raw['Age'] = pd.to_numeric(raw['Age'], errors='coerce')
raw['Age'] = raw['Age'].fillna(raw['Age'].median())

# HR 결측치 제거
raw['HR'] = pd.to_numeric(raw['HR'], errors='coerce')
raw = raw.dropna(subset=['HR'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw['Age'] = pd.to_numeric(raw['Age'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw['Age'] = raw['Age'].fillna(raw['Age'].median())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw['HR'] = pd.to_numeric(raw['HR'], errors='coerce')


# 학습용 피처 생성 (나이별 최대 심박수)

In [12]:
raw['HRmax'] = 220 - raw['Age']            # 나이별 최대 심박수
raw['hr_ratio'] = raw['HR'] / raw['HRmax'] # 최대 심박수 대비 현재 심박수 비율

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw['HRmax'] = 220 - raw['Age']            # 나이별 최대 심박수
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw['hr_ratio'] = raw['HR'] / raw['HRmax'] # 최대 심박수 대비 현재 심박수 비율


In [13]:
raw

Unnamed: 0,Sex_1male_2female,Age,Weather_1sunny_2cloudy_3rainy_4missing,Location_1outdoor_indoor,Functionaldependency_1notdisable_2disable,HT,HeartDisease,Pscyco,DM,CerevD,...,AST,ALT,CK,CRP,Admission1,ICU,Deadtodischarge,target,HRmax,hr_ratio
0,1.0,24.0,3.0,,1.0,0,0,0,0,0,...,29.0,48.0,506.0,17.4,1.0,,,1,196.0,0.637755
1,,43.0,,1.0,,0,0,0,0,0,...,36.0,35.0,883.0,0.46,1.0,,,1,177.0,0.491525
2,1.0,58.0,,1.0,1.0,0,0,0,0,0,...,33.0,16.0,815.0,0.16,1.0,,,1,162.0,0.654321
3,1.0,46.0,1.0,1.0,1.0,0,0,0,0,0,...,33.0,54.0,173.0,1,1.0,,,1,174.0,0.609195
4,1.0,57.0,1.0,1.0,1.0,0,0,0,0,0,...,77.0,36.0,386.0,0.103,1.0,1.0,,1,163.0,0.926380
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3170,1.0,54.0,,,1.0,0,0,0,0,0,...,104.0,46.0,3836.0,3.7,1.0,1.0,0.0,1,166.0,0.759036
3171,1.0,62.0,3.0,,2.0,0,0,0,0,0,...,37.0,19.0,1606.0,1.29,1.0,1.0,1.0,1,158.0,0.936709
3172,0.0,80.0,1.0,,1.0,0,1,0,1,0,...,140.0,53.0,4388.0,10.74,1.0,1.0,0.0,1,140.0,1.000000
3173,1.0,84.0,2.0,,2.0,1,0,0,0,0,...,20.0,11.0,61.0,0.06,1.0,1.0,0.0,1,136.0,0.941176


# 학습용 피처 선택 (라벨과 관계 없는 컬럼 제외)

In [14]:
feature_cols = ['Age', 'HR', 'HRmax', 'hr_ratio']
X = raw[feature_cols]
y = raw['target']

In [15]:
X

Unnamed: 0,Age,HR,HRmax,hr_ratio
0,24.0,125.0,196.0,0.637755
1,43.0,87.0,177.0,0.491525
2,58.0,106.0,162.0,0.654321
3,46.0,106.0,174.0,0.609195
4,57.0,151.0,163.0,0.926380
...,...,...,...,...
3170,54.0,126.0,166.0,0.759036
3171,62.0,148.0,158.0,0.936709
3172,80.0,140.0,140.0,1.000000
3173,84.0,128.0,136.0,0.941176


# 데이터 분할 Train / Test split

In [16]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 학습 및 평가

In [17]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.4-py3-none-any.whl (247 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.0/247.0 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.4 colorlog-6.9.0 optuna-4.4.0


In [18]:
# optuna version
import optuna
optuna.__version__

'4.4.0'

In [20]:
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import make_scorer, recall_score, classification_report, precision_recall_curve
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# 양/음성 비율(불균형 보정용)
pos_weight = (y_train == 0).sum() / max(1, (y_train == 1).sum())

# 교차검증/스코어러: "위험(1) 재현율(recall) 극대화"
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
recall_scorer = make_scorer(recall_score, pos_label=1)

## 하이퍼파라미터 튜닝

### LogisticRegression

In [21]:
from sklearn.linear_model import LogisticRegression

def objective_lr(trial):
    # 하이퍼파라미터 정의
    C = trial.suggest_float("C", 1e-3, 1e+2, log=True)
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])
    solver = trial.suggest_categorical("solver", ["lbfgs", "liblinear"])

    # 파이프라인: (옵션) 스케일링 + 로지스틱
    pipe = Pipeline([
        ("scaler", StandardScaler(with_mean=True, with_std=True)),
        ("clf", LogisticRegression(
            C=C,
            class_weight=class_weight,
            solver=solver,
            penalty="l2",
            max_iter=2000,
            n_jobs=-1 if solver=="lbfgs" else None,  # liblinear는 n_jobs 옵션 없음
            random_state=42
        ))
    ])

    # 5-fold 교차검증으로 “위험(1) recall” 평균 점수
    score = cross_val_score(pipe, X_train, y_train, scoring=recall_scorer, cv=cv).mean()
    return score

In [22]:
study_lr = optuna.create_study(direction="maximize", study_name="LR_recall")
study_lr.optimize(objective_lr, n_trials=50, show_progress_bar=True)

[I 2025-08-08 02:42:03,799] A new study created in memory with name: LR_recall


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-08-08 02:42:07,082] Trial 0 finished with value: 0.582557221783741 and parameters: {'C': 1.7787114112259124, 'class_weight': None, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.582557221783741.
[I 2025-08-08 02:42:07,167] Trial 1 finished with value: 0.6706878756602513 and parameters: {'C': 0.019554396160945835, 'class_weight': 'balanced', 'solver': 'liblinear'}. Best is trial 1 with value: 0.6706878756602513.
[I 2025-08-08 02:42:07,278] Trial 2 finished with value: 0.660779551939773 and parameters: {'C': 1.7202273956535121, 'class_weight': 'balanced', 'solver': 'lbfgs'}. Best is trial 1 with value: 0.6706878756602513.
[I 2025-08-08 02:42:07,343] Trial 3 finished with value: 0.582557221783741 and parameters: {'C': 1.4179329569278076, 'class_weight': None, 'solver': 'liblinear'}. Best is trial 1 with value: 0.6706878756602513.
[I 2025-08-08 02:42:07,457] Trial 4 finished with value: 0.5825632930605307 and parameters: {'C': 0.5191422266513658, 'class_weight': None, 'solver': 

In [23]:
best_lr_params = study_lr.best_trial.params
print("LR Best Params:", best_lr_params)
print("LR Best Recall (CV mean):", study_lr.best_value)

LR Best Params: {'C': 0.0010154278904143644, 'class_weight': 'balanced', 'solver': 'liblinear'}
LR Best Recall (CV mean): 0.6805961993807298


In [24]:
# 최적 파라미터로 재학습
best_lr = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        C=best_lr_params["C"],
        class_weight=best_lr_params["class_weight"],
        solver=best_lr_params["solver"],
        penalty="l2",
        max_iter=2000,
        n_jobs=-1 if best_lr_params["solver"]=="lbfgs" else None,
        random_state=42
    ))
]).fit(X_train, y_train)

In [25]:
y_pred_lr = best_lr.predict(X_test)
print("\n[Logistic Regression] Test Report")
print(classification_report(y_test, y_pred_lr))


[Logistic Regression] Test Report
              precision    recall  f1-score   support

           0       0.78      0.73      0.75       333
           1       0.62      0.68      0.65       216

    accuracy                           0.71       549
   macro avg       0.70      0.70      0.70       549
weighted avg       0.71      0.71      0.71       549



### XGBoost

In [26]:
from xgboost import XGBClassifier

def objective_xgb(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 800),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 8),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 2.0),
        # pos_weight 주변 탐색(데이터 불균형 보정)
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", max(1.0, pos_weight*0.5), pos_weight*2.0)
    }

    model = XGBClassifier(
        eval_metric="aucpr",          # 불균형에서 유리한 PR-AUC
        tree_method="hist",           # 빠른 학습
        random_state=42,
        n_jobs=-1,
        **params
    )

    score = cross_val_score(model, X_train, y_train, scoring=recall_scorer, cv=cv, n_jobs=-1).mean()
    return score

In [27]:
study_xgb = optuna.create_study(direction="maximize", study_name="XGB_recall")
study_xgb.optimize(objective_xgb, n_trials=60, show_progress_bar=True)

[I 2025-08-08 02:45:41,346] A new study created in memory with name: XGB_recall


  0%|          | 0/60 [00:00<?, ?it/s]

[I 2025-08-08 02:45:42,315] Trial 0 finished with value: 0.5440228280007287 and parameters: {'n_estimators': 332, 'learning_rate': 0.012677691851628415, 'max_depth': 7, 'min_child_weight': 3, 'subsample': 0.8709121794866319, 'colsample_bytree': 0.9231188197378544, 'reg_alpha': 0.4206160910947234, 'reg_lambda': 1.7372488994607473, 'scale_pos_weight': 1.1024421441101058}. Best is trial 0 with value: 0.5440228280007287.
[I 2025-08-08 02:45:43,370] Trial 1 finished with value: 0.7004432032056342 and parameters: {'n_estimators': 508, 'learning_rate': 0.05199654392425807, 'max_depth': 8, 'min_child_weight': 10, 'subsample': 0.6905117294526582, 'colsample_bytree': 0.645195318533581, 'reg_alpha': 0.31915416647843675, 'reg_lambda': 0.7179683786444597, 'scale_pos_weight': 2.437351340527087}. Best is trial 1 with value: 0.7004432032056342.
[I 2025-08-08 02:45:44,645] Trial 2 finished with value: 0.708147653451521 and parameters: {'n_estimators': 764, 'learning_rate': 0.03271336283793149, 'max_dep

In [28]:
best_xgb_params = study_xgb.best_trial.params
print("XGB Best Params:", best_xgb_params)
print("XGB Best Recall (CV mean):", study_xgb.best_value)

XGB Best Params: {'n_estimators': 268, 'learning_rate': 0.011313662666941763, 'max_depth': 3, 'min_child_weight': 9, 'subsample': 0.8541687687839865, 'colsample_bytree': 0.8634776141143308, 'reg_alpha': 0.25555944765151806, 'reg_lambda': 1.4456900977532203, 'scale_pos_weight': 2.8210410323209003}
XGB Best Recall (CV mean): 0.8413818225972923


In [29]:
# 최적 파라미터로 재학습
best_xgb = XGBClassifier(
    eval_metric="aucpr",
    tree_method="hist",
    random_state=42,
    n_jobs=-1,
    **best_xgb_params
).fit(X_train, y_train)

In [30]:
from sklearn.metrics import classification_report
y_pred_xgb = best_xgb.predict(X_test)
print("\n[XGBoost] Test Report")
print(classification_report(y_test, y_pred_xgb))


[XGBoost] Test Report
              precision    recall  f1-score   support

           0       0.83      0.42      0.55       333
           1       0.49      0.87      0.63       216

    accuracy                           0.59       549
   macro avg       0.66      0.64      0.59       549
weighted avg       0.69      0.59      0.58       549



### LightGBM

In [31]:
import lightgbm as lgb

def objective_lgb(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 300, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 15, 255),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 60),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 10),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 2.0),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", max(1.0, pos_weight*0.5), pos_weight*2.0),
        "objective": "binary",
        "random_state": 42,
        "n_jobs": -1
    }

    model = lgb.LGBMClassifier(**params)

    score = cross_val_score(model, X_train, y_train, scoring=recall_scorer, cv=cv, n_jobs=-1).mean()
    return score

In [32]:
study_lgb = optuna.create_study(direction="maximize", study_name="LGB_recall")
study_lgb.optimize(objective_lgb, n_trials=60, show_progress_bar=True)

[I 2025-08-08 02:49:02,689] A new study created in memory with name: LGB_recall


  0%|          | 0/60 [00:00<?, ?it/s]

[I 2025-08-08 02:49:12,929] Trial 0 finished with value: 0.5539979357658916 and parameters: {'n_estimators': 527, 'learning_rate': 0.15356288381869349, 'num_leaves': 51, 'min_child_samples': 49, 'feature_fraction': 0.7896637741446724, 'bagging_fraction': 0.6164086387374339, 'bagging_freq': 7, 'reg_alpha': 0.9304122226285476, 'reg_lambda': 0.5183244544764218, 'scale_pos_weight': 1.018184053481182}. Best is trial 0 with value: 0.5539979357658916.
[I 2025-08-08 02:49:28,104] Trial 1 finished with value: 0.5550968368647926 and parameters: {'n_estimators': 724, 'learning_rate': 0.19880649819204185, 'num_leaves': 128, 'min_child_samples': 21, 'feature_fraction': 0.8004610952396773, 'bagging_fraction': 0.6945440977715346, 'bagging_freq': 9, 'reg_alpha': 0.42667036370622513, 'reg_lambda': 0.13783033076138218, 'scale_pos_weight': 1.100342688556044}. Best is trial 1 with value: 0.5550968368647926.
[I 2025-08-08 02:49:39,292] Trial 2 finished with value: 0.5991378786958897 and parameters: {'n_est

In [33]:
best_lgb_params = study_lgb.best_trial.params
print("LGB Best Params:", best_lgb_params)
print("LGB Best Recall (CV mean):", study_lgb.best_value)

LGB Best Params: {'n_estimators': 564, 'learning_rate': 0.010182620128974302, 'num_leaves': 149, 'min_child_samples': 60, 'feature_fraction': 0.8221555366530124, 'bagging_fraction': 0.6494240374886435, 'bagging_freq': 1, 'reg_alpha': 0.9159139752384675, 'reg_lambda': 0.2740215360300978, 'scale_pos_weight': 2.8307762582982843}
LGB Best Recall (CV mean): 0.7874506708760853


In [34]:
# 최적 파라미터로 재학습
best_lgb = lgb.LGBMClassifier(**best_lgb_params).fit(
    X_train, y_train
    eval_set=[("train", X_train, y_train), ("valid", X_test, y_test)],
    eval_metric="recall",  # 위험(1) recall
    verbose=False
)

[LightGBM] [Info] Number of positive: 908, number of negative: 1286
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000192 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 539
[LightGBM] [Info] Number of data points in the train set: 2194, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.413856 -> initscore=-0.348048
[LightGBM] [Info] Start training from score -0.348048


In [35]:
y_pred_lgb = best_lgb.predict(X_test)
print("\n[LightGBM] Test Report")
print(classification_report(y_test, y_pred_lgb))


[LightGBM] Test Report
              precision    recall  f1-score   support

           0       0.81      0.52      0.63       333
           1       0.52      0.81      0.63       216

    accuracy                           0.63       549
   macro avg       0.66      0.66      0.63       549
weighted avg       0.69      0.63      0.63       549

