# 팀간 1차 전처리 완료 후 RF 모델 진행

In [2]:
import pandas as pd

df = pd.read_csv(r"C:\Users\nuwba\Desktop\Codeit_4\중급2_프로젝트\중급_2\02_preprocessed_data.csv")

df

Unnamed: 0,user_uuid,area_pyeong,stay_time_second_total,trial_day,trial_gap,first_trial_covid_level,last_trial_covid_level,check_in_total_count,check_out_total_count,is_payment,...,last_site_id_47,last_site_id_49,start_trial_year_2021,start_trial_year_2022,start_trial_year_2023,start_trial_year_2024,final_trial_year_2021,final_trial_year_2022,final_trial_year_2023,final_trial_year_2024
0,57521fee-05a3-4d93-9e22-9475a9740c19,0.904151,-1.139471,-0.960123,-1.464053,4.580077,4.662120,-1.486115,-1.452799,0,...,False,False,True,False,False,False,True,False,False,False
1,cc8ac908-3463-434c-8b4b-369d19f0e749,0.904151,1.418627,-0.960123,3.213976,4.580077,4.662120,-0.288033,-0.256400,1,...,False,False,True,False,False,False,True,False,False,False
2,25034325-b30f-4728-8475-4e9e2bceea5f,-0.280211,-0.560481,-0.960123,-1.464053,4.580077,4.662120,-0.288033,-0.256400,0,...,False,False,True,False,False,False,True,False,False,False
3,79e6b258-7d99-4c3c-85b2-fd954da86b83,0.904151,-0.645693,-0.960123,-1.464053,4.580077,4.662120,-1.486115,-1.452799,0,...,False,False,True,False,False,False,True,False,False,False
4,42360a06-abb0-4b3f-8c76-dfaadd0034ea,-0.280211,-1.051514,-0.960123,0.095290,4.580077,4.662120,-1.486115,-1.452799,0,...,False,False,True,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5477,516c8142-ae8d-4c6f-85d9-2025c14fbd33,0.904151,1.449524,1.591826,-1.464053,-0.593360,-0.597216,1.535608,1.558022,1,...,False,False,False,False,True,False,False,False,False,True
5478,d16cd121-9802-4880-8879-2eaf09fe99c5,-0.280211,-1.136596,0.315851,-1.464053,-0.593360,-0.597216,-0.288033,-0.256400,1,...,False,False,False,False,True,False,False,False,False,True
5479,06bbc34a-80ad-4fe7-af86-353204eda243,-1.464573,0.224382,0.315851,-1.464053,-0.593360,-0.597216,0.609715,0.637865,0,...,True,False,False,False,True,False,False,False,False,True
5480,aa4773c8-dbc9-4c80-849b-1ab8d1db7bda,0.904151,-0.716659,-0.960123,1.654633,-0.593360,-0.597216,-0.288033,-0.256400,0,...,False,False,False,False,False,True,False,False,False,True


In [4]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report

In [5]:
# X, y 분리 (is_payment는 목표변수, user_uuid 같은 ID 계열은 제거)
X = df.drop(columns=["is_payment", "user_uuid"], errors="ignore")
y = df["is_payment"]

# 학습/테스트 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Optuna 목적 함수 정의
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 5, 50),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
        "class_weight": "balanced",
        "random_state": 42,
        "n_jobs": -1
    }

    model = RandomForestClassifier(**params)

    # 교차 검증 (f1-score 기준 최적화)
    score = cross_val_score(
        model, X_train, y_train,
        cv=5, scoring="f1", n_jobs=-1
    ).mean()

    return score

# Optuna 실행
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)  # n_trials=50 → 탐색 횟수

print("Best Trial Params:", study.best_trial.params)
print("Best CV f1 Score:", study.best_value)

# 최적 모델로 학습 후 테스트셋 평가
best_params = study.best_trial.params
best_model = RandomForestClassifier(**best_params)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred, digits=4))

[32m[I 2025-09-05 18:34:12,687][0m A new study created in memory with name: no-name-21b01d49-4581-4d90-aa46-9a00048995b1[0m


[32m[I 2025-09-05 18:34:26,875][0m Trial 0 finished with value: 0.43286902273092015 and parameters: {'n_estimators': 732, 'max_depth': 34, 'min_samples_split': 11, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.43286902273092015.[0m
[32m[I 2025-09-05 18:34:50,843][0m Trial 1 finished with value: 0.4546392417036914 and parameters: {'n_estimators': 582, 'max_depth': 22, 'min_samples_split': 12, 'min_samples_leaf': 6, 'max_features': None}. Best is trial 1 with value: 0.4546392417036914.[0m
[32m[I 2025-09-05 18:34:54,648][0m Trial 2 finished with value: 0.4828870276460712 and parameters: {'n_estimators': 358, 'max_depth': 35, 'min_samples_split': 3, 'min_samples_leaf': 9, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.4828870276460712.[0m
[32m[I 2025-09-05 18:34:56,937][0m Trial 3 finished with value: 0.4765808527318584 and parameters: {'n_estimators': 229, 'max_depth': 49, 'min_samples_split': 14, 'min_samples_leaf': 7, 'max_features': 'l

Best Trial Params: {'n_estimators': 697, 'max_depth': 5, 'min_samples_split': 3, 'min_samples_leaf': 10, 'max_features': None}
Best CV f1 Score: 0.48942568146652865
              precision    recall  f1-score   support

           0     0.6533    0.9192    0.7638       693
           1     0.5410    0.1634    0.2510       404

    accuracy                         0.6408      1097
   macro avg     0.5972    0.5413    0.5074      1097
weighted avg     0.6120    0.6408    0.5749      1097

