<a href="https://colab.research.google.com/github/Hanbin-git/Dacon_cacer/blob/main/Untitled6_%EC%8B%A4%ED%97%98%EC%BD%94%EB%93%9C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!unzip -o "/content/drive/MyDrive/open_1.zip" -d "/content/open_1"


Archive:  /content/drive/MyDrive/open_1.zip
  inflating: /content/open_1/sample_submission.csv  
  inflating: /content/open_1/test.csv  
  inflating: /content/open_1/train.csv  


In [3]:
import os

def get_path(filename):
    return "/content/open_1/" + filename


In [4]:
# SMOTE 설치
!pip install -U imbalanced-learn
!pip install lightgbm optuna
!pip install -q lightgbm catboost xgboost



Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.2-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.7/242.7 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.2 colorlog-6.9.0 optuna-4.4.0
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
# ✅ Import
import pandas as pd
import numpy as np
import optuna

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# ✅ Data Load
train = pd.read_csv('/content/open_1/train.csv')
test = pd.read_csv('/content/open_1/test.csv')
submission = pd.read_csv('/content/open_1/sample_submission.csv')

X = train.drop(columns=['ID', 'Cancer'])
y = train['Cancer']
X_test = test.drop(columns=['ID'])

# ✅ Preprocessing
def preprocess(X, X_test):
    categorical_cols = X.select_dtypes(include='object').columns
    encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
        X_test[col] = le.transform(X_test[col].astype(str))
        encoders[col] = le

    # 수치형 결측치 처리
    imputer = SimpleImputer(strategy='mean')
    X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
    X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)
    return X, X_test

X, X_test = preprocess(X, X_test)

# ✅ Optuna 튜닝용 Objective 함수
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 800),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
    }

    smote = SMOTE(random_state=42)
    X_res, y_res = smote.fit_resample(X, y)

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores = []

    for train_idx, valid_idx in kf.split(X_res, y_res):
        X_tr, X_val = X_res.iloc[train_idx], X_res.iloc[valid_idx]
        y_tr, y_val = y_res.iloc[train_idx], y_res.iloc[valid_idx]

        model = XGBClassifier(**params, random_state=42, use_label_encoder=False, eval_metric='logloss')
        model.fit(X_tr, y_tr)
        preds = model.predict(X_val)
        f1_scores.append(f1_score(y_val, preds))

    return np.mean(f1_scores)

# ✅ Optuna 수행
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30, timeout=600)
print("Best F1:", study.best_value)
print("Best Params:", study.best_params)

# ✅ Cross-validation + Threshold 최적화 + 예측
best_params = study.best_params
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))
y_true = np.zeros(len(X))

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y)):
    X_tr, X_val = X.iloc[train_idx], X.iloc[valid_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[valid_idx]

    sm = SMOTE(random_state=42)
    X_tr, y_tr = sm.fit_resample(X_tr, y_tr)

    model = XGBClassifier(**best_params, random_state=42, use_label_encoder=False, eval_metric='logloss')
    model.fit(X_tr, y_tr)

    oof_preds[valid_idx] = model.predict_proba(X_val)[:, 1]
    test_preds += model.predict_proba(X_test)[:, 1] / 5
    y_true[valid_idx] = y_val

# ✅ Threshold 최적화
thresholds = np.arange(0.1, 0.9, 0.01)
f1_scores = [f1_score(y_true, oof_preds > t) for t in thresholds]
best_thresh = thresholds[np.argmax(f1_scores)]
print(f"Best Threshold: {best_thresh:.2f} / F1: {max(f1_scores):.4f}")

# ✅ 최종 예측 및 제출 파일 생성
submission['Cancer'] = (test_preds > best_thresh).astype(int)
submission.to_csv("submission_optuna_cv_threshold.csv", index=False)

# ✅ Colab 다운로드
from google.colab import files
files.download("submission_optuna_cv_threshold.csv")


[I 2025-06-17 06:17:14,851] A new study created in memory with name: no-name-f25b7234-85ba-4803-8a54-7b982ca35064
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-06-17 06:17:37,454] Trial 0 finished with value: 0.9258690271976763 and parameters: {'n_estimators': 373, 'max_depth': 4, 'learning_rate': 0.14124934951626097, 'subsample': 0.5595041098312006, 'colsample_bytree': 0.6565643302545194}. Best is trial 0 with value: 0.9258690271976763.
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-06-17 06:18:10,635] Trial 1 finished with value: 0.9282032564360716 and parame

Best F1: 0.9312812999597118
Best Params: {'n_estimators': 800, 'max_depth': 12, 'learning_rate': 0.20834472133568432, 'subsample': 0.9852221431794483, 'colsample_bytree': 0.8851738607189914}


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Best Threshold: 0.10 / F1: 0.4103


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [6]:
from google.colab import files
files.download("submission_optuna_cv_threshold.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>