<a href="https://colab.research.google.com/github/Hanbin-git/Dacon_cacer/blob/main/SMOTE%2BstackingClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!unzip -o "/content/drive/MyDrive/open_1.zip" -d "/content/open_1"


Archive:  /content/drive/MyDrive/open_1.zip
  inflating: /content/open_1/sample_submission.csv  
  inflating: /content/open_1/test.csv  
  inflating: /content/open_1/train.csv  


In [3]:
import os

def get_path(filename):
    return "/content/open_1/" + filename


In [4]:
# SMOTE 설치
!pip install -U imbalanced-learn
!pip install lightgbm optuna
!pip install -q lightgbm catboost xgboost



Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.2-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.7/242.7 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.2 colorlog-6.9.0 optuna-4.4.0
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# 경로
def get_path(filename):
    return "/content/open_1/" + filename

# 데이터 로딩
train = pd.read_csv(get_path("train.csv"))
test = pd.read_csv(get_path("test.csv"))
submission = pd.read_csv(get_path("sample_submission.csv"))

y = train["Cancer"]
X = train.drop(columns=["ID", "Cancer"])
X_test = test.drop(columns=["ID"])

# 파생 변수 생성
def add_derived_features(df):
    df = df.copy()
    for col in df.select_dtypes(include='number'):
        df[f"{col}_squared"] = df[col] ** 2
        df[f"{col}_sqrt"] = np.sqrt(np.abs(df[col]))
        df[f"{col}_log"] = np.log1p(np.abs(df[col]))

    if "T4_Result" in df.columns and "TSH_Result" in df.columns:
        df["T4_TSH_ratio"] = df["T4_Result"] / (df["TSH_Result"] + 1e-3)
    if "T3_Result" in df.columns and "Nodule_Size" in df.columns:
        df["T3_times_Nodule"] = df["T3_Result"] * df["Nodule_Size"]
    return df

X = add_derived_features(X)
X_test = add_derived_features(X_test)

# 전처리
def preprocess(df, encoders=None):
    df = df.copy()
    encoders = encoders or {}
    for col in df.columns:
        if df[col].dtype == "object":
            df[col] = df[col].astype(str)
            if col not in encoders:
                encoders[col] = LabelEncoder().fit(df[col])
            df[col] = encoders[col].transform(df[col])
        else:
            df[col] = SimpleImputer(strategy="mean").fit_transform(df[[col]])
    return df, encoders

X, encoders = preprocess(X)
X_test, _ = preprocess(X_test, encoders)

# 앙상블 모델
model1 = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
model2 = LGBMClassifier(random_state=42)
model3 = CatBoostClassifier(verbose=0, random_state=42)

ensemble_model = VotingClassifier(
    estimators=[("xgb", model1), ("lgbm", model2), ("cat", model3)],
    voting="soft"
)

# OOF 예측 및 threshold 최적화
oof_preds = cross_val_predict(ensemble_model, X, y, method="predict_proba")[:, 1]
thresholds = np.arange(0.1, 0.9, 0.01)
f1s = [f1_score(y, oof_preds > t) for t in thresholds]
best_thresh = thresholds[np.argmax(f1s)]

# 최종 학습 및 예측
ensemble_model.fit(X, y)
test_preds = ensemble_model.predict_proba(X_test)[:, 1]

# 저장
submission["Cancer"] = (test_preds > best_thresh).astype(int)
submission.to_csv("submission_voting_rescue.csv", index=False)

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 8367, number of negative: 61360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011685 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4911
[LightGBM] [Info] Number of data points in the train set: 69727, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119997 -> initscore=-1.992463
[LightGBM] [Info] Start training from score -1.992463


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 8367, number of negative: 61360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012625 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4915
[LightGBM] [Info] Number of data points in the train set: 69727, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119997 -> initscore=-1.992463
[LightGBM] [Info] Start training from score -1.992463


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 8367, number of negative: 61360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011395 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4915
[LightGBM] [Info] Number of data points in the train set: 69727, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119997 -> initscore=-1.992463
[LightGBM] [Info] Start training from score -1.992463


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 8367, number of negative: 61360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011039 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4915
[LightGBM] [Info] Number of data points in the train set: 69727, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119997 -> initscore=-1.992463
[LightGBM] [Info] Start training from score -1.992463


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 8368, number of negative: 61360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011367 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4911
[LightGBM] [Info] Number of data points in the train set: 69728, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120009 -> initscore=-1.992343
[LightGBM] [Info] Start training from score -1.992343


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 10459, number of negative: 76700
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.034781 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4915
[LightGBM] [Info] Number of data points in the train set: 87159, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119999 -> initscore=-1.992439
[LightGBM] [Info] Start training from score -1.992439


In [12]:
from google.colab import files
files.download("submission_voting_rescue.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score
from sklearn.ensemble import VotingClassifier  # ✅ 추가 필요
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# 5-Fold TTA용
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
test_preds_tta = np.zeros(len(X_test))
oof_preds_tta = np.zeros(len(X))

for train_idx, val_idx in skf.split(X, y):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model1 = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
    model2 = LGBMClassifier(random_state=42)
    model3 = CatBoostClassifier(verbose=0, random_state=42)

    ensemble_model = VotingClassifier(
        estimators=[("xgb", model1), ("lgbm", model2), ("cat", model3)],
        voting="soft"
    )

    ensemble_model.fit(X_tr, y_tr)

    # OOF 예측 저장
    oof_preds_tta[val_idx] = ensemble_model.predict_proba(X_val)[:, 1]

    # TTA 예측 평균 누적
    test_preds_tta += ensemble_model.predict_proba(X_test)[:, 1] / skf.n_splits

# Threshold 최적화
thresholds = np.arange(0.1, 0.9, 0.01)
f1s = [f1_score(y, oof_preds_tta > t) for t in thresholds]
best_thresh = thresholds[np.argmax(f1s)]
print(f"Best F1: {np.max(f1s):.4f}, Best Threshold: {best_thresh:.2f}")

# 제출 파일 저장
submission["Cancer"] = (test_preds_tta > best_thresh).astype(int)
submission.to_csv("submission_voting_tta.csv", index=False)


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 8367, number of negative: 61360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012461 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4915
[LightGBM] [Info] Number of data points in the train set: 69727, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119997 -> initscore=-1.992463
[LightGBM] [Info] Start training from score -1.992463


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 8367, number of negative: 61360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011569 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4915
[LightGBM] [Info] Number of data points in the train set: 69727, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119997 -> initscore=-1.992463
[LightGBM] [Info] Start training from score -1.992463


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 8367, number of negative: 61360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012863 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4911
[LightGBM] [Info] Number of data points in the train set: 69727, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119997 -> initscore=-1.992463
[LightGBM] [Info] Start training from score -1.992463


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 8367, number of negative: 61360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012993 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4915
[LightGBM] [Info] Number of data points in the train set: 69727, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119997 -> initscore=-1.992463
[LightGBM] [Info] Start training from score -1.992463


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 8368, number of negative: 61360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011423 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4911
[LightGBM] [Info] Number of data points in the train set: 69728, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120009 -> initscore=-1.992343
[LightGBM] [Info] Start training from score -1.992343
Best F1: 0.4854, Best Threshold: 0.24


In [11]:
from google.colab import files
files.download("submission_voting_tta.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [13]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from imblearn.over_sampling import SMOTE

# 경로 설정
def get_path(filename):
    return "/content/open_1/" + filename

# 데이터 로딩
train = pd.read_csv(get_path("train.csv"))
test = pd.read_csv(get_path("test.csv"))
submission = pd.read_csv(get_path("sample_submission.csv"))

y = train["Cancer"]
X = train.drop(columns=["ID", "Cancer"])
X_test = test.drop(columns=["ID"])

# 파생변수 생성
def add_derived_features(df):
    df = df.copy()
    for col in df.select_dtypes(include='number'):
        df[f"{col}_squared"] = df[col] ** 2
        df[f"{col}_sqrt"] = np.sqrt(np.abs(df[col]))
        df[f"{col}_log"] = np.log1p(np.abs(df[col]))
    if "T4_Result" in df.columns and "TSH_Result" in df.columns:
        df["T4_TSH_ratio"] = df["T4_Result"] / (df["TSH_Result"] + 1e-3)
    if "T3_Result" in df.columns and "Nodule_Size" in df.columns:
        df["T3_times_Nodule"] = df["T3_Result"] * df["Nodule_Size"]
    return df

X = add_derived_features(X)
X_test = add_derived_features(X_test)

# 전처리
def preprocess(df, encoders=None):
    df = df.copy()
    encoders = encoders or {}
    for col in df.columns:
        if df[col].dtype == "object":
            df[col] = df[col].astype(str)
            if col not in encoders:
                encoders[col] = LabelEncoder().fit(df[col])
            df[col] = encoders[col].transform(df[col])
        else:
            df[col] = SimpleImputer(strategy="mean").fit_transform(df[[col]])
    return df, encoders

X, encoders = preprocess(X)
X_test, _ = preprocess(X_test, encoders)

# 메타 모델용 stacking 학습
base_models = [
    ("xgb", XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)),
    ("lgbm", LGBMClassifier(random_state=42)),
    ("cat", CatBoostClassifier(verbose=0, random_state=42))
]
meta_model = LogisticRegression()

stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1,
    passthrough=False
)

# 5-Fold + SMOTE + stacking
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"Fold {fold+1}")
    X_tr, y_tr = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

    smote = SMOTE(random_state=42)
    X_tr_sm, y_tr_sm = smote.fit_resample(X_tr, y_tr)

    stacking_model.fit(X_tr_sm, y_tr_sm)

    oof_preds[val_idx] = stacking_model.predict_proba(X_val)[:, 1]
    test_preds += stacking_model.predict_proba(X_test)[:, 1] / skf.n_splits

# threshold 최적화
thresholds = np.arange(0.1, 0.9, 0.01)
f1s = [f1_score(y, oof_preds > t) for t in thresholds]
best_thresh = thresholds[np.argmax(f1s)]
print(f"Best F1: {np.max(f1s):.4f}, Best Threshold: {best_thresh:.2f}")

# 최종 결과 저장
submission["Cancer"] = (test_preds > best_thresh).astype(int)
submission.to_csv("submission_stacking_smote.csv", index=False)


Fold 1




Fold 2
Fold 3
Fold 4




Fold 5




Best F1: 0.3914, Best Threshold: 0.41


In [14]:
from google.colab import files
files.download("submission_stacking_smote.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>