<a href="https://colab.research.google.com/github/Hanbin-git/Dacon_cacer/blob/main/Untitled5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!unzip -o "/content/drive/MyDrive/open_1.zip" -d "/content/open_1"


Archive:  /content/drive/MyDrive/open_1.zip
  inflating: /content/open_1/sample_submission.csv  
  inflating: /content/open_1/test.csv  
  inflating: /content/open_1/train.csv  


In [3]:
import os

def get_path(filename):
    return "/content/open_1/" + filename


In [4]:
# SMOTE 설치
!pip install -U imbalanced-learn
!pip install lightgbm optuna
!pip install -q lightgbm catboost xgboost



Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.2-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.7/242.7 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.2 colorlog-6.9.0 optuna-4.4.0
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
# Weighted Voting 앙상블 코드 (with 5-Fold)
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# ✅ 경로 함수
def get_path(filename):
    return "/content/open_1/" + filename

# ✅ 데이터 로드
train = pd.read_csv(get_path("train.csv"))
test = pd.read_csv(get_path("test.csv"))
submission = pd.read_csv(get_path("sample_submission.csv"))

# ✅ Target 분리
y = train['Cancer']
X = train.drop(columns=['ID', 'Cancer'])
X_test = test.drop(columns=['ID'])

# ✅ 파생변수 생성 함수
def add_derived_features(df):
    df = df.copy()
    if 'TSH' in df.columns and 'T4' in df.columns:
        df['T4_TSH_ratio'] = df['T4'] / (df['TSH'] + 1e-3)
    if 'T3' in df.columns and 'Nodule_Size' in df.columns:
        df['T3_times_Nodule'] = df['T3'] * df['Nodule_Size']
    return df

X = add_derived_features(X)
X_test = add_derived_features(X_test)

# ✅ 전처리 함수
def preprocess(df, fit_encoders=None):
    df = df.copy()
    encoders = fit_encoders if fit_encoders else {}
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].astype(str)
            if col not in encoders:
                encoders[col] = LabelEncoder().fit(df[col])
            df[col] = encoders[col].transform(df[col])
        else:
            df[col] = SimpleImputer(strategy='mean').fit_transform(df[[col]])
    return df, encoders

X, encoders = preprocess(X)
X_test, _ = preprocess(X_test, encoders)

# ✅ 5-Fold Stratified CV + Weighted Voting
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

# ✅ 가중치 설정 (성능 기준 수동 조정 또는 추후 자동화 가능)
weights = [2.0, 1.2, 1.0]  # [XGB, LGBM, CAT]

for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y)):
    X_tr, y_tr = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[valid_idx], y.iloc[valid_idx]

    model1 = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=fold)
    model2 = LGBMClassifier(random_state=fold)
    model3 = CatBoostClassifier(verbose=0, random_state=fold)

    ensemble_model = VotingClassifier(
        estimators=[('xgb', model1), ('lgbm', model2), ('cat', model3)],
        voting='soft',
        weights=weights
    )
    ensemble_model.fit(X_tr, y_tr)

    oof_preds[valid_idx] = ensemble_model.predict_proba(X_val)[:, 1]
    test_preds += ensemble_model.predict_proba(X_test)[:, 1] / kf.n_splits

# ✅ Threshold 최적화
thresholds = np.arange(0.1, 0.9, 0.01)
f1s = [f1_score(y, oof_preds > t) for t in thresholds]
best_thresh = thresholds[np.argmax(f1s)]
print(f"📌 Best F1: {max(f1s):.4f} at threshold {best_thresh:.2f}")

# ✅ 제출 파일 생성
submission['Cancer'] = (test_preds > best_thresh).astype(int)
submission.to_csv("submission_weighted_voting.csv", index=False)


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 8367, number of negative: 61360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010088 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1123
[LightGBM] [Info] Number of data points in the train set: 69727, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119997 -> initscore=-1.992463
[LightGBM] [Info] Start training from score -1.992463


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 8367, number of negative: 61360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009371 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1123
[LightGBM] [Info] Number of data points in the train set: 69727, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119997 -> initscore=-1.992463
[LightGBM] [Info] Start training from score -1.992463


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 8367, number of negative: 61360
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014822 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1122
[LightGBM] [Info] Number of data points in the train set: 69727, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119997 -> initscore=-1.992463
[LightGBM] [Info] Start training from score -1.992463


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 8367, number of negative: 61360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008952 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1123
[LightGBM] [Info] Number of data points in the train set: 69727, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119997 -> initscore=-1.992463
[LightGBM] [Info] Start training from score -1.992463


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 8368, number of negative: 61360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010061 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1122
[LightGBM] [Info] Number of data points in the train set: 69728, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120009 -> initscore=-1.992343
[LightGBM] [Info] Start training from score -1.992343
📌 Best F1: 0.4841 at threshold 0.23


In [6]:
from google.colab import files
files.download("submission_weighted_voting.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [7]:
# 자동 가중치 조정 기반 VotingClassifier 전체 코드
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier

# ✅ 데이터 로딩
def get_path(filename):
    return "/content/open_1/" + filename

train = pd.read_csv(get_path("train.csv"))
test = pd.read_csv(get_path("test.csv"))
submission = pd.read_csv(get_path("sample_submission.csv"))

y = train['Cancer']
X = train.drop(columns=['ID', 'Cancer'])
X_test = test.drop(columns=['ID'])

# ✅ 파생변수 생성
def add_derived_features(df):
    df = df.copy()
    if 'TSH' in df.columns and 'T4' in df.columns:
        df['T4_TSH_ratio'] = df['T4'] / (df['TSH'] + 1e-3)
    if 'T3' in df.columns and 'Nodule_Size' in df.columns:
        df['T3_times_Nodule'] = df['T3'] * df['Nodule_Size']
    return df

X = add_derived_features(X)
X_test = add_derived_features(X_test)

# ✅ 전처리 (문자형: Label Encoding, 수치형: 평균 대체)
def preprocess(df, fit_encoders=None):
    df = df.copy()
    encoders = fit_encoders if fit_encoders else {}
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].astype(str)
            if col not in encoders:
                encoders[col] = LabelEncoder().fit(df[col])
            df[col] = encoders[col].transform(df[col])
        else:
            df[col] = SimpleImputer(strategy='mean').fit_transform(df[[col]])
    return df, encoders

X, encoders = preprocess(X)
X_test, _ = preprocess(X_test, encoders)

# ✅ 개별 모델 F1 평가 함수
def get_model_f1(model_cls, X, y, name='model'):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    oof = np.zeros(len(X))
    for train_idx, valid_idx in skf.split(X, y):
        model = model_cls()
        model.fit(X.iloc[train_idx], y.iloc[train_idx])
        oof[valid_idx] = model.predict(X.iloc[valid_idx])
    f1 = f1_score(y, oof)
    print(f"✅ {name} F1 Score: {f1:.4f}")
    return f1

# ✅ 각 모델별 F1 계산
f1_xgb = get_model_f1(lambda: XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=0), X, y, 'XGB')
f1_lgb = get_model_f1(lambda: LGBMClassifier(random_state=0), X, y, 'LGBM')
f1_cat = get_model_f1(lambda: CatBoostClassifier(verbose=0, random_state=0), X, y, 'CatBoost')

# ✅ F1 기준 가중치 설정
model_f1s = np.array([f1_xgb, f1_lgb, f1_cat])
weights = model_f1s / model_f1s.sum() * 3  # 총합 3 기준 정규화 (VotingClassifier에 넣을 weight)
print("📌 자동 설정된 weights:", weights.round(3).tolist())

# ✅ VotingClassifier 앙상블 (Soft Voting with weights)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y)):
    X_tr, y_tr = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[valid_idx], y.iloc[valid_idx]

    model1 = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=fold)
    model2 = LGBMClassifier(random_state=fold)
    model3 = CatBoostClassifier(verbose=0, random_state=fold)

    ensemble = VotingClassifier(
        estimators=[('xgb', model1), ('lgbm', model2), ('cat', model3)],
        voting='soft',
        weights=weights.tolist()
    )
    ensemble.fit(X_tr, y_tr)
    oof_preds[valid_idx] = ensemble.predict_proba(X_val)[:, 1]
    test_preds += ensemble.predict_proba(X_test)[:, 1] / kf.n_splits

# ✅ Threshold 최적화
thresholds = np.arange(0.1, 0.9, 0.01)
f1s = [f1_score(y, oof_preds > t) for t in thresholds]
best_thresh = thresholds[np.argmax(f1s)]
print(f"\n📊 Best Threshold: {best_thresh:.2f}, Best F1: {max(f1s):.4f}")

# ✅ 제출 파일 생성
submission['Cancer'] = (test_preds > best_thresh).astype(int)
submission.to_csv("submission_weighted_auto.csv", index=False)


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



✅ XGB F1 Score: 0.3098
[LightGBM] [Info] Number of positive: 8367, number of negative: 61360
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015546 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1123
[LightGBM] [Info] Number of data points in the train set: 69727, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119997 -> initscore=-1.992463
[LightGBM] [Info] Start training from score -1.992463
[LightGBM] [Info] Number of positive: 8367, number of negative: 61360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009474 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1123
[LightGBM] [Info] Number of data points in the train set: 69727, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119997 -> i

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 8367, number of negative: 61360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010561 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1123
[LightGBM] [Info] Number of data points in the train set: 69727, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119997 -> initscore=-1.992463
[LightGBM] [Info] Start training from score -1.992463


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 8367, number of negative: 61360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010326 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1123
[LightGBM] [Info] Number of data points in the train set: 69727, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119997 -> initscore=-1.992463
[LightGBM] [Info] Start training from score -1.992463


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 8367, number of negative: 61360
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015175 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1122
[LightGBM] [Info] Number of data points in the train set: 69727, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119997 -> initscore=-1.992463
[LightGBM] [Info] Start training from score -1.992463


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 8367, number of negative: 61360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012208 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1123
[LightGBM] [Info] Number of data points in the train set: 69727, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119997 -> initscore=-1.992463
[LightGBM] [Info] Start training from score -1.992463


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 8368, number of negative: 61360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009893 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1122
[LightGBM] [Info] Number of data points in the train set: 69728, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120009 -> initscore=-1.992343
[LightGBM] [Info] Start training from score -1.992343

📊 Best Threshold: 0.23, Best F1: 0.4854


In [8]:
from google.colab import files
files.download("submission_weighted_auto.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# 파일 경로 함수
def get_path(filename):
    return "/content/open_1/" + filename

# 데이터 로드
train = pd.read_csv(get_path("train.csv"))
test = pd.read_csv(get_path("test.csv"))
submission = pd.read_csv(get_path("sample_submission.csv"))

# Target 구분
y = train['Cancer']
X = train.drop(columns=['ID', 'Cancer'])
X_test = test.drop(columns=['ID'])

# 파생변수 생성 함수
def add_derived_features(df):
    df = df.copy()
    num_cols = df.select_dtypes(include='number').columns.tolist()
    for col in num_cols:
        df[f'{col}_squared'] = df[col] ** 2
        df[f'{col}_sqrt'] = np.sqrt(np.abs(df[col]))
        df[f'{col}_log'] = np.log1p(np.abs(df[col]))
    if 'TSH' in df.columns and 'T4' in df.columns:
        df['T4_TSH_ratio'] = df['T4'] / (df['TSH'] + 1e-3)
    if 'T3' in df.columns and 'Nodule_Size' in df.columns:
        df['T3_times_Nodule'] = df['T3'] * df['Nodule_Size']
    return df

X = add_derived_features(X)
X_test = add_derived_features(X_test)

# 전체 전처리
def preprocess(df, encoders=None):
    df = df.copy()
    encoders = encoders or {}
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].astype(str)
            if col not in encoders:
                encoders[col] = LabelEncoder().fit(df[col])
            df[col] = encoders[col].transform(df[col])
        else:
            df[col] = SimpleImputer(strategy='mean').fit_transform(df[[col]])
    return df, encoders

X, encoders = preprocess(X)
X_test, _ = preprocess(X_test, encoders)

# 기본 모델들
base_learners = [
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)),
    ('lgbm', LGBMClassifier(random_state=42)),
    ('cat', CatBoostClassifier(verbose=0, random_state=42))
]
meta_model = LogisticRegression(max_iter=1000, random_state=42)

# Stacking 메타 모델
stack_model = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_model,
    cv=5,
    passthrough=True,
    n_jobs=-1
)

# 학습
stack_model.fit(X, y)

# 예측
test_probs = stack_model.predict_proba(X_test)[:, 1]

# Threshold 권절 및 저장
thresholds = np.arange(0.1, 0.9, 0.01)
oof_preds = stack_model.predict_proba(X)[:, 1]
f1s = [f1_score(y, oof_preds > t) for t in thresholds]
best_thresh = thresholds[np.argmax(f1s)]
print(f"Best threshold: {best_thresh:.2f}, Best F1: {max(f1s):.4f}")

submission['Cancer'] = (test_probs > best_thresh).astype(int)
submission.to_csv("submission_stacking.csv", index=False)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Best threshold: 0.14, Best F1: 0.5263


In [10]:
from google.colab import files
files.download("submission_stacking.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>