<a href="https://colab.research.google.com/github/Hanbin-git/Dacon_cacer/blob/main/%EA%B8%B0%EB%B3%B8%EC%95%99%EC%83%81%EB%B8%94.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!unzip -o "/content/drive/MyDrive/open_1.zip" -d "/content/open_1"


Archive:  /content/drive/MyDrive/open_1.zip
  inflating: /content/open_1/sample_submission.csv  
  inflating: /content/open_1/test.csv  
  inflating: /content/open_1/train.csv  


In [4]:
import os

def get_path(filename):
    return "/content/open_1/" + filename


In [5]:
import pandas as pd

train = pd.read_csv(get_path("train.csv"))
test = pd.read_csv(get_path("test.csv"))
submission = pd.read_csv(get_path("sample_submission.csv"))

print(train.shape, test.shape)


(87159, 16) (46204, 15)


In [6]:
# 결측치 확인
print(train.isnull().sum())

ID                   0
Age                  0
Gender               0
Country              0
Race                 0
Family_Background    0
Radiation_History    0
Iodine_Deficiency    0
Smoke                0
Weight_Risk          0
Diabetes             0
Nodule_Size          0
TSH_Result           0
T4_Result            0
T3_Result            0
Cancer               0
dtype: int64


In [7]:
#범주형 변수 인코딩 준비
categorical_cols = train.select_dtypes(include='object').columns.tolist()
print("범주형 변수:", categorical_cols)


범주형 변수: ['ID', 'Gender', 'Country', 'Race', 'Family_Background', 'Radiation_History', 'Iodine_Deficiency', 'Smoke', 'Weight_Risk', 'Diabetes']


In [14]:
# SMOTE 설치
!pip install -U imbalanced-learn
!pip install catboost


Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [15]:
!pip install xgboost lightgbm




In [16]:
drop_cols = ['ID']  # 분석 제외
X = train.drop(columns=drop_cols + ['Cancer'])
y = train['Cancer']
X_test = test.drop(columns=drop_cols)


In [17]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

categorical_cols = X.select_dtypes(include='object').columns
encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    encoders[col] = le  # 저장

for col in categorical_cols:
    le = encoders[col]
    # 테스트셋에 학습에 없던 label은 '<UNK>'로 처리
    X_test[col] = X_test[col].map(lambda s: '<UNK>' if s not in le.classes_ else s)
    le.classes_ = np.append(le.classes_, '<UNK>')
    X_test[col] = le.transform(X_test[col])


In [18]:
#  Step 3. SMOTE 오버샘플링 + Train/Val 분할
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


In [19]:
# Step 4. 모델 앙상블 정의 및 학습 (XGB + LGBM + CatBoost)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

xgb = XGBClassifier(random_state=42, eval_metric='logloss')
lgbm = LGBMClassifier(random_state=42)
cat = CatBoostClassifier(verbose=0, random_state=42)

xgb.fit(X_train_res, y_train_res)
lgbm.fit(X_train_res, y_train_res)
cat.fit(X_train_res, y_train_res)


[LightGBM] [Info] Number of positive: 61360, number of negative: 61360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011056 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1125
[LightGBM] [Info] Number of data points in the train set: 122720, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


<catboost.core.CatBoostClassifier at 0x7bcdab5ea990>

In [20]:
# Step 5. Threshold 최적화 + Soft Voting
from sklearn.metrics import precision_recall_curve
import numpy as np

xgb_val = xgb.predict_proba(X_val)[:, 1]
lgbm_val = lgbm.predict_proba(X_val)[:, 1]
cat_val = cat.predict_proba(X_val)[:, 1]

weights = {'xgb': 1.0, 'lgbm': 1.0, 'cat': 1.5}
total_weight = sum(weights.values())

ensemble_val = (
    xgb_val * weights['xgb'] +
    lgbm_val * weights['lgbm'] +
    cat_val * weights['cat']
) / total_weight

precisions, recalls, thresholds = precision_recall_curve(y_val, ensemble_val)
f1s = 2 * (precisions * recalls) / (precisions + recalls + 1e-8)
best_idx = np.argmax(f1s)
best_threshold = thresholds[best_idx]
print(f"Best Threshold: {best_threshold:.4f}, Best F1 Score: {f1s[best_idx]:.4f}")


Best Threshold: 0.5873, Best F1 Score: 0.3451


In [21]:
#  Step 6. 예측 및 제출 저장
xgb_test = xgb.predict_proba(X_test)[:, 1]
lgbm_test = lgbm.predict_proba(X_test)[:, 1]
cat_test = cat.predict_proba(X_test)[:, 1]

ensemble_test = (
    xgb_test * weights['xgb'] +
    lgbm_test * weights['lgbm'] +
    cat_test * weights['cat']
) / total_weight

final_pred = (ensemble_test >= best_threshold).astype(int)

submission = pd.read_csv(get_path("sample_submission.csv"))
submission['Cancer'] = final_pred
submission.to_csv(get_path("submission.csv"), index=False)

# 다운로드
from google.colab import files
files.download(get_path("submission.csv"))


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>