In [None]:

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier

# ------------------ 데이터 로딩 ------------------
df = pd.read_csv("/content/drive/MyDrive/Gapsangsun/train.csv").dropna()

# 'Cancer' 컬럼이 존재하는지 확인
if 'Cancer' not in df.columns:
    print("🚨 'Cancer' 컬럼이 데이터에 없습니다. CSV 파일을 확인해주세요.")
else:
    categorical_cols = df.select_dtypes(include='object').columns.drop(['ID'])
    label_encoders = {}

    for col in categorical_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

    X = df.drop(columns=['ID', 'Cancer'])
    y = df['Cancer']
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

    scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

    # ------------------ 모델 학습 ------------------
    model = XGBClassifier(
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss',
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=scale_pos_weight
    )
    model.fit(X_train, y_train)

    # ------------------ threshold 튜닝 ------------------
    y_probs = model.predict_proba(X_valid)[:, 1]
    best_f1 = 0
    best_th = 0.5
    for th in np.arange(0.1, 0.9, 0.01):
        preds = (y_probs >= th).astype(int)
        score = f1_score(y_valid, preds)
        if score > best_f1:
            best_f1 = score
            best_th = th

    print("최적 threshold:", best_th)

    # ------------------ 검증 평가 ------------------
    y_pred_opt = (y_probs >= best_th).astype(int)
    print("F1-score (threshold 적용):", f1_score(y_valid, y_pred_opt))
    print("정확도:", accuracy_score(y_valid, y_pred_opt))
    print("분류 리포트:\n", classification_report(y_valid, y_pred_opt))
    print("혼동 행렬:\n", confusion_matrix(y_valid, y_pred_opt))

    # ------------------ 테스트 예측 ------------------
    test_df = pd.read_csv("/content/drive/MyDrive/Gapsangsun/test.csv")
    test_ids = test_df["ID"]
    for col in categorical_cols:
        le = label_encoders[col]
        test_df[col] = le.transform(test_df[col].astype(str))
    X_test = test_df.drop(columns=["ID"])
    test_probs = model.predict_proba(X_test)[:, 1]
    test_preds = (test_probs >= best_th).astype(int)

    submission = pd.DataFrame({"ID": test_ids, "Cancer": test_preds})
    submission.to_csv("submission.csv", index=False)
    print(submission.head())
