### Test_03 XGBoost

In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve

- train.csv [파일]
```
ID : 샘플별 고유 ID
Age : 환자의 나이
Gender : 성별
Country : 국적
Race : 인종
Family_Background : 가족력 여부
Radiation_History : 방사선 노출 이력
Iodine_Deficiency : 요오드 결핍 여부
Smoke : 흡연 여부
Weight_Risk : 체중 관련 위험도
Diabetes : 당뇨병 여부
Nodule_Size : 갑상선 결절 크기
TSH_Result : TSH 호르몬 검사 결과
T4_Result : T4 호르몬 검사 결과
T3_Result : T3 호르몬 검사 결과
Cancer : 갑상선암 여부 (0: 양성, 1: 악성)
```

### 범주형 데이터인 변수
```
Gender : 성별
Country : 국적
Race : 인종
Family_Background : 가족력 여부
Radiation_History : 방사선 노출 이력
Iodine_Deficiency : 요오드 결핍 여부
Smoke : 흡연 여부
Weight_Risk : 체중 관련 위험도
Diabetes : 당뇨병 여부
```

In [8]:
# 학습 데이터
train = pd.read_csv('open/train.csv')
# 검증 데이터
test = pd.read_csv('open/test.csv')

In [10]:
train['Cancer'].value_counts(normalize=True)
# 0 : 양성, 1: 악성

Cancer
0    0.880001
1    0.119999
Name: proportion, dtype: float64

In [12]:
print('갑상선암 진단 학습 데이터셋 크기 : ', train.shape)
train.info()

갑상선암 진단 학습 데이터셋 크기 :  (87159, 16)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87159 entries, 0 to 87158
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ID                 87159 non-null  object 
 1   Age                87159 non-null  int64  
 2   Gender             87159 non-null  object 
 3   Country            87159 non-null  object 
 4   Race               87159 non-null  object 
 5   Family_Background  87159 non-null  object 
 6   Radiation_History  87159 non-null  object 
 7   Iodine_Deficiency  87159 non-null  object 
 8   Smoke              87159 non-null  object 
 9   Weight_Risk        87159 non-null  object 
 10  Diabetes           87159 non-null  object 
 11  Nodule_Size        87159 non-null  float64
 12  TSH_Result         87159 non-null  float64
 13  T4_Result          87159 non-null  float64
 14  T3_Result          87159 non-null  float64
 15  Cancer             87159 non-null  i

### 데이터 라벨링

In [18]:
all_data = pd.concat([train.drop(columns=['Cancer']), test], axis=0)
label_cols = ['Gender', 'Family_Background', 'Radiation_History',
              'Iodine_Deficiency', 'Smoke', 'Diabetes', 'Weight_Risk']

# 1. Label Encoding 적용할 컬럼
le = LabelEncoder()
for col in label_cols:
    all_data[col] = le.fit_transform(all_data[col].astype(str))

# 2. One-Hot Encoding 적용할 컬럼
one_hot_cols = ['Country', 'Race']

all_data = pd.get_dummies(all_data, columns=one_hot_cols)

# 다시 train/test로 분할
train_encoded = all_data.iloc[:len(train), :]
train_encoded['Cancer'] = train['Cancer'].values
test_encoded = all_data.iloc[len(train):, :]

<class 'pandas.core.frame.DataFrame'>
Index: 87159 entries, 0 to 87158
Data columns (total 29 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ID                 87159 non-null  object 
 1   Age                87159 non-null  int64  
 2   Gender             87159 non-null  int64  
 3   Family_Background  87159 non-null  int64  
 4   Radiation_History  87159 non-null  int64  
 5   Iodine_Deficiency  87159 non-null  int64  
 6   Smoke              87159 non-null  int64  
 7   Weight_Risk        87159 non-null  int64  
 8   Diabetes           87159 non-null  int64  
 9   Nodule_Size        87159 non-null  float64
 10  TSH_Result         87159 non-null  float64
 11  T4_Result          87159 non-null  float64
 12  T3_Result          87159 non-null  float64
 13  Country_BRA        87159 non-null  bool   
 14  Country_CHN        87159 non-null  bool   
 15  Country_DEU        87159 non-null  bool   
 16  Country_GBR        87159 no

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_encoded['Cancer'] = train['Cancer'].values


In [10]:
from sklearn.ensemble import VotingClassifier

x = train_encoded.drop(['ID', 'Cancer'], axis = 1)
y = train_encoded['Cancer']
dum_test = test_encoded.drop('ID', axis=1)


x_train, x_test, y_train, y_test = train_test_split(x, y)

ensemble_model = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_split=5, class_weight='balanced_subsample', random_state=42)),
        ('xgb', XGBClassifier(n_estimators=300, max_depth=6, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, use_label_encoder=False, eval_metric='logloss', random_state=42)),
        ('lgb', LGBMClassifier(n_estimators=300, max_depth=6, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, class_weight='balanced', random_state=42))
    ],
    voting='soft',  # 'hard'는 투표 기반, 'soft'는 확률 평균
    n_jobs=-1
)

ensemble_model.fit(x_train, y_train)
pred = ensemble_model.predict(x_test)
print('Ensemble f1:', f1_score(y_test, pred, average='macro'))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Ensemble f1: 0.7055656103850362


### 앙상블 2

In [12]:
# 1. 데이터 분리 및 스케일링
x = train_encoded.drop(['ID', 'Cancer'], axis=1)
y = train_encoded['Cancer']
dum_test = test_encoded.drop(['ID'], axis=1)

scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)
test_scaled = scaler.transform(dum_test)

# 2. 개별 모델 정의
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    min_samples_split=5,
    class_weight='balanced_subsample',
    random_state=42
)

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

lgb = LGBMClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight='balanced',
    random_state=42
)

# 3. VotingClassifier 앙상블 구성
ensemble_model = VotingClassifier(
    estimators=[('rf', rf), ('xgb', xgb), ('lgb', lgb)],
    voting='soft',
    n_jobs=-1
)

# 4. StratifiedKFold 교차검증 설정
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scores = []
acc_scores = []
test_preds = []

# 5. Cross-Validation 루프
for fold, (train_idx, val_idx) in enumerate(skf.split(x_scaled, y)):
    x_train, x_val = x_scaled[train_idx], x_scaled[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    ensemble_model.fit(x_train, y_train)
    val_pred = ensemble_model.predict(x_val)

    f1 = f1_score(y_val, val_pred, average='macro')
    acc = accuracy_score(y_val, val_pred)

    f1_scores.append(f1)
    acc_scores.append(acc)

    print(f'Fold {fold+1} F1: {f1:.4f}, Acc: {acc:.4f}')

    # 테스트 예측 누적
    test_preds.append(ensemble_model.predict_proba(test_scaled))

# 6. 평균 성능 출력
print(f'\n[최종 결과]')
print(f'평균 F1 score: {np.mean(f1_scores):.4f}')
print(f'평균 Accuracy: {np.mean(acc_scores):.4f}')

# 7. 테스트 데이터 예측 결과 집계 (소프트보팅 평균)
mean_test_pred = np.mean(test_preds, axis=0)
final_test_pred = np.argmax(mean_test_pred, axis=1)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 8367, number of negative: 61360
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004250 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1161
[LightGBM] [Info] Number of data points in the train set: 69727, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 7907, number of negative: 57462
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003892 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1138
[LightGBM] [Info] Number of data points in the train set: 65369, number of used features: 27
[LightGBM] [Info] [b

KeyboardInterrupt: 

# 현재 가장 좋은 성능 !!! 

In [16]:
# 1. Feature, Target 설정
x = train_encoded.drop(['ID', 'Cancer'], axis=1)
y = train_encoded['Cancer']
dum_test = test_encoded.drop(['ID'], axis=1)

# 2. 스케일링
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)
test_scaled = scaler.transform(dum_test)

# 3. 모델 정의
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    min_samples_split=5,
    class_weight='balanced_subsample',
    random_state=42
)

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

lgb = LGBMClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight='balanced',
    random_state=42
)

ensemble_model = VotingClassifier(
    estimators=[('rf', rf), ('xgb', xgb), ('lgb', lgb)],
    voting='soft',
    n_jobs=-1
)

# 4. 전체 데이터로 모델 학습
ensemble_model.fit(x_scaled, y)


# 5. 테스트 예측
pred = ensemble_model.predict(test_scaled)


# 6. 제출 파일 생성
submission = pd.DataFrame({
    'ID': test['ID'],
    'Cancer': pred
})
# 저장
#submission.to_csv('final_model_submission.csv', index=False)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


# 성능 너무 안좋아서 버림 (smote+Threshold)

In [39]:
# 1. Feature, Target 설정
x = train_encoded.drop(['ID', 'Cancer'], axis=1)
y = train_encoded['Cancer']
dum_test = test_encoded.drop(['ID'], axis=1)

# 2. 데이터 분리 (테스트용)
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.2, random_state=42)

# 3. 스케일링
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)
dum_test_scaled = scaler.transform(dum_test)

# 4. SMOTE 적용
smote = SMOTE(random_state=42)
x_sm, y_sm = smote.fit_resample(x_train_scaled, y_train)

# 5. 모델 정의
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    min_samples_split=5,
    class_weight='balanced_subsample',
    random_state=42
)

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

lgb = LGBMClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight='balanced',
    random_state=42
)

ensemble_model = VotingClassifier(
    estimators=[('rf', rf), ('xgb', xgb), ('lgb', lgb)],
    voting='soft',
    n_jobs=-1
)

# 6. 모델 학습
ensemble_model.fit(x_sm, y_sm)

# 7. 확률 예측 후 threshold 최적화
y_proba = ensemble_model.predict_proba(x_test_scaled)[:, 1]

precisions, recalls, thresholds = precision_recall_curve(y_test, y_proba)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
best_f1 = f1_scores[best_idx]

print(f"\n🎯 Best Threshold for F1: {best_threshold:.3f}")
print(f"🏆 Best F1 Score: {best_f1:.3f}")

# 8. 최적 threshold로 재예측
y_pred_best = (y_proba >= best_threshold).astype(int)

# 9. 평가 출력
print("\n[최적 threshold 적용한 Classification Report]")
print(classification_report(y_test, y_pred_best))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_best))

# 10. 실제 제출용 예측 (선택)
final_pred = (ensemble_model.predict_proba(dum_test_scaled)[:, 1] >= best_threshold).astype(int)
submission = pd.DataFrame({
    'ID': test['ID'],
    'Cancer': final_pred
})
# submission.to_csv('final_model_submission.csv', index=False)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



🎯 Best Threshold for F1: 0.482
🏆 Best F1 Score: 0.453

[최적 threshold 적용한 Classification Report]
              precision    recall  f1-score   support

           0       0.92      0.94      0.93     15340
           1       0.48      0.42      0.45      2092

    accuracy                           0.88     17432
   macro avg       0.70      0.68      0.69     17432
weighted avg       0.87      0.88      0.87     17432

Confusion Matrix:
[[14395   945]
 [ 1203   889]]
