In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, TargetEncoder
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from sklearn.calibration import CalibratedClassifierCV
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import joblib
import subprocess

# Read the data
train_origin = pd.read_csv('/Users/jaesolshin/내 드라이브/2024-2/Google ML Bootcamp2024/data/playground1/train.csv')

In [3]:
results=[]

def modeling(model, X_train, y_train, X_valid, y_valid):
    model.fit(X_train, y_train)
    valid_preds = model.predict_proba(X_valid)[:, 1]
    valid_auc = roc_auc_score(y_valid, valid_preds)
    print("ROC AUC:", valid_auc)

    results.append({"ROC AUC":valid_auc,"valid_preds":valid_preds})
    return valid_auc, valid_preds

def calibrated_modeling(model, X_train, y_train, X_valid, y_valid):
    model.fit(X_train, y_train)
    calibrated = CalibratedClassifierCV(model, cv=5).fit(X_train, y_train)
    valid_preds = calibrated.predict_proba(X_valid)[:, 1]
    valid_auc = roc_auc_score(y_valid, valid_preds)
    print("ROC AUC:", valid_auc)


#테스트 절차
1. train, valid, test set 분할
2. train으로 훈련(X_train, y_train) => valid로 검증(X_valid, y_valid)
3. 2에서 검증한 모델에 전체 X_test를 넣어 Y_test_pred 생성.
4. Y_test, Y_test_pred -> 성능지표(auc) 확인 : baseline approach

5. test 중 일부(36%) 샘플링 => X_test_sampled, y_test_sampled
6. 2에서 검증한 모델로 X_test_sampled => y_test_pseudolable 생성
7. 생성한 레이블이 신뢰할 만한지 y_test_sampled와 비교(정확도, 재현율)

8. train 데이터에 (X_test_sampled, y_test_pseudolable) 샘플을 추가하여 pooled 데이터 구성(X_pooled, y_pooled).
10. pooled에서 훈련 뒤 valid로 검증.
11. 7에서 검증한 모델에 전체 X_test를 넣어 Y_test_pred 생성.
12. Y_test, Y_test_pred -> 성능지표(auc) 확인 : pseudo-labling approach

13. pooled+valid로 훈련, 검증 X.
14. 13에서 훈련한 모델에 전체 X_test를 넣어 Y_test_pred 생성.
15. Y_test, Y_test_pred -> 성능지표(auc) 확인 : pseudo-labling approach

In [5]:
### 0. 준비 ###

# 전체 데이터 전처리
train = train_origin.set_index('id').astype(str)

# 예측변수 분리
X = train.drop(['Response'], axis=1)
y = train['Response'].astype(float)

### 1. train, valid, test set 분할 ###

# train:test 0.5:0.5 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, stratify=y, random_state=240726)

# train:valid 0.8:0.2 split => train:valid:test = 0.4:0.1:0.5
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=240726)

# 타겟 인코딩
enc = TargetEncoder()
X_train_tf = enc.fit_transform(X_train, y_train)
X_valid_tf = enc.transform(X_valid)  # 학습된 인코더를 valid 데이터에 적용
X_train_tf = X_train_tf.astype(float)
X_valid_tf = X_valid_tf.astype(float)

In [15]:
### 2. train으로 훈련(X_train, y_train) => valid로 검증(X_valid, y_valid) ###

# LightGBM
print("\nLightGBM:")
best_param = {'lambda_l1': 0.1, 'lambda_l2': 0.1, 'learning_rate': 0.1, 'max_depth': -1, 'n_estimators': 100, 'num_leaves': 20, 'verbose':-1}
lgbm_model = LGBMClassifier(**best_param, random_state=42)

modeling(lgbm_model, X_train_tf, y_train, X_valid_tf, y_valid)
calibrated_modeling(lgbm_model, X_train_tf, y_train, X_valid_tf, y_valid)


LightGBM:
ROC AUC: 0.8848090032754495
ROC AUC: 0.8850564516068894


In [16]:
### 3. 2에서 검증한 모델에 전체 X_test를 넣어 Y_test_pred 생성. ###
### 4. Y_test, Y_test_pred -> 성능지표(auc) 확인 : baseline approach ###

X_test_tf = enc.transform(X_test)  # 학습된 인코더를 테스트 데이터에 적용
X_test_tf  = X_test_tf.astype(float)

modeling(lgbm_model, X_train_tf, y_train, X_test_tf, y_test)
calibrated_modeling(lgbm_model, X_train_tf, y_train, X_test_tf, y_test)

ROC AUC: 0.8850589032106315
ROC AUC: 0.8853068166707659


In [18]:
### 5. test 중 일부(36%) 샘플링 => X_test_sampled, y_test_sampled ### 
X_test_sampled, X_test_rest, y_test_sampled, y_test_rest = train_test_split(X_test, y_test, test_size=0.36, stratify=y_test, random_state=240726)

X_test_sampled_tf = enc.transform(X_test_sampled)  # 학습된 인코더를 테스트 데이터에 적용
X_test_sampled_tf  = X_test_sampled_tf.astype(float)

### 6. 2에서 검증한 모델로 X_test_sampled => y_test_pseudolable 생성 ### 
y_test_pseudolable = lgbm_model.predict(X_test_sampled_tf)[:, 1]

### 7. 생성한 레이블이 신뢰할 만한지 y_test_sampled와 비교(정확도, 재현율) ### 
print(classification_report(y_test_sampled, y_test_pseudolable))

              precision    recall  f1-score   support

         0.0       0.89      0.98      0.94   3228716
         1.0       0.59      0.16      0.26    452819

    accuracy                           0.88   3681535
   macro avg       0.74      0.57      0.60   3681535
weighted avg       0.86      0.88      0.85   3681535



In [21]:
### 8. train 데이터에 (X_test_sampled_tf, y_test_pseudolable) 샘플을 추가하여 pooled 데이터 구성(X_pooled, y_pooled).
X_pooled = np.concatenate((X_train_tf, X_test_sampled_tf), axis=0)
y_pooled = np.concatenate((y_train, y_test_pseudolable), axis=0)

### 10. pooled에서 훈련 뒤 valid로 검증.
modeling(lgbm_model, X_pooled, y_pooled, X_valid_tf, y_valid)
calibrated_modeling(lgbm_model, X_pooled, y_pooled, X_valid_tf, y_valid)

ROC AUC: 0.8702029944992574
ROC AUC: 0.8511197296691382


In [30]:
### 11. 7에서 검증한 모델에 전체 X_test를 넣어 Y_test_pred 생성.
### 12. Y_test, Y_test_pred -> 성능지표(auc) 확인 : pseudo-labling approach
modeling(lgbm_model, X_pooled, y_pooled, X_test_tf, y_test)
calibrated_modeling(lgbm_model, X_pooled, y_pooled, X_test_tf, y_test)

ROC AUC: 0.8703705179199774
ROC AUC: 0.8511262806706722


In [None]:
#13. pooled+valid로 훈련, 검증 X.
X_pooled = np.concatenate((X_pooled, X_valid_tf), axis=0)
y_pooled = np.concatenate((y_pooled, y_valid), axis=0)

#14. 13에서 훈련한 모델에 전체 X_test를 넣어 Y_test_pred 생성.
#15. Y_test, Y_test_pred -> 성능지표(auc) 확인 : pseudo-labling approach
modeling(lgbm_model, X_pooled, y_pooled, X_test_tf, y_test)
calibrated_modeling(lgbm_model, X_pooled, y_pooled, X_test_tf, y_test)
subprocess.run(['say', '-v', 'Kyoko', 'プロセスが完了しました。'])