In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, TargetEncoder
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from sklearn.calibration import CalibratedClassifierCV
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import joblib

# Read the data
train_origin = pd.read_csv('/Users/jaesolshin/내 드라이브/2024-2/Google ML Bootcamp2024/data/playground1/train.csv')

In [5]:
# 데이터 샘플링
train = train_origin.set_index('id').astype(str)

# 예측변수 분리 및 train, valid set 분리
X = train.drop(['Response'], axis=1)
y = train['Response'].astype(float)

# Train/Test split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.002, stratify=y, random_state=42)

# 타겟 인코딩
enc = TargetEncoder()
enc.fit(X_train, y_train) # 학습 데이터로 인코더 학습
X_train = pd.DataFrame(enc.transform(X_train), index=X_train.index, columns=X_train.columns)
X_valid = pd.DataFrame(enc.transform(X_valid), index=X_valid.index, columns=X_valid.columns)
X_train = X_train.astype(float)
X_valid = X_valid.astype(float)


In [None]:
import subprocess

# CatBoost
print("\nCatBoost:")
cat_param = { 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'class_names': [0, 1], 'learning_rate': 0.075, 'iterations': 1000, 'depth': 9, 'random_strength': 0, 'l2_leaf_reg': 0.5, 'max_leaves': 512, 'fold_permutation_block': 64, 'allow_writing_files': False, 'verbose':1}
cat_model = CatBoostClassifier(**cat_param, random_state=42)

cat_model.fit(X_train, y_train)
subprocess.run(['say', '-v', 'Kyoko', '基本モデルの学習プロセスが完了しました。'])

In [9]:
calibrated = CalibratedClassifierCV(cat_model, cv=5).fit(X_train, y_train)
valid_preds = calibrated.predict_proba(X_valid)[:, 1]
valid_auc = roc_auc_score(y_valid, valid_preds)
print("ROC AUC:", valid_auc)
subprocess.run(['say', '-v', 'Kyoko', '補正モデルの学習プロセスが完了しました。'])

0:	total: 8.28s	remaining: 2h 17m 55s
1:	total: 23.9s	remaining: 3h 19m 10s
2:	total: 37.4s	remaining: 3h 27m 6s
3:	total: 41.4s	remaining: 2h 51m 37s
4:	total: 44s	remaining: 2h 25m 59s
5:	total: 46.2s	remaining: 2h 7m 27s
6:	total: 48.3s	remaining: 1h 54m 7s
7:	total: 51.1s	remaining: 1h 45m 34s
8:	total: 53.6s	remaining: 1h 38m 25s
9:	total: 56.4s	remaining: 1h 33m 2s
10:	total: 58.8s	remaining: 1h 28m 5s
11:	total: 1m 1s	remaining: 1h 23m 53s
12:	total: 1m 2s	remaining: 1h 19m 37s
13:	total: 1m 4s	remaining: 1h 15m 50s
14:	total: 1m 6s	remaining: 1h 13m 1s
15:	total: 1m 10s	remaining: 1h 11m 57s
16:	total: 1m 11s	remaining: 1h 9m 22s
17:	total: 1m 13s	remaining: 1h 7m 5s
18:	total: 1m 15s	remaining: 1h 5m 18s
19:	total: 1m 17s	remaining: 1h 3m 16s
20:	total: 1m 19s	remaining: 1h 1m 30s
21:	total: 1m 20s	remaining: 59m 55s
22:	total: 1m 22s	remaining: 58m 20s
23:	total: 1m 24s	remaining: 56m 57s
24:	total: 1m 27s	remaining: 57m 2s
25:	total: 1m 29s	remaining: 56m 5s
26:	total: 1m 31

CompletedProcess(args=['say', '-v', 'Kyoko', 'プロセスが完了しました。'], returncode=0)

In [10]:
#test 데이터 로드
test_origin = pd.read_csv('/Users/jaesolshin/내 드라이브/2024-2/Google ML Bootcamp2024/data/playground1/test.csv')
X_test = test_origin.set_index('id').astype(str)
X_test = pd.DataFrame(enc.transform(X_test), index=X_test.index, columns=X_test.columns)
subprocess.run(['say', '-v', 'Kyoko', 'プロセスが完了しました。'])

CompletedProcess(args=['say', '-v', 'Kyoko', 'プロセスが完了しました。'], returncode=0)

In [13]:
# 예측 생성
y_test_pred = calibrated.predict_proba(X_test)[:, 1]
submission1 = pd.DataFrame({'id': X_test.index, 'Response': y_test_pred })

# 예측을 CSV 파일로 저장
submission1.to_csv('/Users/jaesolshin/Documents/GitHub/kgpg_binary/submission/cat_calib_predictions.csv', index=False)
print("Predictions saved to 'calib_cat_predictions.csv'")
subprocess.run(['say', '-v', 'Kyoko', 'プロセスが完了しました。'])


Predictions saved to 'calib_cat_predictions.csv'


CompletedProcess(args=['say', '-v', 'Kyoko', 'プロセスが完了しました。'], returncode=0)

In [15]:
# 예측을 CSV 파일로 저장
submission1.to_csv('/Users/jaesolshin/Documents/GitHub/kgpg_binary/submission/cat_calib_predictions.csv', index=False)
print("Predictions saved to 'calib_cat_predictions.csv'")
subprocess.run(['say', '-v', 'Kyoko', 'プロセスが完了しました。'])

Predictions saved to 'calib_cat_predictions.csv'


CompletedProcess(args=['say', '-v', 'Kyoko', 'プロセスが完了しました。'], returncode=0)

In [16]:
submission2 = pd.read_csv('/Users/jaesolshin/Documents/GitHub/kgpg_binary/submission/lgbm_pseudo_predictions.csv')

# 'id'를 기준으로 두 DataFrame 병합
submission = pd.merge(submission1, submission2, on='id', suffixes=('_1', '_2'))

# 'Response' 열들의 평균을 계산하여 새로운 'Response' 변수 생성
submission['Response'] = submission[['Response_1', 'Response_2']].mean(axis=1)

# 원래 'Response' 변수 삭제
submission = submission.drop(columns=['Response_1', 'Response_2'])

# 예측을 CSV 파일로 저장
submission.to_csv('/Users/jaesolshin/Documents/GitHub/kgpg_binary/submission/calib_cat_keras_predictions.csv', index=False)
print("Predictions saved to 'calib_cat_keras_predictions.csv'")
subprocess.run(['say', '-v', 'Kyoko', 'プロセスが完了しました。'])

Predictions saved to 'calib_cat_keras_predictions.csv'


CompletedProcess(args=['say', '-v', 'Kyoko', 'プロセスが完了しました。'], returncode=0)