In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from tqdm import tqdm

In [16]:
# 데이터 로드
train = pd.read_csv('/Users/jaesolshin/내 드라이브/2024-2/Google ML Bootcamp2024/data/playground1/train.csv')
train = train.sample(frac=0.01, random_state = 42)

In [17]:
# 범주형 변수를 팩터로 변환 (카테고리형)
train.iloc[:,[1,3,4,5,6,7,9]] = train.iloc[:,[1,3,4,5,6,7,9]].astype('category')

# 최소-최대 정규화 (Min-Max 스케일링)
scaler = StandardScaler()
train.iloc[:,[2,8,10]] = scaler.fit_transform(train.iloc[:,[2,8,10]])

# 이분변수 생성: "Annual_Premium" == 2630.0 인 경우
train['Annual_Premium_Binary'] = (train['Annual_Premium'] == 2630.0).astype('category')

# 로그 변환된 "Annual_Premium" 변수 생성
train['Annual_Premium_Log'] = np.where(train['Annual_Premium'] > 0, np.log1p(train['Annual_Premium']), 0)

# 예측에 필요 없는 'id'와 'Annual_Premium' 변수를 드롭
train = train.drop(columns=['id', 'Annual_Premium'])

# 원-핫 인코딩 (One-Hot Encoding)
category_columns = ['Gender', 'Driving_License', 'Region_Code', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Policy_Sales_Channel', 'Annual_Premium_Binary']
train = pd.get_dummies(train, columns=category_columns, drop_first=True, dtype=int)

# XGBoost에서 발생하는 문제 해결
train.columns = train.columns.str.replace('[', '').str.replace(']', '').str.replace('<', '')


3854901     1
7377384     1
10901782    1
8099641     1
           ..
7582229     1
6945714     1
2404410     1
7484099     1
700051      1
Name: Driving_License, Length: 115048, dtype: category
Categories (2, int64): [0, 1]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  train.iloc[:,[1,3,4,5,6,7,9]] = train.iloc[:,[1,3,4,5,6,7,9]].astype('category')
3854901     28.0
7377384     18.0
10901782    37.0
8099641     28.0
            ... 
7582229     46.0
6945714     33.0
2404410     33.0
7484099     46.0
700051      30.0
Name: Region_Code, Length: 115048, dtype: category
Categories (53, float64): [0.0, 1.0, 2.0, 3.0, ..., 49.0, 50.0, 51.0, 52.0]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  train.iloc[:,[1,3,4,5,6,7,9]] = train.iloc[:,[1,3,4,5,6,7,9]].astype('category')
3854901     0
7377384     0
10901782    1
8099641     1
           ..
7582229     1
6945714     1
2404410     0
7484099     1
700051  

In [18]:
# 모델 리스트
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=42)),
    ('Random Forest', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('CatBoost', CatBoostClassifier(random_state=42, verbose=0)),
    ('LightGBM', LGBMClassifier(random_state=42)),
    ('XGBoost', XGBClassifier(random_state=42))
]

# 평가 지표 계산 함수
def evaluate_model(y_true, y_pred, y_proba):
    conf_matrix = confusion_matrix(y_true, y_pred)
    TP = conf_matrix[1, 1]
    FN = conf_matrix[1, 0]
    TN = conf_matrix[0, 0]
    FP = conf_matrix[0, 1]

    accuracy = (TP + TN) / (TP + TN + FP + FN)
    precision = TP / (TP + FP) if (TP + FP) != 0 else 0
    recall = TP / (TP + FN) if (TP + FN) != 0 else 0
    specificity = TN / (TN + FP) if (TN + FP) != 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
    roc_auc = roc_auc_score(y_true, y_proba)

    return {
        'Confusion Matrix': conf_matrix.tolist(),
        'Accuracy': accuracy,
        'Precision': precision,
        'Positive Recall': recall,
        'Specificity': specificity,
        'F1-Score': f1,
        'AUC': roc_auc
    }


In [19]:
# 교차 검증 설정
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
X = train.drop(columns=['Response'])
y = train['Response']

# 결과를 저장할 리스트 초기화
results = []
probas = []

# 각 모델 학습 및 평가
for name, model in models:
    fold_metrics = []
    fold_probas = []
    for train_index, valid_index in tqdm(skf.split(X, y), desc=f"Training {name}"):
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        # 모델 학습
        model.fit(X_train, y_train)

        # 예측
        valid_y_pred = model.predict(X_valid)
        valid_y_proba = model.predict_proba(X_valid)[:, 1]  # 양성 클래스의 확률만 저장

        # 평가
        metrics = evaluate_model(y_valid, valid_y_pred, valid_y_proba)
        metrics.update({
            'Model': name,
            'y_true': y_valid,
            'y_scores': valid_y_proba
        })
        
        fold_metrics.append(metrics)
        fold_probas.append(valid_y_proba)

    results.extend(fold_metrics)
    probas.extend(fold_probas)

# DataFrame 생성
results_df = pd.DataFrame(results)
results_df

Training Logistic Regression: 5it [00:18,  3.67s/it]
Training Random Forest: 5it [02:42, 32.52s/it]
Training CatBoost: 5it [02:30, 30.08s/it]
Training LightGBM: 0it [00:00, ?it/s]

[LightGBM] [Info] Number of positive: 11313, number of negative: 80725
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025032 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 803
[LightGBM] [Info] Number of data points in the train set: 92038, number of used features: 122
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122917 -> initscore=-1.965096
[LightGBM] [Info] Start training from score -1.965096


Training LightGBM: 1it [00:02,  2.72s/it]

[LightGBM] [Info] Number of positive: 11313, number of negative: 80725
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011959 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 814
[LightGBM] [Info] Number of data points in the train set: 92038, number of used features: 123
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122917 -> initscore=-1.965096
[LightGBM] [Info] Start training from score -1.965096


Training LightGBM: 2it [00:05,  2.53s/it]

[LightGBM] [Info] Number of positive: 11312, number of negative: 80726
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008190 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 806
[LightGBM] [Info] Number of data points in the train set: 92038, number of used features: 123
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122906 -> initscore=-1.965197
[LightGBM] [Info] Start training from score -1.965197


Training LightGBM: 3it [00:09,  3.31s/it]

[LightGBM] [Info] Number of positive: 11313, number of negative: 80726
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.254613 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 807
[LightGBM] [Info] Number of data points in the train set: 92039, number of used features: 124
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122915 -> initscore=-1.965108
[LightGBM] [Info] Start training from score -1.965108


Training LightGBM: 4it [00:21,  6.97s/it]

[LightGBM] [Info] Number of positive: 11313, number of negative: 80726
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005607 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 816
[LightGBM] [Info] Number of data points in the train set: 92039, number of used features: 124
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122915 -> initscore=-1.965108
[LightGBM] [Info] Start training from score -1.965108


Training LightGBM: 5it [00:24,  4.82s/it]
Training XGBoost: 5it [00:21,  4.23s/it]


Unnamed: 0,Confusion Matrix,Accuracy,Precision,Positive Recall,Specificity,F1-Score,AUC,Model,y_true,y_scores
0,"[[20168, 14], [2824, 4]]",0.876662,0.222222,0.001414,0.999306,0.002811,0.851574,Logistic Regression,184360 1 294750 0 9397603 1 1227194...,"[0.0952848723491479, 0.00032033116215664367, 0..."
1,"[[20161, 21], [2824, 4]]",0.876358,0.16,0.001414,0.998959,0.002804,0.85103,Logistic Regression,4123482 0 53254 0 5137627 0 1022...,"[0.00032023988035724055, 0.0002929728907044894..."
2,"[[20170, 11], [2821, 8]]",0.876923,0.421053,0.002828,0.999455,0.005618,0.852002,Logistic Regression,2899125 0 10901782 0 5911933 0 5169...,"[0.32264754639342663, 0.00029161582772650496, ..."
3,"[[20161, 20], [2825, 3]]",0.876353,0.130435,0.001061,0.999009,0.002105,0.849458,Logistic Regression,3854901 0 9356601 0 9969942 0 9835...,"[0.25711251861437723, 0.1729949490237273, 0.00..."
4,"[[20162, 19], [2814, 14]]",0.876874,0.424242,0.00495,0.999059,0.009787,0.850723,Logistic Regression,7377384 0 8099641 0 1128905 0 1023...,"[0.16752872952877843, 0.0012602046393913658, 0..."
5,"[[19467, 715], [2390, 438]]",0.865059,0.379879,0.15488,0.964572,0.220045,0.838442,Random Forest,184360 1 294750 0 9397603 1 1227194...,"[0.01, 0.0, 0.26, 0.22, 0.14, 0.0, 0.0, 0.08, ..."
6,"[[19424, 758], [2371, 457]]",0.864016,0.376132,0.161598,0.962442,0.22607,0.833862,Random Forest,4123482 0 53254 0 5137627 0 1022...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.23, 0.0, 0.0, 0.0,..."
7,"[[19470, 711], [2381, 448]]",0.865624,0.38654,0.15836,0.964769,0.224674,0.836639,Random Forest,2899125 0 10901782 0 5911933 0 5169...,"[0.28, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.31, 0.0..."
8,"[[19472, 709], [2368, 460]]",0.86627,0.393499,0.162659,0.964868,0.230173,0.834376,Random Forest,3854901 0 9356601 0 9969942 0 9835...,"[0.37, 0.12, 0.04, 0.0, 0.35, 0.45, 0.43, 0.19..."
9,"[[19445, 736], [2394, 434]]",0.863966,0.37094,0.153465,0.96353,0.217109,0.833686,Random Forest,7377384 0 8099641 0 1128905 0 1023...,"[0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.51, 0.07..."


In [10]:
mean_auc_df = result_df.groupby('Model', as_index=False).mean()
print(mean_auc_df)

                 Model       AUC
0             CatBoost  0.866207
1             LightGBM  0.865017
2  Logistic Regression  0.850957
3        Random Forest  0.835401
4              XGBoost  0.863330
