In [10]:
import pandas as pd
import numpy as np
import keras
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.preprocessing import OneHotEncoder, TargetEncoder
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score
from scikeras.wrappers import KerasClassifier

In [13]:
# Read and split the data
train_origin = pd.read_csv('/Users/jaesolshin/내 드라이브/2024-2/Google ML Bootcamp2024/data/playground1/train.csv')
train = train_origin.sample(frac=0.01, random_state = 42)

# 범주형 변수를 팩터로 변환 (카테고리형)
train.iloc[:,[1,3,4,5,6,7,9]] = train.iloc[:,[1,3,4,5,6,7,9]].astype('category')

# 최소-최대 정규화 (Min-Max 스케일링)
scaler = MinMaxScaler()
train.iloc[:,[2,8,10]] = scaler.fit_transform(train.iloc[:,[2,8,10]])

# 이분변수 생성: "Annual_Premium" == 2630.0 인 경우
train['Annual_Premium_Binary'] = (train['Annual_Premium'] == 2630.0).astype('category')

# 로그 변환된 "Annual_Premium" 변수 생성
train['Annual_Premium_Log'] = np.where(train['Annual_Premium'] > 0, np.log1p(train['Annual_Premium']), 0)

# 예측에 필요 없는 'id'와 'Annual_Premium' 변수를 드롭
train = train.drop(columns=['id', 'Annual_Premium'])

# 원-핫 인코딩 (One-Hot Encoding)
category_columns = ['Gender', 'Driving_License', 'Region_Code', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Policy_Sales_Channel', 'Annual_Premium_Binary']
train = pd.get_dummies(train, columns=category_columns, drop_first=True, dtype=int)

# 특징과 레이블 분리
X = train.drop(columns=['Response'])
y = train['Response']

# 훈련 세트와 테스트 세트로 데이터 분할
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# pandas DataFrame을 numpy ndarray로 변환
X_train = X_train.to_numpy()
X_valid = X_valid.to_numpy()
y_train = y_train.to_numpy()
y_valid = y_valid.to_numpy()

Using Auto-Keras Framework

In [None]:
#pip install autokeras
#pip install autokeras --upgrade

In [14]:
import autokeras as ak

# AutoKeras 모델 정의
input_node = ak.Input()
output_node = ak.ClassificationHead()(input_node)
auto_model = ak.AutoModel(
    inputs=input_node,
    outputs=output_node,
    overwrite=True,
    max_trials=10
)

# AutoKeras 모델 훈련
auto_model.fit(X_train, y_train, epochs=50)

# 평가
accuracy = auto_model.evaluate(X_valid, y_valid)

Trial 10 Complete [00h 01m 46s]
val_loss: 0.26931753754615784

Best val_loss So Far: 0.2685414254665375
Total elapsed time: 01h 10m 43s
Epoch 1/50
[1m2877/2877[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 4ms/step - accuracy: 0.8767 - loss: 0.3730
Epoch 2/50
[1m2877/2877[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.8774 - loss: 0.2807
Epoch 3/50
[1m2877/2877[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.8773 - loss: 0.2720
Epoch 4/50
[1m2877/2877[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.8772 - loss: 0.2694
Epoch 5/50
[1m2877/2877[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.8773 - loss: 0.2685
Epoch 6/50
[1m2877/2877[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.8772 - loss: 0.2680
Epoch 7/50
[1m2877/2877[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.8772 - loss: 0.2678
Epoch 8/5

  saveable.load_own_variables(weights_store.get(inner_path))


[1m720/720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8797 - loss: 0.2618


In [22]:
# 최적화된 모델을 Keras 모델로 내보내기
best_model = auto_model.export_model()

# 모델 구조 출력
best_model.summary()

# 예측 수행
predictions = best_model.predict(X_valid)

# ROC AUC 계산
y_pred_proba = predictions.flatten()  # AutoKeras 모델의 예측 결과를 확률로 변환
auc_score = roc_auc_score(y_valid, y_pred_proba)
print(auc_score)

  saveable.load_own_variables(weights_store.get(inner_path))


[1m720/720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
0.8494775671769346


In [24]:
import joblib

#모델 저장
filepath = "best_model.joblib"
joblib.dump(best_model, filepath)

['best_model.joblib']