In [None]:
# 오류에 참고 부분 
# https://github.com/adapt-python/adapt/issues/118
# https://velog.io/@cjkangme/TIL-%EB%94%A5%EB%9F%AC%EB%8B%9D-Grid-Search

In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
import numpy as np
import tensorflow as tf
from scikeras.wrappers import KerasRegressor
from sklearn.metrics import r2_score
from scipy.stats import uniform
import matplotlib.pyplot as plt
import shap

# 데이터 로드
df = pd.read_csv('./encoded_data.csv')

# change_date 열을 제외한 나머지 열 선택
X = df.drop(columns=['change_date'])
y = df['change_date'].values  # 예측값

# 결측값 처리 (예: 평균값으로 대체)
X.replace('-', np.nan, inplace=True)
X = X.astype(np.float32)
X.fillna(X.mean(), inplace=True)

y = y.astype(np.float32)

# NaN 값이 있는지 확인
if np.any(np.isnan(X)) or np.any(np.isnan(y)):
    print("결측값이 존재합니다.")
else:
    print("결측값이 없습니다.")

# train/test 데이터셋 나누기 (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# train 데이터셋의 일부를 다시 train(sub)/val 데이터셋으로 나누기 (80% sub(train), 20% val)
X_sub, X_val, y_sub, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=123)

# 스케일링
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_sub = scaler.transform(X_sub)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# 모델 생성 함수 정의
def create_model(learning_rate=0.01, l1=0.01, l2=0.01, dropout_rate=0.2):
    inputs = tf.keras.Input(shape=(X.shape[1],))
    x = tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l2))(inputs)  # 릿지 규제 적용
    x = tf.keras.layers.Dropout(dropout_rate)(x)
    x = tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l1(l1))(x)  # 라쏘 규제 적용
    x = tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l1_l2(l1=l1, l2=l2))(x)  # L1과 L2 규제 함께 적용
    x = tf.keras.layers.Dense(16, activation='relu')(x)
    outputs = tf.keras.layers.Dense(1, activation='linear')(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mean_absolute_error'])
    return model

# KerasRegressor 래퍼 사용
model = KerasRegressor(build_fn=create_model, verbose=0)

# 하이퍼파라미터 검색 공간 정의
param_distributions = {
    'model__learning_rate': uniform(0.001, 0.1),
    'model__l1': uniform(0.01, 0.1),
    'model__l2': uniform(0.01, 0.1),
    'model__dropout_rate': uniform(0.1, 0.5),
    'batch_size': [100, 500, 1000],
    'epochs': [500, 1000, 1500]
}


결측값이 없습니다.


  df = pd.read_csv('./encoded_data.csv')


In [33]:
# 데이터 타입 확인
print(f'X_train dtype: {X_train.dtype}')
print(f'y_train dtype: {y_train.dtype}')
print(f'X_sub dtype: {X_sub.dtype}')
print(f'y_sub dtype: {y_sub.dtype}')
print(f'X_val dtype: {X_val.dtype}')
print(f'y_val dtype: {y_val.dtype}')


# 모델 입력 타입 확인
#print(f'Model input dtype: {model.input.dtype}')

X_train dtype: float32
y_train dtype: float32
X_sub dtype: float32
y_sub dtype: float32
X_val dtype: float32
y_val dtype: float32


In [31]:
y_train = y_train.astype(np.float32)
y_sub = y_sub.astype(np.float32)
y_val = y_val.astype(np.float32)
y_test = y_test.astype(np.float32)

# RandomizedSearchCV 설정
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_distributions, n_iter=10, cv=3, verbose=1, random_state=123, n_jobs=-1)

# 랜덤 서치 실행
random_search_result = random_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터 출력
print(f'Best Score: {random_search_result.best_score_}')
print(f'Best Params: {random_search_result.best_params_}')

# 최적의 모델로 재학습
best_model = random_search_result.best_estimator_
history = best_model.fit(X_sub, y_sub, validation_data=(X_val, y_val))

# 학습 결과 시각화 (학습 곡선 시각화)
plt.figure(figsize=(10, 5))

# subplot 1: Mean Squared Error 손실 함수의 변화
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Mean Squared Error')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# subplot 2: Mean Absolute Error 매트릭스의 변화
plt.subplot(1, 2, 2)
plt.plot(history.history['mean_absolute_error'], label='Train MAE')
plt.plot(history.history['val_mean_absolute_error'], label='Val MAE')
plt.title('Mean Absolute Error')
plt.xlabel('Epochs')
plt.ylabel('Error')
plt.legend()

plt.show()

# 모델 평가 - 테스트 데이터셋 사용
loss_test, mae_test = best_model.model.evaluate(X_test, y_test)
print(f'Test Loss: {loss_test}')
print(f'Test MAE: {mae_test}')

# 모델 평가 - 검증 데이터셋 사용
loss_val, mae_val = best_model.model.evaluate(X_val, y_val)
print(f'Validation Loss: {loss_val}')
print(f'Validation MAE: {mae_val}')


Fitting 3 folds for each of 10 candidates, totalling 30 fits


ValueError: 
All the 30 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "D:\ANACONDA3_ENVS\DR\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\ANACONDA3_ENVS\DR\Lib\site-packages\scikeras\wrappers.py", line 770, in fit
    self._fit(
  File "D:\ANACONDA3_ENVS\DR\Lib\site-packages\scikeras\wrappers.py", line 938, in _fit
    self._fit_keras_model(
  File "D:\ANACONDA3_ENVS\DR\Lib\site-packages\scikeras\wrappers.py", line 535, in _fit_keras_model
    hist = self.model_.fit(x=X, y=y, **fit_args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\ANACONDA3_ENVS\DR\Lib\site-packages\keras\src\utils\traceback_utils.py", line 122, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "D:\ANACONDA3_ENVS\DR\Lib\site-packages\keras\src\regularizers\regularizers.py", line 236, in __call__
    return self.l1 * ops.sum(ops.absolute(x))
           ~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~
TypeError: Input 'y' of 'Mul' Op has type float32 that does not match type float64 of argument 'x'.


In [None]:
# X_sub > X_train and X_val > X_test 로 바꿈.

# 예측값 생성
predictions = best_model.model.predict(X_test)

# NaN 값이 있는지 확인 및 처리
if np.any(np.isnan(predictions)):
    print("예측값에 NaN이 존재합니다. NaN 값을 0으로 대체합니다.")
    predictions = np.nan_to_num(predictions)

# 결정계수 계산
r2 = r2_score(y_test, predictions)
print(f'R2 Score: {r2}')



# 예측값 vs 실제값의 산점도
plt.figure(figsize=(10, 5))
plt.scatter(y_test, predictions, alpha=0.3)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red')
plt.show()

# 잔차 플롯
residuals = y_test - predictions.flatten()
plt.figure(figsize=(10, 5))
plt.scatter(predictions, residuals, alpha=0.3)
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residuals vs Predicted Values')
plt.hlines(0, min(predictions), max(predictions), colors='red')
plt.show()

# SHAP을 사용한 특성 중요도 시각화
shap.initjs()
explainer = shap.Explainer(best_model.model.predict, X_train)
shap_values = explainer(X_test)
shap.summary_plot(shap_values, X_test, feature_names=X.columns.tolist())
