In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from numpy.random import default_rng

# 데이터 로드
df = pd.read_csv('./encoded_data.csv')

# change_date 열을 제외한 나머지 열 선택
X = df.drop(columns=['change_date'])
y = df['change_date'].values  # 예측값

# 결측값 처리 (예: 평균값으로 대체)
X.replace('-', np.nan, inplace=True)
X = X.astype(np.float32)
X.fillna(X.mean(), inplace=True)
y = y.astype(np.float32)

# NaN 값이 있는지 확인
if np.any(np.isnan(X)) or np.any(np.isnan(y)):
    print("결측값이 존재합니다.")
else:
    print("결측값이 없습니다.")

# train/test 데이터셋 나누기 (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# train 데이터셋의 일부를 다시 train(sub)/val 데이터셋으로 나누기 (80% sub(train), 20% val)
X_sub, X_val, y_sub, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=123)

# 스케일링
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_sub = scaler.transform(X_sub)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# 모델 생성 함수 정의
def create_model(params):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(128, input_dim=X_train.shape[1], activation='relu', kernel_regularizer=tf.keras.regularizers.l2(params['l2'])))
    model.add(tf.keras.layers.Dropout(params['dropout_rate']))
    model.add(tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l1(params['l1'])))
    model.add(tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l1_l2(l1=params['l1'], l2=params['l2'])))
    model.add(tf.keras.layers.Dense(16, activation='relu'))
    model.add(tf.keras.layers.Dense(1, activation='linear'))
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=params['learning_rate'])
    model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mean_absolute_error'])
    
    return model

# 목적 함수 정의
def objective(params):
    model = create_model(params)
    history = model.fit(X_sub, y_sub, validation_data=(X_val, y_val), epochs=500, batch_size=100, verbose=0)
    
    # 검증 손실 값 반환
    val_loss = min(history.history['val_loss'])
    return {'loss': val_loss, 'status': STATUS_OK}

# 하이퍼파라미터 검색 공간 정의
space = {
    'learning_rate': hp.uniform('learning_rate', 0.001, 0.1),
    'l1': hp.uniform('l1', 0.01, 0.1),
    'l2': hp.uniform('l2', 0.01, 0.1),
    'dropout_rate': hp.uniform('dropout_rate', 0.1, 0.5)
}

# 최적화 실행
rng = default_rng(123)  # 새로운 난수 생성기 인터페이스 사용
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=10, trials=trials, rstate=rng)

print(f'Best Params: {best}')

# 최적의 하이퍼파라미터로 모델 생성 및 재학습
best_model = create_model(best)
history = best_model.fit(X_sub, y_sub, validation_data=(X_val, y_val), epochs=500, batch_size=100, verbose=1)

# 학습 결과 시각화 (학습 곡선 시각화)
plt.figure(figsize=(10, 5))

# subplot 1: Mean Squared Error 손실 함수의 변화
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Mean Squared Error')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# subplot 2: Mean Absolute Error 매트릭스의 변화
plt.subplot(1, 2, 2)
plt.plot(history.history['mean_absolute_error'], label='Train MAE')
plt.plot(history.history['val_mean_absolute_error'], label='Val MAE')
plt.title('Mean Absolute Error')
plt.xlabel('Epochs')
plt.ylabel('Error')
plt.legend()

plt.show()

# 모델 평가 - 테스트 데이터셋 사용
loss_test, mae_test = best_model.evaluate(X_test, y_test)
print(f'Test Loss: {loss_test}')
print(f'Test MAE: {mae_test}')

# 모델 평가 - 검증 데이터셋 사용
loss_val, mae_val = best_model.evaluate(X_val, y_val)
print(f'Validation Loss: {loss_val}')
print(f'Validation MAE: {mae_val}')


결측값이 없습니다.
  0%|                                                                           | 0/10 [00:00<?, ?trial/s, best loss=?]

  df = pd.read_csv('./encoded_data.csv')
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



100%|████████████████████████████████████████████████| 10/10 [05:18<00:00, 31.88s/trial, best loss: 0.5928509831428528]
Best Params: {'dropout_rate': 0.3524873662919549, 'l1': 0.05204920162761399, 'l2': 0.08967643727120045, 'learning_rate': 0.0016324623333292404}
Epoch 1/500


TypeError: Input 'y' of 'Mul' Op has type float32 that does not match type float64 of argument 'x'.

In [None]:
# Best Params: {'dropout_rate': 0.3524873662919549, 'l1': 0.05204920162761399, 'l2': 0.08967643727120045, 'learning_rate': 0.0016324623333292404}