In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import shap

# 데이터 로드
df = pd.read_csv('./encoded_data.csv')

# change_date 열을 제외한 나머지 열 선택
X = df.drop(columns=['change_date'])
y = df['change_date'].values  # 예측값

# 결측값 처리 (예: 평균값으로 대체)
X.replace('-', np.nan, inplace=True)
X = X.astype(np.float32)
X.fillna(X.mean(), inplace=True)

y = y.astype(np.float32)

# NaN 값이 있는지 확인
if np.any(np.isnan(X)) or np.any(np.isnan(y)):
    print("결측값이 존재합니다.")
else:
    print("결측값이 없습니다.")

# train/test 데이터셋 나누기 (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# train 데이터셋의 일부를 다시 train(sub)/val 데이터셋으로 나누기 (80% sub(train), 20% val)
X_sub, X_val, y_sub, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=123)

# 스케일링
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_sub = scaler.transform(X_sub)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Functional API를 사용하여 모델 생성
inputs = tf.keras.Input(shape=(X.shape[1],))
x = tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(inputs)  # 릿지 규제 적용
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l1(0.01))(x)  # 라쏘 규제 적용
#x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l1_l2(l1=0.01, l2=0.01))(x)  # L1과 L2 규제 함께 적용
x = tf.keras.layers.Dense(16, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='linear')(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

# 회귀를 위한 손실 함수(mean squared error)와 메트릭스(mean absolute error) 사용하여 모델 컴파일
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mean_absolute_error'])

# 조기 종료 콜백 추가 (과적합 방지)
# early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# 모델 학습
history = model.fit(X_sub, y_sub, validation_data=(X_val, y_val), epochs=10000, batch_size=10000)#(, callbacks=[early_stopping])

# 학습 결과 시각화 (학습 곡선 시각화)
plt.figure(figsize=(10, 5))

# subplot 1: Mean Squared Error 손실 함수의 변화
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Mean Squared Error')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# subplot 2: Mean Absolute Error 매트릭스의 변화
plt.subplot(1, 2, 2)
plt.plot(history.history['mean_absolute_error'], label='Train MAE')
plt.plot(history.history['val_mean_absolute_error'], label='Val MAE')
plt.title('Mean Absolute Error')
plt.xlabel('Epochs')
plt.ylabel('Error')
plt.legend()

plt.show()

# 모델 평가 - 테스트 데이터셋 사용
loss_test, mae_test = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss_test}')
print(f'Test MAE: {mae_test}')

# 모델 평가 - 검증 데이터셋 사용
loss_val, mae_val = model.evaluate(X_val, y_val)
print(f'Validation Loss: {loss_val}')
print(f'Validation MAE: {mae_val}')

# 예측값 생성
predictions = model.predict(X_val)

# NaN 값이 있는지 확인 및 처리 #@!
if np.any(np.isnan(predictions)):
    print("예측값에 NaN이 존재합니다. NaN 값을 0으로 대체합니다.")
    predictions = np.nan_to_num(predictions)

# 결정계수 계산
r2 = r2_score(y_val, predictions)
print(f'R2 Score: {r2}')


결측값이 없습니다.
Epoch 1/10000


  df = pd.read_csv('./encoded_data.csv')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 805ms/step - loss: 554.2072 - mean_absolute_error: 22.8918 - val_loss: 526.6799 - val_mean_absolute_error: 22.3146
Epoch 2/10000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - loss: 519.1559 - mean_absolute_error: 22.1293 - val_loss: 459.1898 - val_mean_absolute_error: 20.7562
Epoch 3/10000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - loss: 451.2389 - mean_absolute_error: 20.5433 - val_loss: 354.8072 - val_mean_absolute_error: 18.0433
Epoch 4/10000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - loss: 346.6325 - mean_absolute_error: 17.7721 - val_loss: 214.0639 - val_mean_absolute_error: 13.4919
Epoch 5/10000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - loss: 208.9651 - mean_absolute_error: 13.2615 - val_loss: 78.8683 - val_mean_absolute_error: 7.1625
Epoch 6/10000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [None]:
# # 1 2 같이 돌리면 안됨 >[ Error ]IOPub data rate exceeded 파이썬 에러 - 초과 오류.
# #근데 따로 돌려도....

# 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 34ms/step - loss: 0.2825 - mean_absolute_error: 0.1058 - val_loss: 0.2670 - val_mean_absolute_error: 0.0663
# Epoch 9702/10000
# 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 34ms/step - loss: 0.2848 - mean_absolute_error: 0.1093 - val_loss: 0.2735 - val_mean_absolute_error: 0.0759
# Epoch 9703/10000
# 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 35ms/step - loss: 0.2909 - mean_absolute_error: 0.1169 - val_loss: 0.2652 - val_mean_absolute_error: 0.0685
# Epoch 9704/10000
# 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 35ms/step - loss: 0.2867 - mean_absolute_error: 0.1168 - val_loss: 0.2664 - val_mean_absolute_error: 0.0593
# Epoch 9705/10000
# 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 33ms/step - loss: 0.2866 - mean_absolute_error: 0.1082 - val_loss: 0.2628 - val_mean_absolute_error: 0.0502
# Epoch 9706/10000
# 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 36ms/step - loss: 0.2826 - mean_absolute_error: 0.1049 - val_loss: 0.2644 - val_mean_absolute_error: 0.0611
# Epoch 9707/10000
# 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 45ms/step - loss: 0.2854 - mean_absolute_error: 0.1142 - val_loss: 0.2683 - val_mean_absolute_error: 0.0701
# Epoch 9708/10000
# 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 41ms/step - loss: 0.2887 - mean_absolute_error: 0.1133 - val_loss: 0.2678 - val_mean_absolute_error: 0.0791
# Epoch 9709/10000
# 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 36ms/step - loss: 0.2852 - mean_absolute_error: 0.1204 - val_loss: 0.2667 - val_mean_absolute_error: 0.0734
# Epoch 9710/10000
# 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 35ms/step - loss: 0.2846 - mean_absolute_error: 0.1164 - val_loss: 0.2653 - val_mean_absolute_error: 0.0600
# Epoch 9711/10000
# 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 33ms/step - loss: 0.2873 - mean_absolute_error: 0.1092 - val_loss: 0.2628 - val_mean_absolute_error: 0.0449
# Epoch 9712/10000
# 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 17ms/step - loss: 0.2819 - mean_absolute_error: 0.0999

# #거의 다왔는데 멈춤.. 이상징후 
# https://discourse.jupyter.org/t/iopub-data-rate-exceeded-jupyterlab/19404

# 또

# l_mean_absolute_error: 0.1521
# Epoch 6259/10000
# 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 39ms/step - loss: 0.2422 - mean_absolute_error: 0.1420 - val_loss: 0.2201 - val_mean_absolute_error: 0.1104
# Epoch 6260/10000
# 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 41ms/step - loss: 0.2430 - mean_absolute_error: 0.1485 - val_loss: 0.2683 - val_mean_absolute_error: 0.2027
# Epoch 6261/10000
# 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 41ms/step - loss: 0.2549 - mean_absolute_error: 0.1602 - val_loss: 0.2323 - val_mean_absolute_error: 0.1405
# Epoch 6262/10000
# 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 38ms/step - loss: 0.2608 - mean_absolute_error: 0.1738 - val_loss: 0.2743 - val_mean_absolute_error: 0.2287
# Epoch 6263/10000
# 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 37ms/step - loss: 0.2666 - mean_absolute_error: 0.1871 - val_loss: 0.2369 - val_mean_absolute_error: 0.1555
# Epoch 6264/10000
# 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 44ms/step - loss: 0.2658 - mean_absolute_error: 0.1794 - val_loss: 0.2625 - val_mean_absolute_error: 0.2053
# Epoch 6265/10000
# 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 40ms/step - loss: 0.2615 - mean_absolute_error: 0.1740 - val_loss: 0.2191 - val_mean_absolute_error: 0.1154
# Epoch 6266/10000
# 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 34ms/step - loss: 0.2522 - mean_absolute_error: 0.1658 - val_loss: 0.2445 - val_mean_absolute_error: 0.1729
# Epoch 6267/10000
