In [1]:
import numpy as np
import pandas as pd
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# 소수를 e로 표현하지 않도록 하기
pd.options.display.float_format = "{:.5f}".format

In [2]:
lib_2007 = pd.read_csv("../final_data/lib_2007.csv", index_col=0)
lib_2008 = pd.read_csv("../final_data/lib_2008.csv", index_col=0)
lib_2009 = pd.read_csv("../final_data/lib_2009.csv", index_col=0)
lib_2010 = pd.read_csv("../final_data/lib_2010.csv", index_col=0)
lib_2011 = pd.read_csv("../final_data/lib_2011.csv", index_col=0)
lib_2012 = pd.read_csv("../final_data/lib_2012.csv", index_col=0)
lib_2013 = pd.read_csv("../final_data/lib_2013.csv", index_col=0)
lib_2014 = pd.read_csv("../final_data/lib_2014.csv", index_col=0)
lib_2015 = pd.read_csv("../final_data/lib_2015.csv", index_col=0)
lib_2016 = pd.read_csv("../final_data/lib_2016.csv", index_col=0)
lib_2017 = pd.read_csv("../final_data/lib_2017.csv", index_col=0)
lib_2018 = pd.read_csv("../final_data/lib_2018.csv", index_col=0)
lib_2019 = pd.read_csv("../final_data/lib_2019.csv", index_col=0)
lib_2020 = pd.read_csv("../final_data/lib_2020.csv", index_col=0)
lib_2021 = pd.read_csv("../final_data/lib_2021.csv", index_col=0)

In [3]:
train_lib = pd.concat([lib_2007,lib_2008,lib_2009,lib_2010,lib_2011,lib_2012,lib_2013,lib_2014,lib_2015, lib_2016, lib_2017])
valid_lib = pd.concat([lib_2018, lib_2019])
test_lib = pd.concat([lib_2020,lib_2021])

In [4]:
# train, test set의 데이터 수 구하기
print(f"train set의 데이터 수 : {len(train_lib)}")
print(f"valid set의 데이터 수 : {len(valid_lib)}")
print(f"test set의 데이터 수 : {len(test_lib)}")

train set의 데이터 수 : 9026
valid set의 데이터 수 : 2222
test set의 데이터 수 : 2370


In [5]:
# 사분위수를 활용하여 비율 충족도의 이상치 행 제거하기
def del_outlier(df, col):
    q1 = df[col].quantile(0.10)
    q3 = df[col].quantile(0.90)
    iqr = q3 - q1 
    boundary = 1.5 * iqr 

    upper_index = df[df[col] > q3 + boundary].index
    lower_index = df[df[col] < q1 - boundary].index 

    df.drop(upper_index, inplace = True)
    df.drop(lower_index, inplace = True)

    return df

del_outlier(train_lib, "future_acq_budget_settlement")
del_outlier(valid_lib, "future_acq_budget_settlement")
del_outlier(test_lib, "future_acq_budget_settlement") 

# 이상치 제거 후 train, test set의 데이터 수 구하기
print(f"이상치 제거 후 train set의 데이터 수 : {len(train_lib)}")
print(f"이상치 제거 후 valid set의 데이터 수 : {len(valid_lib)}")
print(f"이상치 제거 후 test set의 데이터 수 : {len(test_lib)}")

이상치 제거 후 train set의 데이터 수 : 8184
이상치 제거 후 valid set의 데이터 수 : 2178
이상치 제거 후 test set의 데이터 수 : 2310


In [6]:
from sklearn.preprocessing import MinMaxScaler

# Min-Max Scaler 생성
scaler = MinMaxScaler()

# 모든 변수에 대해 Min-Max Scaling 적용
# 스케일러를 학습 데이터에 대해 fit
scaler.fit(train_lib.drop(columns=["future_acq_budget_settlement"]))

# 학습 데이터에 대해 변환
train_lib_scaled = pd.DataFrame(scaler.transform(train_lib.drop(columns=["future_acq_budget_settlement"])), 
                                columns=train_lib.columns[:-1])

# 타겟 변수에 대해 스케일링 적용
scaler_y = MinMaxScaler()
train_y_scaled = scaler_y.fit_transform(train_lib[['future_acq_budget_settlement']])

# 학습 데이터에 스케일된 타겟 변수 추가
train_lib_scaled['future_acq_budget_settlement_scaled'] = train_y_scaled

# 검증 데이터에 대해 변환
valid_lib_scaled = pd.DataFrame(scaler.transform(valid_lib.drop(columns=["future_acq_budget_settlement"])), 
                                columns=train_lib.columns[:-1])
valid_y_scaled = scaler_y.transform(valid_lib[['future_acq_budget_settlement']])
valid_lib_scaled['future_acq_budget_settlement_scaled'] = valid_y_scaled

# 테스트 데이터에 대해 변환
test_lib_scaled = pd.DataFrame(scaler.transform(test_lib.drop(columns=["future_acq_budget_settlement"])), 
                               columns=train_lib.columns[:-1])
test_y_scaled = scaler_y.transform(test_lib[['future_acq_budget_settlement']])
test_lib_scaled['future_acq_budget_settlement_scaled'] = test_y_scaled

# X_train, Y_train 구성하기 (스케일된 타겟 사용)
X_train = train_lib_scaled.drop(columns=["future_acq_budget_settlement_scaled"])
Y_train = train_lib_scaled["future_acq_budget_settlement_scaled"]

# X_valid, Y_valid 구성하기 (스케일된 타겟 사용)
X_valid = valid_lib_scaled.drop(columns=["future_acq_budget_settlement_scaled"])
Y_valid = valid_lib_scaled["future_acq_budget_settlement_scaled"]

# X_test, Y_test 구성하기 (스케일된 타겟 사용)
X_test = test_lib_scaled.drop(columns=["future_acq_budget_settlement_scaled"])
Y_test = test_lib_scaled["future_acq_budget_settlement_scaled"]


In [7]:
columns_to_drop = ["prog_sessions",	"local_mat", "prog_partic", "service_recip", "self_srv_machines", "interlib_loans", 
                   "reg_members", "borrowers", "user_ed_partic", "web_access", "open_days", 
                   "avg_week_hours", "disab_mat", "ann_weeding", "support_staff", "info_serv_requests"]

X_train.drop(columns=columns_to_drop, inplace=True)
X_valid.drop(columns=columns_to_drop, inplace=True)
X_test.drop(columns=columns_to_drop, inplace=True)

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

model = Sequential()
model.add(LSTM(100, input_shape=(X_train.shape[1], 1), return_sequences=True)) 
model.add(Dropout(0.2)) 
model.add(LSTM(100)) 
model.add(Dropout(0.2))
  
model.add(Dense(1, activation='linear'))  

2024-09-15 14:13:28.857068: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-15 14:13:29.031418: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-15 14:13:29.031477: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-15 14:13:29.031490: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-15 14:13:29.074809: I tensorflow/core/platform/cpu_feature_g

In [9]:
# 모델 컴파일
optimizer = Adam(learning_rate=0.001)  # Adam 옵티마이저와 학습률 설정
model.compile(optimizer=optimizer, loss='mean_squared_error', metrics='mse')  # 회귀 문제의 경우 MSE 손실 함수 사용

# 모델 학습
history = model.fit(X_train, Y_train, epochs=50, batch_size=32, validation_data=(X_valid, Y_valid))

Epoch 1/50


2024-09-15 14:13:51.088519: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8907
2024-09-15 14:13:54.401592: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f6998211e10 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-09-15 14:13:54.401665: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Tesla V100-SXM2-32GB, Compute Capability 7.0
2024-09-15 14:13:54.412343: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-09-15 14:13:54.836520: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [10]:
from tensorflow.keras.metrics import MeanSquaredError, MeanAbsoluteError
from sklearn.metrics import r2_score

# 예측
y_pred = model.predict(X_train)

# y_pred와 Y_test를 1차원으로 변환
y_pred = y_pred.flatten()
Y_train = Y_train.to_numpy().flatten()

# MSE
mse = MeanSquaredError()
mse.update_state(Y_train, y_pred)
mse_result = mse.result().numpy()

# RMSE
rmse_result = np.sqrt(mse_result)

# MAE
mae = MeanAbsoluteError()
mae.update_state(Y_train, y_pred)
mae_result = mae.result().numpy()

# SMAPE (Symmetric Mean Absolute Percentage Error)
def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true)))

smape_result = smape(Y_train, y_pred)

# R² (R Squared)
r2_result = r2_score(Y_train, y_pred)

print(f'MSE: {mse_result}')
print(f'RMSE: {rmse_result}')
print(f'MAE: {mae_result}')
print(f'SMAPE: {smape_result}')
print(f'R²: {r2_result}')

MSE: 0.0066236406564712524
RMSE: 0.08138575404882431
MAE: 0.050687018781900406
SMAPE: 33.751403079396304
R²: 0.7459173580412783


In [11]:
from tensorflow.keras.metrics import MeanSquaredError, MeanAbsoluteError
from sklearn.metrics import r2_score

# 예측
y_pred = model.predict(X_valid)

# y_pred와 Y_valid를 1차원으로 변환
y_pred = y_pred.flatten()
Y_valid = Y_valid.to_numpy().flatten()

# MSE
mse = MeanSquaredError()
mse.update_state(Y_valid, y_pred)
mse_result = mse.result().numpy()

# RMSE
rmse_result = np.sqrt(mse_result)

# MAE
mae = MeanAbsoluteError()
mae.update_state(Y_valid, y_pred)
mae_result = mae.result().numpy()

# SMAPE (Symmetric Mean Absolute Percentage Error)
def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true)))

smape_result = smape(Y_valid, y_pred)

# R² (R Squared)
r2_result = r2_score(Y_valid, y_pred)

print(f'MSE: {mse_result}')
print(f'RMSE: {rmse_result}')
print(f'MAE: {mae_result}')
print(f'SMAPE: {smape_result}')
print(f'R²: {r2_result}')

MSE: 0.012783690355718136
RMSE: 0.11306498199701309
MAE: 0.06760044395923615
SMAPE: 37.611163811906614
R²: 0.5358405362290457


In [12]:
from tensorflow.keras.metrics import MeanSquaredError, MeanAbsoluteError
from sklearn.metrics import r2_score

# 예측
y_pred = model.predict(X_test)

# y_pred와 Y_test를 1차원으로 변환
y_pred = y_pred.flatten()
Y_test = Y_test.to_numpy().flatten()

# MSE
mse = MeanSquaredError()
mse.update_state(Y_test, y_pred)
mse_result = mse.result().numpy()

# RMSE
rmse_result = np.sqrt(mse_result)

# MAE
mae = MeanAbsoluteError()
mae.update_state(Y_test, y_pred)
mae_result = mae.result().numpy()

# SMAPE (Symmetric Mean Absolute Percentage Error)
def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true)))

smape_result = smape(Y_test, y_pred)

# R² (R Squared)
r2_result = r2_score(Y_test, y_pred)

print(f'MSE: {mse_result}')
print(f'RMSE: {rmse_result}')
print(f'MAE: {mae_result}')
print(f'SMAPE: {smape_result}')
print(f'R²: {r2_result}')

MSE: 0.008723324164748192
RMSE: 0.09339873492717743
MAE: 0.059668801724910736
SMAPE: 36.050837006058636
R²: 0.6721522495062036


In [13]:
import shap
import matplotlib.pyplot as plt

# 모델과 데이터 준비 (예: 모델과 X_train)
explainer = shap.DeepExplainer(model, X_train)  # 모델과 데이터
shap_values = explainer.shap_values(X_train)

# Summary plot
shap.summary_plot(shap_values, X_train)
plt.show()

  from .autonotebook import tqdm as notebook_tqdm


ValueError: Exception encountered when calling layer 'sequential' (type Sequential).

Input 0 of layer "lstm" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (8184, 20)

Call arguments received by layer 'sequential' (type Sequential):
  • inputs=['      dom_books  for_books  nonbook_mat  elec_mat  serials  ann_growth  \\\n0       0.00000    0.00000      0.00000   0.00000  0.00431     0.00033   \n1       0.08417    0.00000      0.01129   0.00026  0.00280     0.00142   \n2       0.04455    0.00000      0.00452   0.00000  0.00374     0.00670   \n3       0.33166    0.00000      0.01953   0.00107  0.01013     0.01156   \n4       0.11052    0.00000      0.02472   0.00026  0.00676     0.00309   \n...         ...        ...          ...       ...      ...         ...   \n8179    0.05988    0.03277      0.00641   0.00039  0.00359     0.00306   \n8180    0.04843    0.00229      0.00786   0.00049  0.00036     0.00171   \n8181    0.02278    0.00318      0.00471   0.00255  0.00122     0.00192   \n8182    0.00745    0.01823      0.00000   0.00000  0.02817     0.00066   \n8183    0.04277    0.01639      0.00998   0.00000  0.00539     0.00090   \n\n      lib_site_area  lib_build_area  total_seats  user_comps  full_time  \\\n0           0.00610         0.00725      0.09126     0.00000    0.00000   \n1           0.00516         0.01455      0.19263     0.09023    0.15714   \n2           0.00284         0.00496      0.04816     0.06391    0.07143   \n3           0.00291         0.00314      0.04816     0.05639    0.12857   \n4           0.00157         0.01015      0.19263     0.11278    0.21429   \n...             ...             ...          ...         ...        ...   \n8179        0.00507         0.01481      0.05851     0.08647    0.07143   \n8180        0.00227         0.00531      0.05177     0.12030    0.10000   \n8181        0.00027         0.00115      0.02071     0.00376    0.05714   \n8182        0.00154         0.00062      0.01156     0.00000    0.05714   \n8183        0.00195         0.00156      0.03299     0.00752    0.07143   \n\n      part_time  total_budget  acq_budget  lib_visitors   loans  \\\n0       0.00000       0.00000     0.00000       0.00000 0.00000   \n1       0.00000       0.19895     0.16693       0.08073 0.12341   \n2       0.00000       0.01445     0.10711       0.00692 0.02567   \n3       0.00847       0.03151     0.18009       0.01516 0.01830   \n4       0.08475       0.19895     0.16693       0.11382 0.05283   \n...         ...           ...         ...           ...     ...   \n8179    0.05085       0.03138     0.03883       0.02526 0.01344   \n8180    0.00000       0.06657     0.39589       0.02020 0.01080   \n8181    0.00847       0.01535     0.02627       0.00998 0.01166   \n8182    0.01695       0.00000     0.00000       0.00027 0.00070   \n8183    0.00847       0.02020     0.01354       0.00881 0.00960   \n\n      user_ed_sessions  vuln_group_budget  total_settlement  \\\n0              0.00000            0.00000           0.01151   \n1              0.00022            0.00000           0.04425   \n2              0.00000            0.00000           0.00519   \n3              0.00001            0.00000           0.01436   \n4              0.00022            0.00000           0.04425   \n...                ...                ...               ...   \n8179           0.00011            0.00043           0.00591   \n8180           0.00020            0.00001           0.00427   \n8181           0.00044            0.00000           0.00294   \n8182           0.00000            0.00000           0.00000   \n8183           0.00165            0.00000           0.00391   \n\n      acq_budget_settlement  \n0                   0.02815  \n1                   0.23447  \n2                   0.01188  \n3                   0.20793  \n4                   0.23447  \n...                     ...  \n8179                0.05916  \n8180                0.01520  \n8181                0.02523  \n8182                0.00000  \n8183                0.01329  \n\n[8184 rows x 20 columns]']
  • training=None
  • mask=None

In [None]:
sample_index = 0  

shap.force_plot(explainer.expected_value[0], shap_values[0][sample_index], X_train.iloc[sample_index], matplotlib=True, figsize=(42, 5))
plt.show()