In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense

from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor

### Feature Selection
    - Feature Importance
    - Drop-Column Importance
    - Permutation Importance

## Top15 Select

In [None]:
path='/home/hwchoi/001.project/data/T_PATIENTS_DAILY_WHOLE_0731.csv'
df=pd.read_csv(path)
df.head(2)

In [None]:
# create_date에 대한 sorting 적용
df['create_date'] = pd.to_datetime(df['create_date'])
df.sort_values(by='create_date', inplace=True)
# 결측치 채우기
df = df.fillna(0)

In [None]:
features=['weekend_yn', 'sido_cd', 'min_ta', 'max_ta', 'mean_ta', 'gap_ta', 'min_tafeel', 'max_tafeel', 'mean_tafeel', 'gap_tafeel', 'min_hm', 'max_hm',
       'mean_hm', 'gap_hm', 'min_wbtemp', 'max_wbtemp', 'mean_wbtemp', 'gap_wbtemp', 'min_ws', 'max_ws', 'mean_ws', 'tropical_3days',
       'heatwave_temp', 'heatalert_temp', 'ta_min_3days', 'ta_max_3days', 'gap_ta_minmax', 'popular_man', 'popular_woman', 'agriculture_man',
       'agriculture_woman', 'ta_min_am', 'ta_max_am', 'ta_mean_am', 'ta_min_pm', 'ta_max_pm', 'ta_mean_pm', 'ta_min_six_am1',
       'ta_max_six_am1', 'ta_mean_six_am1', 'ta_min_six_am2', 'ta_max_six_am2',
       'ta_mean_six_am2', 'ta_min_six_pm1', 'ta_max_six_pm1', 'ta_mean_six_pm1', 'ta_min_six_pm2', 'ta_max_six_pm2', 'ta_mean_six_pm2']
target='patientsCnt'
print(len(features))

In [None]:
# 데이터 스케일링
scaler = MinMaxScaler()
df[features] = scaler.fit_transform(df[features])

# 학습 데이터와 테스트 데이터로 분할
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=11, shuffle=True)

In [None]:
# RF 모델 빌드
rf_model = RandomForestRegressor(n_estimators=100, random_state=11)
rf_model.fit(X_train, y_train)

# RF 모델
rf_predictions = rf_model.predict(X_test)

# 특성 중요도 계산
feature_importances = rf_model.feature_importances_

# 특성 중요도를 데이터프레임으로 정렬
importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

In [None]:
# 특성 중요도 시각화
plt.figure(figsize=(28, 16))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.gca().invert_yaxis()
plt.show()

In [None]:
# 중요도가 높은 상위 N개의 특성 선택
N = 15
top_features = importance_df['Feature'].head(N).tolist()
# 중요 특성만을 사용하여 모델 재학습
X_train_top = X_train[top_features]
X_test_top = X_test[top_features]

### RandomForest Modeling

In [None]:
rf_model_top = RandomForestRegressor(n_estimators=100, random_state=11)
rf_model_top.fit(X_train_top, y_train)

# 예측
y_pred_train_rf_top = rf_model_top.predict(X_train_top)
y_pred_test_rf_top = rf_model_top.predict(X_test_top)

# 평가
mae_train_rf_top = mean_absolute_error(y_train, y_pred_train_rf_top)
rmse_train_rf_top = np.sqrt(mean_squared_error(y_train, y_pred_train_rf_top))
r2_train_rf_top = r2_score(y_train, y_pred_train_rf_top)

mae_test_rf_top = mean_absolute_error(y_test, y_pred_test_rf_top)
rmse_test_rf_top = np.sqrt(mean_squared_error(y_test, y_pred_test_rf_top))
r2_test_rf_top = r2_score(y_test, y_pred_test_rf_top)

print("Random Forest Regressor with Top Features:")
print(f'Training MAE: {mae_train_rf_top}')
print(f'Training RMSE: {rmse_train_rf_top}')
print(f'Training R2: {r2_train_rf_top}')
print()
print(f'Test MAE: {mae_test_rf_top}')
print(f'Test RMSE: {rmse_test_rf_top}')
print(f'Test R2: {r2_test_rf_top}')

### XGBoost Modeling

In [None]:
xgb_model_top = XGBRegressor(n_estimators=100, learning_rate=0.1,  random_state=11)
xgb_model_top.fit(X_train_top, y_train)

# 예측
y_pred_train_xgb_top = xgb_model_top.predict(X_train_top)
y_pred_test_xgb_top = xgb_model_top.predict(X_test_top)

# 평가
mae_train_xgb_top = mean_absolute_error(y_train, y_pred_train_xgb_top)
rmse_train_xgb_top = np.sqrt(mean_squared_error(y_train, y_pred_train_xgb_top))
r2_train_xgb_top = r2_score(y_train, y_pred_train_rf_top)

mae_test_xgb_top = mean_absolute_error(y_test, y_pred_test_xgb_top)
rmse_test_xgb_top = np.sqrt(mean_squared_error(y_test, y_pred_test_xgb_top))
r2_test_xgb_top = r2_score(y_test, y_pred_test_xgb_top)

print("Random Forest Regressor with Top Features:")
print(f'Training MAE: {mae_train_xgb_top}')
print(f'Training RMSE: {rmse_train_xgb_top}')
print(f'Training R2: {r2_train_xgb_top}')
print()
print(f'Test MAE: {mae_test_xgb_top}')
print(f'Test RMSE: {rmse_test_xgb_top}')
print(f'Test R2: {r2_test_xgb_top}')

In [None]:
# 성능 평가 지표를 리스트로 정리
metrics = ['MAE', 'RMSE', 'R^2']
rf_values = [mae_test_rf_top, rmse_test_rf_top, r2_test_rf_top]
xgb_values = [mae_test_xgb_top, rmse_test_xgb_top, r2_test_xgb_top]

# 시각화
x = np.arange(len(metrics))  # 각 지표에 대한 레이블 위치
width = 0.2  # 막대 너비

fig, ax = plt.subplots(figsize=(10, 4))
bars1 = ax.bar(x + 0.0 * width, rf_values, width, label='RF', color='yellow')
bars2 = ax.bar(x + 1.0 * width, xgb_values, width, label='XGB', color='orange')

# 막대 그래프에 값 표시
def add_labels(bars):
    for bar in bars:
        yval = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2, yval, round(yval, 4), va='bottom', ha='center')

add_labels(bars1)
add_labels(bars2)

# 레이블 및 제목 설정
ax.set_xlabel('Metrics')
ax.set_ylabel('Values')
ax.set_title('Prediction Model Performance Metrics')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()

# plt.savefig('Prediction Model Performance Metrics.png')
plt.show()

## Top10 Select

### RandomForest

In [None]:
path='/home/hwchoi/001.project/data/T_PATIENTS_DAILY_WHOLE_0731.csv'
df=pd.read_csv(path)
df.head(2)

In [None]:
# create_date에 대한 sorting 적용
df['create_date'] = pd.to_datetime(df['create_date'])
df.sort_values(by='create_date', inplace=True)
# 결측치 채우기
df = df.fillna(0)

In [None]:
features=['weekend_yn', 'sido_cd', 'min_ta', 'max_ta', 'mean_ta', 'gap_ta', 'min_tafeel', 'max_tafeel', 'mean_tafeel', 'gap_tafeel', 'min_hm', 'max_hm',
       'mean_hm', 'gap_hm', 'min_wbtemp', 'max_wbtemp', 'mean_wbtemp', 'gap_wbtemp', 'min_ws', 'max_ws', 'mean_ws', 'tropical_3days',
       'heatwave_temp', 'heatalert_temp', 'ta_min_3days', 'ta_max_3days', 'gap_ta_minmax', 'popular_man', 'popular_woman', 'agriculture_man',
       'agriculture_woman', 'ta_min_am', 'ta_max_am', 'ta_mean_am', 'ta_min_pm', 'ta_max_pm', 'ta_mean_pm', 'ta_min_six_am1',
       'ta_max_six_am1', 'ta_mean_six_am1', 'ta_min_six_am2', 'ta_max_six_am2',
       'ta_mean_six_am2', 'ta_min_six_pm1', 'ta_max_six_pm1', 'ta_mean_six_pm1', 'ta_min_six_pm2', 'ta_max_six_pm2', 'ta_mean_six_pm2']
target='patientsCnt'
print(len(features))

In [None]:
# 데이터 스케일링
scaler = MinMaxScaler()
df[features] = scaler.fit_transform(df[features])

# 학습 데이터와 테스트 데이터로 분할
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=11, shuffle=True)

In [None]:
# RF 모델 빌드
rf_model = RandomForestRegressor(n_estimators=100, random_state=11)
rf_model.fit(X_train, y_train)

# RF 모델
rf_predictions = rf_model.predict(X_test)

# 특성 중요도 계산
feature_importances = rf_model.feature_importances_

# 특성 중요도를 데이터프레임으로 정렬
importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

In [None]:
# 중요도가 높은 상위 N개의 특성 선택
N = 10
top_features = importance_df['Feature'].head(N).tolist()
# 중요 특성만을 사용하여 모델 재학습
X_train_top_10 = X_train[top_features]
X_test_top_10 = X_test[top_features]

In [None]:
rf_model_top = RandomForestRegressor(n_estimators=100, random_state=11)
rf_model_top.fit(X_train_top_10, y_train)

# 예측
y_pred_train_rf_top = rf_model_top.predict(X_train_top_10)
y_pred_test_rf_top = rf_model_top.predict(X_test_top_10)

# 평가
mae_train_rf_top = mean_absolute_error(y_train, y_pred_train_rf_top)
rmse_train_rf_top = np.sqrt(mean_squared_error(y_train, y_pred_train_rf_top))
r2_train_rf_top = r2_score(y_train, y_pred_train_rf_top)

mae_test_rf_top = mean_absolute_error(y_test, y_pred_test_rf_top)
rmse_test_rf_top = np.sqrt(mean_squared_error(y_test, y_pred_test_rf_top))
r2_test_rf_top = r2_score(y_test, y_pred_test_rf_top)

print("Random Forest Regressor with Top Features:")
print(f'Training MAE: {mae_train_rf_top}')
print(f'Training RMSE: {rmse_train_rf_top}')
print(f'Training R2: {r2_train_rf_top}')
print()
print(f'Test MAE: {mae_test_rf_top}')
print(f'Test RMSE: {rmse_test_rf_top}')
print(f'Test R2: {r2_test_rf_top}')

### XGBoost

In [None]:
xgb_model_top = XGBRegressor(n_estimators=100, learning_rate=0.1,  random_state=11)
xgb_model_top.fit(X_train_top, y_train)

# 예측
y_pred_train_xgb_top = xgb_model_top.predict(X_train_top_10)
y_pred_test_xgb_top = xgb_model_top.predict(X_test_top_10)

# 평가
mae_train_xgb_top = mean_absolute_error(y_train, y_pred_train_xgb_top)
rmse_train_xgb_top = np.sqrt(mean_squared_error(y_train, y_pred_train_xgb_top))
r2_train_xgb_top = r2_score(y_train, y_pred_train_rf_top)

mae_test_xgb_top = mean_absolute_error(y_test, y_pred_test_xgb_top)
rmse_test_xgb_top = np.sqrt(mean_squared_error(y_test, y_pred_test_xgb_top))
r2_test_xgb_top = r2_score(y_test, y_pred_test_xgb_top)

print("Random Forest Regressor with Top Features:")
print(f'Training MAE: {mae_train_xgb_top}')
print(f'Training RMSE: {rmse_train_xgb_top}')
print(f'Training R2: {r2_train_xgb_top}')
print()
print(f'Test MAE: {mae_test_xgb_top}')
print(f'Test RMSE: {rmse_test_xgb_top}')
print(f'Test R2: {r2_test_xgb_top}')