In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from keras.models import Sequential
from keras.layers import SimpleRNN, LSTM, Dense
import xgboost as xgb  # xgboost 라이브러리 import
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings('ignore')

### Feature selection
#### Feature selection의 목적
- 통계기반 변수 선택 VS 모형기반 변수 선택

- 영향력 있는 feature만 추출하기 위함

- 변수들 간에 상관관계가 있는 경우 성능 영향 고려

- 덜 중요한 feature는 예측모델의 불확실성을 높임

#### 1. SelectKBest

- scikit-learn에서 제공하는 라이브러리

- 각 특성(feature)과 종속 변수(target) 간의 통계적 유의성을 기반으로 상위 K개의 최적 특성을 선택함

#### 2. SelectFromModel

- 모델 훈련이 끝난 후 사용자가 지정한 임계값을 기반으로 특성을 선택함

In [3]:
path1='D:/Work/002.code/jupyterNotebook/data/HEAT_TARGET/T_PATIENTS_DAILY_WHOLE_0731.csv'
df_raw=pd.read_csv(path1)
df_raw.head(2)

Unnamed: 0,sido,create_date,sido_cd,weekend_yn,grid_x,grid_y,sat_x,sat_y,min_ta,max_ta,...,ta_mean_six_am2,ta_min_six_pm1,ta_max_six_pm1,ta_mean_six_pm1,ta_min_six_pm2,ta_max_six_pm2,ta_mean_six_pm2,patientsCnt,instDate,updtDate
0,경기도,2014-05-01,31,0,60,120,,,9.6,24.2,...,15,21,24,22,14,21,17,0,2024-07-31,2024-07-31
1,충청북도,2014-05-01,33,0,69,107,,,10.5,23.7,...,15,21,23,22,15,22,18,0,2024-07-31,2024-07-31


In [4]:
# 'date' 컬럼을 datetime 형식으로 변환
df_raw['create_date'] = pd.to_datetime(df_raw['create_date'])
# year, month, day 컬럼 생성
df_raw['year'] = df_raw['create_date'].dt.year
df_raw['month'] = df_raw['create_date'].dt.month
df_raw['day'] = df_raw['create_date'].dt.day

# 클래스 정보로 담아내기
df_raw['patientsCD'] = df_raw['patientsCnt'].apply(lambda x: 0 if x == 0 else 1)

# 결측치 채우기
df_raw = df_raw.fillna(0)

In [5]:
lst=['create_date','weekend_yn', 'year', 'month', 'day', 'min_ta', 'max_ta', 'mean_ta', 'gap_ta', 'min_tafeel', 'max_tafeel', 'mean_tafeel', 'gap_tafeel', 'min_hm', 'max_hm', 'mean_hm', 'gap_hm', 'min_wbtemp', 'max_wbtemp', 'mean_wbtemp',
    'gap_wbtemp', 'min_ws', 'max_ws', 'mean_ws', 'tropical_3days', 'heatwave_temp', 'heatalert_temp', 'ta_min_3days', 'ta_max_3days', 'gap_ta_minmax', 'popular_man', 'popular_woman', 'agriculture_man',
    'agriculture_woman', 'ta_min_am', 'ta_max_am', 'ta_mean_am', 'ta_min_pm', 'ta_max_pm', 'ta_mean_pm', 'ta_min_six_am1', 'ta_max_six_am1', 'ta_mean_six_am1', 'ta_min_six_am2', 'ta_max_six_am2', 'ta_mean_six_am2',
    'ta_min_six_pm1', 'ta_max_six_pm1', 'ta_mean_six_pm1', 'ta_min_six_pm2', 'ta_max_six_pm2', 'ta_mean_six_pm2', 'patientsCnt', 'patientsCD']
print('raw의 변수 개수: ', len(lst), '개')
df=df_raw[lst]

raw의 변수 개수:  54 개


In [6]:
df.head(2)

Unnamed: 0,create_date,weekend_yn,year,month,day,min_ta,max_ta,mean_ta,gap_ta,min_tafeel,...,ta_max_six_am2,ta_mean_six_am2,ta_min_six_pm1,ta_max_six_pm1,ta_mean_six_pm1,ta_min_six_pm2,ta_max_six_pm2,ta_mean_six_pm2,patientsCnt,patientsCD
0,2014-05-01,0,2014,5,1,9.6,24.2,16.6,14.6,12.35,...,21,15,21,24,22,14,21,17,0,0
1,2014-05-01,0,2014,5,1,10.5,23.7,17.1,13.2,12.78,...,21,15,21,23,22,15,22,18,0,0


#### 1. SelectKBest

#### 1-1. patientsCnt 기준

- target과 변수 사이의 상관관계 분석

- 가장 상관관계가 높은 K개의 변수 선정

In [8]:
# 변수선택(feature selection) 하기
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [9]:
# Feature와 Target 분리
# patientsCnt 기준으로 작업
X = df.drop(columns=['create_date', 'patientsCnt', 'patientsCD'])
y = df['patientsCnt']

In [10]:
# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# SelectKBest를 사용한 변수 선택
# 총 변수는 45개
k = 15  # 선택할 변수의 수(랜덤선택)
selector = SelectKBest(score_func=f_regression, k=k)
X_new = selector.fit_transform(X_train, y_train)

# 선택된 변수의 인덱스
selected_indices = selector.get_support(indices=True)

# 선택된 변수의 이름
selected_features = X.columns[selected_indices]
print("Selected Features:", selected_features)

Selected Features: Index(['max_ta', 'mean_ta', 'max_tafeel', 'heatwave_temp', 'heatalert_temp',
       'ta_max_am', 'ta_max_pm', 'ta_mean_pm', 'ta_max_six_am2',
       'ta_mean_six_am2', 'ta_min_six_pm1', 'ta_max_six_pm1',
       'ta_mean_six_pm1', 'ta_max_six_pm2', 'ta_mean_six_pm2'],
      dtype='object')


In [12]:
df.head(3)

Unnamed: 0,create_date,weekend_yn,year,month,day,min_ta,max_ta,mean_ta,gap_ta,min_tafeel,max_tafeel,mean_tafeel,gap_tafeel,min_hm,max_hm,mean_hm,gap_hm,min_wbtemp,max_wbtemp,mean_wbtemp,gap_wbtemp,min_ws,max_ws,mean_ws,tropical_3days,heatwave_temp,heatalert_temp,ta_min_3days,ta_max_3days,gap_ta_minmax,popular_man,popular_woman,agriculture_man,agriculture_woman,ta_min_am,ta_max_am,ta_mean_am,ta_min_pm,ta_max_pm,ta_mean_pm,ta_min_six_am1,ta_max_six_am1,ta_mean_six_am1,ta_min_six_am2,ta_max_six_am2,ta_mean_six_am2,ta_min_six_pm1,ta_max_six_pm1,ta_mean_six_pm1,ta_min_six_pm2,ta_max_six_pm2,ta_mean_six_pm2,patientsCnt,patientsCD
0,2014-05-01,0,2014,5,1,9.6,24.2,16.6,14.6,12.35,22.04,17.0,9.7,30.0,98.0,61.1,5.4,8.8,14.2,11.6,5.4,0.3,3.5,1.7,0,0,0,9.6,24.2,14.6,6219813,6138017,186278,188594,9,21,13,14,24,20,9.0,13.0,10.0,9,21,15,21,24,22,14,21,17,0,0
1,2014-05-01,0,2014,5,1,10.5,23.7,17.1,13.2,12.78,22.16,17.5,9.4,33.0,91.0,60.0,5.6,9.3,14.9,12.0,5.6,0.0,3.7,1.6,0,0,0,10.5,23.7,13.2,796141,782792,92687,95065,10,21,14,15,23,20,10.0,14.0,12.0,10,21,15,21,23,22,15,22,18,0,0
2,2014-05-01,0,2014,5,1,13.5,25.3,19.0,11.8,16.29,23.66,19.6,7.4,27.0,97.0,64.7,3.4,12.7,16.1,14.2,3.4,0.1,3.7,1.9,0,0,0,13.5,25.3,11.8,1241119,1252145,25045,24142,13,21,16,18,25,22,13.0,14.0,14.0,14,21,17,21,25,23,18,24,20,0,0


#### 1-2. patientsCD 기준

In [15]:
# Feature와 Target 분리
# patientsCnt 기준으로 작업
X = df.drop(columns=['create_date', 'patientsCnt', 'patientsCD'])
y = df['patientsCD']

In [16]:
# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# SelectKBest를 사용한 변수 선택
# 총 변수는 45개
k = 15  # 선택할 변수의 수(랜덤선택)
selector = SelectKBest(score_func=f_regression, k=k)
X_new = selector.fit_transform(X_train, y_train)

# 선택된 변수의 인덱스
selected_indices = selector.get_support(indices=True)

# 선택된 변수의 이름
selected_features = X.columns[selected_indices]
print("Selected Features:", selected_features)

Selected Features: Index(['max_ta', 'mean_ta', 'max_tafeel', 'mean_tafeel', 'ta_max_am',
       'ta_mean_am', 'ta_max_pm', 'ta_mean_pm', 'ta_max_six_am2',
       'ta_mean_six_am2', 'ta_min_six_pm1', 'ta_max_six_pm1',
       'ta_mean_six_pm1', 'ta_max_six_pm2', 'ta_mean_six_pm2'],
      dtype='object')
