In [1]:
import pandas as pd
import numpy as np

In [2]:
from preprocess import *
from Data_Load import *
from parameters import *

In [3]:
from pycaret.regression import *

## 1. 기본 데이터 
#### (1시간 단위의 교통 카드 이력 데이터, 정류장별 위/경도 좌표 포함)

In [4]:
base_data = pd.read_pickle("/home/seho/Passenger_Demand/data/base_data.pkl")
target_col = "totalcnt"

In [5]:
base_year = base_data["transdate"].dt.strftime("%Y")[0]
city = "울산"

## 2. 외부 데이터 수집

In [6]:
# 휴일 정보 수집
Load_Holiday_Data(holiday_params_dict, save_tf = True, save_path = "/home/seho/Passenger_Demand/data/api_data/")

In [7]:
# 날씨 정보 수집
Load_Weather_Data(weather_params_dict, save_tf = True, save_path = "/home/seho/Passenger_Demand/data/api_data/")

n_rows : 900, total_count : 8064, max_page = 9


In [8]:
# 미세먼지 경보 정보 수집
Load_Particulate_Matter_Data(pm_params_dict, save_tf = True, save_path = "/home/seho/Passenger_Demand/data/api_data/")

n_rows : 1000, total_count : 304, max_page = 1


In [9]:
# # 상권 정보 수집
# Load_Trading_Data(trading_params_dict,
#                   google_key = 'AIzaSyDfLv3OzniRbUc7tTRBJndpiuyepHSmUrE',
#                   select_region = '울산')

In [10]:
# 병원 정보 수집
Load_Hospital_Data(hospital_params_dict, save_tf = True, save_path = "/home/seho/Passenger_Demand/data/api_data/")

n_rows : 1000, total_count : 1406, max_page = 2


In [11]:
# # 학교(초중고) 정보 수집
# Load_School_Data(school_params_dict,
#                  select_region = city)

In [12]:
# # 대학교 정보 수집
# Load_University_Data(university_params_dict,
#                      google_key = 'AIzaSyDfLv3OzniRbUc7tTRBJndpiuyepHSmUrE',
#                      select_region = city,
#                      save_tf = True
#                      save_path = "/home/seho/Passenger_Demand/data/api_data/")

In [13]:
# 행사 정보 수집
Load_Event_Data(event_params_dict,
                start_year = base_year,
                select_region = city,
                save_tf = True,
                save_path = "/home/seho/Passenger_Demand/data/api_data/")

n_rows : 1000, total_count : 6396, max_page = 7


In [14]:
# 축제 정보 수집
Load_Festival_Data(festival_params_dict,
                   start_year = base_year,
                   select_region = city,
                   save_tf = True,
                   save_path = "/home/seho/Passenger_Demand/data/api_data/")

n_rows : 1000, total_count : 1026, max_page = 2


## 3. 외부 데이터 Load

In [15]:
# 휴일 정보 Load
holiday_data = pd.read_csv("/home/seho/Passenger_Demand/data/api_data/holiday_data.csv")
# 날씨 정보 Load
weather_data = pd.read_csv("/home/seho/Passenger_Demand/data/api_data/weather_data.csv")
# 미세먼지 경보 정보 Load
pm_data = pd.read_csv("/home/seho/Passenger_Demand/data/api_data/pm_data.csv")

# 상권 정보 load (기존)
trading_area_data = pd.read_csv("/home/seho/Passenger_Demand/data/울산광역시_상권정보_201231.csv")
# 병원 정보 Load
hospital_data = pd.read_csv("/home/seho/Passenger_Demand/data/api_data/hospital_data.csv")
# 학교 정보 Load (기존)
school_data = pd.read_csv("/home/seho/Passenger_Demand/data/school_data.csv")

# 행사 정보 Load
event_data = pd.read_csv("/home/seho/Passenger_Demand/data/api_data/event_data.csv")
# 축제 정보 Load
festival_data = pd.read_csv("/home/seho/Passenger_Demand/data/api_data/festival_data.csv")

In [16]:
preprocessing = preprocess(data = base_data,
                           date_col = "transdate", 
                           target_cols = target_col, 
                           stop_id_col = "stop_id", 
                           holiday_data = holiday_data,
                           weather_data = weather_data,
                           pm_data = pm_data,
                           trading_area_data = trading_area_data,
                           hospital_data = hospital_data,
                           school_data = school_data,
                           event_data = event_data,
                           festival_data = festival_data,
                           num_cores = 12)

In [17]:
ml_data = preprocessing.run()

1. 결측치 Impute ... Finished ((563300, 14))
2. 시계열 변수 생성 : Finished ((563300, 66))
3. 시간적 특성 변수 추가 (특일, 날씨, 미세먼지 경보) ... Finished ((563300, 74))
4. 공간적 특성 정보 추가 (상권정보, 학교정보, 병원정보) ... Finished ((563300, 175))
5. 시공간적 특성 정보 추가 (행사정보, 축제정보) ... Finished ((563300, 177))


In [18]:
del base_data

## 4. 모델 학습

In [19]:
categorical_feature = ["dayofweek", "hour", "ntl_holi", "holi", "seq_holi"]

In [20]:
numerical_feature = [col for col in ml_data.columns if col not in categorical_feature + [target_col]]

#### 데이터 Setup

In [21]:
reg_experiment = setup(ml_data, 
                       target = "totalcnt", 
                       categorical_features = categorical_feature,
                       numeric_features = numerical_feature,
                       ignore_features = ["month", "weekofyear", "longitude", "latitude", "studentcnt", "childcnt", "normalcnt"],
                       train_size = 0.7,
                       n_jobs = 14,
                       session_id = 12345,
#                        log_experiment = True,
#                        log_plots = True,
#                        log_profile = True,
#                        log_data = True,
#                        profile = True,
                       silent = True,
                       experiment_name="Passenger_Demand")

Unnamed: 0,Description,Value
0,session_id,12345
1,Target,totalcnt
2,Original Data,"(563300, 173)"
3,Missing Values,True
4,Numeric Features,160
5,Categorical Features,5
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(394309, 189)"


#### 초도 모델링

In [None]:
%%time
base_models = compare_models(include = ["lightgbm", "rf", "gbr", "lr", "ridge", "lasso", "en", "knn"])

IntProgress(value=0, description='Processing: ', max=44)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,1.587,8.23,2.8686,0.7677,0.552,0.5649,7.273


#### 모델 선택 및 생성

In [None]:
%%time
lightgbm = create_model('lightgbm')

#### 모델 튜닝

In [None]:
%%time
tuned_lightgbm_optuna_tpe = tune_model(lightgbm, 
                                       optimize = "RMSE", 
                                       search_library = "optuna", 
                                       search_algorithm = "tpe", 
                                       fold = 10, 
                                       return_tuner = True, 
                                       n_iter = 10)