In [1]:
import pandas as pd
import numpy as np
import re
from matplotlib import pyplot as plt

plt.rc('font', family='AppleGothic')
plt.rcParams['axes.unicode_minus'] = False

In [2]:
dataset = pd.read_csv('./data/주차예측(한맥+다래)_정기권.csv')
dataset['일시'] = pd.to_datetime(dataset['일시'], format='%Y-%m-%d %H:%M:%S', errors='raise')


In [3]:
dataset.head()

Unnamed: 0,일시,주차장,입차대수,출차대수,입출차 차이,입출차 누계,차량구분,연,월,일,시각,기온(°C),강수량(mm),풍속(m/s),습도(%),적설(cm)
0,2020-10-01 00:00:00,"주차1동, 주차2동",0.0,0.0,0.0,96.0,정기권,2020,10,1,0,17.3,0.0,0.2,92.0,0.0
1,2020-10-01 01:00:00,"주차1동, 주차2동",0.0,0.0,0.0,96.0,정기권,2020,10,1,1,16.8,0.0,0.5,94.0,0.0
2,2020-10-01 02:00:00,"주차1동, 주차2동",0.0,0.0,0.0,96.0,정기권,2020,10,1,2,16.9,0.0,1.1,93.0,0.0
3,2020-10-01 03:00:00,"주차1동, 주차2동",0.0,0.0,0.0,96.0,정기권,2020,10,1,3,16.8,0.0,0.4,94.0,0.0
4,2020-10-01 04:00:00,"주차1동, 주차2동",0.0,0.0,0.0,96.0,정기권,2020,10,1,4,16.1,0.0,0.1,95.0,0.0


# 데이터 전처리

In [4]:
# 대체 휴일은 따로 지정해 줘야함
import holidays
kr_holidays = holidays.KR()

In [5]:
def holiday_check(x):
    if x in kr_holidays:
        return 1
    else :
        return 0

In [6]:
dataset['공휴일']= dataset['일시']
dataset['공휴일'] = dataset['공휴일'].apply(holiday_check)
dataset['요일'] = dataset['일시'].dt.weekday

In [7]:
dataset = dataset[dataset['주차장']=='주차1동, 주차2동']
dataset['시각2'] = dataset['시각']
dataset['시각3'] = dataset['시각']
dataset['시각4'] = dataset['시각']
dataset['시각5'] = dataset['시각']
dataset['시각6'] = dataset['시각']

dataset = dataset[['일시','입차대수','시각','기온(°C)','강수량(mm)','풍속(m/s)','습도(%)','적설(cm)','공휴일','요일','연','월','일']]
dataset = dataset.set_index('일시')
CATEGORICAL_COLUMN = ['공휴일','요일','시각','연','월','일']


In [8]:
#dataset['트렌드'] = range(2020,2020+len(dataset))
#dataset[(dataset['시각'] >= 7) & (dataset['시각'] <= 19)]
dataset

Unnamed: 0_level_0,입차대수,시각,기온(°C),강수량(mm),풍속(m/s),습도(%),적설(cm),공휴일,요일,연,월,일
일시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2020-10-01 00:00:00,0.0,0,17.3,0.0,0.2,92.0,0.0,1,3,2020,10,1
2020-10-01 01:00:00,0.0,1,16.8,0.0,0.5,94.0,0.0,1,3,2020,10,1
2020-10-01 02:00:00,0.0,2,16.9,0.0,1.1,93.0,0.0,1,3,2020,10,1
2020-10-01 03:00:00,0.0,3,16.8,0.0,0.4,94.0,0.0,1,3,2020,10,1
2020-10-01 04:00:00,0.0,4,16.1,0.0,0.1,95.0,0.0,1,3,2020,10,1
...,...,...,...,...,...,...,...,...,...,...,...,...
2022-10-01 19:00:00,1.0,19,21.1,0.0,0.1,54.0,0.0,0,5,2022,10,1
2022-10-01 20:00:00,0.0,20,19.6,0.0,0.6,63.0,0.0,0,5,2022,10,1
2022-10-01 21:00:00,0.0,21,18.8,0.0,0.7,73.0,0.0,0,5,2022,10,1
2022-10-01 22:00:00,1.0,22,18.1,0.0,0.8,75.0,0.0,0,5,2022,10,1


# LGBM
##### 생각해야 할 것

In [9]:
from datetime import datetime

In [10]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
from lightgbm import LGBMRegressor

train_dataset = dataset[dataset.index < datetime(2022, 5, 1)]
train_x = train_dataset.iloc[:, 1:]
train_y = train_dataset.iloc[:, 0]

test_dataset = dataset[dataset.index > datetime(2022, 5, 1)]
test_x = test_dataset.iloc[:, 1:]
test_y = test_dataset.iloc[:, 0]



In [11]:
params = {'learning_rate': 0.01,
          'max_depth': 20,
          'boosting': 'gbdt',
          'objective': 'regression',
          'metric': 'mse',
          'is_training_metric': True,
          'num_leaves': 169,
          'feature_fraction': 0.9,
          'bagging_fraction': 0.7,
          'bagging_freq': 10,
          'n_estimators' : 500,
          'seed':1111}

In [None]:
model = LGBMRegressor(**params)
model.fit(train_x, train_y, categorical_feature =CATEGORICAL_COLUMN, verbose=100)





In [None]:
predict_test = model.predict(test_x)
for i in range(len(predict_test)):
    if predict_test[i] < 0 :
        predict_test[i] = 0
    else :
        predict_test[i] = predict_test[i].round(0)
predict_test

In [None]:
MAE = mean_absolute_error(test_y, predict_test)
MAE_per = sum(abs(test_y - predict_test)) / sum(test_y) * 100
print('MAE =', MAE)
print('MAE_per =', MAE_per)
RMSE = np.sqrt(mean_squared_error(test_y, predict_test))
RMSE_per = (np.sqrt(np.mean(np.square((test_y - predict_test) / test_y)))) * 100
print('RMSE =', RMSE)
print('RMSE(per) =', RMSE_per)


In [None]:
pd.concat([test_y, pd.Series(predict_test, index=test_y.index, name='예측값')], axis=1).plot()

In [None]:
result = pd.concat([test_y, pd.Series(predict_test, index=test_y.index, name='예측값')], axis=1)
result[result['예측값']>=5]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# sorted(zip(clf.feature_importances_, X.columns), reverse=True)
feature_imp = pd.DataFrame(sorted(zip(model.feature_importances_,dataset.columns)), columns=['Value','Feature'])

plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.show()