## 구글 코랩 사용시 구글 드라이브 연결 사용

In [None]:
#구글 드라이브 연결
from google.colab import drive
drive.mount('/content/gdrive')

#코랩 환경 경로 설정 -> 자신에게 맞는 경로로 설정해주시면 됩니다
DATA_PATH = '/content/gdrive/MyDrive/미세먼지예측공모전/dataset/'

Mounted at /content/gdrive


## Import & Install

In [None]:
!pip install prophet

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#Base & visualization
import os
import glob
import random
import pandas as pd
import numpy as np
import warnings
import matplotlib.pylab as plt
import seaborn as sns

#Feature engineering
from datetime import datetime

#Modeling
import prophet as pr

## Fix Seed

In [None]:
#Seed 고정
class CFG:
    SEED = 42

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG.SEED) # Seed 고정

## Data Load

In [None]:
train = pd.read_csv(DATA_PATH + '/train.csv', encoding = 'utf-8')
test = pd.read_csv(DATA_PATH + '/test.csv', encoding = 'utf-8')

In [None]:
submit = pd.read_csv(DATA_PATH + 'answer_sample.csv')

## Modeling

In [None]:
train['연도'] = train['연도'].map({0: "2017", 1: "2018", 2: "2019", 3: "2020"})

In [None]:
train['일시'] = pd.to_datetime(train['연도']+ '-' + train['일시'], format='%Y-%m-%d %H:%M')
train = train.drop(columns = '연도')

In [None]:
train

Unnamed: 0,일시,측정소,기온(°C),풍향(deg),풍속(m/s),강수량(mm),습도(%),PM2.5
0,2017-01-01 00:00:00,공주,0.173776,0.201944,0.023018,0.0,0.828,0.056
1,2017-01-01 01:00:00,공주,0.176935,0.168611,0.030691,0.0,0.831,0.060
2,2017-01-01 02:00:00,공주,0.180095,0.087222,0.033248,0.0,0.784,0.068
3,2017-01-01 03:00:00,공주,0.178515,0.087222,0.025575,0.0,0.745,0.060
4,2017-01-01 04:00:00,공주,0.164297,0.113889,0.020460,0.0,0.750,0.068
...,...,...,...,...,...,...,...,...
596083,2020-12-31 19:00:00,홍성읍,0.273302,0.832222,0.086957,0.0,0.671,0.060
596084,2020-12-31 20:00:00,홍성읍,0.271722,0.831667,0.043478,0.0,0.692,0.052
596085,2020-12-31 21:00:00,홍성읍,0.268562,0.832500,0.066496,0.0,0.706,0.044
596086,2020-12-31 22:00:00,홍성읍,0.262243,0.866944,0.043478,0.0,0.725,0.052


In [None]:
# train 데이터를 Prophet 모델에 맞게 가공한다.
train_prophet = train[["일시", "PM2.5"]]
train_prophet = train_prophet.rename(columns={"일시": "ds", "PM2.5": "y"})
train_prophet["기온"] = train["기온(°C)"]
train_prophet["풍향"] = train["풍향(deg)"]
train_prophet["풍속"] = train["풍속(m/s)"]
train_prophet["강수량"] = train["강수량(mm)"]
train_prophet["습도"] = train["습도(%)"]

In [None]:
model = pr.Prophet()

In [None]:
model.fit(train_prophet)

DEBUG:cmdstanpy:input tempfile: /tmp/tmp2y_1y99c/szbltyyh.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp2y_1y99c/633mq9sf.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.10/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=89256', 'data', 'file=/tmp/tmp2y_1y99c/szbltyyh.json', 'init=/tmp/tmp2y_1y99c/633mq9sf.json', 'output', 'file=/tmp/tmp2y_1y99c/prophet_modelgstwhcmi/prophet_model-20230505090648.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
09:06:48 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
09:14:34 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


<prophet.forecaster.Prophet at 0x7f13e95bc880>

In [None]:
test['연도'] = test['연도'].map({4: "2021"})

In [None]:
test['일시'] = pd.to_datetime(test['연도']+ '-' + test['일시'], format='%Y-%m-%d %H:%M')
test = test.drop(columns = '연도')

In [None]:
dates = pd.date_range('2001-01-03 00:00', '2001-01-05 23:00', freq='H')

In [None]:
test_prophet = test.iloc[:48,:][["일시"]]
test_prophet = test_prophet.rename(columns={"일시": "ds"})
test_prophet["기온"] = test.iloc[:48,:]["기온(°C)"]
test_prophet["풍향"] = test.iloc[:48,:]["풍향(deg)"]
test_prophet["풍속"] = test.iloc[:48,:]["풍속(m/s)"]
test_prophet["강수량"] = test.iloc[:48,:]["강수량(mm)"]
test_prophet["습도"] = test.iloc[:48,:]["습도(%)"]

In [None]:
future = pd.DataFrame({'ds': dates,
                       '기온(°C)': [0] * len(dates),
                       '풍향(deg)': [0] * len(dates),
                       '풍속(m/s)': [0] * len(dates),
                       '강수량(mm)': [0] * len(dates),
                       '습도(%)': [0] * len(dates)})

In [None]:
future

Unnamed: 0,ds,기온(°C),풍향(deg),풍속(m/s),강수량(mm),습도(%)
0,2001-01-03 00:00:00,0,0,0,0,0
1,2001-01-03 01:00:00,0,0,0,0,0
2,2001-01-03 02:00:00,0,0,0,0,0
3,2001-01-03 03:00:00,0,0,0,0,0
4,2001-01-03 04:00:00,0,0,0,0,0
...,...,...,...,...,...,...
67,2001-01-05 19:00:00,0,0,0,0,0
68,2001-01-05 20:00:00,0,0,0,0,0
69,2001-01-05 21:00:00,0,0,0,0,0
70,2001-01-05 22:00:00,0,0,0,0,0


In [None]:
forecast = model.predict(future)

In [None]:
forecast.columns

Index(['ds', 'trend', 'yhat_lower', 'yhat_upper', 'trend_lower', 'trend_upper',
       'additive_terms', 'additive_terms_lower', 'additive_terms_upper',
       'daily', 'daily_lower', 'daily_upper', 'weekly', 'weekly_lower',
       'weekly_upper', 'yearly', 'yearly_lower', 'yearly_upper',
       'multiplicative_terms', 'multiplicative_terms_lower',
       'multiplicative_terms_upper', 'yhat'],
      dtype='object')

In [None]:
print(forecast[['ds', 'yhat']])

                    ds      yhat
0  2001-01-03 00:00:00  7.579765
1  2001-01-03 01:00:00  7.580989
2  2001-01-03 02:00:00  7.580424
3  2001-01-03 03:00:00  7.577949
4  2001-01-03 04:00:00  7.575342
..                 ...       ...
67 2001-01-05 19:00:00  7.568879
68 2001-01-05 20:00:00  7.570593
69 2001-01-05 21:00:00  7.570344
70 2001-01-05 22:00:00  7.569536
71 2001-01-05 23:00:00  7.569724

[72 rows x 2 columns]
