<a href="https://colab.research.google.com/github/KOO-96/Average_temp_Seoul-Prophet-/blob/main/%EB%8D%B0%EC%9D%B4%EC%BD%98_%EC%84%9C%EC%9A%B8%EC%8B%9C_%ED%8F%89%EA%B7%A0_%EA%B8%B0%EC%98%A8_%EC%98%88%EC%B8%A1_%ED%95%B4%EC%BB%A4%ED%86%A4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv]
!rm ~/.cache/matplotlib -rf

# Library

In [2]:
import pandas as pd
from prophet import Prophet
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from prophet.diagnostics import cross_validation, performance_metrics

plt.rc('font', family = 'NanumBarunGothic')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Load

In [4]:
train = pd.read_csv('/content/drive/MyDrive/기온/train.csv')
submission = pd.read_csv('/content/drive/MyDrive/기온/sample_submission.csv')

In [28]:
train.head()

Unnamed: 0,ds,최고기온,최저기온,일교차,강수량,평균습도,평균풍속,일조합,일사합,일조율,y
0,1960-01-01,2.2,-5.2,7.4,0.4,68.3,1.7,6.7,4.81,28.1,-1.6
1,1960-01-02,1.2,-5.6,6.8,0.4,87.7,1.3,0.0,4.81,28.1,-1.9
2,1960-01-03,8.7,-2.1,10.8,0.0,81.3,3.0,0.0,4.81,28.1,4.0
3,1960-01-04,10.8,1.2,9.6,0.0,79.7,4.4,2.6,4.81,28.1,7.5
4,1960-01-05,1.3,-8.2,9.5,0.0,44.0,5.1,8.2,4.81,28.1,-4.6


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23011 entries, 0 to 23010
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   일시      23011 non-null  object 
 1   최고기온    23008 non-null  float64
 2   최저기온    23008 non-null  float64
 3   일교차     23007 non-null  float64
 4   강수량     9150 non-null   float64
 5   평균습도    23011 non-null  float64
 6   평균풍속    23007 non-null  float64
 7   일조합     22893 non-null  float64
 8   일사합     18149 non-null  float64
 9   일조율     22645 non-null  float64
 10  평균기온    23011 non-null  float64
dtypes: float64(10), object(1)
memory usage: 1.9+ MB


In [8]:
train.isnull().sum()

일시          0
최고기온        3
최저기온        3
일교차         4
강수량     13861
평균습도        0
평균풍속        4
일조합       118
일사합      4862
일조율       366
평균기온        0
dtype: int64

일조합 : 1년간 일사시간(구름이나 안개 따위에 가려지지 아니하고 햇볕이 실제로 내리쬐는 시간)의 합계  
일사합 : 1년간 일사량(태양의 복사 에너지가 땅에 닿는 양)의 합계  
일교차 : 최고기온 - 최저기온  
일조율 : 햇볕이 내리쬐는 시간 / 햇빛이 떠있는 시간  
-> 일조합 시간이 클수록 일조율은 커진다.

# Preprocessing

In [9]:
# BaseLine Preprocessing

# 날짜 데이터 변환
train['일시'] = pd.to_datetime(train['일시'])
train = train.set_index('일시')

# 데이터의 시간 간격 지정
train.index.freq = 'D'

In [20]:
# Preprocessing
# 결측치 처리 앞의 값으로 채워서 결측치를 처리하려고 하였으나
# 강수량의 경우 결측값이 채워지지 않았고 그 이유는 앞 행이 결측치로 존재했기 때문이다. 따라서 뒤의 값으로 결측치를 채워서 결측치를 처리.

nu = ['최고기온', '최저기온', '일교차', '강수량', '평균풍속', '일조합', '일사합', '일조율']
for i in nu:
  train[i].fillna(method= 'bfill', inplace=True)
train.isnull().sum()

최고기온    0
최저기온    0
일교차     0
강수량     0
평균습도    0
평균풍속    0
일조합     0
일사합     0
일조율     0
평균기온    0
dtype: int64

In [21]:
train = train.reset_index()
train = train.rename(columns={'일시': 'ds', '평균기온': 'y'})

In [22]:
train.head(5)

Unnamed: 0,ds,최고기온,최저기온,일교차,강수량,평균습도,평균풍속,일조합,일사합,일조율,y
0,1960-01-01,2.2,-5.2,7.4,0.4,68.3,1.7,6.7,4.81,28.1,-1.6
1,1960-01-02,1.2,-5.6,6.8,0.4,87.7,1.3,0.0,4.81,28.1,-1.9
2,1960-01-03,8.7,-2.1,10.8,0.0,81.3,3.0,0.0,4.81,28.1,4.0
3,1960-01-04,10.8,1.2,9.6,0.0,79.7,4.4,2.6,4.81,28.1,7.5
4,1960-01-05,1.3,-8.2,9.5,0.0,44.0,5.1,8.2,4.81,28.1,-4.6


# modeling

In [None]:
# Base prophet모델
prophet = Prophet()
prophet.fit(train)

In [23]:
# predict
future_data = prophet.make_future_dataframe(periods = len(submission), freq = 'd')
forecast_data = prophet.predict(future_data)

# submission
submission['평균기온'] = forecast_data.yhat[-len(submission):].values
# submission.to_csv("prophet_submission.csv", index=False)
print(submission.tail(5))

INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmpeolr8yat/ngqfzwus.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpeolr8yat/qebk10uz.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.10/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=58053', 'data', 'file=/tmp/tmpeolr8yat/ngqfzwus.json', 'init=/tmp/tmpeolr8yat/qebk10uz.json', 'output', 'file=/tmp/tmpeolr8yat/prophet_modelnac9_qs9/prophet_model-20240102071910.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
07:19:10 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
07:19:15 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


             일시      평균기온
353  2023-12-20  0.400586
354  2023-12-21  0.293537
355  2023-12-22  0.161249
356  2023-12-23  0.096136
357  2023-12-24 -0.091857


> 2000년도 이후 데이터를 사용한 예측시도

In [29]:
train = pd.read_csv('/content/drive/MyDrive/기온/train.csv')
submission = pd.read_csv('/content/drive/MyDrive/기온/sample_submission.csv')

In [30]:
# BaseLine Preprocessing

# 날짜 데이터 변환
train['일시'] = pd.to_datetime(train['일시'])
train = train.set_index('일시')

# 데이터의 시간 간격 지정
train.index.freq = 'D'

# Preprocessing
# 결측치 처리 앞의 값으로 채워서 결측치를 처리하려고 하였으나
# 강수량의 경우 결측값이 채워지지 않았고 그 이유는 앞 행이 결측치로 존재했기 때문이다. 따라서 뒤의 값으로 결측치를 채워서 결측치를 처리.

nu = ['최고기온', '최저기온', '일교차', '강수량', '평균풍속', '일조합', '일사합', '일조율']
for i in nu:
  train[i].fillna(method= 'bfill', inplace=True)
train.isnull().sum()

최고기온    0
최저기온    0
일교차     0
강수량     0
평균습도    0
평균풍속    0
일조합     0
일사합     0
일조율     0
평균기온    0
dtype: int64

In [31]:
train = train.reset_index()
train = train.rename(columns={'일시': 'ds', '평균기온': 'y'})

In [32]:
train = train[train['ds'] > '	2000-01-01']

In [33]:
pip install optuna

Collecting optuna
  Downloading optuna-3.5.0-py3-none-any.whl (413 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.4/413.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.0-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.0-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.0 alembic-1.13.1 colorlog-6.8.0 optuna-3.5.0


In [None]:
import optuna
from prophet import Prophet
from sklearn.metrics import mean_absolute_error

def objective(trial):
    # Prophet
    model = Prophet(
        changepoint_prior_scale=trial.suggest_loguniform('changepoint_prior_scale', 0.001, 0.5),
        seasonality_prior_scale=trial.suggest_loguniform('seasonality_prior_scale', 0.1, 100.0),
        interval_width=trial.suggest_uniform('interval_width', 0.7, 0.9)
    )

    # training
    model.fit(train)

    # validation set -> 2022년도 1년치 데이터
    valid = train.tail(365)

    # 목적 함수 (검증 세트에 대한 MAE 최소화)
    y_true = valid['y'].values
    forecast = model.predict(valid[['ds']])
    y_pred = forecast['yhat'].values

    mae = mean_absolute_error(y_true, y_pred)
    return mae

# Optuna 최적화 수행
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# 최적 파라미터 출력
print('Best trial:')
trial = study.best_trial
print('Value: {:.4f}'.format(trial.value))
print('Params: ')
for key, value in trial.params.items():
    print('{}: {}'.format(key, value))


In [35]:
optimal_params = {
    'changepoint_prior_scale': study.best_params['changepoint_prior_scale'],
    'seasonality_prior_scale': study.best_params['seasonality_prior_scale'],
    'interval_width': study.best_params['interval_width'],
}

model = Prophet(**optimal_params)
model.fit(train)

INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmpeolr8yat/0yjdg2co.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpeolr8yat/2nmkjo5n.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.10/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=71176', 'data', 'file=/tmp/tmpeolr8yat/0yjdg2co.json', 'init=/tmp/tmpeolr8yat/2nmkjo5n.json', 'output', 'file=/tmp/tmpeolr8yat/prophet_modeli6ot14i7/prophet_model-20240102083547.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
08:35:47 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
08:35:48 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


<prophet.forecaster.Prophet at 0x7c4161dcf4c0>

In [37]:
# predict
future_data = model.make_future_dataframe(periods = len(submission), freq = 'd')
forecast_data = model.predict(future_data)

# submission
submission['평균기온'] = forecast_data.yhat[-len(submission):].values
# submission.to_csv("prophet_submission2.csv", index=False)
print(submission.tail(5))