In [1]:
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from pmdarima import auto_arima
from statsmodels.tsa.arima.model import ARIMA
from prophet import Prophet
from prophet.diagnostics import cross_validation, performance_metrics
from prophet.plot import plot_cross_validation_metric
import itertools

In [3]:
# 파일 경로
train_path = 'C:/Users/systj/bigdata/dacon/train.csv' # 학습 데이터 파일 경로
submission_path = 'C:/Users/systj/bigdata/dacon/sample_submission.csv' # 제출 양식 파일 경로

# 데이터 불러오기
train_df = pd.read_csv(train_path)
submission_df = pd.read_csv(submission_path)

display(train_df.head())
display(submission_df.head())

Unnamed: 0,일시,최고기온,최저기온,일교차,강수량,평균습도,평균풍속,일조합,일사합,일조율,평균기온
0,1960-01-01,2.2,-5.2,7.4,,68.3,1.7,6.7,,,-1.6
1,1960-01-02,1.2,-5.6,6.8,0.4,87.7,1.3,0.0,,,-1.9
2,1960-01-03,8.7,-2.1,10.8,0.0,81.3,3.0,0.0,,,4.0
3,1960-01-04,10.8,1.2,9.6,0.0,79.7,4.4,2.6,,,7.5
4,1960-01-05,1.3,-8.2,9.5,,44.0,5.1,8.2,,,-4.6


Unnamed: 0,일시,평균기온
0,2023-01-01,0
1,2023-01-02,0
2,2023-01-03,0
3,2023-01-04,0
4,2023-01-05,0


In [4]:
# 결측값 체크
train_df.isna().sum()

일시          0
최고기온        3
최저기온        3
일교차         4
강수량     13861
평균습도        0
평균풍속        4
일조합       118
일사합      4862
일조율       366
평균기온        0
dtype: int64

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23011 entries, 0 to 23010
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   일시      23011 non-null  object 
 1   최고기온    23008 non-null  float64
 2   최저기온    23008 non-null  float64
 3   일교차     23007 non-null  float64
 4   강수량     9150 non-null   float64
 5   평균습도    23011 non-null  float64
 6   평균풍속    23007 non-null  float64
 7   일조합     22893 non-null  float64
 8   일사합     18149 non-null  float64
 9   일조율     22645 non-null  float64
 10  평균기온    23011 non-null  float64
dtypes: float64(10), object(1)
memory usage: 1.9+ MB


# 전처리

In [6]:
# 뒤의 값으로 대체
train_df['최고기온'].fillna(method='ffill', inplace=True)
train_df['최저기온'].fillna(method='ffill', inplace=True)
train_df['일교차'].fillna(method='ffill', inplace=True)
train_df['평균풍속'].fillna(method='ffill', inplace=True)
train_df['일조합'].fillna(method='ffill', inplace=True)

In [7]:
# 매년 같은 날짜의 평균 값으로 대체
train_df['일사합'] = train_df.groupby(train_df['일시'].str[5:])['일사합'].transform(lambda x: x.fillna(x.mean()))
train_df['일조율'] = train_df.groupby(train_df['일시'].str[5:])['일조율'].transform(lambda x: x.fillna(x.mean()))
train_df

Unnamed: 0,일시,최고기온,최저기온,일교차,강수량,평균습도,평균풍속,일조합,일사합,일조율,평균기온
0,1960-01-01,2.2,-5.2,7.4,,68.3,1.7,6.7,6.1932,47.535484,-1.6
1,1960-01-02,1.2,-5.6,6.8,0.4,87.7,1.3,0.0,5.9464,50.775806,-1.9
2,1960-01-03,8.7,-2.1,10.8,0.0,81.3,3.0,0.0,6.7872,55.535484,4.0
3,1960-01-04,10.8,1.2,9.6,0.0,79.7,4.4,2.6,7.2344,58.427419,7.5
4,1960-01-05,1.3,-8.2,9.5,,44.0,5.1,8.2,6.8800,55.600000,-4.6
...,...,...,...,...,...,...,...,...,...,...,...
23006,2022-12-27,3.3,-7.3,10.6,,69.8,1.8,8.8,10.2500,91.700000,-2.6
23007,2022-12-28,0.1,-6.0,6.1,0.1,58.1,2.5,8.7,10.8600,90.600000,-3.3
23008,2022-12-29,2.1,-7.8,9.9,0.0,56.3,1.7,9.0,10.8800,93.800000,-2.9
23009,2022-12-30,2.3,-4.4,6.7,0.0,65.6,1.9,7.9,10.8400,82.300000,-1.8


In [8]:
train_df.isna().sum()

일시          0
최고기온        0
최저기온        0
일교차         0
강수량     13861
평균습도        0
평균풍속        0
일조합         0
일사합         0
일조율         0
평균기온        0
dtype: int64

In [9]:
# RandomForest 모델 학습
x = train_df.drop(['일시'],axis=1).dropna()
x = x.drop('강수량', axis=1)
y = train_df.dropna()['강수량']

x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2)

md = RandomForestRegressor(n_estimators=300)
md.fit(x_train, y_train)

pred = md.predict(x_valid)
print(mean_squared_error(y_valid, pred, squared=False))

md.fit(x, y)

16.958224444735738


In [10]:
# 강수량 예측
precipitation_is_na = train_df[train_df['강수량'].isna()]
precipitation_x = precipitation_is_na.drop(['일시','강수량'], axis=1)
precipitation_y = md.predict(precipitation_x)

precipitation = precipitation_is_na.copy()
precipitation['강수량'] = precipitation_y
train_df = pd.concat([train_df.dropna(), precipitation], axis=0).sort_index()
train_df

Unnamed: 0,일시,최고기온,최저기온,일교차,강수량,평균습도,평균풍속,일조합,일사합,일조율,평균기온
0,1960-01-01,2.2,-5.2,7.4,0.420667,68.3,1.7,6.7,6.1932,47.535484,-1.6
1,1960-01-02,1.2,-5.6,6.8,0.400000,87.7,1.3,0.0,5.9464,50.775806,-1.9
2,1960-01-03,8.7,-2.1,10.8,0.000000,81.3,3.0,0.0,6.7872,55.535484,4.0
3,1960-01-04,10.8,1.2,9.6,0.000000,79.7,4.4,2.6,7.2344,58.427419,7.5
4,1960-01-05,1.3,-8.2,9.5,0.260000,44.0,5.1,8.2,6.8800,55.600000,-4.6
...,...,...,...,...,...,...,...,...,...,...,...
23006,2022-12-27,3.3,-7.3,10.6,0.382333,69.8,1.8,8.8,10.2500,91.700000,-2.6
23007,2022-12-28,0.1,-6.0,6.1,0.100000,58.1,2.5,8.7,10.8600,90.600000,-3.3
23008,2022-12-29,2.1,-7.8,9.9,0.000000,56.3,1.7,9.0,10.8800,93.800000,-2.9
23009,2022-12-30,2.3,-4.4,6.7,0.000000,65.6,1.9,7.9,10.8400,82.300000,-1.8


In [11]:
train_df.isna().sum()

일시      0
최고기온    0
최저기온    0
일교차     0
강수량     0
평균습도    0
평균풍속    0
일조합     0
일사합     0
일조율     0
평균기온    0
dtype: int64

In [12]:
# 날짜 데이터 변환
train_df['일시'] = pd.to_datetime(train_df['일시'])
train_df

Unnamed: 0,일시,최고기온,최저기온,일교차,강수량,평균습도,평균풍속,일조합,일사합,일조율,평균기온
0,1960-01-01,2.2,-5.2,7.4,0.420667,68.3,1.7,6.7,6.1932,47.535484,-1.6
1,1960-01-02,1.2,-5.6,6.8,0.400000,87.7,1.3,0.0,5.9464,50.775806,-1.9
2,1960-01-03,8.7,-2.1,10.8,0.000000,81.3,3.0,0.0,6.7872,55.535484,4.0
3,1960-01-04,10.8,1.2,9.6,0.000000,79.7,4.4,2.6,7.2344,58.427419,7.5
4,1960-01-05,1.3,-8.2,9.5,0.260000,44.0,5.1,8.2,6.8800,55.600000,-4.6
...,...,...,...,...,...,...,...,...,...,...,...
23006,2022-12-27,3.3,-7.3,10.6,0.382333,69.8,1.8,8.8,10.2500,91.700000,-2.6
23007,2022-12-28,0.1,-6.0,6.1,0.100000,58.1,2.5,8.7,10.8600,90.600000,-3.3
23008,2022-12-29,2.1,-7.8,9.9,0.000000,56.3,1.7,9.0,10.8800,93.800000,-2.9
23009,2022-12-30,2.3,-4.4,6.7,0.000000,65.6,1.9,7.9,10.8400,82.300000,-1.8


# 평균 기온 예측 모델 생성

In [13]:
# PROPHET 하이퍼파라미터 튜닝
train_df.rename(columns={'일시': 'ds', '평균기온': 'y'}, inplace=True)

In [14]:
# 튜닝할 파라미터 설정
param_grid = {  
    'changepoint_prior_scale': [0.01, 0.1, 0.5],
    'seasonality_prior_scale': [0.01, 0.1, 0.5],
    'holidays_prior_scale': [0.01, 0.1, 0.5],
    'seasonality_mode': ['additive', 'multiplicative']
}

# 가능한 모든 파라미터 조합 생성
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
rmses = []  # RMSE 값 저장을 위한 리스트

# 각 파라미터 조합에 대한 모델 학습 및 성능 평가
for params in all_params:
    model = Prophet(**params).fit(train_df)  # 모델 초기화 및 학습
    cv_results = cross_validation(model, initial='730 days', period='180 days', horizon = '365 days')
    df_p = performance_metrics(cv_results, rolling_window=1)
    rmses.append(df_p['rmse'].values[0])

# 최적의 파라미터 조합 찾기
best_params = all_params[rmses.index(min(rmses))]

print(best_params)

15:16:54 - cmdstanpy - INFO - Chain [1] start processing
15:16:55 - cmdstanpy - INFO - Chain [1] done processing
  0%|                                                                                          | 0/122 [00:00<?, ?it/s]15:16:58 - cmdstanpy - INFO - Chain [1] start processing
15:16:58 - cmdstanpy - INFO - Chain [1] done processing
  1%|▋                                                                                 | 1/122 [00:00<00:29,  4.08it/s]15:16:58 - cmdstanpy - INFO - Chain [1] start processing
15:16:58 - cmdstanpy - INFO - Chain [1] done processing
  2%|█▎                                                                                | 2/122 [00:00<00:27,  4.39it/s]15:16:58 - cmdstanpy - INFO - Chain [1] start processing
15:16:59 - cmdstanpy - INFO - Chain [1] done processing
  2%|██                                                                                | 3/122 [00:00<00:27,  4.31it/s]15:16:59 - cmdstanpy - INFO - Chain [1] start processing
15:16:59 - cmds

{'changepoint_prior_scale': 0.01, 'seasonality_prior_scale': 0.1, 'holidays_prior_scale': 0.01, 'seasonality_mode': 'additive'}





In [15]:
# PROPHET 모델 초기화
model = Prophet(
    changepoint_prior_scale=0.01,
    seasonality_prior_scale=0.1,
    holidays_prior_scale=0.01,
    seasonality_mode='additive'
)
# 모델 학습
model.fit(train_df)

# 미래 날짜 프레임 생성 (예: 365일 앞까지 예측)
future = model.make_future_dataframe(periods=358)

# 예측 수행
forecast = model.predict(future)

# 예측 결과 확인
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]

19:56:10 - cmdstanpy - INFO - Chain [1] start processing
19:56:10 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,yhat,yhat_lower,yhat_upper
0,1960-01-01,-2.918076,-6.796784,1.094631
1,1960-01-02,-2.939561,-6.969915,1.450996
2,1960-01-03,-3.082399,-6.966887,0.653478
3,1960-01-04,-3.174794,-7.225421,0.672917
4,1960-01-05,-3.281428,-7.461188,0.807069
...,...,...,...,...
23364,2023-12-20,0.106393,-4.159299,4.488446
23365,2023-12-21,-0.000739,-4.014603,3.848129
23366,2023-12-22,-0.133006,-4.421867,3.677361
23367,2023-12-23,-0.198194,-4.389844,3.623955


In [16]:
pred = forecast[['ds','yhat']].tail(358)
pred.reset_index(inplace=True, drop=True)
pred

Unnamed: 0,ds,yhat
0,2023-01-01,-1.312731
1,2023-01-02,-1.414411
2,2023-01-03,-1.530519
3,2023-01-04,-1.595596
4,2023-01-05,-1.634156
...,...,...
353,2023-12-20,0.106393
354,2023-12-21,-0.000739
355,2023-12-22,-0.133006
356,2023-12-23,-0.198194


In [17]:
submission_df['평균기온'] = pred['yhat']
submission_df

Unnamed: 0,일시,평균기온
0,2023-01-01,-1.312731
1,2023-01-02,-1.414411
2,2023-01-03,-1.530519
3,2023-01-04,-1.595596
4,2023-01-05,-1.634156
...,...,...
353,2023-12-20,0.106393
354,2023-12-21,-0.000739
355,2023-12-22,-0.133006
356,2023-12-23,-0.198194


In [18]:
# 예측 결과 저장
submission_df.to_csv('C:/Users/systj/bigdata/dacon/baseline_submit.csv', index=False)

In [19]:
pd.read_csv('baseline_submit.csv')

Unnamed: 0,일시,평균기온
0,2023-01-01,-1.312731
1,2023-01-02,-1.414411
2,2023-01-03,-1.530519
3,2023-01-04,-1.595596
4,2023-01-05,-1.634156
...,...,...
353,2023-12-20,0.106393
354,2023-12-21,-0.000739
355,2023-12-22,-0.133006
356,2023-12-23,-0.198194
