In [1]:
# 共通事前處理
# 隱藏不必要的警告
import warnings
warnings.filterwarnings('ignore')

# 匯入必要的函式庫
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import font_manager as fm

# 將字型新增到 matplotlib
fm.fontManager.addfont('./TaipeiSansTCBeta-Regular.ttf')

# 用來顯示資料框的函式
from IPython.display import display

# 調整顯示選項
# NumPy 的浮點數表示精度
np.set_printoptions(suppress=True, precision=4)
# pandas 中的浮點數表示精度
pd.options.display.float_format = '{:.4f}'.format
# 顯示資料框中的所有項目
pd.set_option("display.max_columns",None)
# 指定圖形的預設字體大小
plt.rcParams["font.size"] = 14
# 指定圖形的預設字型
plt.rcParams['font.family'] = 'Taipei Sans TC Beta'
# 隨機種子
random_seed = 2277

In [2]:
#載入資料、確認資料
df = pd.read_csv('stockData/stockData.csv', parse_dates=[1])
column = ['指數代碼', '日期', '開盤價', '最高價', '最低價', '收盤價', '調整後收盤價', '交易量']
df.columns = column

display(df.head())
display(df.tail())

Unnamed: 0,指數代碼,日期,開盤價,最高價,最低價,收盤價,調整後收盤價,交易量
0,NYA,1965-12-31,528.69,528.69,528.69,528.69,528.69,0.0
1,NYA,1966-01-03,527.21,527.21,527.21,527.21,527.21,0.0
2,NYA,1966-01-04,527.84,527.84,527.84,527.84,527.84,0.0
3,NYA,1966-01-05,531.12,531.12,531.12,531.12,531.12,0.0
4,NYA,1966-01-06,532.07,532.07,532.07,532.07,532.07,0.0


Unnamed: 0,指數代碼,日期,開盤價,最高價,最低價,收盤價,調整後收盤價,交易量
112452,N100,2021-05-27,1241.12,1251.91,1241.12,1247.0699,1247.0699,379696400.0
112453,N100,2021-05-28,1249.47,1259.21,1249.03,1256.6,1256.6,160773400.0
112454,N100,2021-05-31,1256.08,1258.88,1248.14,1248.9301,1248.9301,91173700.0
112455,N100,2021-06-01,1254.61,1265.66,1254.61,1258.58,1258.58,155179900.0
112456,N100,2021-06-02,1258.49,1263.71,1258.24,1263.62,1263.62,148465000.0


In [3]:
# 確認遺失值、欲處理資料
missing_rows = df[df.isnull().any(axis=1)]
# 查看有遺失值的列(最後5列)
display(missing_rows.tail())
# 向前填充遺失值，同df.fillnu(method='ffill')
df_filled = df.ffill()
#df_filled.isnull().sum()

Unnamed: 0,指數代碼,日期,開盤價,最高價,最低價,收盤價,調整後收盤價,交易量
108316,N100,2005-03-28,,,,,,
108511,N100,2005-12-26,,,,,,
110051,N100,2012-01-02,,,,,,
110643,N100,2014-05-01,,,,,,
112090,N100,2019-12-25,,,,,,


In [21]:

df_ixic = df[df['指數代碼'] == 'IXIC']
print(f"調整後收盤價 為 0 的筆數: {len(df_ixic[df_ixic['調整後收盤價'] == 0])}")
display(df_ixic.head())
display(df_ixic.tail())

調整後收盤價 為 0 的筆數: 0


Unnamed: 0,指數代碼,日期,開盤價,最高價,最低價,收盤價,調整後收盤價,交易量
13948,IXIC,1971-02-05,100.0,100.0,100.0,100.0,100.0,0.0
13949,IXIC,1971-02-08,100.84,100.84,100.84,100.84,100.84,0.0
13950,IXIC,1971-02-09,100.76,100.76,100.76,100.76,100.76,0.0
13951,IXIC,1971-02-10,100.69,100.69,100.69,100.69,100.69,0.0
13952,IXIC,1971-02-11,101.45,101.45,101.45,101.45,101.45,0.0


Unnamed: 0,指數代碼,日期,開盤價,最高價,最低價,收盤價,調整後收盤價,交易量
26633,IXIC,2021-05-24,13557.21,13708.8496,13551.0098,13661.1699,13661.1699,3490650000.0
26634,IXIC,2021-05-25,13721.54,13751.1396,13631.7998,13657.1699,13657.1699,4084480000.0
26635,IXIC,2021-05-26,13693.9404,13750.1602,13679.5898,13738.0,13738.0,4231140000.0
26636,IXIC,2021-05-27,13742.5898,13776.5195,13701.6299,13736.2803,13736.2803,5057550000.0
26637,IXIC,2021-05-28,13792.0498,13820.8701,13747.6104,13748.7402,13748.7402,4435220000.0


In [10]:
df_ixic2 = df_ixic[['日期', '調整後收盤價']]
df_ixic2.columns = ['ds', 'y']
display(df_ixic2.head())

Unnamed: 0,ds,y
13948,1971-02-05,100.0
13949,1971-02-08,100.84
13950,1971-02-09,100.76
13951,1971-02-10,100.69
13952,1971-02-11,101.45


In [11]:
train_end_date = pd.to_datetime('2005-12-31')
validation_end_date = pd.to_datetime('2015-12-31')

# 把2005-12-31(含)前的資料指派給train_df2
train_df_ixic2 = df_ixic2[df_ixic2['ds'] <= train_end_date]
# 把2015-12-31(含)前的資料指派給validation_df2
validation_df_ixic2 = df_ixic2[(df_ixic2['ds'] <= validation_end_date) & (df_ixic2['ds'] > train_end_date)] #注意pd中的and要用'&'代替，且是優先運算因子
# 把2015-12-31(不含)後的資料指派給test_df2
test_df_ixic2 = df_ixic2[df_ixic2['ds'] > validation_end_date]

display(train_df_ixic2.tail())
display(validation_df_ixic2.head())
display(validation_df_ixic2.tail())
display(test_df_ixic2.head())

Unnamed: 0,ds,y
22755,2005-12-23,2249.4199
22756,2005-12-27,2226.8899
22757,2005-12-28,2228.9399
22758,2005-12-29,2218.1599
22759,2005-12-30,2205.3201


Unnamed: 0,ds,y
22760,2006-01-03,2243.74
22761,2006-01-04,2263.46
22762,2006-01-05,2276.8701
22763,2006-01-06,2305.6201
22764,2006-01-09,2318.6899


Unnamed: 0,ds,y
25272,2015-12-24,5048.4902
25273,2015-12-28,5040.9902
25274,2015-12-29,5107.9399
25275,2015-12-30,5065.8501
25276,2015-12-31,5007.4102


Unnamed: 0,ds,y
25277,2016-01-04,4903.0898
25278,2016-01-05,4891.4302
25279,2016-01-06,4835.7598
25280,2016-01-07,4689.4302
25281,2016-01-08,4643.6299


In [16]:
from prophet import Prophet
model = Prophet(
    yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=False,
    growth='linear', seasonality_mode='multiplicative', changepoint_prior_scale=0.5
)
model.fit(train_df_ixic2) # 訓練模型
print('模型已訓練完成')

09:35:26 - cmdstanpy - INFO - Chain [1] start processing
09:35:29 - cmdstanpy - INFO - Chain [1] done processing


模型已訓練完成


In [17]:
futuer_validation = validation_df_ixic2[['ds']].copy()
pred_validation = model.predict(futuer_validation) # 預測模型
display(pred_validation.head())

Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,weekly,weekly_lower,weekly_upper,yearly,yearly_lower,yearly_upper,additive_terms,additive_terms_lower,additive_terms_upper,yhat
0,2006-01-03,843.6477,1241.1242,2049.4693,843.6477,843.6477,0.9557,0.9557,0.9557,0.9184,0.9184,0.9184,0.0373,0.0373,0.0373,0.0,0.0,0.0,1649.9303
1,2006-01-04,843.4054,1245.841,2077.2402,843.4054,843.4054,0.9616,0.9616,0.9616,0.9233,0.9233,0.9233,0.0383,0.0383,0.0383,0.0,0.0,0.0,1654.4546
2,2006-01-05,843.1631,1297.2677,2091.8301,843.1631,843.1631,0.9645,0.9645,0.9645,0.925,0.925,0.925,0.0395,0.0395,0.0395,0.0,0.0,0.0,1656.3687
3,2006-01-06,842.9208,1251.3585,2046.8481,842.9208,842.9208,0.9674,0.9674,0.9674,0.9266,0.9266,0.9266,0.0408,0.0408,0.0408,0.0,0.0,0.0,1658.3654
4,2006-01-09,842.194,1267.0621,2057.7099,842.194,842.194,0.9657,0.9657,0.9657,0.92,0.92,0.92,0.0457,0.0457,0.0457,0.0,0.0,0.0,1655.4697


In [18]:
# 評估測試集結果 (最終性能報告)
df_validation_results = validation_df_ixic2.merge(
    pred_validation[['ds', 'yhat', 'yhat_lower', 'yhat_upper']],
    on='ds',
    how='left'
).rename(columns={'y': 'y_true'})

display(df_validation_results.head())

Unnamed: 0,ds,y_true,yhat,yhat_lower,yhat_upper
0,2006-01-03,2243.74,1649.9303,1241.1242,2049.4693
1,2006-01-04,2263.46,1654.4546,1245.841,2077.2402
2,2006-01-05,2276.8701,1656.3687,1297.2677,2091.8301
3,2006-01-06,2305.6201,1658.3654,1251.3585,2046.8481
4,2006-01-09,2318.6899,1655.4697,1267.0621,2057.7099


In [19]:
# 準備計算R2值(決定係數)、RMSE(均方值誤差)
from sklearn.metrics import mean_squared_error, r2_score
y_true = df_validation_results['y_true']
y_pred = df_validation_results['yhat']

r2score = r2_score(y_true, y_pred)
print(f'驗證集 R2_score: {r2score:.4f}')
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
print(f'驗證集 RMSE: {rmse:.4f}')

驗證集 R2_score: -6.0786
驗證集 RMSE: 2618.3250
