In [1]:
# 共通事前處理
# 隱藏不必要的警告
import warnings
warnings.filterwarnings('ignore')

# 匯入必要的函式庫
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import font_manager as fm

# 將字型新增到 matplotlib
fm.fontManager.addfont('./TaipeiSansTCBeta-Regular.ttf')

# 用來顯示資料框的函式
from IPython.display import display

# 調整顯示選項
# NumPy 的浮點數表示精度
np.set_printoptions(suppress=True, precision=4)
# pandas 中的浮點數表示精度
pd.options.display.float_format = '{:.4f}'.format
# 顯示資料框中的所有項目
pd.set_option("display.max_columns",None)
# 指定圖形的預設字體大小
plt.rcParams["font.size"] = 14
# 指定圖形的預設字型
plt.rcParams['font.family'] = 'Taipei Sans TC Beta'
# 隨機種子
random_seed = 2277

In [2]:
#載入資料、確認資料
df = pd.read_csv('stockData/stockData.csv', parse_dates=[1])
column = ['指數代碼', '日期', '開盤價', '最高價', '最低價', '收盤價', '調整後收盤價', '交易量']
df.columns = column

display(df.head())
display(df.tail())

Unnamed: 0,指數代碼,日期,開盤價,最高價,最低價,收盤價,調整後收盤價,交易量
0,NYA,1965-12-31,528.69,528.69,528.69,528.69,528.69,0.0
1,NYA,1966-01-03,527.21,527.21,527.21,527.21,527.21,0.0
2,NYA,1966-01-04,527.84,527.84,527.84,527.84,527.84,0.0
3,NYA,1966-01-05,531.12,531.12,531.12,531.12,531.12,0.0
4,NYA,1966-01-06,532.07,532.07,532.07,532.07,532.07,0.0


Unnamed: 0,指數代碼,日期,開盤價,最高價,最低價,收盤價,調整後收盤價,交易量
112452,N100,2021-05-27,1241.12,1251.91,1241.12,1247.0699,1247.0699,379696400.0
112453,N100,2021-05-28,1249.47,1259.21,1249.03,1256.6,1256.6,160773400.0
112454,N100,2021-05-31,1256.08,1258.88,1248.14,1248.9301,1248.9301,91173700.0
112455,N100,2021-06-01,1254.61,1265.66,1254.61,1258.58,1258.58,155179900.0
112456,N100,2021-06-02,1258.49,1263.71,1258.24,1263.62,1263.62,148465000.0


In [3]:
# 確認遺失值、欲處理資料
missing_rows = df[df.isnull().any(axis=1)]
# 查看有遺失值的列(最後5列)
display(missing_rows.tail())
# 向前填充遺失值，同df.fillnu(method='ffill')
df_filled = df.ffill()
#df_filled.isnull().sum()

Unnamed: 0,指數代碼,日期,開盤價,最高價,最低價,收盤價,調整後收盤價,交易量
108316,N100,2005-03-28,,,,,,
108511,N100,2005-12-26,,,,,,
110051,N100,2012-01-02,,,,,,
110643,N100,2014-05-01,,,,,,
112090,N100,2019-12-25,,,,,,


In [80]:

df_ixic = df_filled[df_filled['指數代碼'] == 'IXIC']

display(df_ixic.head())
display(df_ixic.tail())

Unnamed: 0,指數代碼,日期,開盤價,最高價,最低價,收盤價,調整後收盤價,交易量
13948,IXIC,1971-02-05,100.0,100.0,100.0,100.0,100.0,0.0
13949,IXIC,1971-02-08,100.84,100.84,100.84,100.84,100.84,0.0
13950,IXIC,1971-02-09,100.76,100.76,100.76,100.76,100.76,0.0
13951,IXIC,1971-02-10,100.69,100.69,100.69,100.69,100.69,0.0
13952,IXIC,1971-02-11,101.45,101.45,101.45,101.45,101.45,0.0


Unnamed: 0,指數代碼,日期,開盤價,最高價,最低價,收盤價,調整後收盤價,交易量
26633,IXIC,2021-05-24,13557.21,13708.8496,13551.0098,13661.1699,13661.1699,3490650000.0
26634,IXIC,2021-05-25,13721.54,13751.1396,13631.7998,13657.1699,13657.1699,4084480000.0
26635,IXIC,2021-05-26,13693.9404,13750.1602,13679.5898,13738.0,13738.0,4231140000.0
26636,IXIC,2021-05-27,13742.5898,13776.5195,13701.6299,13736.2803,13736.2803,5057550000.0
26637,IXIC,2021-05-28,13792.0498,13820.8701,13747.6104,13748.7402,13748.7402,4435220000.0


In [81]:
df_ixic2 = df_ixic[['日期', '調整後收盤價']]
df_ixic2.columns = ['ds', 'y']
display(df_ixic2.head())

Unnamed: 0,ds,y
13948,1971-02-05,100.0
13949,1971-02-08,100.84
13950,1971-02-09,100.76
13951,1971-02-10,100.69
13952,1971-02-11,101.45


In [None]:
# 假設 df_ixic2 已經是篩選出的 IXIC 數據，且已處理完 Adj Close 中的 0 值

# 1. 計算當天的日對數報酬率
# current_log_return 的 t 值 = (Adj Close_t / Adj Close_{t-1})
df_ixic2['current_log_return'] = np.log(df_ixic2['y'] / df_ixic2['y'].shift(1))

# 2. **關鍵步驟：將當日報酬率滯後一天**
# lagged_return 的 t 值，儲存的是 t-1 日的報酬率
df_ixic2['lagged_return'] = df_ixic2['current_log_return'].shift(1)

# 3. 處理 NaN：丟棄無法計算報酬率和滯後報酬率的數據點
# 通常是前兩筆數據會是 NaN
df_ixic2 = df_ixic2.dropna(subset=['lagged_return'])
df_ixic3 = df_ixic2.drop(['y'],axis=1).rename(columns={'current_log_return' : 'y'}) #把真實數據y替換成報酬率

display(df_ixic3.head())
print("lagged_return 欄位已成功創建並清理 NaN。")


Unnamed: 0,ds,y,lagged_return
13956,1971-02-18,-0.0032,-0.0044
13957,1971-02-19,-0.0071,-0.0032
13958,1971-02-22,-0.0102,-0.0071
13959,1971-02-23,0.0004,-0.0102
13960,1971-02-24,0.0092,0.0004


lagged_return 欄位已成功創建並清理 NaN。


In [100]:
train_start_date = pd.to_datetime('1990-01-01')
train_end_date = pd.to_datetime('2010-12-31')
validation_end_date = pd.to_datetime('2016-12-31')

# 把1990-01-01(含)前的資料指派給train_df2
train_df_ixic2 = df_ixic3[ (df_ixic3['ds'] >= train_start_date) & (df_ixic3['ds'] <= train_end_date)]
# 把2010-12-31(含)前的資料指派給validation_df2
validation_df_ixic2 = df_ixic3[(df_ixic3['ds'] <= validation_end_date) & (df_ixic3['ds'] > train_end_date)] #注意pd中的and要用'&'代替，且是優先運算因子
# 把2016-12-31(不含)後的資料指派給test_df2
test_df_ixic2 = df_ixic3[df_ixic3['ds'] > validation_end_date]


# 重新準備 Prophet 格式的 DataFrame (現在 lagged_return 存在了)
train_df_ixic3 = train_df_ixic2[['ds', 'y', 'lagged_return']].copy()
validation_df_ixic3 = validation_df_ixic2[['ds', 'lagged_return']].copy()
display(train_df_ixic3.head())
print("資料切分和 Prophet 資料準備完成。")


Unnamed: 0,ds,y,lagged_return
18724,1990-01-02,0.0098,0.0106
18725,1990-01-03,0.0035,0.0098
18726,1990-01-04,-0.0033,0.0035
18727,1990-01-05,-0.0026,-0.0033
18728,1990-01-08,0.0011,-0.0026


資料切分和 Prophet 資料準備完成。


In [105]:
from prophet import Prophet
model = Prophet(
    yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=False,
    growth='linear', changepoint_prior_scale=0.15, seasonality_mode='additive'
)
model.add_regressor('lagged_return')

model.fit(train_df_ixic3) # 訓練模型
print('模型已訓練完成')

18:37:50 - cmdstanpy - INFO - Chain [1] start processing
18:37:51 - cmdstanpy - INFO - Chain [1] done processing


模型已訓練完成


In [106]:
future_validation = validation_df_ixic3.copy()
pred_validation = model.predict(future_validation) # 預測模型
print("已完成驗證集預測。")

已完成驗證集預測。


In [107]:
# 評估測試集結果 (最終性能報告)
df_validation_results = validation_df_ixic2.merge(
    pred_validation[['ds', 'yhat']],
    on='ds',
    how='left'
).rename(columns={'y': 'y_true'})

display(df_validation_results.head())

Unnamed: 0,ds,y_true,lagged_return,yhat
0,2011-01-03,0.0145,-0.0038,0.0013
1,2011-01-04,-0.0038,0.0145,0.0016
2,2011-01-05,0.0078,-0.0038,0.0027
3,2011-01-06,0.0028,0.0078,0.0021
4,2011-01-07,-0.0025,0.0028,0.0015


In [108]:
# 準備計算R2值(決定係數)、RMSE(均方值誤差)
from sklearn.metrics import mean_squared_error, r2_score
y_true = df_validation_results['y_true']
y_pred = df_validation_results['yhat']

r2score = r2_score(y_true, y_pred)
print(f'驗證集 R2_score: {r2score:.4f}')
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
print(f'驗證集 RMSE: {rmse:.4f}')

驗證集 R2_score: -0.0017
驗證集 RMSE: 0.0108
