In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 讀取 CSV 文件，去除缺失值
MacroData = pd.read_csv('MacroData.csv').dropna()

# 確認數據類型轉換
MacroData.iloc[:, 1:] = MacroData.iloc[:, 1:].apply(pd.to_numeric)

# 分割數據集，使用 98.5% 的數據作為訓練集
split_point = int(0.985 * len(MacroData))
train = MacroData[:split_point]
test = MacroData[split_point:]


In [5]:
import statsmodels.api as sm
from statsmodels.tsa.api import ExponentialSmoothing
from statsmodels.tsa.arima.model import ARIMA

# 創建時間序列
y_ts = pd.Series(train['市半導體'].values, index=pd.date_range(start='2011-01', periods=len(train), freq='ME'))

# 建立 ARIMA 模型
model = ARIMA(y_ts, order=(1,1,1))
model_fit = model.fit()

# 預測
forecast_result = model_fit.forecast(steps=len(test))


In [7]:
forecast_result

2024-01-31    410.759204
2024-02-29    413.590223
2024-03-31    415.174397
Freq: ME, Name: predicted_mean, dtype: float64

In [15]:
import statsmodels.api as sm
import pandas as pd
import numpy as np

# 读取数据
MacroData = pd.read_csv("MacroData.csv")
MacroData.dropna(inplace=True)

# 将除日期列外的所有列转换为数值类型
MacroData.iloc[:, 1:] = MacroData.iloc[:, 1:].apply(pd.to_numeric, errors='coerce')

# 解析日期列，根据实际的日期格式进行修改
# 假设日期格式为 'Jan-11'，即月份缩写和两位数年份
MacroData['日期'] = pd.to_datetime(MacroData['日期'], format='%b-%y')

# 按日期排序（确保数据的时间顺序正确）
MacroData.sort_values('日期', inplace=True)

# 划分训练集和测试集（例如，按80%和20%划分）
split_point = int(0.8 * len(MacroData))
train = MacroData.iloc[:split_point]
test = MacroData.iloc[split_point:]

# 构建逐步回归的函数，基于 BIC 准则
def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.01, 
                       threshold_out=0.05, 
                       verbose=True):
    """基于p-value和BIC的逐步回归"""
    included = list(initial_list)
    while True:
        changed = False
        # forward step
        excluded = list(set(X.columns) - set(included))
        new_pval = pd.Series(index=excluded, dtype=float)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included + [new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        if new_pval.empty:
            break
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed = True
            if verbose:
                print(f'Add  {best_feature} with p-value {best_pval}')
        
        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max()  # 最大 p-value
        if worst_pval > threshold_out:
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            changed = True
            if verbose:
                print(f'Remove {worst_feature} with p-value {worst_pval}')
        
        if not changed:
            break

    return included

# 使用逐步回归
X_train = train.drop(columns=['市半導體', '日期'])
y_train = train['市半導體']

# 进行逐步回归
resulting_features = stepwise_selection(X_train, y_train)

# 使用最终选定的变量重新建模
final_model = sm.OLS(y_train, sm.add_constant(X_train[resulting_features])).fit()
print(final_model.summary())

# 对测试集进行预测并计算MSE
X_test = test.drop(columns=['市半導體', '日期'])
y_test = test['市半導體']

# 确保测试集包含训练时选定的特征
X_test = X_test[resulting_features]

# 进行预测
y_pred = final_model.predict(sm.add_constant(X_test))

# 计算MSE
mse = np.mean((y_test - y_pred) ** 2)
print(f'测试集的MSE为: {mse}')


Add  費城半導體指數 with p-value 2.2227290583584877e-98
Add  美國央行利率 with p-value 1.775860019778248e-14
Add  股價指數 with p-value 3.219594242938782e-12
Add  失業率 with p-value 6.317182106511963e-05
Add  消費者物價指數年增率 with p-value 0.0002965418662342792
Add  台灣消費者信心指數 with p-value 0.006399010756409681
Add  VIX with p-value 0.00021561289218154556
Add  美國EPU指數 with p-value 2.21698084095959e-06
Add  半導體內銷品物價指數 with p-value 0.0057995347531844665
                            OLS Regression Results                            
Dep. Variable:                   市半導體   R-squared:                       0.994
Model:                            OLS   Adj. R-squared:                  0.993
Method:                 Least Squares   F-statistic:                     2123.
Date:                Thu, 19 Sep 2024   Prob (F-statistic):          3.57e-125
Time:                        19:54:13   Log-Likelihood:                -417.03
No. Observations:                 127   AIC:                             854.1
Df Residuals:      

In [17]:
import statsmodels.api as sm
import pandas as pd
import numpy as np

# 構建逐步回歸的函數，基於 BIC 準則
def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.01, 
                       threshold_out=0.05, 
                       verbose=True):
    """基於p-value和BIC的逐步回歸"""
    included = list(initial_list)
    while True:
        changed = False
        # forward step
        excluded = list(set(X.columns) - set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included + [new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed = True
            if verbose:
                print(f'Add  {best_feature} with p-value {best_pval}')
        
        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max()  # 最大 p-value
        if worst_pval > threshold_out:
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            changed = True
            if verbose:
                print(f'Remove {worst_feature} with p-value {worst_pval}')
        
        if not changed:
            break

    return included

# 使用逐步回歸
X_train = train.drop(columns=['市半導體', '日期'])
y_train = train['市半導體']

# 進行逐步回歸
resulting_features = stepwise_selection(X_train, y_train)

# 使用最終選定的變量重新建模
final_model = sm.OLS(y_train, sm.add_constant(X_train[resulting_features])).fit()
print(final_model.summary())


Add  股價指數 with p-value 2.687574224643153e-133
Add  那斯達克指數 with p-value 4.453208512974947e-16
Add  CBOE黃金ETF波動率指數 with p-value 1.2558787266509822e-14
Add  台灣消費者信心指數 with p-value 8.60831037979936e-05
Add  美國全品項CPI with p-value 3.6986008302133595e-09
Add  失業率 with p-value 7.975510037374047e-06
Add  電力(企業)總用電量 with p-value 0.00029008992932445045
Add  消費者物價指數年增率 with p-value 0.003243002114582758
                            OLS Regression Results                            
Dep. Variable:                   市半導體   R-squared:                       0.995
Model:                            OLS   Adj. R-squared:                  0.995
Method:                 Least Squares   F-statistic:                     3936.
Date:                Wed, 18 Sep 2024   Prob (F-statistic):          2.43e-167
Time:                        12:27:30   Log-Likelihood:                -543.60
No. Observations:                 156   AIC:                             1105.
Df Residuals:                     147   BIC:         

In [39]:
bic_pred = final_model.predict(test.drop(columns=['市半導體', '日期']))

ValueError: shapes (3,35) and (9,) not aligned: 35 (dim 1) != 9 (dim 0)

In [37]:
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import numpy as np

# 分離訓練數據中的特徵和目標變量
X_train = train.drop(columns=['市半導體', '日期'])
y_train = train['市半導體']

# 對特徵進行標準化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 使用 LassoCV 進行交叉驗證來選擇 alpha 值
lasso = LassoCV(cv=5).fit(X_train_scaled, y_train)

# 獲取非零係數的特徵
lasso_features = X_train.columns[lasso.coef_ != 0]
print("LASSO 選擇的特徵:", lasso_features)

# 對測試數據進行標準化並進行預測
X_test = test.drop(columns=['市半導體', '日期'])
X_test_scaled = scaler.transform(X_test)  # 確保使用與訓練數據相同的 scaler

# LASSO 預測
lasso_pred = lasso.predict(X_test_scaled)

# 計算 RMSE
rmse_lasso = np.sqrt(mean_squared_error(test['市半導體'], lasso_pred))
print(f'LASSO 模型的 RMSE: {rmse_lasso}')



LASSO 選擇的特徵: Index(['製造業營業氣候測驗點', '股價指數', '工業及服務業受僱員工淨進入率', '實質半導體設備進口值', '電力(企業)總用電量',
       '失業率', '消費者物價指數年增率', '台灣消費者信心指數', '美國央行利率', 'VIX', 'CBOE黃金ETF波動率指數',
       '那斯達克指數', '費城半導體指數'],
      dtype='object')
LASSO 模型的 RMSE: 18.214016787887065


In [19]:
from sklearn.decomposition import PCA

# 進行PCA
pca = PCA(n_components=6)
pca_data = pca.fit_transform(train.drop(columns=['市半導體', '日期']))

# 將主成分數據放入數據框
pca_df = pd.DataFrame(pca_data, columns=[f'PC{i+1}' for i in range(6)])
pca_df['市半導體'] = train['市半導體']

# 建立GLM
pca_model = smf.ols('市半導體 ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6', data=pca_df).fit()


NameError: name 'smf' is not defined

In [25]:
from sklearn.metrics import mean_squared_error
import numpy as np

# BIC 預測
bic_pred = final_model.predict(test.drop(columns=['市半導體', '日期']))
rmse_bic = np.sqrt(mean_squared_error(test['市半導體'], bic_pred))

# LASSO 預測
X_test_scaled = scaler.transform(test.drop(columns=['市半導體', '日期']))
lasso_pred = lasso.predict(X_test_scaled)
rmse_lasso = np.sqrt(mean_squared_error(test['市半導體'], lasso_pred))

# PCA 預測
pca_test_data = pca.transform(test.drop(columns=['市半導體', '日期']))
pca_test_df = pd.DataFrame(pca_test_data, columns=[f'PC{i+1}' for i in range(6)])
pca_pred = pca_model.predict(pca_test_df)
rmse_pca = np.sqrt(mean_squared_error(test['市半導體'], pca_pred))

# 時間序列預測 RMSE
rmse_ts = np.sqrt(mean_squared_error(test['市半導體'], forecast_result))

# 輸出RMSE
print(f'RMSE: BIC={rmse_bic}, LASSO={rmse_lasso}, TS={rmse_ts}, PCA={rmse_pca}')


ValueError: shapes (3,35) and (9,) not aligned: 35 (dim 1) != 9 (dim 0)

In [29]:
from sklearn.metrics import mean_squared_error
import numpy as np

# BIC 預測
X_test_bic = sm.add_constant(test.drop(columns=['市半導體', '日期']))[resulting_features]  # 使用逐步選擇的變量
bic_pred = final_model.predict(X_test_bic)
rmse_bic = np.sqrt(mean_squared_error(test['市半導體'], bic_pred))

# LASSO 預測
X_test_scaled = scaler.transform(test.drop(columns=['市半導體', '日期']))  # 確保使用相同的變量進行標準化
lasso_pred = lasso.predict(X_test_scaled)
rmse_lasso = np.sqrt(mean_squared_error(test['市半導體'], lasso_pred))

# PCA 預測
pca_test_data = pca.transform(test.drop(columns=['市半導體', '日期']))  # 確保PCA轉換與訓練一致
pca_test_df = pd.DataFrame(pca_test_data[:, :6], columns=[f'PC{i+1}' for i in range(6)])  # 使用相同數量的主成分
pca_pred = pca_model.predict(pca_test_df)
rmse_pca = np.sqrt(mean_squared_error(test['市半導體'], pca_pred))

# 時間序列預測 RMSE
forecast_values = forecast_result.mean[:len(test)]  # 檢查 forecast_result 是否為正確長度
rmse_ts = np.sqrt(mean_squared_error(test['市半導體'], forecast_values))

# 輸出RMSE
print(f'RMSE: BIC={rmse_bic}, LASSO={rmse_lasso}, TS={rmse_ts}, PCA={rmse_pca}')


ValueError: shapes (3,8) and (9,) not aligned: 8 (dim 1) != 9 (dim 0)

In [35]:
print(f'X_test_bic shape: {X_test_bic.shape}')  # BIC
print(f'X_test_scaled shape: {X_test_scaled.shape}')  # LASSO
print(f'pca_test_df shape: {pca_test_df.shape}')  # PCA


X_test_bic shape: (3, 8)


NameError: name 'X_test_scaled' is not defined