<a href="https://colab.research.google.com/github/Kathy42xu/DL_TA/blob/main/LSTM_rf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 启用 GPU
import tensorflow as tf
device = tf.test.gpu_device_name()
print("GPU:", device)

# 安装依赖（若需要）
!pip install yfinance ta

# 下载数据 (示例：S&P500)
import yfinance as yf
df = yf.download("^GSPC", start="2002-08-01", end="2018-06-28")
df.to_csv("SP500.csv")


GPU: 
Collecting ta
  Downloading ta-0.11.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ta
  Building wheel for ta (setup.py) ... [?25l[?25hdone
  Created wheel for ta: filename=ta-0.11.0-py3-none-any.whl size=29412 sha256=a2de3205221c8c405b52147f9888b7b78c702a80215eb09fd36f46cea1e13d8a
  Stored in directory: /root/.cache/pip/wheels/a1/d7/29/7781cc5eb9a3659d032d7d15bdd0f49d07d2b24fec29f44bc4
Successfully built ta
Installing collected packages: ta
Successfully installed ta-0.11.0
YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed


#preprocessing_indicator

In [2]:
import pandas as pd

# 1️⃣ 重新读取 CSV —— 明确告诉 pandas 用第一列作 index
df = pd.read_csv("SP500.csv", index_col=0, parse_dates=True)

# 2️⃣ 确认列名正确，只保留 Open/High/Low/Close/Volume
print(df.columns)
df = df[['Open','High','Low','Close','Volume']]

# 3️⃣ 强制转成浮点数（会把任何非数字变成 NaN）
df = df.apply(pd.to_numeric, errors='coerce')

# 4️⃣ 删除因转换失败产生的 NaN 行
df.dropna(inplace=True)

print(df.head())


Index(['Close', 'High', 'Low', 'Open', 'Volume'], dtype='object')
                  Open        High         Low       Close        Volume
Price                                                                   
2002-08-01  911.619995  911.619995  882.479980  884.659973  1.672200e+09
2002-08-02  884.400024  884.719971  853.950012  864.239990  1.538100e+09
2002-08-05  864.239990  864.239990  833.440002  834.599976  1.425500e+09
2002-08-06  834.599976  874.440002  834.599976  859.570007  1.514100e+09
2002-08-07  859.570007  878.739990  854.150024  876.770020  1.490400e+09


  df = pd.read_csv("SP500.csv", index_col=0, parse_dates=True)


In [3]:


import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import ta

# —— 1️⃣ 读取 CSV ——
df = pd.read_csv("SP500.csv", index_col=0, parse_dates=True)

# —— 2️⃣ 保留 OHLCV 并转换为 float ——
df = df[['Open','High','Low','Close','Volume']]
df = df.apply(pd.to_numeric, errors='coerce')
df.dropna(inplace=True)

# —— 3️⃣ 计算全部 43 技术指标 ——
# （示例仅列举部分；务必按论文 Table 1 补全其余指标）
import ta

# —— 全量技术指标列表 ——
df['SMA_5'] = ta.trend.sma_indicator(df['Close'], window=5)
df['SMA_10'] = ta.trend.sma_indicator(df['Close'], window=10)
df['SMA_20'] = ta.trend.sma_indicator(df['Close'], window=20)

df['EMA_6'] = ta.trend.ema_indicator(df['Close'], window=6)
df['EMA_10'] = ta.trend.ema_indicator(df['Close'], window=10)
df['EMA_14'] = ta.trend.ema_indicator(df['Close'], window=14)

df['MACD'] = ta.trend.macd_diff(df['Close'], window_slow=20, window_fast=6)
df['RSI_10'] = ta.momentum.rsi(df['Close'], window=10)
df['RSI_14'] = ta.momentum.rsi(df['Close'], window=14)
df['CCI_20'] = ta.trend.cci(df['High'], df['Low'], df['Close'], window=20)

df['BOLL_UB'] = ta.volatility.bollinger_hband(df['Close'], window=20)
df['BOLL_LB'] = ta.volatility.bollinger_lband(df['Close'], window=20)

df['ATR_14'] = ta.volatility.average_true_range(df['High'], df['Low'], df['Close'], window=14)
df['H-L']   = df['High'] - df['Low']
df['H-Cp']  = (df['High'] - df['Close'].shift(1)).abs()
df['L-Cp']  = (df['Low']  - df['Close'].shift(1)).abs()
df['TR']    = df[['H-L','H-Cp','L-Cp']].max(axis=1)


df['OBV'] = ta.volume.on_balance_volume(df['Close'], df['Volume'])
df['MFI'] = ta.volume.money_flow_index(df['High'], df['Low'], df['Close'], df['Volume'], window=14)
df['ForceIndex'] = ta.volume.force_index(df['Close'], df['Volume'], window=1)
df['FI_5'] = ta.volume.force_index(df['Close'], df['Volume'], window=5)

df['ROC_12'] = ta.momentum.roc(df['Close'], window=12)
df['Williams_%R'] = ta.momentum.williams_r(df['High'], df['Low'], df['Close'], lbp=14)
df['DPO_20'] = ta.trend.dpo(df['Close'], window=20)

df['ADX_7'] = ta.trend.adx(df['High'], df['Low'], df['Close'], window=7)
df['ADX_14'] = ta.trend.adx(df['High'], df['Low'], df['Close'], window=14)
df['DX'] = ta.trend.adx_pos(df['High'], df['Low'], df['Close'], window=14) - ta.trend.adx_neg(df['High'], df['Low'], df['Close'], window=14)


df['KST'] = ta.trend.kst(df['Close'])
# Signal line = 9‑day SMA of KST
df['KST_9'] = df['KST'].rolling(window=9).mean()


df['SEMV'] = ta.volume.ease_of_movement(df['High'], df['Low'], df['Volume'])
df['NVI'] = ta.volume.negative_volume_index(df['Close'], df['Volume'])

# True range components
df['H-L'] = df['High'] - df['Low']
df['H-Cp'] = (df['High'] - df['Close'].shift(1)).abs()
df['L-Cp'] = (df['Low'] - df['Close'].shift(1)).abs()

# Final drop NaN
df.dropna(inplace=True)

print("Computed all 43 technical indicators; resulting shape:", df.shape)


df.dropna(inplace=True)

# —— 4️⃣ 归一化 ——
scaler = MinMaxScaler()
features = df.columns.tolist()
df[features] = scaler.fit_transform(df[features])

# —— 5️⃣ 构造监督序列 ——
window = 50
returns = 100 * np.log(df['Close'] / df['Close'].shift(1))
returns = returns.dropna().values

X, y_reg, y_clf = [], [], []
for i in range(len(returns) - window):
    X.append(df.iloc[i:i+window].values)
    y_reg.append(returns[i+window])
    y_clf.append(int(returns[i+window] > 0))


X = np.array(X)
y_reg = np.array(y_reg)
y_clf = np.array(y_clf)

# —— 6️⃣ 保存结果 ——
np.savez("SP500_preprocessed.npz", X=X, y_reg=y_reg, y_clf=y_clf)

print("Done! Shapes → X:", X.shape, "y_reg:", y_reg.shape, "y_clf:", y_clf.shape)


  df = pd.read_csv("SP500.csv", index_col=0, parse_dates=True)


Computed all 43 technical indicators; resulting shape: (3978, 36)


  result = getattr(ufunc, method)(*inputs, **kwargs)


Done! Shapes → X: (3927, 50, 36) y_reg: (3927,) y_clf: (3927,)


#Train LSTM

In [3]:
import numpy as np
from sklearn.model_selection import train_test_split

data = np.load("SP500_preprocessed.npz")
X, y_reg, y_clf = data['X'], data['y_reg'], data['y_clf']

# 按 70% train / 10% validation / 20% test 切分
X_temp, X_test, y_reg_temp, y_reg_test, y_clf_temp, y_clf_test = train_test_split(
    X, y_reg, y_clf, test_size=0.2, shuffle=False)

X_train, X_val, y_reg_train, y_reg_val, y_clf_train, y_clf_val = train_test_split(
    X_temp, y_reg_temp, y_clf_temp, test_size=0.125, shuffle=False)

print("Shapes → Train:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)


Shapes → Train: (2748, 50, 36) Val: (393, 50, 36) Test: (786, 50, 36)


In [4]:
mask = (~np.isnan(y_reg)) & (~np.isinf(y_reg))
X, y_reg, y_clf = X[mask], y_reg[mask], y_clf[mask]


##LFM

In [6]:
# 1️⃣ Split data exactly as authors do
from sklearn.model_selection import train_test_split
X_temp, X_test, y_reg_temp, y_reg_test, y_clf_temp, y_clf_test = train_test_split(X, y_reg, y_clf, test_size=0.2, shuffle=False)
X_train, X_val, y_reg_train, y_reg_val, y_clf_train, y_clf_val = train_test_split(X_temp, y_reg_temp, y_clf_temp, test_size=0.125, shuffle=False)

# 2️⃣ Build exactly as original script
def build_model():
    inp = Input(shape=(50, X.shape[2]))
    x = LSTM(15, return_sequences=False)(inp)
    x = Dense(30, activation="relu")(x)
    x = Dropout(0.3)(x)
    x = Dense(30, activation="relu")(x)
    out_reg = Dense(1, name="regression")(x)
    out_clf = Dense(2, activation="softmax", name="classification")(x)
    model = Model(inp, [out_reg, out_clf])
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
        loss={"regression":"mse", "classification":"sparse_categorical_crossentropy"},
        loss_weights={"regression":0.1, "classification":1.0}
    )
    return model

model = build_model()

# 3️⃣ Validate no NaNs in training batches
print("Train X NaNs:", np.isnan(X_train).sum(), "y_reg NaNs:", np.isnan(y_reg_train).sum())

# 4️⃣ Train
history = model.fit(
    X_train, {"regression":y_reg_train, "classification":y_clf_train},
    validation_data=(X_val, {"regression":y_reg_val, "classification":y_clf_val}),
    batch_size=256, epochs=300, callbacks=[EarlyStopping(patience=20, restore_best_weights=True)]
)


Train X NaNs: 0 y_reg NaNs: 0
Epoch 1/300
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 92ms/step - classification_loss: 0.6887 - loss: 5.0865 - regression_loss: 43.8788 - val_classification_loss: 0.6907 - val_loss: 0.8137 - val_regression_loss: 1.2653
Epoch 2/300
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - classification_loss: 0.6914 - loss: 5.1222 - regression_loss: 44.2889 - val_classification_loss: 0.6910 - val_loss: 0.8129 - val_regression_loss: 1.2580
Epoch 3/300
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 43ms/step - classification_loss: 0.6890 - loss: 4.1000 - regression_loss: 34.0164 - val_classification_loss: 0.6914 - val_loss: 0.8134 - val_regression_loss: 1.2609
Epoch 4/300
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - classification_loss: 0.6871 - loss: 3.7978 - regression_loss: 30.9953 - val_classification_loss: 0.6912 - val_loss: 0.8128 - val_regression_loss: 1.2579
Epoch 

## paper logic:100 LSTM base model + learning rate halved every 50 epochs + 300 epochs fixed training + 13 randomly drawn features + bootstrap samples + final average prediction


In [None]:
import numpy as np
import random
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.models import clone_model

def build_model(input_dim):
    inp = Input(shape=(50, input_dim))
    x = LSTM(15)(inp)
    x = Dense(30, activation="relu")(x)
    x = Dropout(0.3)(x)
    x = Dense(30, activation="relu")(x)
    out_reg = Dense(1, name="regression")(x)
    out_clf = Dense(2, activation="softmax", name="classification")(x)
    model = Model(inp, [out_reg, out_clf])
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
        loss={"regression":"mse", "classification":"sparse_categorical_crossentropy"},
        loss_weights={"regression":0.1, "classification":1.0}
    )
    return model





# Learning rate schedule: halve every 50 epochs
def lr_schedule(epoch):
    return 1e-3 * (0.5 ** (epoch // 50))

lr_cb = LearningRateScheduler(lr_schedule)

ensemble_preds_reg = []
ensemble_preds_clf = []

for i in range(100):
    # Randomly sample 13 features
    features_idx = random.sample(range(X_train.shape[2]), 13)
    X_tr_sub = X_train[:,:,features_idx]
    X_val_sub = X_val[:,:,features_idx]
    X_test_sub = X_test[:,:,features_idx]

    # Bootstrap sample training data
    idx = np.random.choice(len(X_tr_sub), size=len(X_tr_sub), replace=True)
    X_boot, y_reg_boot, y_clf_boot = X_tr_sub[idx], y_reg_train[idx], y_clf_train[idx]

    # Build & train
    model_i = build_model(input_dim=X_train.shape[2] if False else X_boot.shape[2])
    model_i.fit(
        X_boot, {"regression": y_reg_boot, "classification": y_clf_boot},
        validation_data=(X_val_sub, {"regression": y_reg_val, "classification": y_clf_val}),
        epochs=300, batch_size=256, callbacks=[lr_cb], verbose=0
    )

    # Predict on test
    reg_pred, clf_pred = model_i.predict(X_test_sub, verbose=0)
    ensemble_preds_reg.append(reg_pred.flatten())
    ensemble_preds_clf.append(clf_pred)

# Average ensemble outputs
final_reg = np.mean(np.vstack(ensemble_preds_reg), axis=0)
final_clf = np.mean(np.stack(ensemble_preds_clf), axis=0).argmax(axis=1)

print("Ensemble Test RMSE:", np.sqrt(((final_reg - y_reg_test)**2).mean()))
from sklearn.metrics import balanced_accuracy_score
print("Ensemble Test BACC:", balanced_accuracy_score(y_clf_test, final_clf))
