1. Data Preparation
- Load OHLCV data for all selected tickers (e.g., AMZN, META, AVGO, ETFs)
- Normalize features per ticker (z-score or min-max scaling)
- Add metadata: ticker_id, sector, day_of_week, macro regime (optional)
- Create rolling windows for time series modeling (e.g., 10-day sequences)
2. Feature Engineering
- Technical indicators: RSI, MACD, Bollinger Bands, ATR
- Candle features: range, body size, wick ratios
- Volume features: OBV, VWAP, volume spikes
- Lagged returns, volatility, momentum scores
3. Labeling Strategy
- Define swing trade targets:
- Binary: Will price rise >x% in next n days?
- Multi-class: Uptrend / Downtrend / Sideways
- Regression: Expected return over next n days

#### Dependencies

In [None]:
import yfinance as yf
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from scipy.stats import spearmanr, pearsonr
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
from datetime import datetime, timedelta
from ta.momentum import RSIIndicator, StochRSIIndicator
from ta.trend import MACD, SMAIndicator, EMAIndicator
from ta.volatility import BollingerBands, AverageTrueRange
from ta.volume import OnBalanceVolumeIndicator, ChaikinMoneyFlowIndicator

In [109]:
print(torch.__version__)
print(torch.version.cuda) 
print(torch.cuda.is_available())   # True if a GPU is detected
print(torch.cuda.device_count())   # Number of GPUs available
print(torch.cuda.get_device_name(0))  # Name of the first GPU

2.9.0+cu128
12.8
True
1
NVIDIA GeForce RTX 4080


#### Datacollation

Equities taken from realistic stock portfolio:
"AMZN", "META", "AVGO", "LLY", "ETN", "CYBR", "LIN", "WM", "SLNO", "CYTK", "XLV"

In [201]:
leadup_days = 30
start_date = (datetime.strptime("2015-01-01", "%Y-%m-%d") - timedelta(days=leadup_days)).strftime("%Y-%m-%d")
amzn = yf.Ticker("AMZN")
raw_data = amzn.history(start=start_date, end="2025-01-01", interval="1d", auto_adjust=True, actions=False)

In [202]:
raw_data.info()
raw_data.describe()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2537 entries, 2014-12-02 00:00:00-05:00 to 2024-12-31 00:00:00-05:00
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    2537 non-null   float64
 1   High    2537 non-null   float64
 2   Low     2537 non-null   float64
 3   Close   2537 non-null   float64
 4   Volume  2537 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 118.9 KB


Unnamed: 0,Open,High,Low,Close,Volume
count,2537.0,2537.0,2537.0,2537.0,2537.0
mean,100.546931,101.687477,99.302719,100.522593,76160160.0
std,54.533373,55.155739,53.855255,54.506998,40582250.0
min,14.314,14.5395,14.2625,14.3475,15007500.0
25%,48.0,48.365501,47.702499,47.992001,50776000.0
50%,94.18,95.25,93.139999,94.230003,65262000.0
75%,153.692505,155.630005,151.550507,153.729996,90000000.0
max,232.389999,233.0,228.009995,232.929993,477122000.0


#### Initialize important functions

In [116]:
def add_indicators(df):
    df = df.copy()
    
    # General price dynamics
    df["returns"] = df["Close"].pct_change()
    df["logReturns"] = np.log(df["Close"] / df["Close"].shift(1))
    df["Volatility"] = df["returns"].rolling(10).std()
    df['Range'] = df['High'] - df['Low']
    df['Body'] = abs(df['Close'] - df['Open'])
    df['Wick'] = df['Range'] - df['Body']

    # Momentum
    df['RSI'] = RSIIndicator(df['Close'], window=14).rsi()
    df['StochRSI'] = StochRSIIndicator(df['Close'], window=14).stochrsi()
    
    # Short-term moving averages (faster response)
    df['SMA_10'] = SMAIndicator(df['Close'], window=10).sma_indicator()
    df['EMA_10'] = EMAIndicator(df['Close'], window=10).ema_indicator()

    # Medium-term moving averages (faster than previous 50)
    df['SMA_20'] = SMAIndicator(df['Close'], window=20).sma_indicator()
    df['EMA_20'] = EMAIndicator(df['Close'], window=20).ema_indicator()

    # Optional: slightly faster “long-term” averages for crossovers
    df['SMA_30'] = SMAIndicator(df['Close'], window=30).sma_indicator()
    df['EMA_30'] = EMAIndicator(df['Close'], window=30).ema_indicator()

    # Faster MACD for earlier crossovers
    macd = MACD(df['Close'], window_slow=13, window_fast=6, window_sign=5)
    df['MACD'] = macd.macd()
    df['MACD_Signal'] = macd.macd_signal()
    
    df['MACD_rel'] = df['MACD'] / df['Close']
    df['MACD_Signal_rel'] = df['MACD_Signal'] / df['Close']
    
    # Volatility
    bb = BollingerBands(df['Close'], window=20)
    df['BB_High'] = bb.bollinger_hband()
    df['BB_Low'] = bb.bollinger_lband()
    df['ATR'] = AverageTrueRange(df['High'], df['Low'], df['Close'], window=14).average_true_range()
    
    # Volume
    df['OBV'] = OnBalanceVolumeIndicator(df['Close'], df['Volume']).on_balance_volume()
    df['OBV_prev'] = pd.to_numeric(df['OBV'].shift(1), errors='coerce').fillna(0).astype(int)
    df['CMF'] = ChaikinMoneyFlowIndicator(df['High'], df['Low'], df['Close'], df['Volume'], window=20).chaikin_money_flow()
    
    return df

In [117]:
""" | Indicator(s)             | Rule                                                                            | Label |
    | ------------------------ | ------------------------------------------------------------------------------- | ----- |
    | MACD & MACD_Signal       | If `MACD > MACD_Signal` → **Buy**, if `MACD < MACD_Signal` → **Sell**           | ±1    |
    | RSI                      | If `RSI < 30` → **Buy** (oversold), if `RSI > 70` → **Sell** (overbought)       | ±1    |
    | StochRSI                 | If `StochRSI < 0.2` → **Buy**, if `StochRSI > 0.8` → **Sell**                   | ±1    |
    | SMA / EMA                | If `Close > SMA_20` → **Buy**, if `Close < SMA_20` → **Sell**                   | ±1    |
    | Bollinger Bands          | If `Close < BB_Low` → **Buy**, if `Close > BB_High` → **Sell**                  | ±1    |
    | OBV (On-Balance Volume)  | If `OBV > OBV_prev` → **Buy**, if `OBV < OBV_prev` → **Sell**                   | ±1    |
    | CMF (Chaikin Money Flow) | If `CMF > 0` → **Buy**, if `CMF < 0` → **Sell**                                 | ±1    |
    | Volatility / ATR         | If `Volatility` or `ATR` is high, reduce conviction (set to `0` to avoid noise) | 0     |
"""

def signal_engine(df):
    df = df.copy()

    # Initialize vote counter
    votes = np.zeros(len(df), dtype=float)

    # ---- Weighted voting system ----
    # Stronger weights = more historically reliable / consistent indicator
    w = {
        # Core momentum/trend indicators
        'MACD': 1.0,
        'RSI': 0.8,
        'StochRSI': 0.6,

        # Short / medium / long-term trend indicators
        'SMA_10': 0.9,           # Short-term trend
        'SMA_20': 1.0,           # Medium-term trend
        'SMA_30': 1.1,           # Long-term anchor
        'SMA_cross': 1.0,        # 10 vs 20 crossover
        'SMA_long_cross': 1.0,   # 20 vs 30 crossover

        'EMA_10': 0.9,
        'EMA_20': 1.0,
        'EMA_30': 1.1,
        'EMA_cross': 1.0,
        'EMA_long_cross': 1.0,

        # Volatility and confirmation indicators
        'BB': 0.75,
        'OBV': 0.9,
        'CMF': 0.8,
        'Body': 0.6,
        'Wick': 0.8,
        'Volatility': 0.4,
    }

    # --- Apply rules ---
     # Shift indicators by 1 bar to avoid lookahead
    df_shift = df.shift(1)

    # --- Apply rules with shifted indicators ---
    
    # MACD crossover
    votes += w['MACD'] * np.where(df_shift['MACD'] > df_shift['MACD_Signal'], 1, 
                                  np.where(df_shift['MACD'] < df_shift['MACD_Signal'], -1, 0))
    
    # RSI thresholds
    votes += w['RSI'] * np.where(df_shift['RSI'] < 30, 1, 
                                 np.where(df_shift['RSI'] > 70, -1, 0))
    
    # StochRSI thresholds
    votes += w['StochRSI'] * np.where(df_shift['StochRSI'] < 0.2, 1, 
                                      np.where(df_shift['StochRSI'] > 0.8, -1, 0))
    
    # RSI divergence (optional)
    rsi_div = (df_shift['RSI'] > df_shift['RSI'].shift(1)) & (df_shift['Close'] < df_shift['Close'].shift(1))
    votes += 0.6 * np.where(rsi_div, -1, 0)
    
    # --- Short-term and medium-term trend signals ---

    # # SMA trend signals
    # votes += w['SMA_10'] * np.where(df_shift['Close'] > df_shift['SMA_10'], 1,
    #                                 np.where(df_shift['Close'] < df_shift['SMA_10'], -1, 0))
    # votes += w['SMA_20'] * np.where(df_shift['Close'] > df_shift['SMA_20'], 1,
    #                                 np.where(df_shift['Close'] < df_shift['SMA_20'], -1, 0))

    # # SMA crossover (short vs medium)
    # votes += w['SMA_cross'] * np.where(df_shift['SMA_10'] > df_shift['SMA_20'], 1,
    #                                 np.where(df_shift['SMA_10'] < df_shift['SMA_20'], -1, 0))

    # EMA trend signals
    votes += w['EMA_10'] * np.where(df_shift['Close'] > df_shift['EMA_10'], 1,
                                    np.where(df_shift['Close'] < df_shift['EMA_10'], -1, 0))
    votes += w['EMA_20'] * np.where(df_shift['Close'] > df_shift['EMA_20'], 1,
                                    np.where(df_shift['Close'] < df_shift['EMA_20'], -1, 0))

    # EMA crossover (short vs medium)
    votes += w['EMA_cross'] * np.where(df_shift['EMA_10'] > df_shift['EMA_20'], 1,
                                    np.where(df_shift['EMA_10'] < df_shift['EMA_20'], -1, 0))

    # Optional: long-term trend anchors
    votes += w['SMA_30'] * np.where(df_shift['Close'] > df_shift['SMA_30'], 1,
                                    np.where(df_shift['Close'] < df_shift['SMA_30'], -1, 0))
    votes += w['EMA_30'] * np.where(df_shift['Close'] > df_shift['EMA_30'], 1,
                                    np.where(df_shift['Close'] < df_shift['EMA_30'], -1, 0))
    
    # Long-term crossover (medium vs long)
    votes += w['SMA_long_cross'] * np.where(df_shift['SMA_20'] > df_shift['SMA_30'], 1,
                                            np.where(df_shift['SMA_20'] < df_shift['SMA_30'], -1, 0))
    votes += w['EMA_long_cross'] * np.where(df_shift['EMA_20'] > df_shift['EMA_30'], 1,
                                            np.where(df_shift['EMA_20'] < df_shift['EMA_30'], -1, 0))
    
    # Bollinger Bonds breakout
    votes += w['BB'] * np.where(df_shift['Close'] < df_shift['BB_Low'], 1, 
                                np.where(df_shift['Close'] > df_shift['BB_High'], -1, 0))
    
    # OBV momentum
    votes += w['OBV'] * np.where(df_shift['OBV'] > df_shift['OBV_prev'], 1, 
                                 np.where(df_shift['OBV'] < df_shift['OBV_prev'], -1, 0))
    
    # CNF accumulation/distribution
    votes += w['CMF'] * np.where(df_shift['CMF'] > 0, 1, np.where(df_shift['CMF'] < 0, -1, 0))

    # Candle body momentum
    body_avg = df_shift['Body'].rolling(5, min_periods=1).mean()
    votes += w['Body'] * np.where(df_shift['Body'] > body_avg, 1, np.where(df_shift['Body'] < body_avg, -1, 0))

    # Wick exhaustion
    votes += w['Wick'] * np.where(df_shift['Wick'] > df_shift['Range'] * 0.6, -1, 
                                  np.where(df_shift['Wick'] < df_shift['Range'] * 0.1, 1, 0))
    
    # Bearish engulfing candle
    bear_engulf = (df_shift['Open'] < df_shift['Close']) & (df_shift['Close'] < df_shift['Open'].shift(1)) & (df_shift['Open'] > df_shift['Close'].shift(1))
    votes += 0.8 * np.where(bear_engulf, -1, 0)


    # Volatility contraction
    vol_avg = df_shift['Volatility'].rolling(10, min_periods=1).mean()
    votes += w['Volatility'] * np.where(df_shift['Volatility'] < vol_avg, 1, 0)

    # --- Targeted volatility soft gate ---
    atr_mean = df_shift['ATR'].rolling(50, min_periods=10).mean()
    atr_std = df_shift['ATR'].rolling(50, min_periods=10).std()
    high_atr = df_shift['ATR'] > (atr_mean + 2 * atr_std)

    scale = np.ones(len(df))
    scale[high_atr] = np.clip((atr_mean[high_atr] + 2 * atr_std[high_atr]) / df_shift['ATR'][high_atr], 0.3, 1.0)
    votes *= scale

    # --- Preliminary label ---
    prelim_label = np.where(votes > 4, 1, np.where(votes < -4, -1, 0))

    # --- Soft fallback logic for weak votes ---
    fallback_zone = (prelim_label == 0) & (np.abs(votes) <= 2)
    soft_vote_strength = 0.5
    low_vol = df_shift['ATR'] < atr_mean
    fallback_bear = (df_shift['EMA_10'] < df_shift['EMA_20']) & fallback_zone & low_vol
    fallback_bull = (df_shift['EMA_10'] > df_shift['EMA_20']) & fallback_zone & low_vol

    prelim_label[fallback_bull] = soft_vote_strength
    prelim_label[fallback_bear] = -soft_vote_strength

    # list the vote_totals
    df['vote_total'] = votes

    return df

In [None]:
def create_regression_target(df, horizon=1):
    df[f'f_return_{horizon}'] = (
        (1 + df['returns']).shift(-1).rolling(horizon).apply(lambda x: np.prod(x) - 1, raw=True)
    )
    return df.dropna()

In [182]:
# added Z-score normalization
def normalize_features(df):
    df = df.copy()
    
    # Select numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    # Exclude regression targets and bounded indicators
    exclude_cols = [c for c in numeric_cols if c.startswith("f_return_")] + [
        "RSI", "StochRSI", "CMF"
    ]
    
    cols_to_scale = [c for c in numeric_cols if c not in exclude_cols]
    
    scaler = StandardScaler()
    df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])
    
    return df, scaler


In [206]:
# reshape the data into sequences that an LSTM can consume.
def build_sequences(df, target_col, window_size=30):
    """
    Convert a feature DataFrame into sequences for LSTM training.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame with features and target column(s).
    target_col : str
        Name of the regression target column (e.g., 'f_return_1').
    window_size : int
        Number of past timesteps to include in each sequence.

    Returns
    -------
    X : np.ndarray
        Feature sequences of shape (samples, timesteps, features).
    y : np.ndarray
        Target values aligned with each sequence.
    feature_names : list
        Names of the features used (for reference).
    """
    # Features = all numeric columns except the target
    feature_cols = [c for c in df.select_dtypes(include=[np.number]).columns if c != target_col]
    
    Xs, ys = [], []
    for i in range(len(df) - window_size):
        Xs.append(df[feature_cols].iloc[i:(i+window_size)].values)
        ys.append(df[target_col].iloc[i+window_size])
    
    X = np.array(Xs)
    y = np.array(ys)
    
    return X, y, feature_cols


In [304]:
def walk_forward_split_and_scale(X, y, n_splits=5, train_ratio=0.8):
    """
    Perform walk-forward splits and normalize each fold using only its training data.
    
    Parameters
    ----------
    X : np.ndarray
        Input features of shape (n_samples, window, n_features).
    y : np.ndarray
        Target values of shape (n_samples,).
    n_splits : int
        Number of folds (default=5).
    train_ratio : float
        Initial fraction of data to use for training (default=0.8).
    
    Returns
    -------
    folds : list of tuples
        Each tuple = (X_train_scaled, y_train, X_test_scaled, y_test, scaler)
    """
    folds = []
    n_samples = len(X)
    split_size = int(n_samples * (1 - train_ratio) / n_splits)
    start_train = int(n_samples * train_ratio)

    for i in range(n_splits):
        # Define train/test indices
        train_end = start_train + i * split_size
        test_end = train_end + split_size
        if test_end > n_samples:
            break

        X_train, y_train = X[:train_end], y[:train_end]
        X_test, y_test = X[train_end:test_end], y[train_end:test_end]

        # Fit scaler on training fold only
        n_train, window, n_features = X_train.shape
        X_train_flat = X_train.reshape(-1, n_features)
        scaler = StandardScaler()
        scaler.fit(X_train_flat)

        # Transform train/test
        X_train_scaled = scaler.transform(X_train_flat).reshape(n_train, window, n_features)
        X_test_scaled = scaler.transform(X_test.reshape(-1, n_features)).reshape(X_test.shape[0], window, n_features)

        folds.append((X_train_scaled, y_train, X_test_scaled, y_test, scaler))

    return folds


##### LSTM + train Function

In [219]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, units1=64, units2=32, dense_units=32, dropout=0.2):
        super(LSTMModel, self).__init__()
        
        # First LSTM layer (returns sequences)
        self.lstm1 = nn.LSTM(input_size=input_size, hidden_size=units1, batch_first=True)
        
        # Second LSTM layer (returns final hidden state)
        self.lstm2 = nn.LSTM(input_size=units1, hidden_size=units2, batch_first=True)
        
        # Dense + Dropout + Output
        self.fc1 = nn.Linear(units2, dense_units)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.fc_out = nn.Linear(dense_units, 1)  # regression output

    def forward(self, x):
        # x shape: (batch, seq_len, features)
        out, _ = self.lstm1(x)
        out, (h_n, _) = self.lstm2(out)
        
        # Take last hidden state from second LSTM
        out = h_n[-1]  # shape: (batch, units2)
        
        out = self.fc1(out)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc_out(out)
        return out

In [218]:
def train_lstm(X_train, y_train, device, epochs=50, batch_size=32, patience=5, val_split=0.1):
    """
    Train LSTM on given train set using PyTorch with validation and early stopping.
    """
    n_features = X_train.shape[2]
    model = LSTMModel(input_size=n_features).to(device)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Convert numpy arrays to torch tensors
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)

    # Train/validation split
    n_val = int(len(X_train_tensor) * val_split)
    X_val_tensor, y_val_tensor = X_train_tensor[-n_val:], y_train_tensor[-n_val:]
    X_train_tensor, y_train_tensor = X_train_tensor[:-n_val], y_train_tensor[:-n_val]

    train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

    best_loss = float("inf")
    patience_counter = 0
    history = {"train_loss": [], "val_loss": []}

    for epoch in range(epochs):
        model.train()
        epoch_loss = 0.0

        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch).squeeze()
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item() * X_batch.size(0)

        epoch_loss /= len(train_loader.dataset)
        history["train_loss"].append(epoch_loss)

        # Validation
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_val_tensor).squeeze()
            val_loss = criterion(val_outputs, y_val_tensor).item()
        history["val_loss"].append(val_loss)

        # Early stopping
        if val_loss < best_loss:
            best_loss = val_loss
            patience_counter = 0
            best_model_state = model.state_dict()
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                break

    model.load_state_dict(best_model_state)
    return model, history


#### Evaluation functions

#### Operation pipeline

In [None]:
threshold = 0.002  # 0.2% cutoff
horizon = 15
PATIENCE = 5
BATCH_SIZE = 32
WINDOW = 60
EPOCHS = 20
TRAIN_RATIO = 0.8

# 1. Prepare data
df = raw_data.copy()
df = add_indicators(df)
df = signal_engine(df)
df = df.dropna()
df = create_regression_target(df, horizon)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
# 2. Build sequences
X, y, features = build_sequences(df, target_col=f"f_return_{horizon}", window_size=30)
folds = walk_forward_split_and_scale(X, y, n_splits=5)

for i, (X_train, y_train, X_test, y_test, scaler) in enumerate(folds):
    print(f"Fold {i}: Train {X_train.shape}, Test {X_test.shape}")

# 3. Train LSTM
    model, history = train_lstm(
        X_train, y_train,
        device=device,
        epochs=50,
        batch_size=32,
        patience=5,
        val_split=0.1
    )
    model.eval()

# 4. Predict on test set
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
    with torch.no_grad():
        preds = model(X_test_tensor).squeeze().cpu().numpy()

# 5. Metrics
    rmse = np.sqrt(np.mean((preds - y_test) ** 2))
    mae = np.mean(np.abs(preds - y_test))
    dir_acc = np.mean((preds >= 0) == (y_test >= 0))

    print(f"Fold {i} Metrics: RMSE={rmse:.4f}, MAE={mae:.4f}, DirAcc={dir_acc:.2%}")








Fold 0: Train (1970, 30, 30), Test (98, 30, 30)
Early stopping at epoch 10
Fold 0 Metrics: RMSE=0.0498, MAE=0.0378, DirAcc=93.88%
Fold 1: Train (2068, 30, 30), Test (98, 30, 30)
Early stopping at epoch 9
Fold 1 Metrics: RMSE=0.0328, MAE=0.0268, DirAcc=81.63%
Fold 2: Train (2166, 30, 30), Test (98, 30, 30)
Early stopping at epoch 13
Fold 2 Metrics: RMSE=0.0355, MAE=0.0260, DirAcc=86.73%
Fold 3: Train (2264, 30, 30), Test (98, 30, 30)
Early stopping at epoch 10
Fold 3 Metrics: RMSE=0.0428, MAE=0.0376, DirAcc=68.37%
Fold 4: Train (2362, 30, 30), Test (98, 30, 30)
Early stopping at epoch 7
Fold 4 Metrics: RMSE=0.0777, MAE=0.0644, DirAcc=67.35%
