1. Data Preparation
- Load OHLCV data for all selected tickers (e.g., AMZN, META, AVGO, ETFs)
- Normalize features per ticker (z-score or min-max scaling)
- Add metadata: ticker_id, sector, day_of_week, macro regime (optional)
- Create rolling windows for time series modeling (e.g., 10-day sequences)
2. Feature Engineering
- Technical indicators: RSI, MACD, Bollinger Bands, ATR
- Candle features: range, body size, wick ratios
- Volume features: OBV, VWAP, volume spikes
- Lagged returns, volatility, momentum scores
3. Labeling Strategy
- Define swing trade targets:
- Binary: Will price rise >x% in next n days?
- Multi-class: Uptrend / Downtrend / Sideways
- Regression: Expected return over next n days

#### Dependencies

In [35]:
import yfinance as yf
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from scipy.stats import spearmanr, pearsonr
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
from datetime import datetime, timedelta
from ta.momentum import RSIIndicator, StochasticOscillator
from ta.trend import SMAIndicator, MACD, CCIIndicator
from ta.momentum import WilliamsRIndicator
from ta.volume import ChaikinMoneyFlowIndicator


In [36]:
print(torch.__version__)
print(torch.version.cuda) 
print(torch.cuda.is_available())   # True if a GPU is detected
print(torch.cuda.device_count())   # Number of GPUs available
print(torch.cuda.get_device_name(0))  # Name of the first GPU

2.9.0+cu128
12.8
True
1
NVIDIA GeForce RTX 4080


#### Datacollation

Equities taken from realistic stock portfolio:
"AMZN", "META", "AVGO", "LLY", "ETN", "CYBR", "LIN", "WM", "SLNO", "CYTK", "XLV"

In [50]:
leadup_days = 30
start_date = (datetime.strptime("2015-01-01", "%Y-%m-%d") - timedelta(days=leadup_days)).strftime("%Y-%m-%d")
amzn = yf.Ticker("AMZN")
raw_data = amzn.history(start=start_date, end="2025-01-01", interval="1d", auto_adjust=False, actions=False)

In [57]:
# raw_data.columns = raw_data.columns.str.lower()
raw_data.info()
raw_data.describe()


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2537 entries, 2014-12-02 00:00:00-05:00 to 2024-12-31 00:00:00-05:00
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   open       2537 non-null   float64
 1   high       2537 non-null   float64
 2   low        2537 non-null   float64
 3   close      2537 non-null   float64
 4   adj close  2537 non-null   float64
 5   volume     2537 non-null   int64  
dtypes: float64(5), int64(1)
memory usage: 138.7 KB


Unnamed: 0,open,high,low,close,adj close,volume
count,2537.0,2537.0,2537.0,2537.0,2537.0,2537.0
mean,100.546931,101.687477,99.302719,100.522593,100.522593,76160160.0
std,54.533373,55.155739,53.855255,54.506998,54.506998,40582250.0
min,14.314,14.5395,14.2625,14.3475,14.3475,15007500.0
25%,48.0,48.365501,47.702499,47.992001,47.992001,50776000.0
50%,94.18,95.25,93.139999,94.230003,94.230003,65262000.0
75%,153.692505,155.630005,151.550507,153.729996,153.729996,90000000.0
max,232.389999,233.0,228.009995,232.929993,232.929993,477122000.0


#### Useful functions

In [68]:
def add_indicators(df, sma_vals=[10, 20]):
    df = df.copy()
    # ensure all column labels are lowercase
    df.columns = df.columns.str.lower().str.replace(' ', '_')

    # 1. Simple Moving Average (SMA)
    for i in sma_vals:
        df[f'sma_{i}'] = SMAIndicator(df['close'], window=i).sma_indicator()

    # 2. Stochastic Oscillator (KD)
    stoch = StochasticOscillator(df['high'], df['low'], df['close'], window=14, smooth_window=3)
    df['stoch_k'] = stoch.stoch()         # %K line
    df['stoch_d'] = stoch.stoch_signal()  # %D line

    # 3. MACD
    macd = MACD(df['close'], window_slow=26, window_fast=12, window_sign=9)
    df['macd'] = macd.macd()
    df['macd_signal'] = macd.macd_signal()
    df['macd_hist'] = macd.macd_diff()

    # 4. Commodity Channel Index (CCI)
    df['cci'] = CCIIndicator(df['high'], df['low'], df['close'], window=20).cci()

    # 5. Williams %R
    df['williams_r'] = WilliamsRIndicator(df['high'], df['low'], df['close'], lbp=14).williams_r()

    # 6. Relative Strength Index (RSI)
    df['rsi'] = RSIIndicator(df['close'], window=14).rsi()

    # 7. Chaikin A/D Oscillator (ADOSC)
    df['adosc'] = ChaikinMoneyFlowIndicator(
        high=df['high'],
        low=df['low'],
        close=df['close'],
        volume=df['volume'],
        window=20
    ).chaikin_money_flow()

    return df



#### Operation Pipeline

In [65]:
df = raw_data.copy()
df = add_indicators(df, sma_vals=[5, 19])

In [67]:
df.head(30)

Unnamed: 0_level_0,open,high,low,close,adj_close,volume,sma_5,sma_19,stoch_k,stoch_d,macd,macd_signal,macd_hist,cci,williams_r,rsi,adosc
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2014-12-02 00:00:00-05:00,16.375,16.3965,16.1625,16.3155,16.3155,55806000,,,,,,,,,,,
2014-12-03 00:00:00-05:00,16.286501,16.338499,15.718,15.825,15.825,113620000,,,,,,,,,,,
2014-12-04 00:00:00-05:00,15.7765,15.9295,15.6735,15.8465,15.8465,65806000,,,,,,,,,,,
2014-12-05 00:00:00-05:00,15.84,15.8465,15.542,15.6315,15.6315,65304000,,,,,,,,,,,
2014-12-08 00:00:00-05:00,15.5785,15.828,15.241,15.332,15.332,72784000,15.7901,,,,,,,,,,
2014-12-09 00:00:00-05:00,15.1495,15.682,15.057,15.625,15.625,80990000,15.652,,,,,,,,,,
2014-12-10 00:00:00-05:00,15.6,15.6595,15.234,15.292,15.292,64918000,15.5454,,,,,,,,,,
2014-12-11 00:00:00-05:00,15.3945,15.632,15.3005,15.368,15.368,65258000,15.4497,,,,,,,,,,
2014-12-12 00:00:00-05:00,15.1995,15.532,15.1505,15.366,15.366,63070000,15.3966,,,,,,,,,,
2014-12-15 00:00:00-05:00,15.4435,15.543,15.1075,15.3035,15.3035,76832000,15.3909,,,,,,,,,,
