In [1]:
import tensorflow as tf
import numpy as np
import alpaca_trade_api as tradeapi
from sklearn.model_selection import train_test_split
import neptune
import pandas as pd
from fredapi import Fred

In [2]:
df = pd.read_csv('stock_data_full_data.csv')
df['timestamp'] = pd.to_datetime(df['timestamp']).dt.tz_localize(None)
df['timestamp'] = df['timestamp'].dt.date
df.set_index('timestamp', inplace=True)

In [3]:
df_bin1 = pd.read_csv('bin1_stocks.csv')
df_bin2 = pd.read_csv('bin2_stocks.csv')
df_bin3 = pd.read_csv('bin3_stocks.csv')
df_bin4 = pd.read_csv('bin4_stocks.csv')
df_bin5 = pd.read_csv('bin5_stocks.csv')

In [4]:
bins = [df_bin1, df_bin2, df_bin3, df_bin4, df_bin5]

In [5]:
for bin in bins:
    bin.drop(columns=['Unnamed: 0'], inplace=True)

In [6]:
# Prepare (mostly macro-economic) Features that are used by every stock or ETF

# Prepare macro-economic features
fred_api_key = 'b0fae0f02614161be351f0a9d3517207'
fred = Fred(api_key=fred_api_key)
start_date = '2020-05-08'
end_date = '2025-05-07'

indicators = {
    "real_GDP": "GDPC1",
    "unemployment_rate": "UNRATE",
    "CPI": "CPIAUCSL",
    "personal_consumption": "PCE",
    "industrial_production": "INDPRO",
    "federal_funds_rate": "FEDFUNDS",
    "10Y_treasury_rate": "DGS10",
    "retail_sales": "RSAFS",
    "housing_starts": "HOUST",
    "usd_eur": "DEXUSEU",
    "usd_gbp": "DEXUSUK",
    "usd_jpy": "DEXJPUS"
}

# iterate to get data for each indicator
data = {}
for name, series_id in indicators.items():
    print(f"Downloading {name}...")
    series_data = fred.get_series(series_id, observation_start=start_date, observation_end=end_date)
    series_data = series_data.to_frame(name=name)
    data[name] = series_data

# Combine all macro-economic indicators into a single DataFrame
df_macro = pd.concat(data.values(), axis=1)
df_macro = df_macro.interpolate(
    method='linear',
    axis=0,
    limit_direction='both'
)
df_macro = df_macro[df_macro.index >= '2020-05-08']
df_macro.to_csv("macro_data.csv", index=True)

Downloading real_GDP...
Downloading unemployment_rate...
Downloading CPI...
Downloading personal_consumption...
Downloading industrial_production...
Downloading federal_funds_rate...
Downloading 10Y_treasury_rate...
Downloading retail_sales...
Downloading housing_starts...
Downloading usd_eur...
Downloading usd_gbp...
Downloading usd_jpy...


In [7]:
chosen_stocks = []
for bin in bins:
    df_bin = []
    for ticker in bin.columns:
        df_ticker = df[df['symbol'] == ticker].copy()


        # ---- Add technical indicators ---- #


        # Add Relative Strength Index (RSI)
        window = 14
        delta = df_ticker['close'].diff()
        gain = delta.clip(lower=0)
        loss = -delta.clip(upper=0)
        avg_gain = gain.ewm(alpha=1/window, min_periods=window).mean()
        avg_loss = loss.ewm(alpha=1/window, min_periods=window).mean() 
        rs = avg_gain / avg_loss
        df_ticker['rsi'] = 100 - (100 / (1 + rs))

        # Add Rate of Change (ROC)
        window1 = 9
        window2 = 14
        window3 = 21
        prev_price1 = df_ticker['close'].shift(window1)
        df_ticker['roc_9'] = (df_ticker['close'] - prev_price1) / prev_price1 * 100
        prev_price2 = df_ticker['close'].shift(window2)
        df_ticker['roc_14'] = (df_ticker['close'] - prev_price2) / prev_price2 * 100
        prev_price3 = df_ticker['close'].shift(window3)
        df_ticker['roc_21'] = (df_ticker['close'] - prev_price3) / prev_price3 * 100

        # Add Moving Average Convergence Divergence (MACD)
        short_span = 12
        long_span = 26
        signal_span = 9
        ema_short = df_ticker['close'].ewm(span=short_span, adjust=False).mean()
        ema_long = df_ticker['close'].ewm(span=long_span, adjust=False).mean()
        df_ticker['macd'] = ema_short - ema_long
        df_ticker['macd_signal'] = df_ticker['macd'].ewm(span=signal_span, adjust=False).mean()
        df_ticker['macd_hist'] = df_ticker['macd'] - df_ticker['macd_signal']

        # Add Stochastic Oscillator
        window_K = 14 # for %K
        window_D = 3
        highest_high = df_ticker['high'].rolling(window=window_K).max()
        lowest_low = df_ticker['low'].rolling(window=window_K).min()
        df_ticker['stoch_%K'] = (df_ticker['close'] - lowest_low) / (highest_high - lowest_low) * 100
        df_ticker['stoch_%D'] = df_ticker['stoch_%K'].rolling(window=window_D).mean()
        df_ticker['stoch_diff'] = df_ticker['stoch_%K'] - df_ticker['stoch_%D']

        # Add Commodity Channel Index (CCI)
        window = 20
        typical_price = (df_ticker['high'] + df_ticker['low'] + df_ticker['close']) / 3
        sma = typical_price.rolling(window=window, min_periods=int(window/2)).mean()
        mad = (typical_price - sma).abs().rolling(window=window).mean()
        df_ticker['cci'] = (typical_price - sma) / (0.015 * mad)

        # Add various Moving Averages
        df_ticker['ma_15'] = df_ticker['close'].rolling(window=15, min_periods=5).mean()
        df_ticker['ma_30'] = df_ticker['close'].rolling(window=30, min_periods=10).mean()
        df_ticker['ma_60'] = df_ticker['close'].rolling(window=60, min_periods=15).mean()
        df_ticker['ma_100'] = df_ticker['close'].rolling(window=100, min_periods=20).mean()
        df_ticker['ma_200'] = df_ticker['close'].rolling(window=200, min_periods=30).mean()

        # Add Average True Range (ATR)
        window = 14
        high_low = df_ticker['high'] - df_ticker['low']
        high_close = (df_ticker['high'] - df_ticker['close'].shift()).abs()
        low_close = (df_ticker['low'] - df_ticker['close'].shift()).abs()
        tr = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
        df_ticker['atr'] = tr.rolling(window=window).mean()
        
        # Add Average Directional Index (ADX)
        up_move = df_ticker['high'] - df_ticker['high'].shift()
        down_move = df_ticker['low'].shift() - df_ticker['low']
        plus_dm = pd.Series(np.where((up_move > down_move) & (up_move > 0), up_move, 0), index=df_ticker.index)
        minus_dm = pd.Series(np.where((down_move > up_move) & (down_move > 0), down_move, 0), index=df_ticker.index)
        window_dx = 14
        df_ticker['plus_di'] = 100 * (plus_dm.rolling(window=window_dx).mean() / df_ticker['atr'])
        df_ticker['minus_di'] = 100 * (minus_dm.rolling(window=window_dx).mean() / df_ticker['atr'])
        df_ticker['dx'] = 100 * (abs(df_ticker['plus_di'] - df_ticker['minus_di']) / (df_ticker['plus_di'] + df_ticker['minus_di']))

        # Calculate ADX
        window = 14
        dx_rolling = df_ticker['dx'].rolling(window=window).mean()
        first_valid_index = dx_rolling.first_valid_index()
        df_ticker['adx'] = np.nan
        df_ticker.loc[first_valid_index, 'adx'] = dx_rolling[first_valid_index]
        first_valid_index = df_ticker.index.get_loc(first_valid_index)
        for i in range(first_valid_index + 1, len(df_ticker)):
            df_ticker.loc[df_ticker.index[i], 'adx'] = (
                (df_ticker.loc[df_ticker.index[i - 1], 'adx'] * (window - 1) + df_ticker.loc[df_ticker.index[i], 'dx']) / window
            )

        # Add Bollinger Bands
        window = 20
        df_ticker['bb_middle'] = df_ticker['close'].rolling(window=window).mean()
        df_ticker['bb_std'] = df_ticker['close'].rolling(window=window).std()
        df_ticker['bb_upper'] = df_ticker['bb_middle'] + (df_ticker['bb_std'] * 2)
        df_ticker['bb_lower'] = df_ticker['bb_middle'] - (df_ticker['bb_std'] * 2)
        df_ticker['bb_pct'] = (df_ticker['close'] - df_ticker['bb_lower']) / (df_ticker['bb_upper'] - df_ticker['bb_lower'])
        df_ticker['bb_width'] = (df_ticker['bb_upper'] - df_ticker['bb_lower']) / df_ticker['bb_middle']
        df_ticker['bb_d_to_upper'] = df_ticker['close'] - df_ticker['bb_upper']
        df_ticker['bb_d_to_lower'] = df_ticker['close'] - df_ticker['bb_lower']

        # Add On-Balance Volume (OBV)
        df_ticker['obv'] = 0
        for i in range(1, len(df_ticker)):
            if df_ticker['close'].iloc[i] > df_ticker['close'].iloc[i-1]:
                df_ticker.loc[df_ticker.index[i], 'obv'] = df_ticker['obv'].iloc[i-1] + df_ticker['volume'].iloc[i]
            elif df_ticker['close'].iloc[i] < df_ticker['close'].iloc[i-1]:
                df_ticker.loc[df_ticker.index[i], 'obv'] = df_ticker['obv'].iloc[i-1] - df_ticker['volume'].iloc[i]
            else:
                df_ticker.loc[df_ticker.index[i], 'obv'] = df_ticker['obv'].iloc[i-1]
        df_ticker['obv_norm'] = (df_ticker['obv'] - df_ticker['obv'].min()) / (df_ticker['obv'].max() - df_ticker['obv'].min())
        df_ticker['obv_momentum'] = df_ticker['obv'] - df_ticker['obv'].shift(10)

        # Add Volume Price Trend (VPT)
        df_ticker['vpt'] = 0
        vpt_change = df_ticker['close'].pct_change(fill_method=None) * df_ticker['volume']
        df_ticker['vpt'] = df_ticker['vpt'].shift() + vpt_change
        df_ticker['vpt_norm'] = (df_ticker['vpt'] - df_ticker['vpt'].min()) / (df_ticker['vpt'].max() - df_ticker['vpt'].min())
        df_ticker['vpt_momentum'] = df_ticker['vpt'] - df_ticker['vpt'].shift(10)





        # ---- Add macro-economic indicators ---- #


        # Add S&P 500 Index
        df_ticker['sp500'] = df[df['symbol'] == 'SPY']['close'].copy()

        # Add calendaric features
        df_ticker.reset_index(inplace=True)
        df_ticker.set_index('timestamp', inplace=True)
        df_ticker.index = pd.to_datetime(df_ticker.index)
        df_ticker.index = df_ticker.index.date
        df_ticker.index = pd.to_datetime(df_ticker.index)
        df_ticker['dayofweek'] = df_ticker.index.day_of_week
        df_ticker['dayofyear'] = df_ticker.index.day_of_year
        df_ticker['month'] = df_ticker.index.month
        df_ticker['year'] = df_ticker.index.year
        df_ticker['quarter'] = df_ticker.index.quarter

        # Add macro-economic indicators that were prepared earlier in the pipeline
        df_ticker = df_ticker.join(df_macro, how='left')

        # Add the target variable which is the next day's close price
        df_ticker['target'] = df_ticker['close'].shift(-1)
        
        df_bin.append(df_ticker.copy())

    chosen_stocks.append(df_bin.copy())

In [8]:
for column in chosen_stocks[1][14].columns:
    print(f"{column}: {chosen_stocks[1][14][column].isna().sum()}")

close: 0
high: 0
low: 0
trade_count: 0
open: 0
volume: 0
vwap: 0
symbol: 0
rsi: 14
roc_9: 9
roc_14: 14
roc_21: 21
macd: 0
macd_signal: 0
macd_hist: 0
stoch_%K: 13
stoch_%D: 15
stoch_diff: 15
cci: 28
ma_15: 4
ma_30: 9
ma_60: 14
ma_100: 19
ma_200: 29
atr: 13
plus_di: 13
minus_di: 13
dx: 13
adx: 26
bb_middle: 19
bb_std: 19
bb_upper: 19
bb_lower: 19
bb_pct: 19
bb_width: 19
bb_d_to_upper: 19
bb_d_to_lower: 19
obv: 0
obv_norm: 0
obv_momentum: 10
vpt: 1
vpt_norm: 1
vpt_momentum: 11
sp500: 0
dayofweek: 0
dayofyear: 0
month: 0
year: 0
quarter: 0
real_GDP: 0
unemployment_rate: 0
CPI: 0
personal_consumption: 0
industrial_production: 0
federal_funds_rate: 0
10Y_treasury_rate: 0
retail_sales: 0
housing_starts: 0
usd_eur: 0
usd_gbp: 0
usd_jpy: 0
target: 1


In [9]:
chosen_stocks[1][13]

Unnamed: 0,close,high,low,trade_count,open,volume,vwap,symbol,rsi,roc_9,...,personal_consumption,industrial_production,federal_funds_rate,10Y_treasury_rate,retail_sales,housing_starts,usd_eur,usd_gbp,usd_jpy,target
2020-05-08,5.6690,5.669,5.5640,49,5.575,16372,5.614000,EEA,,,...,13298.117647,86.343959,0.051765,0.69,480777.764706,1052.352941,1.0854,1.2436,106.50,5.5380
2020-05-11,5.5380,5.640,5.5090,146,5.597,16710,5.577000,EEA,,,...,13344.535294,86.677118,0.053529,0.73,483106.529412,1065.705882,1.0818,1.2330,107.70,5.4290
2020-05-12,5.4290,5.677,5.1960,243,5.487,281091,5.316000,EEA,,,...,13390.952941,87.010276,0.055294,0.69,485435.294118,1079.058824,1.0864,1.2299,107.33,5.3200
2020-05-13,5.3200,5.455,5.2980,232,5.429,57646,5.421000,EEA,,,...,13437.370588,87.343435,0.057059,0.64,487764.058824,1092.411765,1.0837,1.2225,106.92,5.2910
2020-05-14,5.2910,5.298,5.2840,26,5.298,1844,5.292000,EEA,,,...,13483.788235,87.676594,0.058824,0.63,490092.823529,1105.764706,1.0800,1.2194,107.09,5.2870
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-01,9.5300,9.540,9.4451,42,9.540,5336,9.505202,EEA,66.777698,5.187638,...,20669.500000,103.594800,4.330000,4.25,715417.000000,1256.000000,1.1279,1.3273,145.48,9.6428
2025-05-02,9.6428,9.670,9.4713,55,9.550,8859,9.543689,EEA,69.597917,6.786268,...,20669.500000,103.594800,4.330000,4.33,715417.000000,1256.000000,1.1330,1.3283,144.52,9.7500
2025-05-05,9.7500,9.750,9.6700,125,9.670,28301,9.709490,EEA,72.028133,6.093580,...,20669.500000,103.594800,4.330000,4.36,715417.000000,1256.000000,1.1315,1.3284,143.98,9.6725
2025-05-06,9.6725,9.870,9.5300,43,9.870,6546,9.699875,EEA,67.808110,4.624121,...,20669.500000,103.594800,4.330000,4.30,715417.000000,1256.000000,1.1345,1.3369,142.76,9.6400


In [10]:
for i in range(len(chosen_stocks)):
    print(f"Bin {i+1} contains {len(chosen_stocks[i])} stocks.")
    for j in range(len(chosen_stocks[i])):
        print(f"Date range: {chosen_stocks[i][j].index.min().date()} - {chosen_stocks[i][j].index.max().date()}")
        print(f"Stock {j+1} in Bin {i+1} contains {len(chosen_stocks[i][j])} rows.")
        print(f"Stock {j+1} in Bin {i+1} contains {len(chosen_stocks[i][j].columns)} columns.")
        print(f"Stock {j+1} in Bin {i+1} has missing values: {chosen_stocks[i][j].isnull().sum().sum()}")

Bin 1 contains 15 stocks.
Date range: 2020-05-08 - 2025-05-07
Stock 1 in Bin 1 contains 1256 rows.
Stock 1 in Bin 1 contains 62 columns.
Stock 1 in Bin 1 has missing values: 458
Date range: 2020-05-08 - 2025-05-07
Stock 2 in Bin 1 contains 1256 rows.
Stock 2 in Bin 1 contains 62 columns.
Stock 2 in Bin 1 has missing values: 458
Date range: 2020-05-08 - 2025-05-07
Stock 3 in Bin 1 contains 1256 rows.
Stock 3 in Bin 1 contains 62 columns.
Stock 3 in Bin 1 has missing values: 458
Date range: 2020-05-08 - 2025-05-07
Stock 4 in Bin 1 contains 1256 rows.
Stock 4 in Bin 1 contains 62 columns.
Stock 4 in Bin 1 has missing values: 458
Date range: 2020-05-08 - 2025-05-07
Stock 5 in Bin 1 contains 1256 rows.
Stock 5 in Bin 1 contains 62 columns.
Stock 5 in Bin 1 has missing values: 458
Date range: 2020-05-08 - 2025-05-07
Stock 6 in Bin 1 contains 1256 rows.
Stock 6 in Bin 1 contains 62 columns.
Stock 6 in Bin 1 has missing values: 458
Date range: 2020-05-08 - 2025-05-07
Stock 7 in Bin 1 contains 

In [11]:
copy = chosen_stocks.copy()

In [12]:
for i in range(len(chosen_stocks)):
    print(f"Bin {i+1} contains {len(chosen_stocks[i])} stocks.")
    for j in range(len(chosen_stocks[i])):
        chosen_stocks[i][j].dropna(inplace=True)
        print(f"After dropping NaN values:")
        print(f"Date range: {chosen_stocks[i][j].index.min().date()} - {chosen_stocks[i][j].index.max().date()}")
        print(f"Stock {j+1} in Bin {i+1} contains {len(chosen_stocks[i][j])} rows.")
        print(f"Stock {j+1} in Bin {i+1} contains {len(chosen_stocks[i][j].columns)} columns.")
        print(f"Stock {j+1} in Bin {i+1} has missing values: {chosen_stocks[i][j].isnull().sum().sum()}")

Bin 1 contains 15 stocks.
After dropping NaN values:
Date range: 2020-06-19 - 2025-05-06
Stock 1 in Bin 1 contains 1226 rows.
Stock 1 in Bin 1 contains 62 columns.
Stock 1 in Bin 1 has missing values: 0
After dropping NaN values:
Date range: 2020-06-19 - 2025-05-06
Stock 2 in Bin 1 contains 1226 rows.
Stock 2 in Bin 1 contains 62 columns.
Stock 2 in Bin 1 has missing values: 0
After dropping NaN values:
Date range: 2020-06-19 - 2025-05-06
Stock 3 in Bin 1 contains 1226 rows.
Stock 3 in Bin 1 contains 62 columns.
Stock 3 in Bin 1 has missing values: 0
After dropping NaN values:
Date range: 2020-06-19 - 2025-05-06
Stock 4 in Bin 1 contains 1226 rows.
Stock 4 in Bin 1 contains 62 columns.
Stock 4 in Bin 1 has missing values: 0
After dropping NaN values:
Date range: 2020-06-19 - 2025-05-06
Stock 5 in Bin 1 contains 1226 rows.
Stock 5 in Bin 1 contains 62 columns.
Stock 5 in Bin 1 has missing values: 0
After dropping NaN values:
Date range: 2020-06-19 - 2025-05-06
Stock 6 in Bin 1 contains 

In [13]:
chosen_stocks[2][14]

Unnamed: 0,close,high,low,trade_count,open,volume,vwap,symbol,rsi,roc_9,...,personal_consumption,industrial_production,federal_funds_rate,10Y_treasury_rate,retail_sales,housing_starts,usd_eur,usd_gbp,usd_jpy,target
2020-06-19,130.350,131.6276,128.04,11098,131.38,1663506,130.115111,LBRDK,50.088867,-3.380031,...,14208.163636,93.793082,0.086364,0.70,523298.181818,1431.454545,1.1189,1.2362,106.89,129.360
2020-06-22,129.360,130.5100,128.21,5582,130.51,782941,129.301022,LBRDK,46.746667,-2.472859,...,14220.118182,93.944409,0.086818,0.71,523673.909091,1443.272727,1.1260,1.2447,106.84,128.100
2020-06-23,128.100,131.3800,127.61,7328,131.38,697805,128.352552,LBRDK,42.829647,-3.189238,...,14232.072727,94.095736,0.087273,0.72,524049.636364,1455.090909,1.1322,1.2531,106.44,123.990
2020-06-24,123.990,127.4300,123.73,8413,127.43,494493,124.401316,LBRDK,33.089747,-2.668969,...,14244.027273,94.247064,0.087727,0.69,524425.363636,1466.909091,1.1272,1.2432,106.83,124.165
2020-06-25,124.165,124.3400,121.89,7311,123.12,537939,123.849859,LBRDK,33.780270,-3.358499,...,14255.981818,94.398391,0.088182,0.68,524801.090909,1478.727273,1.1221,1.2406,107.21,123.300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-04-30,90.390,90.5400,87.05,15390,87.62,796910,89.323446,LBRDK,64.863764,18.002611,...,20669.500000,103.605109,4.330000,4.17,715715.454545,1262.181818,1.1349,1.3344,142.63,88.960
2025-05-01,88.960,90.5600,88.80,16956,90.55,910700,89.316701,LBRDK,61.801403,13.730504,...,20669.500000,103.594800,4.330000,4.25,715417.000000,1256.000000,1.1279,1.3273,145.48,89.550
2025-05-02,89.550,90.1750,89.08,14508,89.89,656744,89.425865,LBRDK,62.586252,19.879518,...,20669.500000,103.594800,4.330000,4.33,715417.000000,1256.000000,1.1330,1.3283,144.52,91.780
2025-05-05,91.780,92.5000,88.59,16446,88.68,928323,91.251146,LBRDK,65.473774,19.551908,...,20669.500000,103.594800,4.330000,4.36,715417.000000,1256.000000,1.1315,1.3284,143.98,93.540


In [14]:
import pickle
with open('preprocessed_data/preprocessed_general.pkl', 'wb') as f:
    pickle.dump(chosen_stocks, f)