In [1]:
# 02_features.ipynb
# Feature Engineering - Purpose: Construct momentum, volatility, and liquidity features for sandbox tickers.
# some functions are imported from the src folder

In [2]:
import sys, os
sys.path.append(os.path.abspath(".."))

from src.data import load_prices, load_close_prices

aapl = load_prices("AAPL")
print(type(aapl.index))
print(aapl.index.dtype)
print(aapl.index.min(), aapl.index.max())

<class 'pandas.core.indexes.datetimes.DatetimeIndex'>
datetime64[ns]
2015-01-02 00:00:00 2024-12-31 00:00:00


In [3]:
# build (monthly) momentum, volatility, and liquidity features
prices = load_close_prices(["AAPL", "MSFT", "JPM", "XOM", "PG"])
returns = prices.pct_change()
volumes = {t: load_prices(t)["Volume"] for t in prices.columns}

from src.features import make_features
features = make_features(prices, returns, volumes, prices.columns)

print("Features shape:", features.shape)
features.head()


Features shape: (113, 25)


Unnamed: 0_level_0,"(AAPL, mom_1m)","(AAPL, mom_3m)","(AAPL, mom_6m)","(AAPL, vol_3m)","(AAPL, liq_3m)","(MSFT, mom_1m)","(MSFT, mom_3m)","(MSFT, mom_6m)","(MSFT, vol_3m)","(MSFT, liq_3m)",...,"(XOM, mom_1m)","(XOM, mom_3m)","(XOM, mom_6m)","(XOM, vol_3m)","(XOM, liq_3m)","(PG, mom_1m)","(PG, mom_3m)","(PG, mom_6m)","(PG, vol_3m)","(PG, liq_3m)"
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-08-31,-0.032927,-0.026714,0.043764,0.018537,220820700.0,0.057758,-0.033645,0.171733,0.016192,33594540.0,...,-0.047957,-0.085898,-0.079544,0.015928,13999290.0,-0.011656,-0.027442,-0.075216,0.011519,9421786.0
2015-09-30,-0.066196,-0.130562,-0.114561,0.021584,243898900.0,-0.061949,-0.065152,0.005512,0.018253,34074200.0,...,-0.041232,-0.108638,-0.135169,0.017813,15312190.0,-0.078618,-0.0911,-0.156328,0.01272,10163800.0
2015-10-31,-0.021816,-0.116645,-0.105826,0.021746,244819200.0,0.017004,0.009103,0.102839,0.022114,36224620.0,...,-0.011829,-0.098009,-0.109808,0.017499,15975840.0,0.017971,-0.072992,-0.107731,0.012378,10503710.0
2015-11-30,0.083409,-0.01038,-0.036817,0.017336,194901600.0,0.189335,0.134628,0.096453,0.018224,34800370.0,...,0.112845,0.054339,-0.036226,0.015127,14268730.0,0.071261,0.004778,-0.022795,0.010286,9175424.0
2015-12-31,-0.005804,0.053622,-0.083941,0.015747,172770700.0,0.039444,0.257268,0.175355,0.017394,36246130.0,...,-0.00445,0.094788,-0.024148,0.016443,14840580.0,-0.020163,0.068525,-0.028817,0.009754,8695911.0


In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# --- Some data cleaning ---
# Forward/backward fill for rolling NaNs (the strategy we chose here)
# After the forward-fill, there might still be NaNs at the very beginning of the dataset->'bfill'
features_clean = features.fillna(method='ffill').fillna(method='bfill')

# --- Rescale the features for efficient ML ---
scaler = StandardScaler()
features_scaled = pd.DataFrame(
    scaler.fit_transform(features_clean),
    index=features_clean.index,
    columns=features_clean.columns
)

# --- Create target: quantile-based future returns ---
# Compute next month's return per ticker
future_returns = prices.pct_change().shift(-1)  # shift -1 month ahead: forward shift creates the "future returns"
# Define quantiles for classification (top 20%, bottom 20%, middle 60%)
def classify_quantiles(series, top=0.8, bottom=0.2):
    return pd.qcut(series, q=[0, bottom, top, 1], labels=[0, 1, 2])
# cuts a Series into a specified number of quantiles (20%, 60%, 20%)-> converts a continuous variable (returns) into a categorical one

target = future_returns.apply(classify_quantiles)

# Align features & target by removing the last row from both the features and the target DataFrames
features_scaled = features_scaled.iloc[:-1]
target = target.iloc[:-1]

print("Features shape:", features_scaled.shape)
print("Target shape:", target.shape)
target.head()


Features shape: (112, 25)
Target shape: (2515, 5)


  features_clean = features.fillna(method='ffill').fillna(method='bfill')


Unnamed: 0_level_0,AAPL,MSFT,JPM,XOM,PG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-01-02,0,1,0,0,1
2015-01-05,1,0,0,1,1
2015-01-06,2,2,1,1,1
2015-01-07,2,2,2,2,2
2015-01-08,1,1,0,1,0


In [5]:
# Convert target to integer (if using pd.qcut)
target_int = target.apply(lambda col: col.astype(int))

# Align target index with features index (day versus month)
target_aligned = target_int.reindex(features.index, method='ffill')  
# Save preprocessed features and target
os.makedirs("../data/processed", exist_ok=True)

# Save files using relative path from notebooks/
features_scaled.to_pickle("../data/processed/features_scaled.pkl")
target_aligned.to_pickle("../data/processed/target.pkl")
prices.to_pickle("../data/processed/prices.pkl")

NameError: name 'y_train' is not defined