In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
from pathlib import Path
import os

PROJECT_ROOT = Path("/content/drive/MyDrive/Stock Project").resolve()
os.chdir(PROJECT_ROOT)

print("CWD =", Path.cwd())
print("Files here:", [p.name for p in Path.cwd().iterdir()])

CWD = /content/drive/MyDrive/Stock Project
Files here: ['02_feature_engineering.ipynb', '01_price_data_baseline.ipynb', 'data']


In [21]:
import os
import warnings
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 100)

In [22]:
import os
from pathlib import Path

print("CWD:", os.getcwd())
print("Files in CWD:", os.listdir(".")[:50])

# Show whether ./data exists and what's inside
data_dir = Path("data")
print("data exists?", data_dir.exists())
if data_dir.exists():
    print("data contents:", [p.name for p in data_dir.iterdir()])

CWD: /content/drive/MyDrive/Stock Project
Files in CWD: ['02_feature_engineering.ipynb', '01_price_data_baseline.ipynb', 'data']
data exists? True
data contents: ['clean_prices.parquet', 'returns.parquet']


In [23]:
import os
from pathlib import Path

print("CWD:", Path.cwd())
print("CWD files:", sorted([p.name for p in Path.cwd().iterdir()])[:80])

data_dir = Path.cwd() / "data"
print("data_dir:", data_dir)
print("data_dir exists?", data_dir.exists())

if data_dir.exists():
    print("data contents:", sorted([p.name for p in data_dir.iterdir()])[:80])

print("clean_prices exists?", (data_dir / "clean_prices.parquet").exists())
print("returns exists?", (data_dir / "returns.parquet").exists())

CWD: /content/drive/MyDrive/Stock Project
CWD files: ['01_price_data_baseline.ipynb', '02_feature_engineering.ipynb', 'data']
data_dir: /content/drive/MyDrive/Stock Project/data
data_dir exists? True
data contents: ['clean_prices.parquet', 'returns.parquet']
clean_prices exists? True
returns exists? True


In [24]:
prices = pd.read_parquet("data/clean_prices.parquet")
returns = pd.read_parquet("data/returns.parquet")

assert prices.shape[0] == returns.shape[0] + 1 or prices.shape[0] == returns.shape[0]

In [25]:
def rolling_return(prices, window):
    return prices.pct_change(window)

def rolling_vol(returns, window):
    return returns.rolling(window).std() * np.sqrt(252)

def moving_avg_ratio(prices, short, long):
    return prices.rolling(short).mean() / prices.rolling(long).mean()

def max_drawdown(prices):
    cummax = prices.cummax()
    drawdown = prices / cummax - 1
    return drawdown

def zscore(series, window):
    return (series - series.rolling(window).mean()) / series.rolling(window).std()

In [26]:
tech = {}

# Rolling returns
for w in [5, 10, 21, 63]:
    tech[f"ret_{w}d"] = rolling_return(prices, w)

# Rolling volatility
for w in [10, 21, 63]:
    tech[f"vol_{w}d"] = rolling_vol(returns, w)

# Momentum (price ratios)
tech["ma_20_100"] = moving_avg_ratio(prices, 20, 100)
tech["ma_50_200"] = moving_avg_ratio(prices, 50, 200)

# Drawdown
tech["drawdown"] = max_drawdown(prices)

features_tech = pd.concat(tech.values(), axis=1, keys=tech.keys())
features_tech.columns = ["__".join(col).strip() for col in features_tech.columns.values]

In [27]:
# Drop rows with excessive NaNs (early window effects)
features_tech = features_tech.dropna(how="all")

# Explicit fill policy: none
# If NaNs remain, downstream models must handle them
features_tech.isna().sum().sort_values(ascending=False).head()

Unnamed: 0,0
ma_50_200__SPY,199
ma_20_100__SPY,99
ret_63d__SPY,63
vol_63d__SPY,63
ret_21d__SPY,21


In [28]:
regime = {}

# Volatility regime (z-scored)
vol_63 = rolling_vol(returns, 63)
regime["vol_regime_z"] = zscore(vol_63, 252)

# Trend regime (long MA slope)
ma_200 = prices.rolling(200).mean()
regime["trend_regime"] = (prices / ma_200 - 1)

# Drawdown regime (depth)
regime["drawdown_depth"] = max_drawdown(prices)

features_regime = pd.concat(regime.values(), axis=1, keys=regime.keys())
features_regime.columns = ["__".join(col).strip() for col in features_regime.columns.values]

In [29]:
# Align indices
common_idx = features_tech.index.intersection(features_regime.index)
features_tech = features_tech.loc[common_idx]
features_regime = features_regime.loc[common_idx]

assert features_tech.index.equals(features_regime.index)

In [30]:
os.makedirs("data", exist_ok=True)

features_tech.to_parquet("data/features_technical.parquet")
features_regime.to_parquet("data/features_regime.parquet")

print("Saved:")
print("data/features_technical.parquet")
print("data/features_regime.parquet")

Saved:
data/features_technical.parquet
data/features_regime.parquet


In [31]:
"""
Contract:
- This notebook only produces features.
- No labels, no strategies, no sentiment, no models.
- Downstream notebooks must load features from disk.
"""

'\nContract:\n- This notebook only produces features.\n- No labels, no strategies, no sentiment, no models.\n- Downstream notebooks must load features from disk.\n'