# Feature Engineering & Regime Signals

This notebook focuses on constructing meaningful features from cleaned intraday
data and identifying market regimes to support a quantitative trading strategy.


## Objectives

The objectives of this notebook are:
- Load cleaned spot and futures datasets
- Engineer trend, volatility, and momentum-based features
- Identify market regimes (trending vs ranging)
- Prepare a feature set suitable for ML-based trade filtering


## Feature Engineering Philosophy

Features are selected based on interpretability and relevance to intraday trading.
The focus is on trend strength, volatility expansion, and momentum confirmation
rather than an exhaustive list of technical indicators.


In [1]:
import pandas as pd
import numpy as np
import os

from IPython.display import display

# Paths
PROCESSED_DATA_PATH = os.path.join("..", "data", "processed")

spot_path = os.path.join(PROCESSED_DATA_PATH, "spot_cleaned_1y.csv")
futures_path = os.path.join(PROCESSED_DATA_PATH, "futures_cleaned_1y.csv")

# Load processed data
spot_df = pd.read_csv(spot_path, parse_dates=["date"])
futures_df = pd.read_csv(futures_path, parse_dates=["date"])

print("Spot shape:", spot_df.shape)
print("Futures shape:", futures_df.shape)

display(spot_df.head())

Spot shape: (17488, 6)
Futures shape: (17488, 6)


Unnamed: 0,date,close,high,low,open,volume
0,2021-10-21 15:25:00+05:30,18219.6,18222.35,18198.4,18198.4,0
1,2021-10-22 09:15:00+05:30,18249.9,18251.0,18206.15,18230.7,0
2,2021-10-22 09:20:00+05:30,18248.25,18276.0,18225.95,18251.0,0
3,2021-10-22 09:25:00+05:30,18279.15,18283.15,18248.5,18248.5,0
4,2021-10-22 09:30:00+05:30,18295.4,18295.6,18273.85,18280.05,0


In [2]:
# EMA-based trend features
# Define EMA periods
FAST_EMA = 20
SLOW_EMA = 50

# Calculate EMAs on spot prices
spot_df["ema_fast"] = spot_df["close"].ewm(span=FAST_EMA, adjust=False).mean()
spot_df["ema_slow"] = spot_df["close"].ewm(span=SLOW_EMA, adjust=False).mean()

# EMA spread (trend strength)
spot_df["ema_spread"] = spot_df["ema_fast"] - spot_df["ema_slow"]

# Trend direction: 1 = bullish, -1 = bearish
spot_df["trend_dir"] = np.where(
    spot_df["ema_fast"] > spot_df["ema_slow"], 1, -1
)

print("EMA features added.")
display(
    spot_df[["date", "close", "ema_fast", "ema_slow", "ema_spread", "trend_dir"]].head(10)
)


EMA features added.


Unnamed: 0,date,close,ema_fast,ema_slow,ema_spread,trend_dir
0,2021-10-21 15:25:00+05:30,18219.6,18219.6,18219.6,0.0,-1
1,2021-10-22 09:15:00+05:30,18249.9,18222.485714,18220.788235,1.697479,1
2,2021-10-22 09:20:00+05:30,18248.25,18224.939456,18221.865167,3.074289,1
3,2021-10-22 09:25:00+05:30,18279.15,18230.102365,18224.111631,5.990733,1
4,2021-10-22 09:30:00+05:30,18295.4,18236.321187,18226.907254,9.413934,1
5,2021-10-22 09:35:00+05:30,18299.4,18242.328693,18229.750106,12.578587,1
6,2021-10-22 09:40:00+05:30,18308.1,18248.592627,18232.822651,15.769976,1
7,2021-10-22 09:45:00+05:30,18278.65,18251.455234,18234.619802,16.835432,1
8,2021-10-22 09:50:00+05:30,18262.2,18252.478545,18235.701379,16.777167,1
9,2021-10-22 09:55:00+05:30,18259.9,18253.18535,18236.650344,16.535006,1


In [3]:
# Volatility features
# Log returns
spot_df["log_return"] = np.log(spot_df["close"] / spot_df["close"].shift(1))

# Rolling volatility (20-period)
VOL_WINDOW = 20
spot_df["rolling_vol"] = spot_df["log_return"].rolling(VOL_WINDOW).std()

# Volatility regime: high volatility = 1, low volatility = 0
vol_threshold = spot_df["rolling_vol"].median()
spot_df["high_vol"] = (spot_df["rolling_vol"] > vol_threshold).astype(int)

print("Volatility features added.")
display(
    spot_df[["date", "close", "log_return", "rolling_vol", "high_vol"]].head(25)
)

Volatility features added.


Unnamed: 0,date,close,log_return,rolling_vol,high_vol
0,2021-10-21 15:25:00+05:30,18219.6,,,0
1,2021-10-22 09:15:00+05:30,18249.9,0.001662,,0
2,2021-10-22 09:20:00+05:30,18248.25,-9e-05,,0
3,2021-10-22 09:25:00+05:30,18279.15,0.001692,,0
4,2021-10-22 09:30:00+05:30,18295.4,0.000889,,0
5,2021-10-22 09:35:00+05:30,18299.4,0.000219,,0
6,2021-10-22 09:40:00+05:30,18308.1,0.000475,,0
7,2021-10-22 09:45:00+05:30,18278.65,-0.00161,,0
8,2021-10-22 09:50:00+05:30,18262.2,-0.0009,,0
9,2021-10-22 09:55:00+05:30,18259.9,-0.000126,,0


In [4]:
# Market regime labeling
def label_regime(row):
    if row["trend_dir"] == 1 and row["high_vol"] == 1:
        return "bull_trend_high_vol"
    elif row["trend_dir"] == 1 and row["high_vol"] == 0:
        return "bull_trend_low_vol"
    elif row["trend_dir"] == -1 and row["high_vol"] == 1:
        return "bear_trend_high_vol"
    else:
        return "bear_trend_low_vol"

spot_df["market_regime"] = spot_df.apply(label_regime, axis=1)

print("Market regime labeling completed.")
display(spot_df[["date", "market_regime"]].value_counts().head())


Market regime labeling completed.


date                       market_regime     
2021-10-21 15:25:00+05:30  bear_trend_low_vol    1
2021-10-22 09:15:00+05:30  bull_trend_low_vol    1
2021-10-22 09:20:00+05:30  bull_trend_low_vol    1
2021-10-22 09:25:00+05:30  bull_trend_low_vol    1
2021-10-22 09:30:00+05:30  bull_trend_low_vol    1
Name: count, dtype: int64

In [5]:
# Select final feature set for ML
feature_cols = [
    "date",
    "close",
    "ema_fast",
    "ema_slow",
    "ema_spread",
    "trend_dir",
    "rolling_vol",
    "high_vol",
    "market_regime"
]

features_df = spot_df[feature_cols].copy()

# Drop rows with NaNs (from rolling calculations)
features_df = features_df.dropna().reset_index(drop=True)

print("Final feature dataset shape:", features_df.shape)
display(features_df.head())


Final feature dataset shape: (17468, 9)


Unnamed: 0,date,close,ema_fast,ema_slow,ema_spread,trend_dir,rolling_vol,high_vol,market_regime
0,2021-10-22 10:50:00+05:30,18233.65,18261.1861,18247.471314,13.714786,1,0.001021,1,bull_trend_high_vol
1,2021-10-22 10:55:00+05:30,18253.85,18260.487424,18247.721459,12.765965,1,0.000981,1,bull_trend_high_vol
2,2021-10-22 11:00:00+05:30,18271.7,18261.555288,18248.661794,12.893494,1,0.001004,1,bull_trend_high_vol
3,2021-10-22 11:05:00+05:30,18282.05,18263.507165,18249.971135,13.53603,1,0.000938,1,bull_trend_high_vol
4,2021-10-22 11:10:00+05:30,18262.35,18263.396959,18250.456581,12.940378,1,0.000944,1,bull_trend_high_vol


In [6]:
# Save final feature dataset
FEATURES_OUTPUT_PATH = os.path.join("..", "data", "processed", "features_1y.csv")

features_df.to_csv(FEATURES_OUTPUT_PATH, index=False)

print("Feature dataset saved at:")
print(FEATURES_OUTPUT_PATH)


Feature dataset saved at:
..\data\processed\features_1y.csv
