# Feature‑Engineering Notebook

**How to use**
1. Run the notebook top‑to‑bottom once; it will create `features_full.parquet` (or CSV).
2. Switch feature families on/off by editing `FEATURE_SWITCHES` in the first code cell.
3. Re‑run the relevant cells; everything downstream updates automatically.

In [237]:
# --------------------------- GLOBAL IMPORTS & SWITCHES ---------------------------
import pandas as pd
import numpy as np
from pathlib import Path
import warnings, json, os

FEATURE_SWITCHES = {
    # core data
    "base_ohlcv": True,
    "onchain": True,
    "moment_vol": True,
    "corr_with_majors": True,
    # extras
    "ta_all": True,
    "hi_low_vols": True,
    "simple_regimes": True,
}

RAW_FILE  = Path(r"C:/Users/james/OneDrive/Documents/GitHub/solana-qrf-interval-forecasting/data/06data.parquet")
OUT_FILE  = RAW_FILE.parent / "features_full.parquet"

WINDOW_12H    = 1   # 12‑h bar
WINDOW_36H    = 3   # 36‑h look‑back
WINDOW_14BAR  = 14
FWD_HORIZON   = 6   # 72‑h forward return

MISSING_FLAG_COLS = ["holder_count", "new_token_accounts",
                     "transfer_count", "corr_SOL_36h",
                     "corr_BTC_36h", "corr_ETH_36h"]


## 1. Load raw & benchmark data

In [238]:
raw = pd.read_parquet(RAW_FILE)
raw["timestamp"] = pd.to_datetime(raw["timestamp"])
raw = raw.sort_values(["token", "timestamp"]).reset_index(drop=True)

# we expect btc_close_usd / eth_close_usd / sol_close_usd already present
bench_cols = {c for c in raw.columns if c.endswith("_close_usd") and len(c) < 15}
if not {"btc_close_usd", "eth_close_usd", "sol_close_usd"}.issubset(bench_cols):
    warnings.warn("Benchmark close columns not found – ret_BTC/ETH/SOL will stay NaN!", RuntimeWarning)

## 2. Helper functions & feature blocks

In [239]:
def _add_missing_flags(df: pd.DataFrame, cols=MISSING_FLAG_COLS):
    for c in cols:
        if c in df.columns:
            df[f"{c}_missing"] = df[c].isna().astype("uint8")
    return df


def add_benchmark_returns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create ret_BTC / ret_ETH / ret_SOL from the close columns already
    embedded in the raw panel.  Works even if the panel contains many tokens.
    """
    bench_cols = ["btc_close_usd", "eth_close_usd", "sol_close_usd"]
    if not set(bench_cols).issubset(df.columns):
        warnings.warn("Benchmark close columns missing – returns will stay NaN.", RuntimeWarning)
        return df

    # 1. dedupe on timestamp so each bar has one BTC/ETH/SOL quote
    bench = (
        df[["timestamp"] + bench_cols]
        .drop_duplicates("timestamp")
        .sort_values("timestamp")
        .reset_index(drop=True)
    )

    # 2️. calc returns in pure time order
    for asset, col in zip(["BTC", "ETH", "SOL"], bench_cols):
        bench[f"ret_{asset}"] = bench[col].pct_change()

    # 3. merge back to the main frame
    return df.merge(
        bench[["timestamp", "ret_BTC", "ret_ETH", "ret_SOL"]],
        on="timestamp",
        how="left",
    )

In [240]:
# ---------------------------- BASE OHLCV ---------------------------------

def base_ohlcv_features(df: pd.DataFrame) -> pd.DataFrame:
    if not FEATURE_SWITCHES["base_ohlcv"]:
        return df
    df["return_12h"] = df.groupby("token")["token_close_usd"].pct_change(WINDOW_12H)
    df["return_72h"] = df.groupby("token")["token_close_usd"].pct_change(FWD_HORIZON)
    df["logret_12h"] = np.log1p(df["return_12h"])
    df["logret_36h"] = np.log1p(df.groupby("token")["token_close_usd"].pct_change(WINDOW_36H))
    df["realized_vol_12h"] = (
        df.groupby("token")["logret_12h"].transform(lambda s: s.rolling(WINDOW_14BAR, min_periods=5).std())
    )
    return df

In [241]:
# ---------------------------- TAIL FEATURES ----------------------------
def tail_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Flag extreme 12-h moves and rolling tail statistics.
    Threshold = |return_12h| > 2.5 × rolling σ14
    """
    g = df.groupby("token")
    ret  = g["token_close_usd"].pct_change()
    sigma14 = ret.groupby(df["token"]).transform(lambda s: s.rolling(14).std())
    extreme = (ret.abs() > 2.5 * sigma14).astype("int")

    df["extreme_move1"]      = extreme
    df["extreme_flag1"]      = extreme
    df["tail_positive"]          = (ret >  2.5 * sigma14).astype("int")
    df["tail_negative"]          = (ret < -2.5 * sigma14).astype("int")
    # Use .astype(int) to ensure subtraction works without dtype issues
    df["tail_asym"] = df["tail_positive"].astype(int) - df["tail_negative"].astype(int)
    df["extreme_count_72h"] = extreme.groupby(df["token"]).transform(lambda s: s.rolling(6).sum())

    return df

In [242]:
# ---------------------------- holder growth & ratio ---------------------------------
def holder_features(df):
    g = df.groupby("token")
    df["holder_growth_1bar"] = g["holder_count"].pct_change()
    df["holder_growth_7d"]   = g["holder_count"].pct_change(14)
    df["tx_per_account"]     = df["transfer_count"] / df["holder_count"]
    return df

In [243]:
# ---------------------------- LIQUIDITY & VOL-OF-VOL EXTRAS ----------------------------
def liquidity_extras(df: pd.DataFrame) -> pd.DataFrame:
    """
    Compute Amihud illiquidity, price-volume interaction,
    rolling vol-of-vol std (7 bars) and z-score (14 bars).
    """
    g = df.groupby("token")

    # Amihud: |ΔP| / volume
    abs_ret = g["token_close_usd"].pct_change().abs()
    df["amihud_illiq_12h"] = abs_ret / df["token_volume_usd"].replace(0, np.nan)

    # price × volume dollar turnover
    df["price_volume"] = df["token_close_usd"] * df["token_volume_usd"]

    # rolling vol-of-vol
    logret = np.log1p(g["token_close_usd"].pct_change())
    df["vol_std_7bar"]   = logret.groupby(df["token"]).transform(lambda s: s.rolling(7).std())
    vol14                = logret.groupby(df["token"]).transform(lambda s: s.rolling(14).std())
    df["vol_zscore_14"]  = (vol14 - vol14.groupby(df["token"]).transform("mean")) / \
                            vol14.groupby(df["token"]).transform("std")

    return df

In [244]:
# ---------------------------- SEASONALITY DUMMIES ----------------------------
def seasonality_features(df: pd.DataFrame) -> pd.DataFrame:
    df["day_of_week"] = df["timestamp"].dt.dayofweek
    # sine / cosine hour to keep it continuous for linear models
    hr = df["timestamp"].dt.hour
    df["hour_sin"] = np.sin(2 * np.pi * hr / 24)
    df["hour_cos"] = np.cos(2 * np.pi * hr / 24)
    return df

In [245]:
# ---------------------------- MOMENT & VOL --------------------------------

def moment_vol_features(df: pd.DataFrame) -> pd.DataFrame:
    if not FEATURE_SWITCHES["moment_vol"]:
        return df
    df["skew_36h"] = df.groupby("token")["logret_12h"].transform(lambda s: s.rolling(WINDOW_36H).skew())
    down = df["logret_12h"].where(df["logret_12h"] < 0)
    df["downside_vol_3bar"] = df.groupby("token")[down.name].transform(lambda s: s.rolling(WINDOW_36H).std())
    return df

In [246]:
# ----------------------- HI‑LOW VOLATILITY ---------------------------------

def hi_low_vol_features(df: pd.DataFrame) -> pd.DataFrame:
    if not FEATURE_SWITCHES["hi_low_vols"]:
        return df
    h, l = df["high_usd"], df["low_usd"]
    df["parkinson_vol_36h"] = np.sqrt((1 / (4 * np.log(2))) * (np.log(h / l)) ** 2)
    hlc = np.log(df["token_close_usd"]) - 0.5 * np.log(h * l)
    df["gk_vol_36h"] = (hlc ** 2).rolling(WINDOW_36H).mean()
    return df

In [247]:
# ----------------------- SIMPLE REGIMES & BUCKETS -------------------------

def simple_regime_features(df: pd.DataFrame) -> pd.DataFrame:
    if not FEATURE_SWITCHES["simple_regimes"]:
        return df
    # volatility regime based on 14‑bar vol percentile
    vol14 = df.groupby("token")["realized_vol_12h"].transform(lambda s: s.rolling(WINDOW_14BAR).mean())
    df["vol_regime"] = pd.qcut(vol14, 5, labels=False, duplicates="drop")
    # trend regime: 14‑bar SMA slope sign
    sma14 = df.groupby("token")["token_close_usd"].transform(lambda s: s.rolling(WINDOW_14BAR).mean())
    slope = sma14.groupby(df["token"]).diff()
    df["trend_regime"] = np.sign(slope).fillna(0).astype("int8")
    # momentum bucket (deciles of 36 h log ret)
    df["momentum_bucket"] = pd.qcut(df["logret_36h"], 10, labels=False, duplicates="drop")
    return df

In [248]:
# ----------------------- FULL TA INDICATOR PACK ---------------------------

def ta_indicator_features(df: pd.DataFrame) -> pd.DataFrame:
    """Full classical TA pack — no index-alignment errors."""
    if not FEATURE_SWITCHES["ta_all"]:
        return df

    grp = df.groupby("token")

    # --- ROC 3 -----------------------------------------------------------
    df["roc_3"] = grp["token_close_usd"].pct_change(3)

    # --- Stochastic %K & Williams %R ------------------------------------
    high14 = grp["high_usd"].transform(lambda s: s.rolling(WINDOW_14BAR).max())
    low14  = grp["low_usd"].transform(lambda s: s.rolling(WINDOW_14BAR).min())
    range14 = (high14 - low14).replace(0, 1e-9)  # avoid div/0

    df["stoch_k"]   = 100 * (df["token_close_usd"] - low14) / range14
    df["williams_r"] = -100 * (high14 - df["token_close_usd"]) / range14

    # --- MACD (12/26 EMA) & signal 9 ------------------------------------
    ema12 = grp["token_close_usd"].transform(lambda s: s.ewm(span=12, adjust=False).mean())
    ema26 = grp["token_close_usd"].transform(lambda s: s.ewm(span=26, adjust=False).mean())
    df["macd"]        = ema12 - ema26
    df["macd_signal"] = grp["macd"].transform(lambda s: s.ewm(span=9, adjust=False).mean())

    # --- PROC (Price Rate-of-Change relative to SMA-10) ------------------
    sma10 = grp["token_close_usd"].transform(lambda s: s.rolling(10).mean())
    df["proc"] = (df["token_close_usd"] - sma10) / sma10.replace(0, 1e-9)

    # --- Bollinger band %B & width (20, 2σ) ------------------------------
    ma20  = grp["token_close_usd"].transform(lambda s: s.rolling(20).mean())
    std20 = grp["token_close_usd"].transform(lambda s: s.rolling(20).std())
    upper = ma20 + 2 * std20
    lower = ma20 - 2 * std20
    width = (upper - lower).replace(0, 1e-9)

    df["bollinger_b"]  = (df["token_close_usd"] - lower) / width
    df["bollinger_bw"] = width / ma20.replace(0, 1e-9)

    # --- ADX 14 --------------------------------------------------------
    up  = df.groupby("token")["high_usd"].diff()
    down = -df.groupby("token")["low_usd"].diff()

    plus_dm  = np.where((up > down) & (up > 0),  up,   0.0)
    minus_dm = np.where((down > up) & (down > 0), down, 0.0)

    tr = np.maximum.reduce([
        df["high_usd"] - df["low_usd"],
        (df["high_usd"] - df.groupby("token")["token_close_usd"].shift()).abs(),
        (df["low_usd"]  - df.groupby("token")["token_close_usd"].shift()).abs(),
    ])

# rolling sums within each token
    roll14 = lambda x: x.rolling(WINDOW_14BAR, min_periods=1).sum()

    plus_di  = 100 * pd.Series(plus_dm,  index=df.index).groupby(df["token"]).transform(roll14) / \
                    (pd.Series(tr, index=df.index).groupby(df["token"]).transform(roll14) + 1e-9)

    minus_di = 100 * pd.Series(minus_dm, index=df.index).groupby(df["token"]).transform(roll14) / \
                    (pd.Series(tr, index=df.index).groupby(df["token"]).transform(roll14) + 1e-9)

    dx = 100 * (plus_di - minus_di).abs() / (plus_di + minus_di + 1e-9)
    df["adx"] = dx.groupby(df["token"]).transform(lambda s: s.rolling(WINDOW_14BAR).mean())

    # --- Commodity Channel Index (CCI) -----------------------------------
    tp      = (df["high_usd"] + df["low_usd"] + df["token_close_usd"]) / 3
    sma_tp  = tp.groupby(df["token"]).transform(lambda s: s.rolling(WINDOW_14BAR).mean())
    mad_tp  = tp.groupby(df["token"]).transform(lambda s: s.rolling(WINDOW_14BAR)
                                .apply(lambda x: np.mean(np.abs(x - x.mean())), raw=True))
    df["cci"] = (tp - sma_tp) / (0.015 * mad_tp + 1e-9)

    # --- On-Balance Volume (OBV) -----------------------------------------
    direction = np.sign(df["return_12h"].fillna(0))
    obv_series = grp["token_volume_usd"].transform(
        lambda s: (direction.loc[s.index] * s).cumsum()
    )
    df["obv"] = obv_series

    return df

In [249]:
# ----------------------- CORRELATION WITH MAJORS --------------------------

def corr_with_majors_features(df: pd.DataFrame) -> pd.DataFrame:
    if not FEATURE_SWITCHES["corr_with_majors"]:
        return df
    grp = df.groupby("token")
    for a in ["ret_SOL", "ret_BTC", "ret_ETH"]:
        df[f"corr_{a.split('_')[1]}_36h"] = grp[a].transform(lambda s: s.rolling(WINDOW_36H).corr(df[a]))
    return df

## 3. Build pipeline

In [250]:
def build_feature_matrix(raw_panel: pd.DataFrame) -> pd.DataFrame:
    """
    Orchestrates every feature block in the correct dependency order.
    Always returns a *new* DataFrame; never mutates the input.
    """
    # 0️⃣  copy & basic sort (safety)
    df = (
        raw_panel.copy()
        .sort_values(["token", "timestamp"])
        .reset_index(drop=True)
    )

    # 1️⃣  benchmark returns (needed by corr_with_majors)
    df = add_benchmark_returns(df)

    # 2️⃣  base OHLCV (creates return_12h, logret_12h, etc.—prereq for most blocks)
    if FEATURE_SWITCHES.get("base_ohlcv", True):
        df = base_ohlcv_features(df)

    # 3️⃣  moment & realised-vol stats (needs logret_12h)
    if FEATURE_SWITCHES.get("moment_vol", True):
        df = moment_vol_features(df)

    # 4️⃣  hi/low volatility estimators (needs high/low + close)
    if FEATURE_SWITCHES.get("hi_low_vols", True):
        df = hi_low_vol_features(df)

    # 5️⃣  simple regimes / buckets (needs logret_36h + realised_vol_12h)
    if FEATURE_SWITCHES.get("simple_regimes", True):
        df = simple_regime_features(df)

    # 6️⃣  full TA indicator pack (requires return_12h from step 2)
    if FEATURE_SWITCHES.get("ta_all", True):
        df = ta_indicator_features(df)

    # 7️⃣  correlation with majors (needs ret_BTC/ETH/SOL from step 1)
    if FEATURE_SWITCHES.get("corr_with_majors", True):
        df = corr_with_majors_features(df)

    df = holder_features(df)
    df = tail_features(df)
    df = liquidity_extras(df)
    df = seasonality_features(df)

    # 8️⃣  missing-data indicator columns (always run last)
    df = _add_missing_flags(df, cols=MISSING_FLAG_COLS)

    return df

features = build_feature_matrix(raw)

print(f"Feature matrix built – {features.shape[0]:,} rows × {features.shape[1]} columns")


Feature matrix built – 6,440 rows × 94 columns


  df["holder_growth_1bar"] = g["holder_count"].pct_change()
  df["holder_growth_7d"]   = g["holder_count"].pct_change(14)


## 4. Run & save

In [251]:
KEEP_RAW_COLUMNS = False        # ← flip to True if you want the raw fields too

if KEEP_RAW_COLUMNS:
    features_out = features
    out_name = "features_full_with_raw.parquet"
else:
    orig_raw_cols   = set(raw.columns)
    essential_cols  = ["timestamp", "token", "return_72h"]      # identifiers + target
    engineered_cols = [c for c in features.columns
                       if c not in orig_raw_cols]               # drop raw duplicates
    features_out = features[essential_cols + engineered_cols]
    out_name = "features_full.parquet"

OUT_FILE = RAW_FILE.parent / out_name
features_out.to_parquet(OUT_FILE, index=False)

print(f"✅ Saved {features_out.shape[1]} columns → {OUT_FILE}")
features_out.head()

✅ Saved 51 columns → C:\Users\james\OneDrive\Documents\GitHub\solana-qrf-interval-forecasting\data\features_full.parquet


Unnamed: 0,timestamp,token,return_72h,ret_BTC,ret_ETH,ret_SOL,logret_12h,logret_36h,skew_36h,downside_vol_3bar,...,vol_zscore_14,day_of_week,hour_sin,hour_cos,holder_count_missing,new_token_accounts_missing,transfer_count_missing,corr_SOL_36h_missing,corr_BTC_36h_missing,corr_ETH_36h_missing
0,2024-12-05 12:00:00,$WIF,,,,,,,,,...,,3,1.224647e-16,-1.0,1,0,0,1,1,1
1,2024-12-06 00:00:00,$WIF,,0.018095,0.017378,0.011811,0.041834,,,,...,,4,0.0,1.0,1,0,0,1,1,1
2,2024-12-06 12:00:00,$WIF,,0.024566,0.041006,0.008596,0.027813,,,,...,,4,1.224647e-16,-1.0,1,0,0,1,1,1
3,2024-12-07 00:00:00,$WIF,,-0.010024,-0.011104,0.001533,0.04321,0.112858,-1.681377,0.00852,...,,5,0.0,1.0,1,0,0,0,0,0
4,2024-12-07 12:00:00,$WIF,,0.00075,0.002865,0.003241,0.072218,0.143242,0.868838,0.022548,...,,5,1.224647e-16,-1.0,1,0,0,0,0,0
