In [None]:

import pandas as pd
import numpy as np


def rsi(series, period=14):
    delta = series.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    avg_gain = gain.rolling(period, min_periods=period).mean()
    avg_loss = loss.rolling(period, min_periods=period).mean()
    rs = avg_gain / avg_loss
    return 100 - (100 / (1 + rs))


def atr(high, low, close, period=14):
    high_low = high - low
    high_close = (high - close.shift()).abs()
    low_close = (low - close.shift()).abs()
    tr = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
    return tr.rolling(period, min_periods=period).mean()


In [None]:

df = pd.read_parquet('data/06data.parquet')
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.set_index('timestamp', inplace=True)
df = df.asfreq('12H')


In [None]:

def compute_base_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df = df.set_index('timestamp')
        df = df.asfreq('12H')

    close = df['token_close_usd']
    high = df.get('high_usd', df.get('high'))
    low = df.get('low_usd', df.get('low'))
    volume = df.get('volume', df.get('token_volume_usd'))

    df['logret_12h'] = np.log(close / close.shift(1))
    df['logret_36h'] = np.log(close / close.shift(3))
    df['rsi_14'] = rsi(close, 14)
    df['roc_3'] = (close / close.shift(3) - 1) * 100
    df['realized_vol_36h'] = df['logret_12h'].rolling(3).std()
    df['atr_14'] = atr(high, low, close, 14)

    if {'best_ask', 'best_bid'}.issubset(df.columns):
        mid = (df['best_ask'] + df['best_bid']) / 2
        df['spread'] = (df['best_ask'] - df['best_bid']) / mid
    else:
        df['spread'] = np.nan

    if {'bid_size', 'ask_size'}.issubset(df.columns):
        df['depth'] = df['bid_size'] + df['ask_size']
    else:
        df['depth'] = np.nan

    if volume is not None:
        df['vol_spike'] = volume / volume.rolling(14).mean()
    else:
        df['vol_spike'] = np.nan

    uniq_wallets = df.get('unique_wallets', df.get('holder_count'))
    if uniq_wallets is not None:
        df['delta_wallets'] = uniq_wallets.diff()
    else:
        df['delta_wallets'] = np.nan

    df['tx_count_12h'] = df.get('tx_count', df.get('network_tx_count'))

    if 'sol_close_usd' in df.columns:
        df['ret_SOL'] = df['sol_close_usd'].pct_change() * 100
    if 'btc_close_usd' in df.columns:
        df['ret_BTC'] = df['btc_close_usd'].pct_change() * 100
    if 'eth_close_usd' in df.columns:
        df['ret_ETH'] = df['eth_close_usd'].pct_change() * 100
    if 'tvl_usd' in df.columns:
        df['tvl_dev'] = (df['tvl_usd'] / df['tvl_usd'].rolling(14).mean() - 1) * 100

    return df


In [None]:

df = compute_base_features(df)
df.head()


## Base Feature Overview
The `compute_base_features` function adds the following fields:
- **logret_12h** – log return of the close over the last 12 hours.
- **logret_36h** – log return over the previous three 12‑hour bars.
- **rsi_14** – 14‑period Relative Strength Index using closing prices.
- **roc_3** – 3‑period Rate of Change of the close (percent).
- **realized_vol_36h** – rolling 36‑hour standard deviation of `logret_12h`.
- **atr_14** – 14‑period Average True Range.
- **spread** – relative bid/ask spread.
- **depth** – combined bid and ask size.
- **vol_spike** – ratio of volume to its 14‑period average.
- **delta_wallets** – change in unique wallet count.
- **tx_count_12h** – transaction count for the bar.
- **ret_SOL** – SOL percentage return.
- **ret_BTC** – BTC percentage return.
- **ret_ETH** – ETH percentage return.
- **tvl_dev** – deviation of DeFi TVL from its 14‑period mean.


Advanced Features

In [None]:

def compute_advanced_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    high = df.get('high_usd', df.get('high'))
    low = df.get('low_usd', df.get('low'))
    open_ = df.get('open_usd', df.get('open'))
    close = df['token_close_usd']
    volume = df.get('token_volume_usd', df.get('volume'))

    # Parkinson volatility (36h window)
    hl_log = np.log(high / low)
    df['parkinson_vol_36h'] = (hl_log ** 2).rolling(3).sum().div(4 * np.log(2)).pow(0.5)

    # Garman-Klass volatility (36h)
    oc_log = np.log(close / open_)
    gk_var = 0.5 * hl_log ** 2 - (2 * np.log(2) - 1) * oc_log ** 2
    df['gk_vol_36h'] = gk_var.rolling(3).mean().pow(0.5)

    # Amihud illiquidity (12h rolling)
    if volume is not None:
        df['amihud_illiq_12h'] = (df['logret_12h'].abs() / volume).rolling(3).mean()
    else:
        df['amihud_illiq_12h'] = np.nan

    if {'new_token_accounts', 'holder_count'}.issubset(df.columns):
        df['new_accounts_ratio'] = df['new_token_accounts'] / df['holder_count']

    if {'network_tx_count', 'holder_count'}.issubset(df.columns):
        df['tx_per_account'] = df['network_tx_count'] / df['holder_count']

    if 'delta_wallets' in df.columns and 'holder_count' in df.columns:
        df['wallet_growth_rate'] = df['delta_wallets'] / df['holder_count']

    if {'ret_SOL', 'logret_12h'}.issubset(df.columns):
        df['corr_SOL_36h'] = df['logret_12h'].rolling(3).corr(df['ret_SOL'])
    if {'ret_BTC', 'logret_12h'}.issubset(df.columns):
        df['corr_BTC_36h'] = df['logret_12h'].rolling(3).corr(df['ret_BTC'])
    if {'ret_ETH', 'logret_12h'}.issubset(df.columns):
        df['corr_ETH_36h'] = df['logret_12h'].rolling(3).corr(df['ret_ETH'])

    if volume is not None:
        df['vol_zscore'] = (volume - volume.rolling(14).mean()) / volume.rolling(14).std()

    df['day_of_week'] = df.index.dayofweek
    df['hour'] = df.index.hour

    return df


## Advanced Feature Overview
The `compute_advanced_features` function augments the dataset with:
- **parkinson_vol_36h** – 36‑hour Parkinson volatility using high and low prices.
- **gk_vol_36h** – 36‑hour Garman‑Klass volatility from OHLC bars.
- **amihud_illiq_12h** – Amihud illiquidity over the last 36 hours.
- **new_accounts_ratio** – new token accounts relative to current holders.
- **tx_per_account** – network transactions per holder.
- **wallet_growth_rate** – change in wallet count scaled by total holders.
- **corr_SOL_36h** – rolling correlation of token and SOL returns.
- **corr_BTC_36h** – rolling correlation of token and BTC returns.
- **corr_ETH_36h** – rolling correlation of token and ETH returns.
- **vol_zscore** – volume z‑score versus a 14‑period mean and std.
- **day_of_week** – day of week (0=Monday).
- **hour** – bar hour of day (0 or 12).


## Additional Feature Overview
The `compute_additional_features` function adds:
- **skew_36h** – rolling 36‑hour skewness of `logret_12h`.
- **kurt_36h** – rolling 36‑hour kurtosis of `logret_12h`.
- **vol_regime** – 1 if short volatility exceeds its 14‑period mean.
- **trend_regime** – 1 if close is above its 50‑period average.
- **price_volume** – close multiplied by volume.
- **spread_vol** – bid/ask spread times `vol_spike`.
- **market_pc1** – first principal component of SOL/BTC/ETH returns.
- **momentum_bucket** – quantile bin of 3‑period ROC.
- **volume_missing** – flag for missing volume.


In [None]:

def compute_additional_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # Rolling tail risk metrics
    df['skew_36h'] = df['logret_12h'].rolling(3).skew()
    df['kurt_36h'] = df['logret_12h'].rolling(3).kurt()
    # Regime flags
    df['vol_regime'] = (df['realized_vol_36h'] > df['realized_vol_36h'].rolling(14).mean()).astype(int)
    df['trend_regime'] = (df['token_close_usd'] > df['token_close_usd'].rolling(50).mean()).astype(int)
    # Interaction terms
    if 'token_volume_usd' in df.columns:
        df['price_volume'] = df['token_close_usd'] * df['token_volume_usd']
    if {'spread', 'vol_spike'}.issubset(df.columns):
        df['spread_vol'] = df['spread'] * df['vol_spike']
    # Rolling PCA first component of SOL/BTC/ETH returns
    if {'ret_SOL','ret_BTC','ret_ETH'}.issubset(df.columns):
        from sklearn.decomposition import PCA
        rets = df[['ret_SOL','ret_BTC','ret_ETH']]
        pc1 = [np.nan] * len(rets)
        pca = PCA(n_components=1)
        for i in range(13, len(rets)):
            window = rets.iloc[i-13:i+1].dropna()
            if len(window) == 14:
                pca.fit(window)
                pc1[i] = pca.transform(rets.iloc[[i]])[0,0]
        df['market_pc1'] = pc1
    # Momentum quantile bins
    if 'roc_3' in df.columns:
        df['momentum_bucket'] = pd.qcut(df['roc_3'].rank(method='first'), q=5, labels=False)
    # Missing data flag
    if 'token_volume_usd' in df.columns:
        df['volume_missing'] = df['token_volume_usd'].isna().astype(int)
    return df


In [None]:

# compute base and advanced features

df = compute_base_features(df)
df = compute_advanced_features(df)
df = compute_additional_features(df)
df.head()
