In [None]:

import pandas as pd
import numpy as np


def rsi(series, period=14):
    delta = series.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    avg_gain = gain.rolling(period, min_periods=period).mean()
    avg_loss = loss.rolling(period, min_periods=period).mean()
    rs = avg_gain / avg_loss
    return 100 - (100 / (1 + rs))


def atr(high, low, close, period=14):
    high_low = high - low
    high_close = (high - close.shift()).abs()
    low_close = (low - close.shift()).abs()
    tr = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
    return tr.rolling(period, min_periods=period).mean()


In [None]:

df = pd.read_parquet('data/06data.parquet')
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.set_index('timestamp', inplace=True)
df = df.asfreq('12H')


In [None]:

def compute_base_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df = df.set_index('timestamp')
        df = df.asfreq('12H')

    close = df['token_close_usd']
    high = df.get('high_usd', df.get('high'))
    low = df.get('low_usd', df.get('low'))
    volume = df.get('volume', df.get('token_volume_usd'))

    df['logret_12h'] = np.log(close / close.shift(1))
    df['logret_36h'] = np.log(close / close.shift(3))
    df['rsi_14'] = rsi(close, 14)
    df['roc_3'] = (close / close.shift(3) - 1) * 100
    df['realized_vol_36h'] = df['logret_12h'].rolling(3).std()
    df['atr_14'] = atr(high, low, close, 14)

    if {'best_ask', 'best_bid'}.issubset(df.columns):
        mid = (df['best_ask'] + df['best_bid']) / 2
        df['spread'] = (df['best_ask'] - df['best_bid']) / mid
    else:
        df['spread'] = np.nan

    if {'bid_size', 'ask_size'}.issubset(df.columns):
        df['depth'] = df['bid_size'] + df['ask_size']
    else:
        df['depth'] = np.nan

    if volume is not None:
        df['vol_spike'] = volume / volume.rolling(14).mean()
    else:
        df['vol_spike'] = np.nan

    uniq_wallets = df.get('unique_wallets', df.get('holder_count'))
    if uniq_wallets is not None:
        df['delta_wallets'] = uniq_wallets.diff()
    else:
        df['delta_wallets'] = np.nan

    df['tx_count_12h'] = df.get('tx_count', df.get('network_tx_count'))

    if 'sol_close_usd' in df.columns:
        df['ret_SOL'] = df['sol_close_usd'].pct_change() * 100
    if 'btc_close_usd' in df.columns:
        df['ret_BTC'] = df['btc_close_usd'].pct_change() * 100
    if 'eth_close_usd' in df.columns:
        df['ret_ETH'] = df['eth_close_usd'].pct_change() * 100
    if 'tvl_usd' in df.columns:
        df['tvl_dev'] = (df['tvl_usd'] / df['tvl_usd'].rolling(14).mean() - 1) * 100

    return df


In [None]:

df = compute_base_features(df)
df.head()


## Base Feature Overview
The `compute_base_features` function adds the following fields:
- **logret_12h** – log return of the close over the last 12 hours.
- **logret_36h** – log return over the previous three 12‑hour bars.
- **rsi_14** – 14‑period Relative Strength Index using closing prices.
- **roc_3** – 3‑period Rate of Change of the close (percent).
- **realized_vol_36h** – rolling 36‑hour standard deviation of `logret_12h`.
- **atr_14** – 14‑period Average True Range.
- **spread** – relative bid/ask spread.
- **depth** – combined bid and ask size.
- **vol_spike** – ratio of volume to its 14‑period average.
- **delta_wallets** – change in unique wallet count.
- **tx_count_12h** – transaction count for the bar.
- **ret_SOL** – SOL percentage return.
- **ret_BTC** – BTC percentage return.
- **ret_ETH** – ETH percentage return.
- **tvl_dev** – deviation of DeFi TVL from its 14‑period mean.



###
# PLACEHOLDER: ADDITIONAL FEATURE ENGINEERING
###
# For example, you might compute:
#  - Rolling skewness/kurtosis of logret_12h (tail-risk indicators)
#  - Regime flags (volatility regime, trend regime)
#  - Interaction terms (price × volume, spread × vol_spike)
#  - Composite indices (PCA on SOL/BTC/ETH returns)
#  - Sentiment metrics (social_mentions rolling averages or z-scores)
#  - Price momentum buckets (quantile-based bins)
#  - Missing-data flags and imputation indicators
#  - Any other domain-inspired signals you see fit.
