# V2 - Feature Engineering notebook

In [13]:
import pandas as pd
import numpy as np

df = pd.read_parquet("C:/Users/james/OneDrive/Documents/GitHub/solana-qrf-interval-forecasting/data/06data.parquet")
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.sort_values(['token', 'timestamp'], inplace=True)

Base indicators

In [14]:
def rsi(series, period=14):
    delta = series.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    avg_gain = gain.ewm(alpha=1/period, min_periods=period, adjust=False).mean()
    avg_loss = loss.ewm(alpha=1/period, min_periods=period, adjust=False).mean()
    rs = avg_gain / avg_loss
    return 100 - 100 / (1 + rs)
def atr(high, low, close, period=14):
    high_low = high - low
    high_close = (high - close.shift()).abs()
    low_close = (low - close.shift()).abs()
    tr = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
    return tr.rolling(period, min_periods=period).mean()

In [15]:
def compute_base_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    new_cols = [
        'logret_12h', 'logret_36h', 'rsi_14', 'roc_3', 'realized_vol_36h',
        'atr_14', 'spread', 'depth', 'vol_spike', 'delta_wallets',
        'tx_count_12h', 'ret_SOL', 'ret_BTC', 'ret_ETH', 'tvl_dev'
    ]
    df.drop(columns=[c for c in new_cols if c in df.columns], inplace=True, errors='ignore')

    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df.sort_values(['token', 'timestamp'], inplace=True)
    g = df.groupby('token')
    volume = df.get('token_volume_usd', df.get('volume'))

    df['logret_12h'] = g['token_close_usd'].transform(lambda x: np.log(x / x.shift(1)))
    df['logret_36h'] = g['token_close_usd'].transform(lambda x: np.log(x / x.shift(3)))
    df['rsi_14'] = g['token_close_usd'].transform(lambda x: rsi(x, 14))
    df['roc_3'] = g['token_close_usd'].transform(lambda x: (x / x.shift(3) - 1) * 100)
    
    df['realized_vol_36h'] = df.groupby('token')['logret_12h'].transform(lambda x: x.rolling(window=3).std())

    df['atr_14'] = df.groupby('token', group_keys=False).apply(lambda grp: atr(grp.get('high_usd', grp.get('high')), grp.get('low_usd', grp.get('low')), grp['token_close_usd'], 14))

    if {'best_ask', 'best_bid'}.issubset(df.columns):
        mid = (df['best_ask'] + df['best_bid']) / 2
        df['spread'] = (df['best_ask'] - df['best_bid']) / mid
    else:
        df['spread'] = np.nan

    if {'bid_size', 'ask_size'}.issubset(df.columns):
        df['depth'] = df['bid_size'] + df['ask_size']
    else:
        df['depth'] = np.nan

    if volume is not None:
        df['vol_spike'] = g[volume.name].transform(lambda x: x / x.rolling(14).mean())
    else:
        df['vol_spike'] = np.nan

    uniq_wallets = df.get('unique_wallets', df.get('holder_count'))
    if uniq_wallets is not None:
        df['delta_wallets'] = g[uniq_wallets.name].transform(lambda x: x.diff())
    else:
        df['delta_wallets'] = np.nan

    df['tx_count_12h'] = df.get('tx_count', df.get('network_tx_count'))

    if 'sol_close_usd' in df.columns:
        df['ret_SOL'] = df['sol_close_usd'].pct_change() * 100
    if 'btc_close_usd' in df.columns:
        df['ret_BTC'] = df['btc_close_usd'].pct_change() * 100
    if 'eth_close_usd' in df.columns:
        df['ret_ETH'] = df['eth_close_usd'].pct_change() * 100
    if 'tvl_usd' in df.columns:
        df['tvl_dev'] = (df['tvl_usd'] / df['tvl_usd'].rolling(14).mean() - 1) * 100

    return df

In [16]:
df = compute_base_features(df)
df.head()

  df['atr_14'] = df.groupby('token', group_keys=False).apply(lambda grp: atr(grp.get('high_usd', grp.get('high')), grp.get('low_usd', grp.get('low')), grp['token_close_usd'], 14))


Unnamed: 0,timestamp,token_mint,token,open_usd,high_usd,low_usd,token_close_usd,token_volume_usd,holder_count,new_token_accounts,...,atr_14,spread,depth,vol_spike,delta_wallets,tx_count_12h,ret_SOL,ret_BTC,ret_ETH,tvl_dev
0,2024-12-05 12:00:00,ekpqgsjtjmfqkz9kqansqyxrcf8fbopzlhyxdm65zcjm,$WIF,3.323218,3.337522,3.241962,3.269634,36.234117,,2610.0,...,,,,,,182550905.0,,,,
1,2024-12-06 00:00:00,ekpqgsjtjmfqkz9kqansqyxrcf8fbopzlhyxdm65zcjm,$WIF,3.354151,3.551303,3.354151,3.409319,41.386747,,1534.0,...,,,,,,183188078.0,1.181076,1.809513,1.737793,
2,2024-12-06 12:00:00,ekpqgsjtjmfqkz9kqansqyxrcf8fbopzlhyxdm65zcjm,$WIF,3.363693,3.52693,3.33191,3.505474,41.318232,,1701.0,...,,,,,,183446143.0,0.859585,2.456562,4.100628,
3,2024-12-07 00:00:00,ekpqgsjtjmfqkz9kqansqyxrcf8fbopzlhyxdm65zcjm,$WIF,3.490288,3.660266,3.423454,3.660266,41.89819,,1206.0,...,,,,,,183675936.0,0.153338,-1.002396,-1.110447,
4,2024-12-07 12:00:00,ekpqgsjtjmfqkz9kqansqyxrcf8fbopzlhyxdm65zcjm,$WIF,3.672881,3.99508,3.625573,3.934384,45.857745,,2605.0,...,,,,,,182528288.0,0.324093,0.074959,0.286522,


## Base Feature Overview
### The `compute_base_features` function adds the following fields:
- **logret_12h** – log return of the close over the last 12 hours.
- **logret_36h** – log return over the previous three 12‑hour bars.
- **rsi_14** – 14‑period Relative Strength Index using closing prices.
- **roc_3** – 3‑period Rate of Change of the close (percent).
- **realized_vol_36h** – rolling 36‑hour standard deviation of `logret_12h`.
- **atr_14** – 14‑period Average True Range.
- **spread** – relative bid/ask spread.
- **depth** – combined bid and ask size.
- **vol_spike** – ratio of volume to its 14‑period average.
- **delta_wallets** – change in unique wallet count.
- **tx_count_12h** – transaction count for the bar.
- **ret_SOL** – SOL percentage return.
- **ret_BTC** – BTC percentage return.
- **ret_ETH** – ETH percentage return.
- **tvl_dev** – deviation of DeFi TVL from its 14‑period mean.

# Advanced Features

In [17]:
def compute_advanced_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    new_cols = [
        'parkinson_vol_36h', 'gk_vol_36h', 'amihud_illiq_12h',
        'new_accounts_ratio', 'tx_per_account', 'wallet_growth_rate',
        'corr_SOL_36h', 'corr_BTC_36h', 'corr_ETH_36h', 'vol_zscore',
        'day_of_week', 'hour'
    ]
    df.drop(columns=[c for c in new_cols if c in df.columns], inplace=True, errors='ignore')

    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df.sort_values(['token', 'timestamp'], inplace=True)
    g = df.groupby('token')

    high = df.get('high_usd', df.get('high'))
    low = df.get('low_usd', df.get('low'))
    open_ = df.get('open_usd', df.get('open'))
    volume = df.get('token_volume_usd', df.get('volume'))

    df['parkinson_vol_36h'] = g.apply(lambda grp: (np.log(grp[high.name] / grp[low.name]) ** 2).rolling(3).mean().div(4 * np.log(2)).pow(0.5)).reset_index(level=0, drop=True)

    df['gk_vol_36h'] = g.apply(lambda grp: (0.5 * np.log(grp[high.name] / grp[low.name]) ** 2 - (2 * np.log(2) - 1) * np.log(grp['token_close_usd'] / grp[open_.name]) ** 2).rolling(3).mean().pow(0.5)).reset_index(level=0, drop=True)

    if volume is not None:
        df['amihud_illiq_12h'] = g.apply(lambda grp: (grp['logret_12h'].abs() / grp[volume.name]).rolling(3).mean()).reset_index(level=0, drop=True)
    else:
        df['amihud_illiq_12h'] = np.nan

    if {'new_token_accounts', 'holder_count'}.issubset(df.columns):
        df['new_accounts_ratio'] = df['new_token_accounts'] / df['holder_count']

    if {'network_tx_count', 'holder_count'}.issubset(df.columns):
        df['tx_per_account'] = df['network_tx_count'] / df['holder_count']

    if 'delta_wallets' in df.columns and 'holder_count' in df.columns:
        df['wallet_growth_rate'] = df['delta_wallets'] / df['holder_count']

    if {'ret_SOL', 'logret_12h'}.issubset(df.columns):
        df['corr_SOL_36h'] = g.apply(lambda grp: grp['logret_12h'].rolling(3).corr(grp['ret_SOL'])).reset_index(level=0, drop=True)
    if {'ret_BTC', 'logret_12h'}.issubset(df.columns):
        df['corr_BTC_36h'] = g.apply(lambda grp: grp['logret_12h'].rolling(3).corr(grp['ret_BTC'])).reset_index(level=0, drop=True)
    if {'ret_ETH', 'logret_12h'}.issubset(df.columns):
        df['corr_ETH_36h'] = g.apply(lambda grp: grp['logret_12h'].rolling(3).corr(grp['ret_ETH'])).reset_index(level=0, drop=True)

    if volume is not None:
        df['vol_zscore'] = g[volume.name].transform(lambda x: (x - x.rolling(14).mean()) / x.rolling(14).std())

    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df['hour'] = df['timestamp'].dt.hour

    return df

## Advanced Feature Overview
#### The `compute_advanced_features` function augments the dataset with:
- **parkinson_vol_36h** – 36‑hour Parkinson volatility using high and low prices.
- **gk_vol_36h** – 36‑hour Garman‑Klass volatility from OHLC bars.
- **amihud_illiq_12h** – Amihud illiquidity over the last 36 hours.
- **new_accounts_ratio** – new token accounts relative to current holders.
- **tx_per_account** – network transactions per holder.
- **wallet_growth_rate** – change in wallet count scaled by total holders.
- **corr_SOL_36h** – rolling correlation of token and SOL returns.
- **corr_BTC_36h** – rolling correlation of token and BTC returns.
- **corr_ETH_36h** – rolling correlation of token and ETH returns.
- **vol_zscore** – volume z‑score versus a 14‑period mean and std.
- **day_of_week** – day of week (0=Monday).
- **hour** – bar hour of day (0 or 12).

## Additional Feature Overview
#### The `compute_additional_features` function adds:
- **skew_36h** – rolling 36‑hour skewness of `logret_12h`.
- **kurt_36h** – rolling 36‑hour kurtosis of `logret_12h`.
- **vol_regime** – 1 if short volatility exceeds its 14‑period mean.
- **trend_regime** – 1 if close is above its 50‑period average.
- **price_volume** – close multiplied by volume.
- **market_pc1** – first principal component of SOL/BTC/ETH returns.
- **momentum_bucket** – quantile bin of 3‑period ROC.
- **sigma_14** – 14‑period rolling standard deviation of returns.
- **extreme_flag** – flag for |return| > 2×`sigma_14`.
- **extreme_count_72h** – count of extreme returns over 72 hours.
- **downside_vol_3bar** and **downside_vol_6bar** – downside volatility.
- **hmm_regime** – hidden Markov model state.

In [18]:
def compute_additional_features(df: pd.DataFrame) -> pd.DataFrame:
    """Compute engineered features used across the project."""
    df = df.copy()
    new_cols = [
        'skew_36h', 'kurt_36h', 'vol_regime', 'trend_regime', 'price_volume',
        'market_pc1', 'momentum_bucket', 'sigma_14', 'extreme_flag',
        'extreme_count_72h', 'downside_vol_3bar', 'downside_vol_6bar',
        'hmm_regime'
    ]
    df.drop(columns=[c for c in new_cols if c in df.columns], inplace=True, errors='ignore')

    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df.sort_values(['token', 'timestamp'], inplace=True)
    g = df.groupby('token')

    df['skew_36h'] = g['logret_12h'].transform(lambda x: x.rolling(3).skew())
    df['kurt_36h'] = g['logret_12h'].transform(lambda x: x.rolling(3).kurt())
    df['vol_regime'] = g['realized_vol_36h'].transform(
        lambda x: (x > x.rolling(14).mean()).astype(int)
    )
    df['trend_regime'] = g['token_close_usd'].transform(
        lambda x: (x > x.rolling(50).mean()).astype(int)
    )

    if 'token_volume_usd' in df.columns:
        df['price_volume'] = df['token_close_usd'] * df['token_volume_usd']

    if {'ret_SOL', 'ret_BTC', 'ret_ETH'}.issubset(df.columns):
        from sklearn.decomposition import PCA

        rets = df[['ret_SOL', 'ret_BTC', 'ret_ETH']]
        pc1 = [np.nan] * len(rets)
        pca = PCA(n_components=1)
        for i in range(13, len(rets)):
            window = rets.iloc[i - 13 : i + 1].dropna()
            if len(window) == 14:
                pca.fit(window)
                pc1[i] = pca.transform(rets.iloc[[i]])[0, 0]
        df['market_pc1'] = pc1

    if 'roc_3' in df.columns:
        df['momentum_bucket'] = pd.qcut(
            df['roc_3'].rank(method='first'), q=5, labels=False
        )

    df['sigma_14'] = g['logret_12h'].transform(lambda x: x.rolling(14).std())
    df['extreme_flag'] = (
        df['logret_12h'].abs() > 2 * df['sigma_14']
    ).astype(int)
    df['extreme_count_72h'] = g['extreme_flag'].transform(
        lambda x: x.rolling(6).sum()
    )

    def downside_std(x: pd.Series, window: int) -> pd.Series:
        return x.where(x < 0).rolling(window).std()

    for w in (3, 6):
        col = f'downside_vol_{w}bar'
        df[col] = g['logret_12h'].transform(lambda x, w=w: downside_std(x, w))

    from hmmlearn.hmm import GaussianHMM

    def fit_hmm_states(series: pd.Series, n_states: int = 2) -> pd.Series:
        series = series.dropna()
        if series.empty:
            return pd.Series(index=series.index, dtype='int64')
        X = series.values.reshape(-1, 1)
        model = GaussianHMM(n_components=n_states, covariance_type='diag', n_iter=100)
        model.fit(X)
        states = pd.Series(model.predict(X), index=series.index)
        return states

    df['hmm_regime'] = (
        g['logret_12h']
        .apply(lambda s: fit_hmm_states(s))
        .reset_index(level=0, drop=True)
        .astype('Int64')
    )

    return df



In [19]:
def compute_technical_indicators(df: pd.DataFrame) -> pd.DataFrame:

    df = df.copy()

    cols = [
    'stoch_k','williams_r','macd','macd_signal','proc',
    'bollinger_b','bollinger_bw','adx','cci','obv',
    'vol_zscore_14','momentum_3bar','momentum_6bar',
    'vol_std_3bar','vol_std_7bar','holder_growth_7d',
    'new_addr_growth_7d','tvl_change_7d'
    ]
    df.drop(columns=[c for c in cols if c in df.columns], inplace=True, errors='ignore')

    # Ensure timestamp sorted within each token
    df = df.sort_values(["token", "timestamp"])

    high_col = "high_usd" if "high_usd" in df.columns else "token_high_usd"
    low_col = "low_usd" if "low_usd" in df.columns else "token_low_usd"
    close_col = "close_usd" if "close_usd" in df.columns else "token_close_usd"
    vol_col = "volume_usd" if "volume_usd" in df.columns else "token_volume_usd"

    high = df.groupby("token")[high_col]
    low = df.groupby("token")[low_col]
    close = df.groupby("token")[close_col]

    # 14-period highs/lows for Stochastic Oscillator and Williams %R
    highest_14 = high.transform(lambda x: x.rolling(window=14).max())
    lowest_14 = low.transform(lambda x: x.rolling(window=14).min())

    stoch_k = 100 * (df[close_col] - lowest_14) / (highest_14 - lowest_14)
    df['stoch_k'] = stoch_k

    williams_r = -100 * (highest_14 - df[close_col]) / (highest_14 - lowest_14)
    df['williams_r'] = williams_r

    # MACD using closing price
    ema12 = close.transform(lambda x: x.ewm(span=12, adjust=False).mean())
    ema26 = close.transform(lambda x: x.ewm(span=26, adjust=False).mean())
    macd = ema12 - ema26
    df['macd'] = macd
    df['macd_signal'] = macd.groupby(df['token']).transform(lambda x: x.ewm(span=9, adjust=False).mean())

    # Price Rate of Change (12 periods ~ 6 days)
    proc = close.transform(lambda x: x.pct_change(periods=12))
    df['proc'] = proc

    # Bollinger Bands (20-period)
    ma20 = close.transform(lambda x: x.rolling(window=20).mean())
    std20 = close.transform(lambda x: x.rolling(window=20).std())
    upper = ma20 + 2 * std20
    lower = ma20 - 2 * std20
    df['bollinger_b'] = (df[close_col] - lower) / (upper - lower)
    df['bollinger_bw'] = (upper - lower) / ma20

    # Average Directional Index (14-period)
    up_move = high.diff()
    down_move = (-low.diff())
    plus_dm = np.where((up_move > down_move) & (up_move > 0), up_move, 0.0)
    minus_dm = np.where((down_move > up_move) & (down_move > 0), down_move, 0.0)
    prev_close = close.shift()
    tr = pd.concat([
        df[high_col] - df[low_col],
        (df[high_col] - prev_close).abs(),
        (df[low_col] - prev_close).abs(),
    ], axis=1).max(axis=1)
    atr = tr.groupby(df['token']).transform(lambda x: x.ewm(alpha=1/14, adjust=False).mean())
    plus_di = 100 * pd.Series(plus_dm, index=df.index).groupby(df['token']).transform(lambda x: x.ewm(alpha=1/14, adjust=False).mean()) / atr
    minus_di = 100 * pd.Series(minus_dm, index=df.index).groupby(df['token']).transform(lambda x: x.ewm(alpha=1/14, adjust=False).mean()) / atr
    dx = (abs(plus_di - minus_di) / (plus_di + minus_di)) * 100
    df['adx'] = dx.groupby(df['token']).transform(lambda x: x.ewm(alpha=1/14, adjust=False).mean())

    # Commodity Channel Index (20-period)
    tp = (df[high_col] + df[low_col] + df[close_col]) / 3
    tp_ma = tp.groupby(df['token']).transform(lambda x: x.rolling(window=20).mean())
    mad = tp.groupby(df['token']).transform(lambda x: x.rolling(window=20).apply(lambda y: np.mean(np.abs(y - y.mean())), raw=False))
    df['cci'] = (tp - tp_ma) / (0.015 * mad)

    # On-Balance Volume
    price_change_sign = np.sign(df[close_col] - prev_close).fillna(0)
    df['obv'] = (price_change_sign * df[vol_col]).groupby(df['token']).cumsum()

    # Volume Z-score over 14 bars
    vol = df.groupby('token')[vol_col]
    vol_mean = vol.transform(lambda x: x.rolling(window=14).mean())
    vol_std = vol.transform(lambda x: x.rolling(window=14).std())
    df['vol_zscore_14'] = (df[vol_col] - vol_mean) / vol_std

    # Short-term momentum (returns)
    df['momentum_3bar'] = close.transform(lambda x: x.pct_change(periods=3))
    df['momentum_6bar'] = close.transform(lambda x: x.pct_change(periods=6))

    # Lagged volatility (standard deviation of log returns)
    logret = close.transform(lambda x: np.log(x) - np.log(x.shift(1)))
    df['vol_std_3bar'] = logret.groupby(df['token']).transform(lambda x: x.rolling(window=3).std())
    df['vol_std_7bar'] = logret.groupby(df['token']).transform(lambda x: x.rolling(window=7).std())

    # Network activity growth metrics
    holders = df.groupby('token')['holder_count']
    df['holder_growth_7d'] = holders.transform(lambda x: x.pct_change(periods=14))

    new_addr = df.groupby('token')['new_token_accounts']
    df['new_addr_growth_7d'] = new_addr.transform(lambda x: x.pct_change(periods=14))

    tvl_col = "tvl_usd" if "tvl_usd" in df.columns else "tvl_tvl_usd"
    tvl = df.groupby("token")[tvl_col]
    df["tvl_change_7d"] = tvl.transform(lambda x: x.pct_change(periods=14))

    return df

# Compute a suite of technical indicators.

This helper generates a range of indicators used in the forecasting
notebooks.  The following columns are added (when the required inputs are
present):

* ``stoch_k`` – Stochastic Oscillator %K calculated from 14‑period highs/lows.
* ``williams_r`` – Williams %R using the same 14‑period window.
* ``macd`` / ``macd_signal`` – 12–26 EMA difference and its 9‑period signal
  line.
* ``proc`` – price rate of change.
* ``bollinger_b`` / ``bollinger_bw`` – Bollinger %b and band width from a
  20‑period moving average.
* ``adx`` – Average Directional Index measuring trend strength.
* ``cci`` – Commodity Channel Index.
* ``obv`` – On‑Balance Volume.
* ``vol_zscore_14`` – volume Z‑score over a 14‑bar lookback.
* ``momentum_3bar`` / ``momentum_6bar`` – short‑term returns.
* ``vol_std_3bar`` / ``vol_std_7bar`` – realised volatility of log returns.
* ``holder_growth_7d`` / ``new_addr_growth_7d`` – one‑week growth in holders
  and new addresses.
* ``tvl_change_7d`` – weekly percentage change in TVL.

The function is tolerant of slightly different input column names (e.g.
``token_close_usd`` vs ``close_usd``) and operates independently on each token
in the DataFrame.

## Compute base and advanced features

In [20]:
df = compute_base_features(df)
df = compute_advanced_features(df)
df = compute_additional_features(df)
df = compute_technical_indicators(df)
df.head()

  df['atr_14'] = df.groupby('token', group_keys=False).apply(lambda grp: atr(grp.get('high_usd', grp.get('high')), grp.get('low_usd', grp.get('low')), grp['token_close_usd'], 14))
  df['parkinson_vol_36h'] = g.apply(lambda grp: (np.log(grp[high.name] / grp[low.name]) ** 2).rolling(3).mean().div(4 * np.log(2)).pow(0.5)).reset_index(level=0, drop=True)
  df['gk_vol_36h'] = g.apply(lambda grp: (0.5 * np.log(grp[high.name] / grp[low.name]) ** 2 - (2 * np.log(2) - 1) * np.log(grp['token_close_usd'] / grp[open_.name]) ** 2).rolling(3).mean().pow(0.5)).reset_index(level=0, drop=True)
  df['amihud_illiq_12h'] = g.apply(lambda grp: (grp['logret_12h'].abs() / grp[volume.name]).rolling(3).mean()).reset_index(level=0, drop=True)
  df['corr_SOL_36h'] = g.apply(lambda grp: grp['logret_12h'].rolling(3).corr(grp['ret_SOL'])).reset_index(level=0, drop=True)
  df['corr_BTC_36h'] = g.apply(lambda grp: grp['logret_12h'].rolling(3).corr(grp['ret_BTC'])).reset_index(level=0, drop=True)
  df['corr_ETH_36h'] 

Unnamed: 0,timestamp,token_mint,token,open_usd,high_usd,low_usd,token_close_usd,token_volume_usd,holder_count,new_token_accounts,...,cci,obv,vol_zscore_14,momentum_3bar,momentum_6bar,vol_std_3bar,vol_std_7bar,holder_growth_7d,new_addr_growth_7d,tvl_change_7d
0,2024-12-05 12:00:00,ekpqgsjtjmfqkz9kqansqyxrcf8fbopzlhyxdm65zcjm,$WIF,3.323218,3.337522,3.241962,3.269634,36.234117,,2610.0,...,,0.0,,,,,,,,
1,2024-12-06 00:00:00,ekpqgsjtjmfqkz9kqansqyxrcf8fbopzlhyxdm65zcjm,$WIF,3.354151,3.551303,3.354151,3.409319,41.386747,,1534.0,...,,41.386747,,,,,,,,
2,2024-12-06 12:00:00,ekpqgsjtjmfqkz9kqansqyxrcf8fbopzlhyxdm65zcjm,$WIF,3.363693,3.52693,3.33191,3.505474,41.318232,,1701.0,...,,82.704979,,,,,,,,
3,2024-12-07 00:00:00,ekpqgsjtjmfqkz9kqansqyxrcf8fbopzlhyxdm65zcjm,$WIF,3.490288,3.660266,3.423454,3.660266,41.89819,,1206.0,...,,124.603169,,0.119473,,0.00852,,,,
4,2024-12-07 12:00:00,ekpqgsjtjmfqkz9kqansqyxrcf8fbopzlhyxdm65zcjm,$WIF,3.672881,3.99508,3.625573,3.934384,45.857745,,2605.0,...,,170.460914,,0.154009,,0.022548,,,,


In [21]:
cols_to_drop = [
    'token_symbol', 'token_name', 'post_launch', 'spread', 'depth',
    'delta_wallets', 'new_accounts_ratio', 'wallet_growth_rate', 'kurt_36h'
]
df = df.drop(columns=cols_to_drop, errors='ignore')