In [2]:
# 📦 Import Required Libraries
import pandas as pd
import numpy as np
import os

# 📂 Load Cleaned Data
data_path = os.path.join("..", "data", "cleaned", "cleaned_data.csv")
df = pd.read_csv(data_path)

# 👁️ Inspect Basic Info
print("Initial shape:", df.shape)
print("Columns:", df.columns.tolist())

# 📆 Convert 'date' column to datetime if not already
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values(by=['coin', 'date'])
else:
    raise ValueError("Missing 'date' column required for time-based feature engineering.")

# 🧮 Feature Engineering with min_periods=1 to avoid NaNs at start of rolling windows

# 1. 7-day & 30-day Moving Average of Price
df['price_ma7'] = df.groupby('coin')['price'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())
df['price_ma30'] = df.groupby('coin')['price'].transform(lambda x: x.rolling(window=30, min_periods=1).mean())

# 2. 7-day Volatility (Rolling Std Dev)
df['volatility_7d'] = df.groupby('coin')['price'].transform(lambda x: x.rolling(window=7, min_periods=1).std())

# 3. Volume Change % (1-day lagged difference)
df['volume_change_pct'] = df.groupby('coin')['24h_volume'].transform(lambda x: x.pct_change() * 100)

# 4. Price Change % (1-day lagged difference)
df['price_change_pct'] = df.groupby('coin')['price'].transform(lambda x: x.pct_change() * 100)

# 5. Rolling Liquidity Ratio: 24h_volume / mkt_cap (7-day mean)
df['liquidity_ratio'] = df['24h_volume'] / df['mkt_cap']
df['liquidity_ratio_7d'] = df.groupby('coin')['liquidity_ratio'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())

# 🚨 Drop rows with NaNs only in critical columns (to avoid losing entire dataset)
critical_cols = ['price', '24h_volume', 'mkt_cap', 'liquidity_ratio_7d']
df_fe = df.dropna(subset=critical_cols).reset_index(drop=True)

# 💾 Save Engineered Features Dataset
output_path = os.path.join("..", "data", "cleaned", "feature_engineered_data.csv")
df_fe.to_csv(output_path, index=False)
print(f"\n✅ Feature engineered data saved to {output_path}")
print("Final shape after feature engineering:", df_fe.shape)

# 👀 Preview the Final Data
print(df_fe.head())


Initial shape: (1000, 10)
Columns: ['coin', 'symbol', 'price', '1h', '24h', '7d', '24h_volume', 'mkt_cap', 'date', 'SourceFile']

✅ Feature engineered data saved to ..\data\cleaned\feature_engineered_data.csv
Final shape after feature engineering: (999, 17)
           coin symbol     price        1h       24h        7d  24h_volume  \
0            0x    ZRX  0.000012  0.912390  0.556827  0.113434    0.000506   
1            0x    ZRX  0.000013  0.884856  0.541292  0.106078    0.000418   
2         1inch  1INCH  0.000036  0.907384  0.618970  0.139566    0.002079   
3         1inch  1INCH  0.000036  0.891114  0.525756  0.124855    0.001107   
4  AIOZ Network   AIOZ  0.000006  0.939925  0.762878  0.154665    0.000237   

    mkt_cap       date                 SourceFile  price_ma7  price_ma30  \
0  0.000467 2022-03-16  coin_gecko_2022-03-16.csv   0.000012    0.000012   
1  0.000481 2022-03-17  coin_gecko_2022-03-17.csv   0.000012    0.000012   
2  0.000709 2022-03-16  coin_gecko_2022-03-16