### Focus: The "Kitchen Sink" (Aggressive).

### 1: Imports & Timezone Logic

In [None]:
# Cell 1: Imports & Timezone Logic
import pandas as pd
import numpy as np
import os

LABELED_FILE = "../data/EURUSD_H1_Labeled.parquet"
FEATURES_FILE = "../data/EURUSD_H1_Features.parquet"

df = pd.read_parquet(LABELED_FILE)

# CRITICAL: Set Timezone for Session Features
# Assuming the data from Notebook 01 is in Broker Time (EET usually).
# We need to convert to 'America/New_York' for the "Is_London", "Is_NY" flags to make sense.
# IF Notebook 01 saved as naive, we localize first. 
if df.index.tz is None:
    # Assuming Broker is UTC+2 (Adjust based on Notebook 01 output!)
    df.index = df.index.tz_localize('Etc/GMT-2') 

# Convert to NY Time for Feature Calculation
df_ny = df.index.tz_convert('America/New_York')
df['hour_ny'] = df_ny.hour

### 2: The Kitchen Sink (Numpy/Pandas Only)

In [155]:
# Cell 2: The Kitchen Sink (Numpy/Pandas Only)
print("ðŸ§ª Engineering All Features...")

# 1. PRICE ACTION & MOMENTUM
# --------------------------
# Log Returns (1h, 3h, 5h, 12h, 24h)
for lag in [1, 3, 5, 12, 24]:
    df[f'log_ret_{lag}'] = np.log(df['close'] / df['close'].shift(lag))

# RSI (Manual)
delta = df['close'].diff()
gain = (delta.where(delta > 0, 0)).rolling(14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
rs = gain / loss
df['rsi_14'] = 100 - (100 / (1 + rs))

# MACD (Manual: 12, 26, 9)
ema12 = df['close'].ewm(span=12, adjust=False).mean()
ema26 = df['close'].ewm(span=26, adjust=False).mean()
df['macd'] = ema12 - ema26
df['macd_signal'] = df['macd'].ewm(span=9, adjust=False).mean()

# 2. VOLATILITY
# -------------
# Garman-Klass Volatility (OHLC derived)
log_hl = np.log(df['high'] / df['low'])
log_co = np.log(df['close'] / df['open'])
df['vol_gk'] = np.sqrt(0.5 * log_hl**2 - (2 * np.log(2) - 1) * log_co**2)

# Bollinger Bands (20, 2)
sma20 = df['close'].rolling(20).mean()
std20 = df['close'].rolling(20).std()
df['bb_upper'] = sma20 + (2 * std20)
df['bb_lower'] = sma20 - (2 * std20)
df['bb_width'] = (df['bb_upper'] - df['bb_lower']) / sma20

# 3. STATIC / DRIFTING
# --------------------
# Distance from SMA (Trend Mean Reversion)
for p in [20, 50, 200]:
    df[f'dist_sma{p}'] = (df['close'] - df['close'].rolling(p).mean()) / df['close']

# 4. TIME & SESSIONS (The "Timely" Features)
# ------------------------------------------
# Cyclical
df['hour_sin'] = np.sin(2 * np.pi * df['hour_ny'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour_ny'] / 24)

# Session Flags (NY Time Based)
h = df['hour_ny']
# Sydney: 17:00 - 02:00 NY
df['is_sydney'] = ((h >= 17) | (h < 2)).astype(int)
# Tokyo: 19:00 - 04:00 NY
df['is_tokyo']  = ((h >= 19) | (h < 4)).astype(int)
# London: 03:00 - 12:00 NY
df['is_london'] = ((h >= 3) & (h < 12)).astype(int)
# NY: 08:00 - 17:00 NY
df['is_ny']     = ((h >= 8) & (h < 17)).astype(int)

# Overlaps
df['overlap_london_ny'] = (df['is_london'] & df['is_ny']).astype(int)
df['overlap_sydney_tokyo'] = (df['is_sydney'] & df['is_tokyo']).astype(int)

# 5. SPREAD VETO FEATURE
# Keep spread as a feature so model learns "High Spread = Bad"
df['spread_val'] = df['spread']

# Drop NaNs from rolling windows
df = df.dropna()

### 3. SAVE THE FINAL DATASET

In [None]:
# Cell 3: Save
print(f"âœ… Features Engineered. Final Shape: {df.shape}")
df.to_parquet(FEATURES_FILE)