### Focus: The "Kitchen Sink" (Aggressive).

### 1.Setup & Data Loading

In [2]:
import pandas as pd
import numpy as np
import os

# 1. CONFIGURATION
# choosing 16 as it had the most well balanced dataset
INPUT_FILENAME = "triple_barrier_16h.parquet" 
INPUT_PATH = os.path.join('../data', INPUT_FILENAME)
OUTPUT_FILENAME = "engineered_features_16h.parquet"
OUTPUT_PATH = os.path.join('../data', OUTPUT_FILENAME)

# 2. LOAD DATA
print(f"Loading data from {INPUT_PATH}...")
df = pd.read_parquet(INPUT_PATH)

# Ensure index is datetime
if not isinstance(df.index, pd.DatetimeIndex):
    # if not then simply convert it
    df.index = pd.to_datetime(df.index)

print(f"Initial Shape: {df.shape}")
print("Data loaded successfully.")

Loading data from ../data\triple_barrier_16h.parquet...
Initial Shape: (9301, 12)
Data loaded successfully.


### 2: Time Standardization (UTC+2 -> UTC)

In [3]:
# 1. STANDARDIZE TIMEZONE
# Data is UTC+2 (Broker Time). We subtract 2 hours to get true UTC.
print("Shifting time from UTC+2 to UTC...")

# this is a fancy operation to remove 2 hours and hence convert the timeframt to utc
df.index = df.index - pd.Timedelta(hours=2)

# Verify the shift (Optional check)
# If it was 10:00 Broker time, it should now be 08:00 UTC (London Open)
print(f"New start time (UTC): {df.index[0]}")
print(f"New end time (UTC): {df.index[-1]}")

Shifting time from UTC+2 to UTC...
New start time (UTC): 2020-01-06 06:00:00
New end time (UTC): 2025-12-24 02:00:00


### 3. Aggressive Technical & Statistical Features

In [4]:
# Create a copy to work on like a nroaml person
df_feat = df.copy()

print("Generating features...")

# ==========================================
# 1. TIME & SESSION CONTEXT
# ==========================================
# Cyclical Time (Clock Coordinates)


# We use Sine and Cosine to map the hours onto a circle. Now,
# 11 PM and Midnight are mathematically close to each other

df_feat['hour_sin'] = np.sin(2 * np.pi * df_feat.index.hour / 24)
df_feat['hour_cos'] = np.cos(2 * np.pi * df_feat.index.hour / 24)
df_feat['day_sin'] = np.sin(2 * np.pi * df_feat.index.dayofweek / 7)
df_feat['day_cos'] = np.cos(2 * np.pi * df_feat.index.dayofweek / 7)

# Session Booleans (Based on UTC)
h = df_feat.index.hour
df_feat['sess_london'] = ((h >= 7) & (h <= 16)).astype(int)
df_feat['sess_ny'] = ((h >= 12) & (h <= 21)).astype(int)
df_feat['sess_tokyo'] = ((h >= 0) & (h <= 9)).astype(int)
df_feat['sess_sydney'] = ((h >= 21) | (h <= 6)).astype(int) # Wraps midnight

# Critical Overlaps & Zones
df_feat['sess_london_ny'] = (df_feat['sess_london'] & df_feat['sess_ny']).astype(int)
df_feat['sess_tokyo_london'] = (df_feat['sess_tokyo'] & df_feat['sess_london']).astype(int)

# The US banks are closed, and Tokyo hasn't opened fully. Spreads widen, and moves are often fake.
df_feat['sess_dead_zone'] = ((h >= 21) | (h == 0)).astype(int) # Late NY / Early Syd

# The market literally goes to lunch. Volume drops to zero.
df_feat['sess_asian_lunch'] = ((h >= 3) & (h <= 4)).astype(int) # Low Volatility

# ==========================================
# 2. MARKET MICROSTRUCTURE (Spread & Vol)
# ==========================================
# Spread Features (Liquidity Risk)
if 'spread' in df_feat.columns:
    # Normalize spread by price (e.g., 0.0001 / 1.1000)
    df_feat['spread_pct'] = df_feat['spread'] / df_feat['close']
    # Spread Shock: Is spread currently 2x or 3x the average? (News Event Detector)
    df_feat['spread_shock'] = df_feat['spread'] / df_feat['spread'].rolling(20).mean()
    print("Spread features added.")


# Volume Features (Activity)
vol_col = 'tick_volume' if 'tick_volume' in df_feat.columns else 'volume' if 'volume' in df_feat.columns else None

if vol_col:

    # Price moves up + Low Volume: The car is coasting uphill. It will likely roll back (False Breakout).
    df_feat['vol_rel'] = df_feat[vol_col] / df_feat[vol_col].rolling(20).mean()
    
    # Force volume stationarity (Rate of Change)
    df_feat['vol_roc'] = df_feat[vol_col].pct_change()
    print(f"Volume features added using column: {vol_col}")

# ==========================================
# 3. VOLATILITY & STATS
# ==========================================
# Bollinger Bands
sma_20 = df_feat['close'].rolling(20).mean()
std_20 = df_feat['close'].rolling(20).std()
bb_upper = sma_20 + (std_20 * 2)
bb_lower = sma_20 - (std_20 * 2)

# o measure if the market is "nervous" (volatile) or "calm", 
# and if price is "stretched" (likely to snap back).
df_feat['bb_width'] = (bb_upper - bb_lower) / sma_20 # Squeeze/Expand
df_feat['bb_pos'] = (df_feat['close'] - bb_lower) / (bb_upper - bb_lower) # Position



# Rolling Distribution Shape (The "Aggressive" Stats)
# Intuition (The Shape of Danger):

for window in [20, 50]:
    # Is the market crashing up or crashing down?
    df_feat[f'roll_skew_{window}'] = df_feat['close'].rolling(window).skew()
    # High kurtosis means the market is jumpy and dangerous.
    df_feat[f'roll_kurt_{window}'] = df_feat['close'].rolling(window).kurt()

# ATR % (if ATR exists from labelling)
if 'atr' in df_feat.columns:
    df_feat['atr_pct'] = df_feat['atr'] / df_feat['close']

# ==========================================
# 4. MOMENTUM & TREND (Stationary)
# ==========================================
# RSI
# Intuition (The Speedometer): How fast is price changing? If RSI > 70, the engine is redlining.

for window in [7, 14, 21]:
    delta = df_feat['close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    df_feat[f'rsi_{window}'] = 100 - (100 / (1 + rs))

# MACD
ema_12 = df_feat['close'].ewm(span=12, adjust=False).mean()
ema_26 = df_feat['close'].ewm(span=26, adjust=False).mean()

# "Are the short-term and long-term trends agreeing or fighting?"
df_feat['macd_hist'] = (ema_12 - ema_26) - (ema_12 - ema_26).ewm(span=9, adjust=False).mean()

# Distance from Moving Averages (Trend Strength)
for period in [20, 50, 100, 200]:
    ma = df_feat['close'].rolling(period).mean()
    df_feat[f'dist_sma_{period}'] = (df_feat['close'] - ma) / ma
    df_feat[f'slope_sma_{period}'] = ma.diff()

# ==========================================
# 5. LAGS (Memory)
# ==========================================
# Log returns of previous candles
for lag in [1, 2, 3, 5, 8, 13]:
    df_feat[f'log_ret_lag_{lag}'] = np.log(df_feat['close'] / df_feat['close'].shift(lag))

# Cleanup
# Drop rows with NaN (Warmup period for 200 SMA)
original_len = len(df_feat)
df_feat.dropna(inplace=True)

print("Feature Engineering Complete.")
print(f"Dropped {original_len - len(df_feat)} rows (warmup).")
print(f"Final Feature Count: {df_feat.shape[1]}")

Generating features...
Spread features added.
Volume features added using column: tick_volume
Feature Engineering Complete.
Dropped 200 rows (warmup).
Final Feature Count: 53


### Save Final Dataset

In [None]:
# 1. Save to Parquet
print(f"Saving processed data to {OUTPUT_PATH}...")
df_feat.to_parquet(OUTPUT_PATH, index=True)

# 2. Sanity Check: Inspect the Columns
print("\n--- Feature List ---")
print(df_feat.columns.tolist())

print("\n--- Sample Data (Last 5 rows) ---")
df_feat.tail()

### to be tried later

In [None]:
import pandas as pd
import pandas_ta as ta
import numpy as np
import os

# 1. SETUP
# Load your dataset (assuming it is named 'df' from the previous steps)
# If reloading: df = pd.read_parquet('../data/engineered_features_16h.parquet')

# Ensure Volume exists (Backups for different naming conventions)
if 'tick_volume' in df.columns:
    df['volume'] = df['tick_volume']
elif 'Volume' in df.columns:
    df['volume'] = df['Volume']

# ==========================================
# 2. DEFINING THE "GOLD STANDARD" STRATEGY
# ==========================================
# We only calculate what adds unique information.

RefinedStrategy = ta.Strategy(
    name="Refined_ML_Strategy",
    description="Non-redundant, stationary-focused indicators",
    ta=[
        # --- TREND STRENGTH (Is the market moving?) ---
        {"kind": "adx", "length": 14},   # Trend Strength
        {"kind": "aroon", "length": 25}, # Trend Start/End
        
        # --- BASELINES (To measure distance from) ---
        {"kind": "ema", "length": 50},
        {"kind": "ema", "length": 200},
        {"kind": "psar"},                # Parabolic SAR (Trailing Stop proxy)
        
        # --- MOMENTUM (Cyclical & Velocity) ---
        {"kind": "rsi", "length": 14},
        {"kind": "cci", "length": 20},   # Cyclical nature
        {"kind": "macd"},                # Trend Velocity
        {"kind": "stoch"},               # Overbought/Oversold
        {"kind": "willr"},               # Williams %R (Fast momentum)
        
        # --- VOLATILITY (Regime) ---
        {"kind": "bbands", "length": 20}, # Volatility Squeeze
        {"kind": "atr", "length": 14},    # Normalizer
        {"kind": "kc"},                   # Keltner Channels (Trend pullbacks)

        # --- VOLUME (Confirmation) ---
        {"kind": "obv"},      # Cumulative Flow
        {"kind": "cmf"},      # Institutional Flow
        {"kind": "mfi"},      # Momentum of Volume
    ]
)

print("Running Refined Strategy...")
df.ta.strategy(RefinedStrategy, cores=0)

# ==========================================
# 3. CRITICAL: STATIONARIZING FEATURES
# ==========================================
# Converting "Price Levels" to "Relative Percentages"

print("Post-processing features for Stationarity...")

# A. Distances to Averages (Instead of raw EMA values)
# "Price is 2% above the 200 EMA" is better than "EMA is 1.0500"
df['dist_ema_50'] = (df['close'] - df['EMA_50']) / df['EMA_50']
df['dist_ema_200'] = (df['close'] - df['EMA_200']) / df['EMA_200']

# B. Distance to Parabolic SAR (Stop Loss proxy)
# PSARs and Psarl are the columns created by pandas_ta
# We combine them into one distance metric
psar_col = df['PSARl_0.02_0.2'].fillna(df['PSARs_0.02_0.2'])
df['dist_psar'] = (df['close'] - psar_col) / df['close']

# C. Bollinger Band Features
# Bandwidth is already stationary (Width / Mean)
# %B is already stationary (Position)
# Rename for clarity if needed, but pandas_ta names them BBP_20_2.0 and BBB_20_2.0

# D. Keltner Channel Position
# Where is price relative to the Keltner Channel?
# (Close - Lower) / (Upper - Lower)
kc_upper = df['KCqe_20_2.0'] # Upper channel
kc_lower = df['KCle_20_2.0'] # Lower channel
df['kc_pos'] = (df['close'] - kc_lower) / (kc_upper - kc_lower)

# ==========================================
# 4. DROPPING RAW PRICE COLUMNS
# ==========================================
# We used EMA_50 and PSAR to calculate distances. 
# Now we remove the raw price columns so the model doesn't overfit to them.

cols_to_drop = [
    'EMA_50', 'EMA_200', 
    'PSARl_0.02_0.2', 'PSARs_0.02_0.2', 'PSARaf_0.02_0.2', 'PSARr_0.02_0.2',
    'KCbe_20_2.0', 'KCqe_20_2.0', 'KCle_20_2.0' # Raw Keltner levels
]

# Only drop if they exist
df.drop(columns=[c for c in cols_to_drop if c in df.columns], inplace=True)

# Drop warmup rows (NaNs)
df.dropna(inplace=True)

print(f"Final Feature Engineering Complete.")
print(f"Final Shape: {df.shape}")

# Save
OUTPUT_PATH = os.path.join('../data', "engineered_features_final.parquet")
df.to_parquet(OUTPUT_PATH)
print(f"Saved to: {OUTPUT_PATH}")

# Preview
print("\nFinal Feature List:")
print(df.columns.tolist())

In [None]:
# 1. Trend Indicators (Baselines)
# These indicators identify the general direction of the market and help traders "follow the trend". 
# Simple Moving Average (SMA): The average price over a set number of periods.
# Exponential Moving Average (EMA): Similar to SMA but weights recent prices more heavily.
# Weighted Moving Average (WMA): Assigns heavier weight to the most recent data points.
# Hull Moving Average (HMA): Designed to reduce lag and improve smoothness.
# Kaufman Adaptive Moving Average (KAMA): Adjusts its sensitivity based on market noise.
# Ichimoku Cloud (Kinko Hyo): A comprehensive system showing trend, support, and resistance.
# Parabolic SAR (Stop and Reverse): Uses dots to indicate potential trend reversals.
# Average Directional Index (ADX): Measures the strength of a trend.
# Aroon Indicator: Identifies when a trend is starting or changing.
# Aroon Oscillator: Measures the difference between Aroon Up and Aroon Down.
# Linear Regression Trendline: A straight line that best fits a set of price data.
# Double Exponential Moving Average (DEMA): Further reduces lag from standard EMAs.
# Triple Exponential Moving Average (TEMA): Uses triple smoothing to minimize lag.
# Supertrend: A trend-following indicator based on ATR.
# ZigZag: Filters out smaller price movements to show significant trends.
# Schaff Trend Cycle (STC): Combines MACD with a stochastic for faster trend signals.
# TRIX: A triple-smoothed exponential oscillator for trend identification.
# Detrended Price Oscillator (DPO): Removes trend to highlight short-term cycles.
# Alligator (Bill Williams): Uses three smoothed moving averages to identify trends.
# Gann Fans/Gann Lines: Based on geometric angles of price and time. 

# 2. Momentum Indicators (Oscillators)
# These measure the speed of price movements and identify overbought or oversold conditions.
# 21. Relative Strength Index (RSI): Ranges 0-100; signals overbought (>70) or oversold (<30).
# 22. Stochastic Oscillator: Compares closing price to its range over time.
# 23. MACD (Moving Average Convergence Divergence): Shows the relationship between two EMAs.
# 24. Commodity Channel Index (CCI): Identifies cyclical trends and reversals.
# 25. Williams %R: Shows where the current price is relative to the highest high.
# 26. Awesome Oscillator (AO): Measures market momentum using 34 and 5-period SMAs.
# 27. Momentum Indicator: Measures the rate of change of prices.
# 28. Rate of Change (ROC): Calculates the percentage change in price between periods.
# 29. Money Flow Index (MFI): Volume-weighted version of RSI.
# 30. Relative Vigor Index (RVI): Measures the strength of a trend by comparing closing prices.
# 31. Stochastic RSI: A stochastic applied to RSI values for increased sensitivity.
# 32. Ultimate Oscillator: Uses three different timeframes to reduce false signals.
# 33. Chande Momentum Oscillator (CMO): Calculates momentum based on unsmoothed data.
# 34. Gator Oscillator: Derived from the Alligator indicator to show trend changes.
# 35. DeMarker (DeM): Compares most recent maximum and minimum prices.
# 36. True Strength Index (TSI): A double-smoothed momentum oscillator.
# 37. Vortex Indicator: Two lines (+VI and -VI) that identify the start of a new trend.
# 38. Fisher Transform: Transforms prices into a Gaussian normal distribution.
# 39. Center of Gravity Oscillator: Identifies major turning points without lag.
# 40. Percentage Price Oscillator (PPO): Similar to MACD but shown in percentages. 

# 3. Volatility Indicators
# These measure how far price stretches from its mean, helping with risk management and breakout detection.
# 41. Bollinger Bands: A moving average with two standard deviation bands.
# 42. Average True Range (ATR): Measures the average range of price movement.
# 43. Standard Deviation: Measures how spread out price data is from the mean.
# 44. Keltner Channels: Volatility-based envelopes set above/below an EMA.
# 45. Donchian Channels: Shows the highest high and lowest low over a period.
# 46. Envelopes: Two moving averages set at a fixed percentage above and below price.
# 47. Chaikin Volatility: Measures the difference between high and low prices.
# 48. Bollinger Bandwidth: Measures the distance between upper and lower Bollinger Bands.
# 49. STARC Bands: Combines moving averages and ATR to create volatility channels.
# 50. Ulcer Index: Measures "stress" by analyzing the depth and duration of price drops.
# 51. Relative Volatility Index (RVI): Measures the direction of volatility.
# 52. Choppiness Index: Determines if the market is trending or "choppy".
# 53. Mass Index: Predicts reversals by measuring the narrowing/widening of price ranges.
# 54. Historical Volatility (HV): Measures the past standard deviation of an asset. 
    
#     4. Volume & Support/Resistance Indicators
# These measure market participation and key price levels.
# 55. On-Balance Volume (OBV): Relates volume to price change to confirm trends.
# 56. Accumulation/Distribution (A/D): Measures the cumulative flow of money.
# 57. Chaikin Money Flow (CMF): Measures the amount of Money Flow Volume over a period.
# 58. Volume Weighted Average Price (VWAP): Average price weighted by total volume.
# 59. Pivot Points (Standard): Key levels calculated from the previous day's H/L/C.
# 60. Fibonacci Retracements: Horizontal lines based on Fibonacci ratios to find support.
# 61. Volume Profile: Shows volume traded at specific price levels.
# 62. Ease of Movement (EOM): Relates price change to volume.
# 63. Money Flow Ratio: Compares positive and negative money flow.
# 64. Negative Volume Index (NVI): Focuses on days where volume decreased.
# 65. Positive Volume Index (PVI): Focuses on days where volume increased.
# 66. Price Volume Trend (PVT): Cumulative volume that adds a percentage of the day's volume.
# 67. Trade Volume Index (TVI): Used to determine whether an asset is being accumulated.
# 68. Market Profile: Visualizes price and volume over time (TPO).
# 69. Balance of Power (BOP): Measures the strength of buyers vs. sellers.
# 70. Camarilla Pivot Points: A variation of pivots providing tighter support/resistance.
# 71. Woodieâ€™s Pivot Points: Weighted differently to emphasize recent price action.
# 72. Fibonacci Extensions: Projects potential future profit-taking levels.
# 73. Elder Force Index: Combines price movement and volume to measure trend power.
# 74. Vortex Indicator (VI): Identifies the start of a trend based on high/low distance.
# 75. Chaikin Oscillator: Applies MACD to the Accumulation/Distribution line.
# 76. Force Index: Uses price, volume, and time to identify trend strength.
# 77. Klinger Oscillator: Compares volume flowing through an asset with price

# 5. Specialized & Hybrid Indicators
# Fractals (Bill Williams): Arrows that highlight local high and low points.
# Heiken Ashi: Specialized candlesticks that filter market noise.
# TD Sequential: Identifies trend exhaustion and potential price flips.
# Murrey Math Lines: Support/resistance levels based on Gann's theory.
# Psychological Line: Measures the ratio of rising days to total days.
# Volume Rate of Change (VROC): Percentage change in volume.
# Typical Price: (High + Low + Close) / 3.
# Median Price: (High + Low) / 2.
# Adaptive Lagging Line: Follows price but smooths out data without excessive lag.
# Trend Intensity Index (TII): Measures the strength of a trend.
# Rainbow Moving Average: Multiple MAs used together to show trend maturity.
# Linear Regression Slope: Measures the rate of change of a linear regression line.
# CVI (Chart Volatility Index): A custom measure of intraday volatility.
# Gpivot Ressup: Dynamic support and resistance pivot variation.
# DMI (Directional Movement Index): Consists of +DI and -DI lines.
# TMA (Triangular Moving Average): Double-smoothed version of the SMA.
# QQE (Quantitative Qualitative Estimation): Smother RSI-based indicator.
# RSX (Relative Strength Index smoothed): A noise-free version of RSI.
# ATR Trailing Stop: A stop-loss level that moves with the price.
# Connors RSI: Combines RSI, Up/Down Length, and Rate of Change.
# Forex Sentiment: Measures the percentage of long vs. short retail positions.
# Currency Strength Meter: Compares one currency's performance against all others.
# Market Sessions Indicator: Highlights Asian, London, and New York trading hours. 



