### Imports & Load Labeled Data

In [2]:
import pandas as pd
import numpy as np
# import pandas_ta as ta
import matplotlib.pyplot as plt

# Load your LABELED dataset (ensure you saved the 0.8 version)
df = pd.read_parquet("../data/EURUSD_D1_Labeled.parquet")

print(f"Loaded {len(df)} rows with columns: {df.columns.tolist()}")

Loaded 1489 rows with columns: ['open', 'high', 'low', 'close', 'tick_volume', 'spread', 'real_volume', 'atr', 'label']


### Add Technical Indicators (The "Evidence")

In [3]:
# --- REPLACEMENT CODE: NO PANDAS_TA REQUIRED ---

def calculate_rsi(series, period=14):
    delta = series.diff(1)
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

def add_features(df):
    df = df.copy()
    
    # --- A. MOMENTUM (RSI) ---
    # Manual RSI Calculation (Standard Formula)
    df['rsi'] = calculate_rsi(df['close'], period=14)
    df['rsi_slope'] = df['rsi'].diff(1)
    df['rsi_accel'] = df['rsi_slope'].diff(1)
    
    # --- B. TREND (SMA) ---
    # Simple Moving Averages
    df['sma_50'] = df['close'].rolling(window=50).mean()
    df['sma_200'] = df['close'].rolling(window=200).mean()
    
    # Distance from MA
    df['dist_sma50'] = (df['close'] - df['sma_50']) / df['sma_50']
    df['dist_sma200'] = (df['close'] - df['sma_200']) / df['sma_200']
    
    # --- C. VOLATILITY (Bollinger Bands) ---
    # Manual BB Calculation
    window = 20
    std_dev = 2.0
    rolling_mean = df['close'].rolling(window=window).mean()
    rolling_std = df['close'].rolling(window=window).std()
    
    upper_band = rolling_mean + (rolling_std * std_dev)
    lower_band = rolling_mean - (rolling_std * std_dev)
    
    # Band Width: (Upper - Lower) / Middle
    df['bb_width'] = (upper_band - lower_band) / rolling_mean
    # Band Position: (Price - Lower) / (Upper - Lower)
    df['bb_position'] = (df['close'] - lower_band) / (upper_band - lower_band)
    
    # ATR Ratio (Relative Volatility)
    # Note: We already calculated 'atr' in the previous notebook, so it should exist.
    # If not, we can approximate it with High-Low.
    if 'atr' in df.columns:
        df['atr_rel'] = df['atr'] / df['atr'].rolling(30).mean()
    else:
        # Fallback approximation
        tr = df['high'] - df['low']
        df['atr_rel'] = tr.rolling(14).mean() / tr.rolling(30).mean()
    
    # --- D. TIME / SEASONALITY ---
    df['day_of_week'] = df.index.dayofweek
    df['month'] = df.index.month
    
    # --- E. PRICE ACTION (Lagged Returns) ---
    for lag in [1, 2, 3, 5]:
        df[f'return_t-{lag}'] = df['close'].pct_change(lag)
    
    return df

print("üõ†Ô∏è Engineering Features (Manual Mode)...")
df_features = add_features(df)

# Drop NaN values (Must do this because indicators need history to warm up)
df_features.dropna(inplace=True)

print(f"‚úÖ Features added. Final dataset shape: {df_features.shape}")
display(df_features.tail(3))

üõ†Ô∏è Engineering Features (Manual Mode)...
‚úÖ Features added. Final dataset shape: (1290, 25)


Unnamed: 0_level_0,open,high,low,close,tick_volume,spread,real_volume,atr,label,rsi,...,dist_sma200,bb_width,bb_position,atr_rel,day_of_week,month,return_t-1,return_t-2,return_t-3,return_t-5
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-10-09,1.16256,1.16482,1.1542,1.15634,96127,2,0,0.007423,1.0,32.18232,...,0.031027,0.023588,-0.106287,0.96176,3,10,-0.005641,-0.008021,-0.012663,-0.012991
2025-10-10,1.15578,1.16306,1.15569,1.16203,85604,8,0,0.007419,-1.0,32.007913,...,0.035494,0.025069,0.141704,0.964428,4,10,0.004921,-0.000748,-0.00314,-0.01028
2025-10-13,1.1609,1.16297,1.15575,1.15696,79199,8,0,0.007405,1.0,27.566575,...,0.030434,0.027575,0.046554,0.964894,0,10,-0.004363,0.000536,-0.005108,-0.012133


### The "Golden Rule" Check (No Peeking!)

In [4]:
# Sanity Check
# Ensure that 'label' is NOT derived from any feature we just created.
# (Labels look forward, Features look backward. They must not cross.)

print("Columns available for training:")
print(df_features.columns.tolist())

# Correlation Check
# If any feature has 0.99 correlation with the label, we messed up (leakage).
correlations = df_features.corr()['label'].sort_values()
print("\nTop Correlations with Target (Label):")
print(correlations)

Columns available for training:
['open', 'high', 'low', 'close', 'tick_volume', 'spread', 'real_volume', 'atr', 'label', 'rsi', 'rsi_slope', 'rsi_accel', 'sma_50', 'sma_200', 'dist_sma50', 'dist_sma200', 'bb_width', 'bb_position', 'atr_rel', 'day_of_week', 'month', 'return_t-1', 'return_t-2', 'return_t-3', 'return_t-5']

Top Correlations with Target (Label):
close         -0.063758
low           -0.062362
high          -0.061950
open          -0.061514
sma_50        -0.057361
sma_200       -0.047809
return_t-2    -0.038270
return_t-5    -0.037785
return_t-3    -0.037757
dist_sma200   -0.031393
return_t-1    -0.026050
dist_sma50    -0.020727
bb_position   -0.013961
spread        -0.010844
rsi           -0.009561
day_of_week   -0.007386
rsi_slope     -0.004175
atr_rel       -0.001375
month          0.008920
rsi_accel      0.037901
bb_width       0.039599
atr            0.051537
tick_volume    0.055094
label          1.000000
real_volume         NaN
Name: label, dtype: float64
