# XAUUSD AI Trading Model - Kaggle Training

Train LSTM model on 55 years of XAUUSD historical data with multi-timeframe features.

**Kaggle Advantages:**
- Free GPU (P100 or T4)
- 30GB RAM (vs Colab's 12GB)
- 9-hour session limit
- Persistent datasets

**Before running:** Upload your `XAUUSD_HISTORICAL_DATA` folder as a Kaggle Dataset

## Step 1: Setup Environment

In [None]:
# Check GPU availability
import tensorflow as tf
print("GPU Available:", tf.config.list_physical_devices('GPU'))
print("TensorFlow version:", tf.__version__)

# Check system resources
!free -h
!nvidia-smi

In [None]:
# Install required packages
!pip install -q pandas-ta joblib
print("✓ Packages installed")

## Step 2: Import Dataset

**IMPORTANT:** Before running this cell:
1. Go to "Add Data" → "Upload Dataset"
2. Upload your `XAUUSD_HISTORICAL_DATA` folder
3. Name it: `xauusd-historical-data`
4. Make it public or private
5. Then run this cell

In [None]:
# List available datasets
!ls /kaggle/input/

# Set data path (update if your dataset name is different)
DATA_DIR = '/kaggle/input/xauusd-historical-data'

# Verify data exists
!ls -lh "$DATA_DIR"/*.csv | head -10

## Step 3: Data Processing Functions

In [None]:
import pandas as pd
import numpy as np
import pandas_ta as ta
import os
import glob

# Configuration
OUTPUT_DIR = '/kaggle/working/processed_data'
os.makedirs(OUTPUT_DIR, exist_ok=True)

def load_mt_csv(filepath):
    """Load MetaTrader CSV format"""
    print(f"\nLoading: {os.path.basename(filepath)}")
    
    try:
        df = pd.read_csv(
            filepath,
            sep='\t',
            skiprows=0,
            names=['DATE', 'TIME', 'OPEN', 'HIGH', 'LOW', 'CLOSE', 'TICKVOL', 'VOL', 'SPREAD'],
            skipinitialspace=True
        )
        
        if df['DATE'].iloc[0] == '<DATE>':
            df = df.iloc[1:]
        
        df['DateTime'] = pd.to_datetime(
            df['DATE'].astype(str) + ' ' + df['TIME'].astype(str),
            format='%Y.%m.%d %H:%M:%S'
        )
        
        df.set_index('DateTime', inplace=True)
        df.rename(columns={
            'OPEN': 'Open',
            'HIGH': 'High',
            'LOW': 'Low',
            'CLOSE': 'Close',
            'TICKVOL': 'Volume'
        }, inplace=True)
        
        df = df[['Open', 'High', 'Low', 'Close', 'Volume']]
        
        for col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        
        df.dropna(inplace=True)
        
        print(f"  ✓ Loaded {len(df):,} bars")
        print(f"  ✓ Range: {df.index[0]} to {df.index[-1]}")
        
        return df
        
    except Exception as e:
        print(f"  ✗ Error: {e}")
        return None

print("✓ Data processing functions loaded")

## Step 4: Process Historical Data

In [None]:
# Priority timeframes
timeframes = {
    'M5': 'XAUUSD_M5_*.csv',
    'M15': 'XAUUSD_M15_*.csv',
    'H1': 'XAUUSD_H1_*.csv',
    'H4': 'XAUUSD_H4_*.csv'
}

processed_data = {}

for tf_name, pattern in timeframes.items():
    print(f"\n{'='*60}")
    print(f"Processing: {tf_name}")
    print(f"{'='*60}")
    
    matching_files = glob.glob(os.path.join(DATA_DIR, pattern))
    
    if matching_files:
        df = load_mt_csv(matching_files[0])
        
        if df is not None:
            # Save processed data
            output_file = os.path.join(OUTPUT_DIR, f"XAUUSD_{tf_name}_processed.csv")
            df.to_csv(output_file)
            processed_data[tf_name] = df
            print(f"  ✓ Saved: {output_file}")
    else:
        print(f"  ✗ No file found for {tf_name}")

print(f"\n{'='*60}")
print("✓ Data Processing Complete")
print(f"{'='*60}")

for tf, df in processed_data.items():
    print(f"{tf}: {len(df):,} bars")

## Step 5: Model Training Code

In [None]:
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from tensorflow.keras import mixed_precision
import joblib

# Enable mixed precision for faster training on GPU
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)
print("✓ Mixed precision enabled")

# Configuration
SEQ_LEN = 120
FUTURE_TARGET = 1
PROCESSED_DATA_DIR = '/kaggle/working/processed_data'

def load_data(csv_path):
    if os.path.exists(csv_path):
        print(f"Loading: {os.path.basename(csv_path)}...")
        df = pd.read_csv(csv_path, parse_dates=['DateTime'], index_col='DateTime')
        print(f"  ✓ {len(df):,} rows")
        return df
    return None

def add_smc_indicators(df):
    df['Swing_High'] = df['High'].rolling(window=5, center=True).max() == df['High']
    df['Swing_Low'] = df['Low'].rolling(window=5, center=True).min() == df['Low']
    df['Last_Swing_High'] = df['High'].where(df['Swing_High']).ffill()
    df['Last_Swing_Low'] = df['Low'].where(df['Swing_Low']).ffill()
    df['Dist_to_High'] = df['Last_Swing_High'] - df['Close']
    df['Dist_to_Low'] = df['Close'] - df['Last_Swing_Low']
    df.fillna(method='ffill', inplace=True)
    df.fillna(0, inplace=True)
    return df

def add_technical_indicators(df, prefix=''):
    df[f'{prefix}EMA_50'] = ta.ema(df['Close'], length=50)
    df[f'{prefix}EMA_200'] = ta.ema(df['Close'], length=200)
    df[f'{prefix}RSI'] = ta.rsi(df['Close'], length=14)
    
    macd = ta.macd(df['Close'])
    if isinstance(macd, pd.DataFrame):
        df[f'{prefix}MACD'] = macd.iloc[:, 0]
    else:
        df[f'{prefix}MACD'] = macd
    
    df[f'{prefix}ATR'] = ta.atr(df['High'], df['Low'], df['Close'], length=14)
    
    bb = ta.bbands(df['Close'], length=20)
    if bb is not None:
        df[f'{prefix}BB_UPPER'] = bb.iloc[:, 0]
        df[f'{prefix}BB_LOWER'] = bb.iloc[:, 2]
    
    if prefix == '':
        df = add_smc_indicators(df)
    
    if 'Volume' in df.columns:
        df[f'{prefix}Volume_MA'] = df['Volume'].rolling(window=20).mean()
        df[f'{prefix}Volume_Ratio'] = df['Volume'] / df[f'{prefix}Volume_MA']
    
    df[f'{prefix}Log_Return'] = np.log(df['Close'] / df['Close'].shift(1))
    df.dropna(inplace=True)
    return df

def align_higher_timeframe(df_main, df_htf, prefix):
    print(f"  Aligning {prefix}...")
    htf_cols = [col for col in df_htf.columns if col not in ['Open', 'High', 'Low', 'Close', 'Volume']]
    df_htf_selected = df_htf[htf_cols].copy()
    df_htf_selected.columns = [f'{prefix}{col}' for col in df_htf_selected.columns]
    df_merged = df_main.join(df_htf_selected, how='left')
    df_merged.fillna(method='ffill', inplace=True)
    return df_merged

def preprocess_data(df, use_sampling=True):
    if use_sampling:
        print(f"\nApplying sampling...")
        print(f"  Original: {len(df):,} rows")
        
        cutoff_date = df.index[-1] - pd.Timedelta(days=730)
        df_recent = df[df.index >= cutoff_date]
        df_old = df[df.index < cutoff_date]
        
        if len(df_old) > 0:
            sample_size = int(len(df_old) * 0.2)
            df_old_sampled = df_old.sample(n=sample_size, random_state=42).sort_index()
            df = pd.concat([df_old_sampled, df_recent])
        else:
            df = df_recent
        
        print(f"  Sampled: {len(df):,} rows")
    
    df['Target'] = df['Log_Return'].shift(-FUTURE_TARGET)
    df.dropna(inplace=True)
    
    feature_cols = [
        'Close', 'RSI', 'MACD', 'ATR', 'EMA_50', 'EMA_200', 'BB_UPPER', 'BB_LOWER',
        'Dist_to_High', 'Dist_to_Low', 'Volume_MA', 'Volume_Ratio'
    ]
    
    htf_features = [col for col in df.columns if col.startswith('HTF')]
    feature_cols.extend(htf_features)
    feature_cols = [col for col in feature_cols if col in df.columns]
    
    print(f"\nUsing {len(feature_cols)} features")
    
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(df[feature_cols])
    
    target_scaler = MinMaxScaler()
    scaled_target = target_scaler.fit_transform(df[['Target']])
    
    X, y = [], []
    print(f"Creating sequences...")
    for i in range(SEQ_LEN, len(scaled_data)):
        X.append(scaled_data[i-SEQ_LEN:i])
        y.append(scaled_target[i])
        if i % 50000 == 0:
            print(f"  {i:,} / {len(scaled_data):,}")
    
    return np.array(X), np.array(y), scaler, target_scaler, feature_cols

def build_model(input_shape):
    model = Sequential()
    model.add(LSTM(units=256, return_sequences=True, input_shape=input_shape, kernel_regularizer=l2(0.001)))
    model.add(Dropout(0.4))
    model.add(BatchNormalization())
    model.add(LSTM(units=256, return_sequences=True, kernel_regularizer=l2(0.001)))
    model.add(Dropout(0.4))
    model.add(BatchNormalization())
    model.add(LSTM(units=128, return_sequences=False, kernel_regularizer=l2(0.001)))
    model.add(Dropout(0.4))
    model.add(BatchNormalization())
    model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])
    return model

print("✓ Training functions loaded")

## Step 6: Train Model

In [None]:
print("\n" + "="*70)
print("TRAINING XAUUSD MODEL")
print("="*70)

# Load data
print("\n--- Loading Data ---")
df_5m = load_data(os.path.join(PROCESSED_DATA_DIR, 'XAUUSD_M5_processed.csv'))
df_15m = load_data(os.path.join(PROCESSED_DATA_DIR, 'XAUUSD_M15_processed.csv'))
df_1h = load_data(os.path.join(PROCESSED_DATA_DIR, 'XAUUSD_H1_processed.csv'))
df_4h = load_data(os.path.join(PROCESSED_DATA_DIR, 'XAUUSD_H4_processed.csv'))

# Add indicators
print("\n--- Adding Indicators ---")
df_5m = add_technical_indicators(df_5m, prefix='')

if df_15m is not None:
    df_15m = add_technical_indicators(df_15m, prefix='')
    df_5m = align_higher_timeframe(df_5m, df_15m, 'HTF15_')

if df_1h is not None:
    df_1h = add_technical_indicators(df_1h, prefix='')
    df_5m = align_higher_timeframe(df_5m, df_1h, 'HTF1H_')

if df_4h is not None:
    df_4h = add_technical_indicators(df_4h, prefix='')
    df_5m = align_higher_timeframe(df_5m, df_4h, 'HTF4H_')

# Preprocess
print("\n--- Preprocessing ---")
X, y, scaler, target_scaler, features = preprocess_data(df_5m, use_sampling=True)

# Split
split = int(len(X) * 0.9)
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

print(f"\n--- Data Split ---")
print(f"Training: {X_train.shape[0]:,} sequences")
print(f"Testing: {X_test.shape[0]:,} sequences")
print(f"Features: {X_train.shape[2]}")

# Save scalers
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(target_scaler, 'target_scaler.pkl')
print("\n✓ Scalers saved")

# Build model
print("\n--- Building Model ---")
model = build_model((X_train.shape[1], X_train.shape[2]))
print(f"Parameters: {model.count_params():,}")

# Callbacks
checkpoint = ModelCheckpoint("best_xauusd_model.keras", save_best_only=True, 
                            monitor='val_loss', mode='min', verbose=1)
early_stop = EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, 
                              min_lr=0.00001, verbose=1)

# Train
print("\n--- Training ---")
history = model.fit(
    X_train, y_train,
    epochs=30,
    batch_size=64,
    validation_data=(X_test, y_test),
    callbacks=[checkpoint, early_stop, reduce_lr],
    verbose=1
)

print("\n" + "="*70)
print("✓ TRAINING COMPLETE")
print("="*70)

## Step 7: Evaluate Model

In [None]:
from sklearn.metrics import mean_absolute_error

# Load model
model = tf.keras.models.load_model('best_xauusd_model.keras')
target_scaler = joblib.load('target_scaler.pkl')

# Predict
predictions = model.predict(X_test)
predictions_actual = target_scaler.inverse_transform(predictions)
y_test_actual = target_scaler.inverse_transform(y_test)

# Metrics
mae = mean_absolute_error(y_test_actual, predictions_actual)
print(f"\n{'='*60}")
print("EVALUATION RESULTS")
print(f"{'='*60}")
print(f"\nMAE on Log Returns: {mae:.6f}")
print(f"Approximate Price Error: ${mae * 2915:.2f}")

# Directional accuracy
correct = sum((y_test_actual[i][0] > 0 and predictions_actual[i][0] > 0) or 
              (y_test_actual[i][0] < 0 and predictions_actual[i][0] < 0) 
              for i in range(len(y_test_actual)))
accuracy = (correct / len(y_test_actual)) * 100
print(f"\nDirectional Accuracy: {accuracy:.2f}%")
print(f"\n{'='*60}")

## Step 8: Download Model Files

In [None]:
# Verify files exist
!ls -lh best_xauusd_model.keras scaler.pkl target_scaler.pkl

print("\n✓ Model files ready!")
print("\nTo download:")
print("1. Click 'Output' tab on the right")
print("2. Download all 3 files")
print("3. Copy to your local project folder")