In [1]:
import numpy as np
import keras
import os
import pandas as pd

## Loading in Data

In [2]:
input_size = 192
output_size = 12
sample_size = input_size + output_size

x_data = [] # Should be lists to append to
y_data = []
min_max = []


for root, dirs, files in os.walk("./data/Forex/1H"): # Loop through the folder structure. Could be simplified if you only have one level.
    for file in files:
        filepath = os.path.join(root, file)  # More robust way to join paths
        print(f"Processing - {filepath}")
        data = pd.read_csv(filepath, header=None, usecols=[1,2,3,4,5], dtype=float, delimiter=",")
        
    # EMA - Exponential Moving Average - 10, 20, 50, 100, 200 - Cols [6,7,8,9,10]
        EMA_values = [10,20,50,100]
        for periods in EMA_values:
            col = len(data.T)
            col += 1
            data[col] = np.nan
            periods = periods - 1
            
            #First SMA
            data.loc[periods, col] =  np.average(data[:periods][4]) # Calculating first simple moving average
    
            #Then EMA
            smoothing_f = 2 / (periods + 1) # smoothing factor
            data[col] =  data[4].ewm(span=periods, adjust=False).mean()

        
    # RSI - Relative Strength Index - Col 11
        delta = data[4].diff()  # Calculate price differences
        # Separate gains and losses
        gain = delta.where(delta > 0, 0)
        loss = -delta.where(delta < 0, 0)  # Negate losses to make them positive
        
        # Calculate average gains and losses using the ewma function
        avg_gain = gain.ewm(com=14 - 1, adjust=False).mean()
        avg_loss = loss.ewm(com=14 - 1, adjust=False).mean()
    
        rs = avg_gain / avg_loss  # Calculate the relative strength (RS)
        data[11] = 100 - (100 / (1 + rs))  # Calculate the RSI
        
    # Bollinger Bands - Cols [12,13,14]
        # rolling_mean = data[4].rolling(window=20).mean()
        # rolling_std = data[4].rolling(window=20).std()
    
        # # Calculate upper and lower bands
        # upper_band = rolling_mean + 2 * rolling_std
        # lower_band = rolling_mean - 2 * rolling_std
    
        # # Add bands to DataFrame
        # data[12] = rolling_mean
        # data[13] = upper_band
        # data[14] = lower_band

    # MACD - Moving Average Convergence Divergence - Cols [15,16,17]
        # short_ema = data[4].ewm(span=12, adjust=False).mean()
        # long_ema = data[4].ewm(span=26, adjust=False).mean()
    
        # # Calculate MACD line
        # macd_line = short_ema - long_ema
    
        # # Calculate Signal line
        # signal_line = macd_line.ewm(span=9, adjust=False).mean()
    
        # # Calculate MACD histogram
        # macd_histogram = macd_line - signal_line
    
        # # Add to DataFrame
        # data[15] = macd_line
        # data[16] = signal_line
        # data[17] = macd_histogram

    # Stochastic Oscillator - Cols [18,19]
            # Calculate rolling min and max
        low_min = data[4].rolling(window=14).min()
        high_max  = data[4].rolling(window=14).max()
    
        # Calculate %K (Stochastic Oscillator)
        k_percent = ((data[4] - low_min) / (high_max - low_min)) * 100
    
    
        # Calculate %D (Smoothed Stochastic Oscillator)
        d_percent = k_percent.rolling(window=3).mean()
    
        # Add to DataFrame
        data[18] = k_percent
        data[19] = d_percent

    # ADX - Average Directional Index - Cols [20,21,22] (+ +Di, -Di)

        high = data[2] # Or specify column. Assumed you have a high column.
        low = data[3]  # Assumed you have a Low column.
        close = data[4]

        # Calculate True Range (TR)
        tr1 = high - low  # Current High minus the current Low
        tr2 = np.abs(high - close.shift(1))  # Current High minus the previous Close
        tr3 = np.abs(low - close.shift(1))  # Current Low minus the previous Close
        true_range = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1, skipna=False) # Takes the maximum of tr1, tr2, and tr3.
    
    
        # Calculate Plus Directional Movement (+DM) and Minus Directional Movement (-DM)
        plus_dm = (high - high.shift(1)).clip(lower=0) # Takes positive gains
        minus_dm = (low.shift(1) - low).clip(lower=0) # Takes positive losses
        
        # Smooth True Range (ATR)
        atr = true_range.ewm(alpha=1/14, min_periods=14, adjust=False).mean()
        
        # Smooth +DM and -DM
        plus_di = 100 * (plus_dm.ewm(alpha=1/14, min_periods=14, adjust=False).mean() / atr)
        minus_di = 100 * (minus_dm.ewm(alpha=1/14, min_periods=14, adjust=False).mean() / atr)
        
        # Calculate Directional Index (DX)
        dx = 100 * np.abs(plus_di - minus_di) / (plus_di + minus_di)
        
        # Calculate Average Directional Index (ADX)
        adx = dx.ewm(alpha=1/14, min_periods=14, adjust=False).mean()
        
    
        # Add to DataFrame
        data[20] = adx
        # data[21] = plus_di
        # data[22] = minus_di

    # Split to x, y data
        data = data[100:]

        for i in range(len(data) - sample_size + 1):
            sample = data.iloc[i:i + sample_size].copy()

            # Normalize *each sample individually*
            sample_min = sample.min()
            sample_max = sample.max()
            normalized_sample = (sample - sample_min) / (sample_max - sample_min)

            x_sample = normalized_sample.iloc[:input_size].values
            y_sample = data.iloc[i:i + sample_size, 3].values[-output_size:] # Take values from original data

            x_data.append(x_sample) # Append the numpy arrays so type conversion can occur at once outside loops for optimization
            y_data.append(y_sample)
            
# Convert to NumPy arrays after all files are processed
print("Done...")
# COLS = Open, High, Low, Close, Volume, 10 EMA, 20 EMA, 50 EMA, 100 EMA, 200 EMA, RSI, bb_rolling_mean, bb_upper, bb_lower, macd, signal, macd_histogram, k%, d%, adx, +d, -d

Processing - ./data/Forex/1H\GBPUSD_H1.csv
Done...


In [3]:
x_data = np.array(x_data)
y_data = np.array(y_data)

In [4]:
print(f"x_data: {x_data.shape} y_data: {y_data.shape}")

x_data: (99697, 192, 13) y_data: (99697, 12)


## Splitting Data

In [5]:
train_split = 0.8
split_index = int(len(x_data) * train_split)

# Create a shuffled array of indices
indices = np.arange(len(x_data))
np.random.shuffle(indices)

# Split the indices
train_indices = indices[:split_index]
test_indices = indices[split_index:]

# Use the shuffled indices to split the data
x_train = x_data[train_indices]
y_train = y_data[train_indices]
x_test = x_data[test_indices]
y_test = y_data[test_indices]

In [6]:
print(f"x_train: {x_train.shape} y_train:{y_train.shape}")
print(f"x_test: {x_test.shape}) y_test: {y_test.shape}")

x_train: (79757, 192, 13) y_train:(79757, 12)
x_test: (19940, 192, 13)) y_test: (19940, 12)


In [7]:
with open('Expanded_Normalised_Data.npy', 'wb') as f:
    np.save(f,x_train.astype(np.float16))
    np.save(f,y_train.astype(np.float16))
    np.save(f,x_test.astype(np.float16))
    np.save(f,y_test.astype(np.float16))
