In [1]:
# import required libraries 
import os
import warnings 
import numpy as np
import pandas as pd 
import yfinance as yf
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [2]:
def data_wrangle(path, dropped_columns=None):
    """A method that cleans the original dataset,
       restructures the dataset, and fills the missing values.
        
    Parameters
    ----------
    path : str
        Data path to the CSV file.
    dropped_columns : list, optional
        Columns to be dropped (default is None).
    
    Returns
    -------
    pd.DataFrame
        Cleaned and structured dataframe.
    """
    
    # Read the dataset from the given path
    df = pd.read_csv(path, header=None, names=["Date", "Time", "Open", "High", "Low", "Close", "Volume"])
    
    # Combine Date and Time columns into a single Date column
    df['Date'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])
    
    # Drop the now redundant Time column
    df.drop(columns=["Time"], inplace=True)
    
    # If there are any unnecessary columns specified, drop them
    if dropped_columns:
        df = df.drop(columns=dropped_columns)
    
    # Set the Date column as the index
    df.set_index('Date', inplace=True)
    
    # Return the cleaned dataframe
    return df

In [3]:
# load the dataset 
df_1min = data_wrangle('data/XAUUSD_1min.csv')
# check the dataset 
df_1min.head(5)

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-04-25 18:20:00,2328.16,2328.16,2323.53,2323.53,90
2024-04-25 18:21:00,2323.57,2325.51,2323.15,2325.13,91
2024-04-25 18:22:00,2325.11,2326.01,2324.65,2325.65,83
2024-04-25 18:23:00,2325.65,2326.51,2325.57,2325.92,67
2024-04-25 18:24:00,2325.93,2326.3,2325.4,2325.59,85


In [4]:
df_1h = df_1min.resample('1h').agg({'Open': 'first', 'High': 'max', 'Low': 'min', 'Close': 'last'})
df_1h.dropna(inplace=True)
print(df_1h.isna().sum())
df_1h.to_csv('data/test.csv')

Open     0
High     0
Low      0
Close    0
dtype: int64


In [5]:
df_1h.head(5)

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-04-25 18:00:00,2328.16,2330.82,2323.15,2328.72
2024-04-25 19:00:00,2328.77,2332.25,2326.25,2332.15
2024-04-25 20:00:00,2332.22,2334.97,2330.66,2333.03
2024-04-25 21:00:00,2332.96,2336.4,2331.62,2332.93
2024-04-25 22:00:00,2332.9,2333.62,2330.75,2331.24


In [None]:
def stochastic(df: pd.DataFrame, k_period: int = 9, d_period: int = 3) -> pd.DataFrame:
    """A stochastic function that calculates the Fast %K & Slow %D using EMA.
    
    Parameters
    ----------
    df: pd.DataFrame
        Input dataframe containing OHLC data.
    k_period: int, optional
        Period to calculate the Fast %K (default is 9).
    d_period: int, optional
        Period to calculate the Slow %D (default is 3).
    
    Returns
    -------
    pd.DataFrame
        DataFrame that contains Fast %K, Fast %D (EMA), and Slow %D (EMA).
    """

    # Find the highest high market price in the k period
    df['HighestHigh'] = df['High'].rolling(window=k_period).max()

    # Find the lowest low market price in the k period
    df['LowestLow'] = df['Low'].rolling(window=k_period).min()

    # Calculate Fast %K
    df['FastK'] = ((df['Close'] - df['LowestLow']) / (df['HighestHigh'] - df['LowestLow'])) * 100

    # Calculate Fast %D (EMA of Fast %K with period 1, which is just FastK itself)
    df['FastD'] = df['FastK']

    # Calculate Slow %D (EMA of Fast %D with period d_period)
    df['SlowD'] = df['FastD'].ewm(span=d_period, adjust=False).mean()

    # Drop temporary columns
    df.drop(columns=['HighestHigh', 'LowestLow'], inplace=True)

    # Return the dataframe with stochastic values
    return df