## Collecting Historical Gold Price Patterns 

In [1]:
# import required libraries
import pandas as pd
import numpy as np
import yfinance as yf 
import seaborn as sns
import mplfinance as mpf
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.stattools import adfuller, acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.metrics import mean_absolute_error, mean_squared_error
from statsmodels.tsa.seasonal import seasonal_decompose

In [2]:
# download the gold data from yfinance module 

# one day - 1 min interval
gold_one_day = yf.download('GC=F', interval="1m")
# one month - 5 min interval
gold_one_month = yf.download('GC=F', interval="5m", period="1mo")
# one year - 1 day interval
gold_one_year = yf.download('GC=F', period="1y")
# ten years - 1 day interval 
gold_ten_year = yf.download('GC=F', period="10y")

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


In [3]:
# save the data as csv file
gold_one_day.to_csv('data/gold_one_day.csv')
gold_one_month.to_csv('data/gold_one_month.csv')
gold_one_year.to_csv('data/gold_one_year.csv')
gold_ten_year.to_csv('data/gold_ten_year.csv')

In [4]:
def data_wrangle(path, droped_columns):
    """ A method that will clean the original dataset, 
        restructure the dataset and fill the missing values.
        
        input
        -----
        path: data path 
        dropped_columns: columns to be dropped"""
    
    # read the dataset through the path
    df=pd.read_csv(path, index_col=0, parse_dates=True)
    # drop the unnecessary columns that are already specified 
    df = df.drop(columns=droped_columns)
    
    # return the dataframe
    return df

In [5]:
df_one_year = data_wrangle(path = "data/gold_one_year.csv", droped_columns="Adj Close")
print(df_one_year.head(5))
print('-'*100)
# check the information of the dataframe 
df_one_year.info()
print('-'*100)
# display the statics of the data frame 
df_one_year.describe()


                   Open         High          Low        Close  Volume
Date                                                                  
2023-07-20  1973.699951  1973.699951  1965.599976  1968.300049     638
2023-07-21  1961.800049  1964.300049  1961.800049  1964.300049      50
2023-07-24  1965.300049  1965.500000  1960.300049  1960.300049       2
2023-07-25  1953.000000  1962.500000  1953.000000  1962.099976      19
2023-07-26  1966.199951  1972.000000  1966.199951  1968.900024      51
----------------------------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 253 entries, 2023-07-20 to 2024-07-19
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    253 non-null    float64
 1   High    253 non-null    float64
 2   Low     253 non-null    float64
 3   Close   253 non-null    float64
 4   Volume  253 non-null    int64  
dtypes: float64(4), 

Unnamed: 0,Open,High,Low,Close,Volume
count,253.0,253.0,253.0,253.0,253.0
mean,2100.727665,2110.180236,2092.189724,2100.989322,4266.948617
std,177.76535,181.104805,175.288813,178.822354,24019.964577
min,1819.0,1826.300049,1809.400024,1816.599976,1.0
25%,1961.800049,1965.400024,1950.0,1961.800049,46.0
50%,2032.5,2037.099976,2024.800049,2030.199951,178.0
75%,2310.699951,2324.699951,2296.199951,2312.399902,538.0
max,2472.899902,2473.100098,2454.800049,2462.399902,202373.0


In [6]:
df_ten_year = data_wrangle(path = "data/gold_ten_year.csv", droped_columns="Adj Close")
print(df_ten_year.head(5))
print('-'*100)
# check the information of the dataframe 
df_ten_year.info()
print('-'*100)
# display the statics of the data frame 
df_ten_year.describe()

                   Open         High          Low        Close  Volume
Date                                                                  
2014-07-21  1311.000000  1315.500000  1311.000000  1313.699951      18
2014-07-22  1306.099976  1306.099976  1306.099976  1306.099976       2
2014-07-23  1304.500000  1304.500000  1304.500000  1304.500000       0
2014-07-24  1297.500000  1297.500000  1290.599976  1290.599976       1
2014-07-25  1294.800049  1303.099976  1294.800049  1303.099976       8
----------------------------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2515 entries, 2014-07-21 to 2024-07-19
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    2515 non-null   float64
 1   High    2515 non-null   float64
 2   Low     2515 non-null   float64
 3   Close   2515 non-null   float64
 4   Volume  2515 non-null   int64  
dtypes: float64(4),

Unnamed: 0,Open,High,Low,Close,Volume
count,2515.0,2515.0,2515.0,2515.0,2515.0
mean,1541.811093,1549.116064,1534.676462,1541.897812,5274.320875
std,335.108422,337.239507,333.302675,335.319568,29294.692087
min,1053.699951,1062.0,1046.199951,1050.800049,0.0
25%,1249.049988,1253.75,1243.649963,1248.950012,46.0
50%,1422.800049,1425.300049,1416.300049,1420.900024,173.0
75%,1830.049988,1841.049988,1821.849976,1831.850037,536.5
max,2472.899902,2473.100098,2454.800049,2462.399902,386334.0


In [7]:
# checkpoints 
print(df_one_year.iloc[[0]])
print('-'*100)
print(df_one_year.iloc[[0]].index)
print('-'*100)
print(df_one_year.iloc[[0]]["High"])

                   Open         High          Low        Close  Volume
Date                                                                  
2023-07-20  1973.699951  1973.699951  1965.599976  1968.300049     638
----------------------------------------------------------------------------------------------------
DatetimeIndex(['2023-07-20'], dtype='datetime64[ns]', name='Date', freq=None)
----------------------------------------------------------------------------------------------------
Date
2023-07-20    1973.699951
Name: High, dtype: float64


In [8]:
# checkpoints 
print(df_ten_year.iloc[[0]])
print('-'*100)
print(df_ten_year.iloc[[0]].index)
print('-'*100)
print(df_one_year.iloc[[0]]["High"])

              Open    High     Low        Close  Volume
Date                                                   
2014-07-21  1311.0  1315.5  1311.0  1313.699951      18
----------------------------------------------------------------------------------------------------
DatetimeIndex(['2014-07-21'], dtype='datetime64[ns]', name='Date', freq=None)
----------------------------------------------------------------------------------------------------
Date
2023-07-20    1973.699951
Name: High, dtype: float64


In [9]:
print(len(df_one_year))

253


In [10]:
print(len(df_ten_year))

2515


In [11]:
""""
interval = 7
for start in range(0, len(df_ten_year), interval):
    end = start + 7
    segment = df_ten_year[start:end]
#print(segment)
#print(segment.index.day)
#print(segment[:end])
for i in segment[:end]:
    print(len(segment))
    for key, value in zip(segment.index, segment[:end][i]):
        print(key)
        #print(key.year, key.month, key.day)
        print(value)
"""

'"\ninterval = 7\nfor start in range(0, len(df_ten_year), interval):\n    end = start + 7\n    segment = df_ten_year[start:end]\n#print(segment)\n#print(segment.index.day)\n#print(segment[:end])\nfor i in segment[:end]:\n    print(len(segment))\n    for key, value in zip(segment.index, segment[:end][i]):\n        print(key)\n        #print(key.year, key.month, key.day)\n        print(value)\n'

In [12]:
# function to plot and save images <line plot> 
# don't use it anymore
'''
def plot_images(data, interval, output_dir):
    """
    A function that segements out the date inteval and plot
    on a figure.
    
    input
    -----
    data : dataframe 
    intreval : date interval
    
    output
    ------
    plotted figure
    """
    # loop the whole dataframe with interval (days) steps
    for start in range(0, len(data) - interval, interval):
        # define the end 
        end = start + interval
        # segment the dataframe with specifed start and end index
        segment = data[start:end]
        # loop inside each segment 
        for label in segment[:end]:
            #print(label)
            # zip the segment index and label value 
            #for index, value in zip(segment.index, segment[:end][label]):
                #print(f"{label}: {index} {value}")
            plt.figure(figsize=(20, 10), dpi=300)
            plt.plot(segment.index, segment[label], linewidth=3)
            plt.title(f"{label} from {segment.index[0]} to {segment.index[-1]}")
            plt.xlabel("Date")
            # rotating X-axis labels
            plt.xticks(rotation = 90)
            plt.ylabel(label)
            # add grid line
            plt.grid(True)
            plt.savefig(f"{output_dir}/{label}_{segment.index[0]}_{segment.index[-1]}.png")
            plt.close()
'''

'\ndef plot_images(data, interval, output_dir):\n    """\n    A function that segements out the date inteval and plot\n    on a figure.\n    \n    input\n    -----\n    data : dataframe \n    intreval : date interval\n    \n    output\n    ------\n    plotted figure\n    """\n    # loop the whole dataframe with interval (days) steps\n    for start in range(0, len(data) - interval, interval):\n        # define the end \n        end = start + interval\n        # segment the dataframe with specifed start and end index\n        segment = data[start:end]\n        # loop inside each segment \n        for label in segment[:end]:\n            #print(label)\n            # zip the segment index and label value \n            #for index, value in zip(segment.index, segment[:end][label]):\n                #print(f"{label}: {index} {value}")\n            plt.figure(figsize=(20, 10), dpi=300)\n            plt.plot(segment.index, segment[label], linewidth=3)\n            plt.title(f"{label} from {segm

In [13]:
#plot_images(data = df_one_year, interval = 30, output_dir = "images")

In [14]:
# Function to plot and save images - v 0.0.0
"""
def plot_and_save_images(data, interval, output_dir):
    for start in range(0, len(data) - interval, interval):
        end = start + interval
        segment = data[start:end]
        plt.figure(figsize=(10, 5), dpi=300)
        plt.plot(segment.index, segment['Price'], linewidth=3)
        plt.title(f"Gold Price from {segment.index[0]} to {segment.index[-1]}")
        plt.xlabel("Date")
        plt.ylabel("Price")
        #plt.savefig(f"{output_dir}/gold_price_{start}_{end}.png")
        #plt.close()
"""

'\ndef plot_and_save_images(data, interval, output_dir):\n    for start in range(0, len(data) - interval, interval):\n        end = start + interval\n        segment = data[start:end]\n        plt.figure(figsize=(10, 5), dpi=300)\n        plt.plot(segment.index, segment[\'Price\'], linewidth=3)\n        plt.title(f"Gold Price from {segment.index[0]} to {segment.index[-1]}")\n        plt.xlabel("Date")\n        plt.ylabel("Price")\n        #plt.savefig(f"{output_dir}/gold_price_{start}_{end}.png")\n        #plt.close()\n'

In [15]:
# Function to plot and save images - v 0.0.1
"""
def plot_images_test(data, interval, output_dir):
    for start in range(0, len(data) - interval, interval):
        end = start + interval
        segment = data[start:end]
        
        for label in segment.columns:
            plt.figure(figsize=(10, 5), dpi=300)
            plt.plot(segment.index, segment[label], linewidth=3)
            plt.title(f"{label} from {segment.index[0]} to {segment.index[-1]}")
            plt.xlabel("Date")
            plt.ylabel(label)
            
            # Extract the day part from the date index
            day_labels = segment.index.to_series().apply(lambda x: pd.to_datetime(x).day)
            
            # Set x-ticks to show only the day part
            plt.xticks(ticks=segment.index, labels=day_labels, rotation=90)
            # add grid line
            plt.grid(True)
            plt.savefig(f"{output_dir}/{label}_{segment.index[0]}_{segment.index[-1]}.png")
            plt.close()
"""

'\ndef plot_images_test(data, interval, output_dir):\n    for start in range(0, len(data) - interval, interval):\n        end = start + interval\n        segment = data[start:end]\n        \n        for label in segment.columns:\n            plt.figure(figsize=(10, 5), dpi=300)\n            plt.plot(segment.index, segment[label], linewidth=3)\n            plt.title(f"{label} from {segment.index[0]} to {segment.index[-1]}")\n            plt.xlabel("Date")\n            plt.ylabel(label)\n            \n            # Extract the day part from the date index\n            day_labels = segment.index.to_series().apply(lambda x: pd.to_datetime(x).day)\n            \n            # Set x-ticks to show only the day part\n            plt.xticks(ticks=segment.index, labels=day_labels, rotation=90)\n            # add grid line\n            plt.grid(True)\n            plt.savefig(f"{output_dir}/{label}_{segment.index[0]}_{segment.index[-1]}.png")\n            plt.close()\n'

In [16]:
data_test = df_one_year.head(7)
data_test

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-07-20,1973.699951,1973.699951,1965.599976,1968.300049,638
2023-07-21,1961.800049,1964.300049,1961.800049,1964.300049,50
2023-07-24,1965.300049,1965.5,1960.300049,1960.300049,2
2023-07-25,1953.0,1962.5,1953.0,1962.099976,19
2023-07-26,1966.199951,1972.0,1966.199951,1968.900024,51
2023-07-27,1945.400024,1945.400024,1945.400024,1945.400024,194253
2023-07-28,1945.5,1962.199951,1944.199951,1960.400024,23155


In [17]:
def plot_and_save(data: pd.DataFrame, interval: int = 10, output_dir: str = "/", plot_trend: str = "", label_on: bool = False) -> None:
    """ A method that plots and saves the targeted segment. 
        
        parameters
        ----------
        data : pd.DataFrame
            DataFrame containing the data to be plotted.
        interval : int, default 10
            Specified date interval (days) to segment the data.
        output_dir : str, default "/"
            Directory to save the images.
        label_on : bool, default False
            Flag to determine if labels should be included in the plot.
        
        returns
        -------
        None

        ***Cautions: all candlesticks figures require open, high, low and close 
        to form a candlestick that's why we use mplfinance module. 
        For adding trend line, we customize with addplot option in the mplfinance module.***
        """
    # initialize starting point index
    start = 0
    # initialize data length
    data_length = len(data)
    # while start is less than the data length
    while start < data_length:
        # create end point <this is for to remove the excessive days>
        end = min(start + interval, data_length)
        # segment the dataframe with the specifed start and end index
        segment = data[start:end]

        # a list to add additional plots for OHLC <Open, High, Low, Close> trend line
        trend_plots = [
            # open - gold color
            mpf.make_addplot(segment['Open'], color='#FFD700'), # linestyle='--'
            # close - teal color
            mpf.make_addplot(segment['Close'], color='#008080'),
            # high - black color
            mpf.make_addplot(segment['High'], color='#000000'),
            # low - blue color
            mpf.make_addplot(segment['Low'], color='#0000FF')
        ]
        # loop through all columns in the segment
        for label in segment.columns:
            # create a candlestick plot
            # image title 
            title = f"Gold Market Prices from {segment.index[0]} to {segment.index[-1]}"
            # initialize and format start date
            start_date = segment.index[0].strftime("%Y-%m-%d_%H-%M-%S")
            # initialize and format end date
            end_date = segment.index[-1].strftime("%Y-%m-%d_%H-%M-%S")
            # create file name <e.g. start_date_end_date_"OHLC".png>
            file_name = f"{output_dir}/{start_date}_{end_date}_{plot_trend}.png" 
            # arguments for plotting
            plot_args = {
                # style
                'style': 'charles',
                # x-ticks format in year month<in text> day
                'datetime_format': '%y %b %d',
                # y label
                'ylabel': 'Price in $',
                # x label
                'xlabel': 'Date',
                # saving images removing excessive white space around the candle_sticks and trend lines
                'savefig': dict(fname=file_name, dpi=300, bbox_inches='tight', pad_inches=0.1),
            }


            # for label on savings
            if label_on:
                # add title
                plot_args['title'] = title
                # for open trend, including candlestick  
                if plot_trend.lower() == "open":
                    mpf.plot(segment, type='candle', addplot=trend_plots[0], **plot_args)
                # for close trend, including candlestick  
                elif plot_trend.lower() == "close":
                    mpf.plot(segment, type='candle', addplot=trend_plots[1], **plot_args)
                # for high trend, including candlestick  
                elif plot_trend.lower() == "high":
                    mpf.plot(segment, type='candle', addplot=trend_plots[2], **plot_args)
                # for low trend, including candlestick  
                elif plot_trend.lower() == "low":
                    mpf.plot(segment, type='candle', addplot=trend_plots[3], **plot_args)
                # for all OHLC trends, including candlestick  
                elif plot_trend.lower() == "all":
                    mpf.plot(segment, type='candle', addplot=trend_plots, **plot_args)
                # no trend line, just the candle sticks
                else:
                    mpf.plot(segment, type='candle', **plot_args)

            # for label off savings
            else:
                # off the x-axis and y-axis
                plot_args['axisoff'] = True
                # for open trend, including candlestick  
                if plot_trend.lower() == "open":
                    mpf.plot(segment, type='candle', addplot=trend_plots[0], **plot_args)
                # for close trend, including candlestick 
                elif plot_trend.lower() == "close":
                    mpf.plot(segment, type='candle', addplot=trend_plots[1], **plot_args)
                # for high trend, including candlestick  
                elif plot_trend.lower() == "high":
                    mpf.plot(segment, type='candle', addplot=trend_plots[2], **plot_args)
                # for low trend, including candlestick 
                elif plot_trend.lower() == "low":
                    mpf.plot(segment, type='candle', addplot=trend_plots[3], **plot_args)
                # for all OHLC trends, including candlestick
                elif plot_trend.lower() == "all":
                    mpf.plot(segment, type='candle', addplot=trend_plots, **plot_args)
                # no trend line, just the candle sticks
                else:
                    mpf.plot(segment, type='candle', **plot_args)

            # tight layout for removing white spaces
            plt.tight_layout()
            # close the image
            plt.close()
        # increment the start index by the interval
        start += interval

In [18]:
def plot_data_and_save_label_on_1_yr():
    # open trend, label off
    plot_and_save(df_one_year, interval=15, output_dir='images//one_year_data//patterns_labels//open', plot_trend = "open", label_on=True)
    # close trend, label off
    plot_and_save(df_one_year, interval=15, output_dir='images//one_year_data//patterns_labels//close', plot_trend = "close", label_on=True)
    # high trend, label off
    plot_and_save(df_one_year, interval=15, output_dir='images//one_year_data//patterns_labels//high', plot_trend = "high", label_on=True)
    # low trend, label off
    plot_and_save(df_one_year, interval=15, output_dir='images//one_year_data//patterns_labels//low', plot_trend = "low", label_on=True)
    # OHLC trends, label off
    plot_and_save(df_one_year, interval=15, output_dir='images//one_year_data//patterns_labels//all', plot_trend = "all", label_on=True)
    # no trend line, label off
    plot_and_save(df_one_year, interval=15, output_dir='images//one_year_data//patterns_labels//no_trend_line', label_on=True)

In [19]:
plot_data_and_save_label_on_1_yr()

In [20]:
def plot_data_and_save_label_off_1_yr():
    # open trend, label off
    plot_and_save(df_one_year, interval=15, output_dir='images//one_year_data//patterns_no_labels//open', plot_trend = "open")
    # close trend, label off
    plot_and_save(df_one_year, interval=15, output_dir='images//one_year_data//patterns_no_labels//close', plot_trend = "close")
    # high trend, label off
    plot_and_save(df_one_year, interval=15, output_dir='images//one_year_data//patterns_no_labels//high', plot_trend = "high")
    # low trend, label off
    plot_and_save(df_one_year, interval=15, output_dir='images//one_year_data//patterns_no_labels//low', plot_trend = "low")
    # OHLC trends, label off
    plot_and_save(df_one_year, interval=15, output_dir='images//one_year_data//patterns_no_labels//all', plot_trend = "all")
    # no trend line, label off
    plot_and_save(df_one_year, interval=15, output_dir='images//one_year_data//patterns_no_labels//no_trend_line')

In [21]:
plot_data_and_save_label_off_1_yr()

### Run separately, use a lot of memory to collect ten years data.

In [22]:
# open trend, label on
plot_and_save(df_ten_year, interval=15, output_dir='images//ten_year_data//patterns_labels//open', plot_trend = "open", label_on=True)

In [23]:
# close trend, label on
plot_and_save(df_ten_year, interval=15, output_dir='images//ten_year_data//patterns_labels//close', plot_trend = "close", label_on=True)

In [24]:
# high trend, label on
plot_and_save(df_ten_year, interval=15, output_dir='images//ten_year_data//patterns_labels//high', plot_trend = "high", label_on=True)

In [25]:
# low trend, label on
plot_and_save(df_ten_year, interval=15, output_dir='images//ten_year_data//patterns_labels//low', plot_trend = "low", label_on=True)

In [26]:
# OHLC trends, label on
plot_and_save(df_ten_year, interval=15, output_dir='images//ten_year_data//patterns_labels//all', plot_trend = "all", label_on=True)

In [27]:
# no trend line, label on
plot_and_save(df_ten_year, interval=15, output_dir='images//ten_year_data//patterns_labels//no_trend_line', label_on=True)

In [28]:
# open trend, label off
plot_and_save(df_ten_year, interval=15, output_dir='images//ten_year_data//patterns_no_labels//open', plot_trend = "open")

In [29]:
# close trend, label off
plot_and_save(df_ten_year, interval=15, output_dir='images//ten_year_data//patterns_no_labels//close', plot_trend = "close")

In [30]:
# high trend, label off
plot_and_save(df_ten_year, interval=15, output_dir='images//ten_year_data//patterns_no_labels//high', plot_trend = "high")

In [31]:
# low trend, label off
plot_and_save(df_ten_year, interval=15, output_dir='images//ten_year_data//patterns_no_labels//low', plot_trend = "low")

In [32]:
# OHLC trends, label off
plot_and_save(df_ten_year, interval=15, output_dir='images//ten_year_data//patterns_no_labels//all', plot_trend = "all")

In [33]:
# no trend line, label off
plot_and_save(df_ten_year, interval=15, output_dir='images//ten_year_data//patterns_no_labels//no_trend_line')