## Collecting Historical Gold Price Patterns 

In [1]:
# import required libraries
import pandas as pd
import numpy as np
import yfinance as yf 
import seaborn as sns
import mplfinance as mpf
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from statsmodels.tsa.arima.model import ARIMA
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.stattools import adfuller, acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.metrics import mean_absolute_error, mean_squared_error
from statsmodels.tsa.seasonal import seasonal_decompose

In [2]:
# download the gold data from yfinance module 

# one day - 1 min interval
gold_one_day = yf.download('GC=F', interval="1m")
# one month - 5 min interval
gold_one_month = yf.download('GC=F', interval="5m", period="1mo")
# one year - 1 day interval
gold_one_year = yf.download('GC=F', period="1y")
# ten years - 1 day interval 
gold_ten_year = yf.download('GC=F', period="10y")

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


In [3]:
# save the data as csv file
gold_one_day.to_csv('data/gold_one_day.csv')
gold_one_month.to_csv('data/gold_one_month.csv')
gold_one_year.to_csv('data/gold_one_year.csv')
gold_ten_year.to_csv('data/gold_ten_year.csv')

In [4]:
def data_wrangle(path, droped_columns):
    """ A method that will clean the original dataset, 
        restructure the dataset and fill the missing values.
        
        input
        -----
        path: data path 
        dropped_columns: columns to be dropped"""
    
    # read the dataset through the path
    df=pd.read_csv(path, index_col=0, parse_dates=True)
    # drop the unnecessary columns that are already specified 
    df = df.drop(columns=droped_columns)
    
    # return the dataframe
    return df

In [5]:
df_one_year = data_wrangle(path = "data/gold_one_year.csv", droped_columns="Adj Close")
print(df_one_year.head(5))
print('-'*100)
# check the information of the dataframe 
df_one_year.info()
print('-'*100)
# display the statics of the data frame 
df_one_year.describe()


                   Open         High          Low        Close  Volume
Date                                                                  
2023-07-17  1954.000000  1956.500000  1946.599976  1952.400024      29
2023-07-18  1968.800049  1978.400024  1963.400024  1977.199951      83
2023-07-19  1977.000000  1977.500000  1973.000000  1977.500000      35
2023-07-20  1973.699951  1973.699951  1965.599976  1968.300049     638
2023-07-21  1961.800049  1964.300049  1961.800049  1964.300049      50
----------------------------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 254 entries, 2023-07-17 to 2024-07-17
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    254 non-null    float64
 1   High    254 non-null    float64
 2   Low     254 non-null    float64
 3   Close   254 non-null    float64
 4   Volume  254 non-null    int64  
dtypes: float64(4), 

Unnamed: 0,Open,High,Low,Close,Volume
count,254.0,254.0,254.0,254.0,254.0
mean,2096.454327,2105.972833,2088.076772,2096.958262,4469.893701
std,175.343418,179.016966,173.068872,176.813861,24184.421124
min,1819.0,1826.300049,1809.400024,1816.599976,1.0
25%,1960.149994,1964.575043,1948.274963,1960.750031,47.5
50%,2031.399963,2036.849976,2024.0,2028.0,179.5
75%,2307.674927,2321.924988,2293.724976,2306.924988,540.0
max,2473.199951,2487.399902,2466.600098,2472.100098,202373.0


In [6]:
type(df_one_year.index)

pandas.core.indexes.datetimes.DatetimeIndex

In [7]:
df_ten_year = data_wrangle(path = "data/gold_ten_year.csv", droped_columns="Adj Close")
print(df_ten_year.head(5))
print('-'*100)
# check the information of the dataframe 
df_ten_year.info()
print('-'*100)
# display the statics of the data frame 
df_ten_year.describe()

                   Open         High          Low        Close  Volume
Date                                                                  
2014-07-17  1303.099976  1323.800049  1303.099976  1316.699951      16
2014-07-18  1322.699951  1322.699951  1306.199951  1309.199951      27
2014-07-21  1311.000000  1315.500000  1311.000000  1313.699951      18
2014-07-22  1306.099976  1306.099976  1306.099976  1306.099976       2
2014-07-23  1304.500000  1304.500000  1304.500000  1304.500000       0
----------------------------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2515 entries, 2014-07-17 to 2024-07-17
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    2515 non-null   float64
 1   High    2515 non-null   float64
 2   Low     2515 non-null   float64
 3   Close   2515 non-null   float64
 4   Volume  2515 non-null   int64  
dtypes: float64(4),

Unnamed: 0,Open,High,Low,Close,Volume
count,2515.0,2515.0,2515.0,2515.0,2515.0
mean,1540.913002,1548.231611,1533.791293,1541.02143,5296.472366
std,334.205259,336.370852,332.432474,334.475235,29311.40412
min,1053.699951,1062.0,1046.199951,1050.800049,0.0
25%,1249.049988,1253.75,1243.649963,1248.950012,46.0
50%,1419.599976,1423.0,1413.900024,1419.599976,175.0
75%,1829.700012,1839.799988,1821.5,1831.149963,539.0
max,2473.199951,2487.399902,2466.600098,2472.100098,386334.0


In [8]:
# checkpoints 
print(df_one_year.iloc[[0]])
print('-'*100)
print(df_one_year.iloc[[0]].index)
print('-'*100)
print(df_one_year.iloc[[0]]["High"])

              Open    High          Low        Close  Volume
Date                                                        
2023-07-17  1954.0  1956.5  1946.599976  1952.400024      29
----------------------------------------------------------------------------------------------------
DatetimeIndex(['2023-07-17'], dtype='datetime64[ns]', name='Date', freq=None)
----------------------------------------------------------------------------------------------------
Date
2023-07-17    1956.5
Name: High, dtype: float64


In [9]:
"""interval = 30
for start in range(0, len(df_one_year), interval):
    end = start + 30
    segment = df_one_year[start:end]
#print(segment.index)
#print(segment.index.day)
#print(segment[:end])
for i in segment[:end]:
    for key, value in zip(segment.index, segment[:end][i]):
        print(f"{i}")
        #print(key.year, key.month, key.day)
        print(value)
"""

'interval = 30\nfor start in range(0, len(df_one_year), interval):\n    end = start + 30\n    segment = df_one_year[start:end]\n#print(segment.index)\n#print(segment.index.day)\n#print(segment[:end])\nfor i in segment[:end]:\n    for key, value in zip(segment.index, segment[:end][i]):\n        print(f"{i}")\n        #print(key.year, key.month, key.day)\n        print(value)\n'

In [10]:
# Function to plot and save images 
def plot_images(data, interval, output_dir):
    """A function that segements out the date inteval and plot
    on a figure.
    
    input
    -----
    data : dataframe 
    intreval : date interval
    
    output
    ------
    plotted figure"""
    # loop the whole dataframe with interval (days) steps
    for start in range(0, len(data) - interval, interval):
        # define the end 
        end = start + interval
        # segment the dataframe with specifed start and end index
        segment = data[start:end]
        # loop inside each segment 
        for label in segment[:end]:
            #print(label)
            # zip the segment index and label value 
            #for index, value in zip(segment.index, segment[:end][label]):
                #print(f"{label}: {index} {value}")
            plt.figure(figsize=(20, 10), dpi=300)
            plt.plot(segment.index, segment[label], linewidth=3)
            plt.title(f"{label} from {segment.index[0]} to {segment.index[-1]}")
            plt.xlabel("Date")
            # rotating X-axis labels
            plt.xticks(rotation = 90)
            plt.ylabel(label)
            # add grid line
            plt.grid(True)
            plt.savefig(f"{output_dir}/{label}_{segment.index[0]}_{segment.index[-1]}.png")
            plt.close()
        

In [11]:
#plot_images(data = df_one_year, interval = 30, output_dir = "images")

In [12]:
# function to plot and save images
"""
def plot_and_save_images(data, interval, output_dir):
    for start in range(0, len(data) - interval, interval):
        end = start + interval
        segment = data[start:end]
        plt.figure(figsize=(10, 5), dpi=300)
        plt.plot(segment.index, segment['Price'], linewidth=3)
        plt.title(f"Gold Price from {segment.index[0]} to {segment.index[-1]}")
        plt.xlabel("Date")
        plt.ylabel("Price")
        #plt.savefig(f"{output_dir}/gold_price_{start}_{end}.png")
        #plt.close()
"""

'\ndef plot_and_save_images(data, interval, output_dir):\n    for start in range(0, len(data) - interval, interval):\n        end = start + interval\n        segment = data[start:end]\n        plt.figure(figsize=(10, 5), dpi=300)\n        plt.plot(segment.index, segment[\'Price\'], linewidth=3)\n        plt.title(f"Gold Price from {segment.index[0]} to {segment.index[-1]}")\n        plt.xlabel("Date")\n        plt.ylabel("Price")\n        #plt.savefig(f"{output_dir}/gold_price_{start}_{end}.png")\n        #plt.close()\n'

In [13]:
# function to plot and save images
def plot_images_test(data, interval, output_dir):
    for start in range(0, len(data) - interval, interval):
        end = start + interval
        segment = data[start:end]
        
        for label in segment.columns:
            plt.figure(figsize=(10, 5), dpi=300)
            plt.plot(segment.index, segment[label], linewidth=3)
            plt.title(f"{label} from {segment.index[0]} to {segment.index[-1]}")
            plt.xlabel("Date")
            plt.ylabel(label)
            
            # Extract the day part from the date index
            day_labels = segment.index.to_series().apply(lambda x: pd.to_datetime(x).day)
            
            # Set x-ticks to show only the day part
            plt.xticks(ticks=segment.index, labels=day_labels, rotation=90)
            # add grid line
            plt.grid(True)
            plt.savefig(f"{output_dir}/{label}_{segment.index[0]}_{segment.index[-1]}.png")
            plt.close()

In [17]:
def plot_and_save_labels(data, interval, output_dir):
    
    """ A method that plot and save the targeted segment. 
        
        parameters
        -----
        data: dataframe
        interval: specified date interval (days)
        output_dir: directory to save the images
        
        return
        ------
        matplotlib images with labels
        """
    # loop from 0 to the sepcified interval with the interval step size
    for start in range(0, len(data) - interval, interval):
        
        # specified ending 
        end = start + interval
        # segment out the interval from the dataframe
        segment = data[start:end]

        # for lablel, get from the segment columns
        for label in segment.columns:
            
            # create a candlestick plot
            plt.figure(figsize=(10, 5), dpi=300)
            # set title
            title = f"{label} from {segment.index[0]} to {segment.index[-1]}"
            # specified start date
            start_date = segment.index[0].strftime("%Y-%m-%d_%H-%M-%S")
            # specified end date
            end_date = segment.index[-1].strftime("%Y-%m-%d_%H-%M-%S")
            # file name to output
            file_name = f"{output_dir}/{label}_{start_date}_{end_date}.png"
            # plot the data and save
            mpf.plot(segment, type='candle', 
                     title=title, style='charles', datetime_format='%y %b %d',
                     ylabel=label, xlabel="Date", savefig=dict(fname=file_name, dpi=300))            
            # close the image
            plt.close()

In [18]:
plot_and_save_labels(data = df_one_year, interval = 30, output_dir = "candle_sticks")

In [19]:
def plot_candle_sticks(data, interval, output_dir):

    """ A method that plot and save the targeted segment. 
        
        parameters
        -----
        data: dataframe
        interval: specified date interval (days)
        output_dir: directory to save the images
        
        return
        ------
        matplotlib images without labels
        """
    
    # loop from 0 to the sepcified interval with the interval step size
    for start in range(0, len(data) - interval, interval):

        # specified ending 
        end = start + interval
        # segment out the interval from the dataframe 
        segment = data[start:end]
        
        # for lablel, get from the segment columns
        for label in segment.columns:
            
            # create a candlestick plot
            plt.figure(figsize=(10, 5), dpi=300)
            # specified start date 
            start_date = segment.index[0].strftime("%Y-%m-%d_%H-%M-%S")
            # specified end date
            end_date = segment.index[-1].strftime("%Y-%m-%d_%H-%M-%S")
            # file name to output
            file_name = f"{output_dir}/{label}_{start_date}_{end_date}.png"
            # plot the data and save
            mpf.plot(segment, type='candle', 
                     style='charles', datetime_format='%y %b %d',
                     savefig=dict(fname=file_name, dpi=300), axisoff= True)            
            # close the image
            plt.close()

In [20]:
plot_candle_sticks(data = df_one_year, interval = 30, output_dir = "candle_sticks_no_label")

In [33]:
def plot_and_save(data: pd.DataFrame, interval: int = 10, output_dir: str = "/", label_on: bool = False) -> None :
    
    """ A method that plot and save the targeted segment. 
        
        parameters
        -----
        data: dataframe
        interval: specified date interval (days) <default: 10>
        output_dir: directory to save the images <default: at the current directory>
        label_on: boolean <True / False> <default: False>
        
        return
        ------
        matplotlib images with labels <None>
        """
    # loop from 0 to the sepcified interval with the interval step size
    for start in range(0, len(data) - interval, interval):
        
        # specified ending 
        end = start + interval
        # segment out the interval from the dataframe
        segment = data[start:end]

        # for lablel, get from the segment columns
        for label in segment.columns:
            
            # create a candlestick plot
            plt.figure(figsize=(10, 5), dpi=300)
            # set title
            title = f"{label} from {segment.index[0]} to {segment.index[-1]}"
            # specified start date
            start_date = segment.index[0].strftime("%Y-%m-%d_%H-%M-%S")
            # specified end date
            end_date = segment.index[-1].strftime("%Y-%m-%d_%H-%M-%S")
            # file name to output
            file_name = f"{output_dir}/{label}_{start_date}_{end_date}.png"
            if label_on != False:
                # plot the data and save
                mpf.plot(segment, type='candle', 
                        title=title, style='charles', datetime_format='%y %b %d',
                        ylabel=label, xlabel="Date", savefig=dict(fname=file_name, dpi=300)) 
            else:
                # plot the data and save
                mpf.plot(segment, type='candle', 
                        title=title, style='charles', datetime_format='%y %b %d',
                        ylabel=label, xlabel="Date", savefig=dict(fname=file_name, dpi=300), axisoff=True)            
            # close the image
            plt.close()

In [31]:
plot_and_save(data = df_one_year, interval = 30, output_dir = "test_1")

In [29]:
plot_and_save(data = df_one_year, interval = 30, output_dir = "test_2", label_on=True)

In [34]:
plot_and_save(data = df_one_year)