In [1]:
#### Preamble ####
# Purpose: Validate all of the variables found the dataset
# Author: Jiazhou(Justin) Bi
# Date: 16 Nov 2024
# Contact: justin.bi@mail.utoronto.ca
# License: MIT
# Pre-requisites: see requirements.txt
# Any other information needed? None

# Loading the Dataset

In [2]:
import pandas as pd
df_1m = pd.read_parquet('../data/02-analysis_data/cleaned_data_1m.parquet')
df_1h = pd.read_parquet('../data/02-analysis_data/cleaned_data_1h.parquet')
df_1d = pd.read_parquet('../data/02-analysis_data/cleaned_data_1d.parquet')
df_1h.head()

Unnamed: 0,open,high,low,close,volume,index,was_missing,direction_t-1,direction_t+1
0,4261.48,4313.62,4261.32,4308.83,47.181009,2017-08-17 04:00:00,0,0,1
1,4308.83,4328.69,4291.37,4315.32,23.234916,2017-08-17 05:00:00,0,1,1
2,4330.29,4345.45,4309.37,4324.35,7.229691,2017-08-17 06:00:00,0,1,1
3,4316.62,4349.99,4287.41,4349.99,4.443249,2017-08-17 07:00:00,0,1,1
4,4333.32,4377.85,4333.32,4360.69,0.972807,2017-08-17 08:00:00,0,1,1


# Open

The open price should always be lower than or equal to the high price, and always higher or equal to the low price. This subsection will check if this logic is correct.

In [3]:
if all((df_1m['open'] <= df_1m['high']) & (df_1m['open'] >= df_1m['low'])):
    print('PASS')
else:
    print('FAIL')

PASS


In [4]:
if all((df_1h['open'] <= df_1h['high']) & (df_1h['open'] >= df_1h['low'])):
    print('PASS')
else:
    print('FAIL')

PASS


In [5]:
if all((df_1d['open'] <= df_1d['high']) & (df_1d['open'] >= df_1d['low'])):
    print('PASS')
else:
    print('FAIL')

PASS


Originally, missing values were found here. Fixed them in 03-clean_data.ipynb, and they are fixed now.

# High

The high price should always be higher or equal to the open, low price, and close. This subsection will check if this logic is correct.

In [6]:
if all((df_1m['high'] >= df_1m['open']) & (df_1m['high'] >= df_1m['low']) &(df_1m['high'] >= df_1m['close'])):
    print('PASS')
else:
    print('FAIL')

PASS


In [7]:
if all((df_1h['high'] >= df_1h['open']) & (df_1h['high'] >= df_1h['low']) &(df_1h['high'] >= df_1h['close'])):
    print('PASS')
else:
    print('FAIL')

PASS


In [8]:
if all((df_1d['high'] >= df_1d['open']) & (df_1d['high'] >= df_1d['low']) &(df_1d['high'] >= df_1d['close'])):
    print('PASS')
else:
    print('FAIL')

PASS


# Low

The low price should always be lower or equal to the open, high price, and close. This subsection will check if this logic is correct.


In [9]:
if all((df_1m['low'] <= df_1m['open']) & (df_1m['low'] <= df_1m['high']) &(df_1m['low'] <= df_1m['close'])):
    print('PASS')
else:
    print('FAIL')

PASS


In [10]:
if all((df_1h['low'] <= df_1h['open']) & (df_1h['low'] <= df_1h['high']) &(df_1h['low'] <= df_1h['close'])):
    print('PASS')
else:
    print('FAIL')

PASS


In [11]:
if all((df_1d['low'] <= df_1d['open']) & (df_1d['low'] <= df_1d['high']) &(df_1d['low'] <= df_1d['close'])):
    print('PASS')
else:
    print('FAIL')

PASS


# Close

The close price should always be lower than or equal to the high price, and always higher or equal to the low price. Although this has been proven by the subsections above, we will double-check  the data. This subsection will check if this logic is correct.

In [12]:
if all((df_1m['close'] <= df_1m['high']) & (df_1m['close'] >= df_1m['low'])):
    print('PASS')
else:
    print('FAIL')

PASS


In [13]:
if all((df_1h['close'] <= df_1h['high']) & (df_1h['close'] >= df_1h['low'])):
    print('PASS')
else:
    print('FAIL')

PASS


In [14]:
if all((df_1d['close'] <= df_1d['high']) & (df_1d['close'] >= df_1d['low'])):
    print('PASS')
else:
    print('FAIL')

PASS


# Volume

Volume can be any number that is non-negative. This subsection will verify this logic.

In [15]:
if all(df_1d['volume'] >= 0):
    print('PASS')
else:
    print('FAIL')

PASS


# Was_missing

This column is generated in 03-clean_data.ipynb to serve as a flag to indicate if the row exists originally from the raw dataset, or was missing and interpolated. It should either be 0 (originally exist) or 1(originally missing). This subsection will verify this. Note that there was no missing values for the 1-day dataset, so this column does not exist in that dataset.

In [16]:
unique_values = df_1m['was_missing'].unique()
print(unique_values)
print(df_1m['was_missing'].isna().sum())

[0 1]
0


In [17]:
unique_values = df_1h['was_missing'].unique()
print(unique_values)
print(df_1h['was_missing'].isna().sum())

[0 1]
0


# Direction

This column is generated in 03-clean_data.ipynb to serve as an indicator whether the closing price has increased, decreased, or remained the same for the previous timestamp. It should either be -1 (decreased), 0 (remianed the same), or 1(increased). This subsection will verify this.

In [18]:
unique_values = df_1m['direction_t-1'].unique()
print(unique_values)
print(df_1m['direction_t-1'].isna().sum())

[ 0  1 -1]
0


In [19]:
unique_values = df_1m['direction_t+1'].unique()
print(unique_values)
df_1m['direction_t+1'] = df_1m['direction_t+1'].astype(int)
print(df_1m['direction_t+1'].isna().sum())

[ 0  1 -1]
0


In [20]:
unique_values = df_1h['direction_t-1'].unique()
print(unique_values)
print(df_1h['direction_t-1'].isna().sum())

[ 0  1 -1]
0


In [21]:
unique_values = df_1h['direction_t+1'].unique()
print(unique_values)
df_1h['direction_t+1'] = df_1h['direction_t+1'].astype(int)
print(df_1h['direction_t+1'].isna().sum())

[ 1 -1  0]
0


In [22]:
unique_values = df_1d['direction_t-1'].unique()
print(unique_values)
print(df_1d['direction_t-1'].isna().sum())

[ 0 -1  1]
0


In [23]:
unique_values = df_1d['direction_t+1'].unique()
print(unique_values)
df_1d['direction_t+1'] = df_1d['direction_t+1'].astype(int)
print(df_1d['direction_t+1'].isna().sum())

[-1  1  0]
0
