In [53]:
#### Preamble ####
# Purpose: Cleans, engineers, and saves the cleaned data 
# Author: Jiazhou(Justin) Bi
# Date: 15 Nov 2024
# Contact: justin.bi@mail.utoronto.ca
# License: None
# Pre-requisites: see requirements.txt
# Any other information needed? None

# Loading the data

In [54]:
import pandas as pd

In [55]:
#Loading the datasets as DataFrame
df_1m = pd.read_parquet('../data/01-raw_data/raw_data_1m.parquet')
df_1h = pd.read_parquet('../data/01-raw_data/raw_data_1h.parquet')
df_1d = pd.read_parquet('../data/01-raw_data/raw_data_1d.parquet')
# print(df_1m.head())
# print(df_1h.head())
# print(df_1d.head())

# Checking for Missing Values

In [56]:
df_1m.isna().sum()

timestamp    0
open         0
high         0
low          0
close        0
volume       0
dtype: int64

In [57]:
df_1h.isna().sum()

timestamp    0
open         0
high         0
low          0
close        0
volume       0
dtype: int64

In [58]:
df_1d.isna().sum()

timestamp    0
open         0
high         0
low          0
close        0
volume       0
dtype: int64

There are no missing values found in these datasets.

# Data Types

This section examines the datatypes of each column and ensures they are appropriate for the analysis.

In [59]:
df_1m.dtypes

timestamp    datetime64[ns]
open                float64
high                float64
low                 float64
close               float64
volume              float64
dtype: object

In [60]:
df_1h.dtypes

timestamp    datetime64[ns]
open                float64
high                float64
low                 float64
close               float64
volume              float64
dtype: object

In [61]:
df_1d.dtypes

timestamp    datetime64[ns]
open                float64
high                float64
low                 float64
close               float64
volume              float64
dtype: object

# Data Validation

This section is to validate if the datasets are consecutive. That is, for the 1-minute timestamp dataset, no minutes should be skipped. Same logic appleis to the 1-hour and 1-day timestamp datasets.

## Validating if any minute is missing

In [62]:
# creating a full range of minutes for the 1-minute dataset
full_minute_range = pd.date_range(start=df_1m['timestamp'].min(), end=df_1m['timestamp'].max(), freq='1min')
# print(full_minute_range)

In [63]:
# reindexing the DataFrame
df_1m.set_index('timestamp', inplace=True)
df_1m = df_1m.reindex(full_minute_range)

In [64]:
# Check for missing values across the entire DataFrame
missing_rows = df_1m[df_1m.isna().any(axis=1)]

if missing_rows.empty:
    print("No missing values.")
else:
    print("Rows with missing values:")
    print(missing_rows)

Rows with missing values:
                     open  high  low  close  volume
2017-09-06 16:01:00   NaN   NaN  NaN    NaN     NaN
2017-09-06 16:02:00   NaN   NaN  NaN    NaN     NaN
2017-09-06 16:03:00   NaN   NaN  NaN    NaN     NaN
2017-09-06 16:04:00   NaN   NaN  NaN    NaN     NaN
2017-09-06 16:05:00   NaN   NaN  NaN    NaN     NaN
...                   ...   ...  ...    ...     ...
2023-03-24 13:55:00   NaN   NaN  NaN    NaN     NaN
2023-03-24 13:56:00   NaN   NaN  NaN    NaN     NaN
2023-03-24 13:57:00   NaN   NaN  NaN    NaN     NaN
2023-03-24 13:58:00   NaN   NaN  NaN    NaN     NaN
2023-03-24 13:59:00   NaN   NaN  NaN    NaN     NaN

[8632 rows x 5 columns]


There are missing minutes. We will first mark the missing rows down in a new column called 'was_miss'. 1 indicates the row was empty and 0 means not. Then the missing values are filled with linear interpolation, except the column "volume", which is replaced with 0.

In [65]:
# creating an indicator to flag the missing rows
df_1m['was_missing'] = df_1m.isna().any(axis=1).astype(int)

# Interpolate missing values
df_1m.interpolate(method='linear', inplace=True)
df_1m.loc[df_1m['was_missing'] == 1, 'volume'] = 0

In [66]:
# Check agian for missing values across the entire DataFrame
missing_rows = df_1m[df_1m.isna().any(axis=1)]

if missing_rows.empty:
    print("No missing values.")
else:
    print("Rows with missing values:")
    print(missing_rows)

No missing values.


## Validating if any hour is missing from the 1-hour dataset

In [67]:
# using the same logic as above
full_hour_range = pd.date_range(start=df_1h['timestamp'].min(), end=df_1h['timestamp'].max(), freq='1h')
df_1h.set_index('timestamp', inplace=True)
df_1h = df_1h.reindex(full_hour_range)

# Check for missing values across the entire DataFrame
missing_rows_1h = df_1h[df_1h.isna().any(axis=1)]

if missing_rows_1h.empty:
    print("No missing values.")
else:
    print("Rows with missing values:")
    print(missing_rows_1h)

Rows with missing values:
                     open  high  low  close  volume
2017-09-06 17:00:00   NaN   NaN  NaN    NaN     NaN
2017-09-06 18:00:00   NaN   NaN  NaN    NaN     NaN
2017-09-06 19:00:00   NaN   NaN  NaN    NaN     NaN
2017-09-06 20:00:00   NaN   NaN  NaN    NaN     NaN
2017-09-06 21:00:00   NaN   NaN  NaN    NaN     NaN
...                   ...   ...  ...    ...     ...
2021-08-13 04:00:00   NaN   NaN  NaN    NaN     NaN
2021-08-13 05:00:00   NaN   NaN  NaN    NaN     NaN
2021-09-29 07:00:00   NaN   NaN  NaN    NaN     NaN
2021-09-29 08:00:00   NaN   NaN  NaN    NaN     NaN
2023-03-24 13:00:00   NaN   NaN  NaN    NaN     NaN

[128 rows x 5 columns]


In [68]:
# applying the same logic as above
df_1h['was_missing'] = df_1h.isna().any(axis=1).astype(int)

# Interpolate missing values
df_1h.interpolate(method='linear', inplace=True)
df_1h.loc[df_1h['was_missing'] == 1, 'volume'] = 0

# Check agian for missing values across the entire DataFrame
missing_rows_1h = df_1h[df_1h.isna().any(axis=1)]

if missing_rows_1h.empty:
    print("No missing values.")
else:
    print("Rows with missing values:")
    print(missing_rows_1h)

No missing values.


## Validating if any day is missing from the 1-day dataset

In [69]:
# using the same logic as above
full_day_range = pd.date_range(start=df_1d['timestamp'].min(), end=df_1d['timestamp'].max(), freq='1d')
df_1d.set_index('timestamp', inplace=True)
df_1d = df_1d.reindex(full_day_range)

# Check for missing values across the entire DataFrame
missing_rows_1d = df_1d[df_1d.isna().any(axis=1)]

if missing_rows_1d.empty:
    print("No missing values.")
else:
    print("Rows with missing values:")
    print(missing_rows_1d)

No missing values.


For the 1-day dataset, no days were missing.

# Adding A Column For Price Change Direction

This subsection creates a new column for each dataset called direction. If the closing price is higher than the previous closing price, it is considered that the price has gone up and thus marked as 1 for appreciation. If the closing price is lower than the previous closing price, it is considered that the price has gone down and hence is marked as -1 for depreciation. If the price remains the same,  it is marked as 0 for no movement.

In [70]:
# Calculting the direction
df_1m['direction'] = df_1m['close'].diff().apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))

# Dropping the first row as it does not have a direction
df_1m.reset_index(inplace=True)
df_1m = df_1m.iloc[1:].reset_index(drop=True)
#df_1m.head()

In [71]:
# applying the same logic to the 1-hour and 1-day datasets
df_1h['direction'] = df_1h['close'].diff().apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
df_1h.reset_index(inplace=True)
df_1h = df_1h.iloc[1:].reset_index(drop=True)

df_1d['direction'] = df_1d['close'].diff().apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
df_1d.reset_index(inplace=True)
df_1d = df_1d.iloc[1:].reset_index(drop=True)

# Saving the DataFrame as a parquet file

In [72]:
df_1m.to_parquet('../data/02-analysis_data/cleaned_data_1m.parquet', index=False)
df_1h.to_parquet('../data/02-analysis_data/cleaned_data_1h.parquet', index=False)
df_1d.to_parquet('../data/02-analysis_data/cleaned_data_1d.parquet', index=False)