In [38]:
#### Preamble ####
# Purpose: Simulates some data for BTC/USDT
# Author: Jiazhou(Justin) Bi
# Date: Nov 15, 2024
# Contact: justin.bi@mail.utoronto.ca
# License: None
# Pre-requisites: see requirements.txt
# Any other information needed? None

In the following section, I will simulate 1000 observations for my desired dataset based on the sketch in the "other" folder, as well as logics of the data columns. The simulated data with be saved to "../data/00-simulated_data/simulated_data.parquet" and tested in the "/./01-test_simulated_data.ipynb/" in the same folder of this notebook.

In [39]:
import pandas as pd
import numpy as np


In [40]:
# Creating an empty DataFrme for the simulation
simulated_df = pd.DataFrame(columns=['Time','Open-Diff','Low-Diff','High-Diff','Close-Diff','Volume','D_t-1','D_t+1'])
simulated_df

Unnamed: 0,Time,Open-Diff,Low-Diff,High-Diff,Close-Diff,Volume,D_t-1,D_t+1


In [41]:
# Creating the timestamps for the dataset.
# 1-hour interval dataset in this case.
full_time_range = pd.date_range(start=pd.Timestamp("2020-01-01 16:00:00"), end=pd.Timestamp("2022-01-01 16:00:00"), freq='1h')

# Feed the time range to the empty DataFrame and only keep the first 1000 rows
time_values = full_time_range[:1000]
simulated_df['Time'] = time_values
print(simulated_df.shape)
print(simulated_df.head())

(1000, 8)
                 Time Open-Diff Low-Diff High-Diff Close-Diff Volume D_t-1  \
0 2020-01-01 16:00:00       NaN      NaN       NaN        NaN    NaN   NaN   
1 2020-01-01 17:00:00       NaN      NaN       NaN        NaN    NaN   NaN   
2 2020-01-01 18:00:00       NaN      NaN       NaN        NaN    NaN   NaN   
3 2020-01-01 19:00:00       NaN      NaN       NaN        NaN    NaN   NaN   
4 2020-01-01 20:00:00       NaN      NaN       NaN        NaN    NaN   NaN   

  D_t+1  
0   NaN  
1   NaN  
2   NaN  
3   NaN  
4   NaN  


Simulating the values for the column "Close-Diff". Here the starting point of its value is near 20, and ends at around 22. Randomness and fluctuations were added to introduce noises to the data.

In [42]:
# Setting parameters up
np.random.seed(210)
num_rows = 1000

# Simulate random fluctuations
close_diff = np.random.normal(loc=0, scale=20, size=num_rows)

# Feed the simulated values to the DataFrame
simulated_df['Close-Diff'] = close_diff

print(simulated_df.head())

                 Time Open-Diff Low-Diff High-Diff  Close-Diff Volume D_t-1  \
0 2020-01-01 16:00:00       NaN      NaN       NaN   -5.976533    NaN   NaN   
1 2020-01-01 17:00:00       NaN      NaN       NaN  -12.128040    NaN   NaN   
2 2020-01-01 18:00:00       NaN      NaN       NaN    0.466860    NaN   NaN   
3 2020-01-01 19:00:00       NaN      NaN       NaN   30.164719    NaN   NaN   
4 2020-01-01 20:00:00       NaN      NaN       NaN   29.239115    NaN   NaN   

  D_t+1  
0   NaN  
1   NaN  
2   NaN  
3   NaN  
4   NaN  


Applying the same logic to simulate "Open-Diff", "Low-Diff", "High-Diff", and Volume.

In [43]:
# Setting parameters up
num_rows = 1000
np.random.seed(211)
open_diff = np.random.normal(loc=0, scale=20, size=num_rows)
np.random.seed(212)
low_diff = np.random.normal(loc=0, scale=30, size=num_rows)
np.random.seed(213)
high_diff = np.random.normal(loc=0, scale=30, size=num_rows)
np.random.seed(214)
volume = np.abs(np.random.normal(loc=0, scale=2, size=num_rows))

# Feed the simulated values to the DataFrame
simulated_df['Open-Diff'] = open_diff
simulated_df['Low-Diff'] = low_diff
simulated_df['High-Diff'] = high_diff
simulated_df['Volume'] = volume

print(simulated_df.head())

                 Time  Open-Diff   Low-Diff  High-Diff  Close-Diff    Volume  \
0 2020-01-01 16:00:00  13.983636   6.880407   4.045364   -5.976533  0.279903   
1 2020-01-01 17:00:00  20.404356  -4.463911  17.813336  -12.128040  0.876434   
2 2020-01-01 18:00:00  -1.931295 -34.033355  23.718640    0.466860  0.375277   
3 2020-01-01 19:00:00  36.151979 -32.252611  28.071660   30.164719  1.620702   
4 2020-01-01 20:00:00  -9.982300 -10.985767 -29.305975   29.239115  0.949814   

  D_t-1 D_t+1  
0   NaN   NaN  
1   NaN   NaN  
2   NaN   NaN  
3   NaN   NaN  
4   NaN   NaN  


Lastly, I will calculate the values of D_t-1 and D_t+1. D_t-1 will be -1 if the Close-Diff is negative, and 1 if Close-Diff is positive. For D_t+1, if the next CLose-Diff is positive, it is 1, otherwise it will be 0, as per definition of our dataset.

In [44]:

simulated_df['D_t-1'] = simulated_df['Close-Diff'].apply(lambda x: 1 if x > 0 else -1)
simulated_df['D_t+1'] = (simulated_df['Close-Diff'].shift(-1).apply(lambda x: 1 if pd.notnull(x) and x > 0 else -1 if pd.notnull(x) else None))

In [45]:
simulated_df.head(20)

Unnamed: 0,Time,Open-Diff,Low-Diff,High-Diff,Close-Diff,Volume,D_t-1,D_t+1
0,2020-01-01 16:00:00,13.983636,6.880407,4.045364,-5.976533,0.279903,-1,-1.0
1,2020-01-01 17:00:00,20.404356,-4.463911,17.813336,-12.12804,0.876434,-1,1.0
2,2020-01-01 18:00:00,-1.931295,-34.033355,23.71864,0.46686,0.375277,1,1.0
3,2020-01-01 19:00:00,36.151979,-32.252611,28.07166,30.164719,1.620702,1,1.0
4,2020-01-01 20:00:00,-9.9823,-10.985767,-29.305975,29.239115,0.949814,1,-1.0
5,2020-01-01 21:00:00,18.664427,-28.557518,-10.487868,-11.983489,0.573252,-1,1.0
6,2020-01-01 22:00:00,14.575335,-25.758442,-18.305657,20.504349,0.525941,1,1.0
7,2020-01-01 23:00:00,-13.589053,-20.392439,-7.905701,3.31677,0.102138,1,-1.0
8,2020-01-02 00:00:00,10.375314,20.653518,-13.21922,-15.557331,2.395682,-1,-1.0
9,2020-01-02 01:00:00,-25.655323,5.844354,29.572112,-12.775636,0.12845,-1,1.0


In [46]:
# Saving the DataFrame into a parquet for testing
simulated_df.to_parquet('../data/00-simulated_data/simulated_data_1h.parquet', index=False)