In [1]:
import pandas as pd
import numpy as np

In [2]:
# Names of processed dfs:

'''

bnb_chain_processed, bnb_price_processed
arb1_chain_processed, arb2_chain_processed, arb_price_processed 
tron1_chain_processed, tron2_chain_processed, tron_price_processed
avax1_chain_processed, avax2_chain_processed, avax_price_processed
matic1_chain_processed, matic2_chain_processed, matic_price_processed
sol_chain_processed, sol_price_processed
eth1_chain_processed, eth2_chain_processed, eth_price_processed
btc_chain_processed, btc_price_processed

'''

'\n\nbnb_chain_processed, bnb_price_processed\narb1_chain_processed, arb2_chain_processed, arb_price_processed \ntron1_chain_processed, tron2_chain_processed, tron_price_processed\navax1_chain_processed, avax2_chain_processed, avax_price_processed\nmatic1_chain_processed, matic2_chain_processed, matic_price_processed\nsol_chain_processed, sol_price_processed\neth1_chain_processed, eth2_chain_processed, eth_price_processed\nbtc_chain_processed, btc_price_processed\n\n'

In [3]:
def clean_data(data, price_data=None, on_chain_data=None, set_index=True):
    if price_data and on_chain_data:
        raise ValueError('Invalid Parameter Values: Both price_data or on_chain Data cannot be True')
    elif price_data:
        prices = data.copy()
        if prices.isna().any().any():
            print('you have nans here')
            return prices 
        
        if set_index:
            prices['time_period_end'] = pd.to_datetime(prices['time_period_end'])
            prices = prices.set_index('time_period_end') 
        else:
            prices.index = pd.to_datetime(prices.index)

        prices['time_open'] = pd.to_datetime(prices['time_open'])
        prices['time_close'] = pd.to_datetime(prices['time_close'])
        return prices
    
    elif on_chain_data:
        metrics = data.copy()
        if metrics.isna().any().any():
            print('you have nans here')
            return metrics 
        
        if set_index:
            metrics['hour'] = pd.to_datetime(metrics['hour'])
            metrics = metrics.set_index('hour')
        else:
            metrics.index = pd.to_datetime(metrics.index)
        return metrics
    else:
        raise ValueError('Invalid Parameter Values: price_data or on_chain Data must be True')
    

In [4]:
def check_missing_hours(df):
    """
    Check if the DataFrame's datetime index skips any hourly datapoints.
    
    Parameters:
        df (pd.DataFrame): DataFrame with a DatetimeIndex.
        
    Returns:
        missing (pd.DatetimeIndex): The missing hourly timestamps.
    """
    # Ensure the index is a DatetimeIndex
    if not isinstance(df.index, pd.DatetimeIndex):
        raise ValueError("DataFrame index must be a DatetimeIndex.")
    
    # Create an expected date_range from the minimum to the maximum timestamp at hourly frequency
    expected_range = pd.date_range(start=df.index.min(), end=df.index.max(), freq='H')
    
    # Determine which timestamps are missing
    missing = expected_range.difference(df.index)
    return missing

In [5]:
def preprocess_data(data, price_data=None, on_chain_data=None, set_index=True):
    if price_data and on_chain_data:
        raise ValueError('Invalid Parameter Values. Both price_data and on_chain_data cannot both be True')

    elif (not price_data) and (not on_chain_data):
        raise ValueError('Invalid Parameter Values. Both price_data and on_chain_data cannot both be False')
    
    else:
        # Create the target df and merge on correct dates. Then forward fill the na values
        if price_data:
            df = clean_data(data.copy(), price_data=True, set_index=True) if set_index else clean_data(data.copy(), price_data=True, set_index=False)
            target_df = pd.DataFrame(0, columns=[0], index=pd.date_range(start=df.index.min(), end=df.index.max(), freq='H'))
        
        elif on_chain_data:
            df = clean_data(data.copy(), on_chain_data=True, set_index=True) if set_index else clean_data(data.copy(), on_chain_data=True, set_index=False)
            target_df = pd.DataFrame(0, columns=[0], index=pd.date_range(start=df.index.min(), end=df.index.max(), freq='H'))

        target_df = target_df.join(df, how='left').drop(0, axis=1)
        target_df = target_df.fillna(method='ffill')


        return target_df

BITCOIN CLEANING

In [6]:
btc = pd.read_parquet('BTC_Hourly_On_Chain_Data_sorted.parquet')
btc.head()

Unnamed: 0,hour,average_height,average_difficulty,average_total_fees,average_total_reward,average_mint_reward,average_transaction_count,average_nonce,average_size,average_stripped_size,average_version,average_weight
3417,2015-01-01 00:00:00.000 UTC,336868.0,40640960000.0,0.023508,25.023508,25.0,177.733333,1982945000.0,103153.666667,103153.666667,2.0,412614.7
32629,2015-01-01 01:00:00.000 UTC,336879.0,40640960000.0,0.043085,25.043085,25.0,309.571429,2230371000.0,168577.857143,168577.857143,2.0,674311.4
49123,2015-01-01 02:00:00.000 UTC,336886.5,40640960000.0,0.029944,25.029944,25.0,228.75,2312130000.0,155578.5,155578.5,2.0,622314.0
45019,2015-01-01 03:00:00.000 UTC,336892.5,40640960000.0,0.074818,25.074818,25.0,497.0,1882359000.0,257693.0,257693.0,2.0,1030772.0
18681,2015-01-01 04:00:00.000 UTC,336898.0,40640960000.0,0.032928,25.032928,25.0,246.571429,2118612000.0,130498.571429,130498.571429,2.0,521994.3


In [7]:
# We must shift the BTC Data by 1, since we don't know that information until then

btc['hour'] = pd.to_datetime(btc['hour'])
btc = btc.set_index('hour')
btc = btc.shift(1)
btc = btc.dropna()
display(btc.head())
init_num_missing_hours = len(check_missing_hours(btc))
num_nils_btc = np.where(btc == '<nil>', 1, 0).sum()
print(f'Number of Nils in raw btc on-chain Data: {num_nils_btc}')
print(f'Number of missing hours in the raw BTC on-chain Data: {init_num_missing_hours}')

Unnamed: 0_level_0,average_height,average_difficulty,average_total_fees,average_total_reward,average_mint_reward,average_transaction_count,average_nonce,average_size,average_stripped_size,average_version,average_weight
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2015-01-01 01:00:00+00:00,336868.0,40640960000.0,0.023508,25.023508,25.0,177.733333,1982945000.0,103153.666667,103153.666667,2.0,412614.7
2015-01-01 02:00:00+00:00,336879.0,40640960000.0,0.043085,25.043085,25.0,309.571429,2230371000.0,168577.857143,168577.857143,2.0,674311.4
2015-01-01 03:00:00+00:00,336886.5,40640960000.0,0.029944,25.029944,25.0,228.75,2312130000.0,155578.5,155578.5,2.0,622314.0
2015-01-01 04:00:00+00:00,336892.5,40640960000.0,0.074818,25.074818,25.0,497.0,1882359000.0,257693.0,257693.0,2.0,1030772.0
2015-01-01 05:00:00+00:00,336898.0,40640960000.0,0.032928,25.032928,25.0,246.571429,2118612000.0,130498.571429,130498.571429,2.0,521994.3


Number of Nils in raw btc on-chain data: 0
Number of missing hours in the raw BTC on-chain data: 224


In [8]:
btc_chain_processed = preprocess_data(data=btc, on_chain_data=True, set_index=False)
display(btc_chain_processed.head())
num_missing_hours = len(check_missing_hours(btc_chain_processed))
print(f'Number of missing hours in the processed BTC on-chain Data: {num_missing_hours}')

Unnamed: 0,average_height,average_difficulty,average_total_fees,average_total_reward,average_mint_reward,average_transaction_count,average_nonce,average_size,average_stripped_size,average_version,average_weight
2015-01-01 01:00:00+00:00,336868.0,40640960000.0,0.023508,25.023508,25.0,177.733333,1982945000.0,103153.666667,103153.666667,2.0,412614.7
2015-01-01 02:00:00+00:00,336879.0,40640960000.0,0.043085,25.043085,25.0,309.571429,2230371000.0,168577.857143,168577.857143,2.0,674311.4
2015-01-01 03:00:00+00:00,336886.5,40640960000.0,0.029944,25.029944,25.0,228.75,2312130000.0,155578.5,155578.5,2.0,622314.0
2015-01-01 04:00:00+00:00,336892.5,40640960000.0,0.074818,25.074818,25.0,497.0,1882359000.0,257693.0,257693.0,2.0,1030772.0
2015-01-01 05:00:00+00:00,336898.0,40640960000.0,0.032928,25.032928,25.0,246.571429,2118612000.0,130498.571429,130498.571429,2.0,521994.3


Number of missing hours in the processed BTC on-chain data: 0


In [9]:
btc_price = pd.read_csv('btc_hourly.csv').sort_values('time_period_end').set_index('time_period_end')
btc_price.index = pd.to_datetime(btc_price.index)
display(btc_price.head())
num_nans_btc_price = btc.isna().sum().sum()
print(f'Number of NaNs in raw btc price Data: {num_nans_btc_price}')
init_num_missing_hours_price = len(check_missing_hours(btc_price))
print(f'Number of missing hours in the raw BTC price Data: {init_num_missing_hours_price}')

Unnamed: 0_level_0,time_open,time_close,price_open,price_high,price_low,price_close,volume_traded,trades_count
time_period_end,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-01-14 17:00:00+00:00,2015-01-14T16:07:05.0000000Z,2015-01-14T16:55:16.0000000Z,173.93,197.97,110.2,185.91,1.05,105
2015-01-14 19:00:00+00:00,2015-01-14T18:50:59.0000000Z,2015-01-14T18:50:59.0000000Z,186.0,186.0,186.0,186.0,0.015362,1
2015-01-14 20:00:00+00:00,2015-01-14T19:05:36.0000000Z,2015-01-14T19:40:43.0000000Z,186.0,188.0,120.0,120.0,0.264638,14
2015-01-15 02:00:00+00:00,2015-01-15T01:35:08.0000000Z,2015-01-15T01:35:20.0000000Z,191.99,192.0,191.99,192.0,1.13,3
2015-01-15 05:00:00+00:00,2015-01-15T04:09:35.0000000Z,2015-01-15T04:16:31.0000000Z,150.0,150.0,150.0,150.0,0.0722,5


Number of NaNs in raw btc price data: 0
Number of missing hours in the raw BTC price data: 677


In [10]:
btc_price_processed = preprocess_data(data=btc_price, price_data=True, set_index=False)
display(btc_price_processed.head())
num_missing_hours_price = len(check_missing_hours(btc_price_processed))
print(f'Number of missing hours in the processed BTC price Data: {num_missing_hours_price}')

Unnamed: 0,time_open,time_close,price_open,price_high,price_low,price_close,volume_traded,trades_count
2015-01-14 17:00:00+00:00,2015-01-14 16:07:05+00:00,2015-01-14 16:55:16+00:00,173.93,197.97,110.2,185.91,1.05,105.0
2015-01-14 18:00:00+00:00,2015-01-14 16:07:05+00:00,2015-01-14 16:55:16+00:00,173.93,197.97,110.2,185.91,1.05,105.0
2015-01-14 19:00:00+00:00,2015-01-14 18:50:59+00:00,2015-01-14 18:50:59+00:00,186.0,186.0,186.0,186.0,0.015362,1.0
2015-01-14 20:00:00+00:00,2015-01-14 19:05:36+00:00,2015-01-14 19:40:43+00:00,186.0,188.0,120.0,120.0,0.264638,14.0
2015-01-14 21:00:00+00:00,2015-01-14 19:05:36+00:00,2015-01-14 19:40:43+00:00,186.0,188.0,120.0,120.0,0.264638,14.0


Number of missing hours in the processed BTC price data: 0


ETHEREUM CLEANING

In [11]:
eth = pd.read_parquet('ETH_Hourly_On_Chain_Data_sorted.parquet').reset_index().drop('index', axis=1).shift(1).iloc[1:]
eth.head()

Unnamed: 0,hour,average_number,average_gas_limit,average_gas_used,average_difficulty,average_total_difficulty,average_size,average_base_fee_per_gas,average_blob_gas_used,average_excess_blob_gas
1,2015-07-30 15:00:00.000 UTC,465.0,5000.0,0.0,21595370000.0,9303716647839,700.364909,<nil>,<nil>,<nil>
2,2015-07-30 16:00:00.000 UTC,1631.0,5000.0,0.0,38527130000.0,44081278443891,672.244476,<nil>,<nil>,<nil>
3,2015-07-30 17:00:00.000 UTC,2862.0,5000.0,0.0,69321760000.0,107777664952241,644.849858,<nil>,<nil>,<nil>
4,2015-07-30 18:00:00.000 UTC,3824.5,5000.0,0.0,108278800000.0,191398209046303,642.116628,<nil>,<nil>,<nil>
5,2015-07-30 19:00:00.000 UTC,4588.5,5000.0,0.0,151159400000.0,289165740480663,625.376133,<nil>,<nil>,<nil>


In [12]:
print(np.where(eth['average_total_difficulty'] == '<nil>', 1, 0).sum())
print(np.where(eth['average_total_difficulty'] == '<nil>'))
print(np.where(eth['average_base_fee_per_gas'] == '<nil>', 1, 0).sum())
print(np.where(eth['average_base_fee_per_gas'] == '<nil>'))
print(np.where(eth['average_blob_gas_used'] == '<nil>', 1, 0).sum())
print(np.where(eth['average_excess_blob_gas'] == '<nil>', 1, 0).sum())

2596
(array([81313, 81314, 81315, ..., 83906, 83907, 83908]),)
52746
(array([    0,     1,     2, ..., 52743, 52744, 52745]),)
75571
75571


In [13]:
# Cutoff ETH at the end because we have NIL values for difficulty
# Get rid of blob metrics because these are only recorded after ethereum moved to proof of stake

eth_cutoff = eth.set_index('hour').loc[:'2022-09-15 07'][['average_number', 'average_gas_limit', 'average_gas_used', 'average_difficulty', 'average_size']]
# Cutoff so we can include average_base_fee_per_gas in our feature set
eth_cutoff.index = pd.to_datetime(eth_cutoff.index)
eth_cutoff.iloc[52740:].head(10)

Unnamed: 0_level_0,average_number,average_gas_limit,average_gas_used,average_difficulty,average_size
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-08-05 06:00:00+00:00,12963407.5,14989228.0,14867784.0,7710852000000000.0,76254.156716
2021-08-05 07:00:00+00:00,12963655.0,14988204.0,14647698.0,7608569000000000.0,72463.585903
2021-08-05 08:00:00+00:00,12963897.0,14987078.0,14685168.0,7511196000000000.0,72078.284047
2021-08-05 09:00:00+00:00,12964170.0,14988947.0,14719515.0,7590093000000000.0,69635.636678
2021-08-05 10:00:00+00:00,12964456.5,14987432.0,14872388.0,7727006000000000.0,68005.704225
2021-08-05 11:00:00+00:00,12964717.0,14988133.0,14851808.0,7690777000000000.0,68376.075949
2021-08-05 12:00:00+00:00,12964974.0,20917874.0,16654806.0,7696264000000000.0,70026.66065
2021-08-05 13:00:00+00:00,12965236.0,28436542.0,15070133.0,7660835000000000.0,72965.773279
2021-08-05 14:00:00+00:00,12965499.5,27790972.0,14223123.0,7600428000000000.0,70116.403571
2021-08-05 15:00:00+00:00,12965766.5,27855298.0,14429474.0,7577809000000000.0,73928.653543


In [14]:
eth1 = eth_cutoff.loc[:'2021-08-05 11']
eth2 = eth_cutoff.loc['2021-08-05 12':]

In [15]:
num_missing_hours_chain_eth1 = len(check_missing_hours(eth1))
num_missing_hours_chain_eth2 = len(check_missing_hours(eth2))
print(f'Number of missing hours in raw eth1 on-chain Data: {num_missing_hours_chain_eth1}')
print(f'Number of missing hours in raw eth2 on-chain Data: {num_missing_hours_chain_eth2}')

Number of missing hours in raw eth1 on-chain data: 3
Number of missing hours in raw eth2 on-chain data: 0


In [16]:
eth1_chain_processed = preprocess_data(data=eth1, on_chain_data=True, set_index=False)
eth2_chain_processed = preprocess_data(data=eth2, on_chain_data=True, set_index=False)
num_missing_hours_chain_eth1_processed = len(check_missing_hours(eth1_chain_processed))
print(f'Number of missing hours in raw eth2 on-chain Data: {num_missing_hours_chain_eth1_processed}')

Number of missing hours in raw eth2 on-chain data: 0


In [17]:
eth_price = pd.read_csv('eth_hourly.csv').set_index('time_period_end')
eth_price.index = pd.to_datetime(eth_price.index)
num_nans = eth_price.isna().sum().sum()
num_missing_hours_eth_price = len(check_missing_hours(eth_price))
print(f'Number of NaNs in raw eth price Data: {num_nans}')
print(f'Number of missing hours in raw eth price Data: {num_missing_hours_eth_price}')
eth_price_processed = preprocess_data(data=eth_price, price_data=True, set_index=False)
eth_price_processed.head()

Number of NaNs in raw eth price data: 0
Number of missing hours in raw eth price data: 448


Unnamed: 0,time_open,time_close,price_open,price_high,price_low,price_close,volume_traded,trades_count
2017-08-17 05:00:00+00:00,2017-08-17 04:00:29.884000+00:00,2017-08-17 04:58:43.046000+00:00,301.13,302.57,298.0,301.61,125.66877,129.0
2017-08-17 06:00:00+00:00,2017-08-17 05:00:33.336000+00:00,2017-08-17 05:59:24.063000+00:00,301.61,303.28,300.0,303.1,377.67246,202.0
2017-08-17 07:00:00+00:00,2017-08-17 06:00:26.088000+00:00,2017-08-17 06:59:53.820000+00:00,302.4,304.44,301.9,302.68,303.86672,182.0
2017-08-17 08:00:00+00:00,2017-08-17 07:00:48.042000+00:00,2017-08-17 07:59:42.019000+00:00,302.68,307.96,302.6,307.96,754.7451,198.0
2017-08-17 09:00:00+00:00,2017-08-17 08:00:04.773000+00:00,2017-08-17 08:59:24.054000+00:00,307.95,309.97,307.0,308.62,150.75029,182.0


SOLANA CLEANING

In [18]:
# Solana On-Chain

raw_sol_chain = pd.read_parquet('query_result_SOL.parquet').set_index('hour').shift(1).iloc[1:]
raw_sol_chain.index = pd.to_datetime(raw_sol_chain.index)
raw_sol_chain

Unnamed: 0_level_0,average_height,average_total_transactions,average_successful_transactions,average_failed_transactions,average_total_vote_transactions,average_total_non_vote_transactions,average_successful_vote_transactions,average_successful_non_vote_transactions,average_failed_vote_transactions,average_failed_non_vote_transactions,average_num_reward_partitions
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-10-03 15:00:00+00:00,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,<nil>
2020-10-03 16:00:00+00:00,<nil>,190.038944,180.470872,9.568072,173.176376,16.862568,165.337625,15.133248,7.838751,1.729321,<nil>
2020-10-03 17:00:00+00:00,<nil>,197.403345,184.241584,13.161762,171.625556,25.777790,161.705060,22.536523,9.920495,3.241266,<nil>
2020-10-03 18:00:00+00:00,<nil>,190.733246,180.610083,10.123164,173.079493,17.653753,164.667941,15.942141,8.411552,1.711612,<nil>
2020-10-03 19:00:00+00:00,<nil>,192.742412,182.575677,10.166735,173.145040,19.597372,164.879100,17.696578,8.265940,1.900794,<nil>
...,...,...,...,...,...,...,...,...,...,...,...
2025-02-23 19:00:00+00:00,3.00883215e+08,1777.877906,1553.820496,224.057410,1313.753719,464.124187,1310.789862,243.030634,2.963857,221.093554,<nil>
2025-02-23 20:00:00+00:00,3.008922995e+08,1758.940510,1546.387618,212.552892,1317.871234,441.069276,1315.451397,230.936222,2.419837,210.133055,<nil>
2025-02-23 21:00:00+00:00,3.009013755e+08,1746.571429,1550.289247,196.282181,1316.694303,429.877125,1314.379664,235.909583,2.314639,193.967543,<nil>
2025-02-23 22:00:00+00:00,3.009104485e+08,1730.526629,1550.616967,179.909661,1318.189261,412.337368,1316.245268,234.371699,1.943992,177.965669,<nil>


In [19]:
raw_sol_chain = raw_sol_chain[['average_total_transactions', 'average_successful_transactions', 'average_failed_transactions', 'average_total_vote_transactions', 'average_total_non_vote_transactions', 'average_successful_vote_transactions', 'average_successful_non_vote_transactions', 'average_failed_vote_transactions', 'average_failed_non_vote_transactions']]
num_nils_sol_chain = np.where(raw_sol_chain == '<nil>', 1, 0).sum()
print(f'Number of Nils in raw SOL on-chain Data: {num_nils_sol_chain}')
init_num_missing_hours_sol_chain = len(check_missing_hours(raw_sol_chain))
print(f'Number of missing hours in the raw SOL on-chain Data: {init_num_missing_hours_sol_chain}')

Number of Nils in raw SOL on-chain data: 0
Number of missing hours in the raw SOL on-chain data: 103


In [20]:
sol_chain_processed = preprocess_data(data=raw_sol_chain, on_chain_data=True, set_index=False).iloc[1:]
processed_num_missing_hours_sol_chain = len(check_missing_hours(sol_chain_processed))
print(f'Number of missing hours in the processed SOL on-chain Data: {processed_num_missing_hours_sol_chain}')
sol_chain_processed.head()

Number of missing hours in the processed SOL on-chain data: 0


Unnamed: 0,average_total_transactions,average_successful_transactions,average_failed_transactions,average_total_vote_transactions,average_total_non_vote_transactions,average_successful_vote_transactions,average_successful_non_vote_transactions,average_failed_vote_transactions,average_failed_non_vote_transactions
2020-10-03 16:00:00+00:00,190.038944,180.470872,9.568072,173.176376,16.862568,165.337625,15.133248,7.838751,1.729321
2020-10-03 17:00:00+00:00,197.403345,184.241584,13.161762,171.625556,25.77779,161.70506,22.536523,9.920495,3.241266
2020-10-03 18:00:00+00:00,190.733246,180.610083,10.123164,173.079493,17.653753,164.667941,15.942141,8.411552,1.711612
2020-10-03 19:00:00+00:00,192.742412,182.575677,10.166735,173.14504,19.597372,164.8791,17.696578,8.26594,1.900794
2020-10-03 20:00:00+00:00,190.043383,180.030288,10.013095,173.069072,16.97431,164.780588,15.2497,8.288485,1.72461


In [21]:
# SOL Price Data

raw_sol_price = pd.read_csv('sol_hourly.csv').set_index('time_period_end')
raw_sol_price.index = pd.to_datetime(raw_sol_price.index)
init_num_nans_sol_price = raw_sol_price.isna().sum().sum()
print(f'Number of NaNs in the raw SOL price Data: {init_num_nans_sol_price}')
init_num_missing_hours_sol_price = len(check_missing_hours(raw_sol_price))
print(f'Number of missing hours in the raw SOL price Data: {init_num_missing_hours_sol_price}')
raw_sol_price.head()

Number of NaNs in the raw SOL price data: 0
Number of missing hours in the raw SOL price data: 388


Unnamed: 0_level_0,time_open,time_close,price_open,price_high,price_low,price_close,volume_traded,trades_count
time_period_end,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-08-11 07:00:00+00:00,2020-08-11T06:03:26.1400000Z,2020-08-11T06:59:15.4480000Z,3.4699,3.47,2.9054,2.9515,20028.66,266
2020-08-11 08:00:00+00:00,2020-08-11T07:00:03.0470000Z,2020-08-11T07:56:42.0020000Z,2.9515,3.1355,2.88,2.9224,42069.37,472
2020-08-11 09:00:00+00:00,2020-08-11T08:01:49.5620000Z,2020-08-11T08:58:12.1350000Z,2.9626,3.0,2.9144,2.96,24280.76,209
2020-08-11 10:00:00+00:00,2020-08-11T09:00:13.5900000Z,2020-08-11T09:59:37.3670000Z,2.96,2.9736,2.85,2.8543,26371.23,230
2020-08-11 11:00:00+00:00,2020-08-11T10:00:04.1750000Z,2020-08-11T10:54:17.0210000Z,2.8566,2.9329,2.8433,2.8976,26685.94,277


In [22]:
sol_price_processed = preprocess_data(data=raw_sol_price, price_data=True, set_index=False)
processed_num_missing_hours_sol_price = len(check_missing_hours(sol_price_processed))
print(f'Number of missing hours in the processed SOL price Data: {processed_num_missing_hours_sol_price}')
sol_price_processed.head()

Number of missing hours in the processed SOL price data: 0


Unnamed: 0,time_open,time_close,price_open,price_high,price_low,price_close,volume_traded,trades_count
2020-08-11 07:00:00+00:00,2020-08-11 06:03:26.140000+00:00,2020-08-11 06:59:15.448000+00:00,3.4699,3.47,2.9054,2.9515,20028.66,266.0
2020-08-11 08:00:00+00:00,2020-08-11 07:00:03.047000+00:00,2020-08-11 07:56:42.002000+00:00,2.9515,3.1355,2.88,2.9224,42069.37,472.0
2020-08-11 09:00:00+00:00,2020-08-11 08:01:49.562000+00:00,2020-08-11 08:58:12.135000+00:00,2.9626,3.0,2.9144,2.96,24280.76,209.0
2020-08-11 10:00:00+00:00,2020-08-11 09:00:13.590000+00:00,2020-08-11 09:59:37.367000+00:00,2.96,2.9736,2.85,2.8543,26371.23,230.0
2020-08-11 11:00:00+00:00,2020-08-11 10:00:04.175000+00:00,2020-08-11 10:54:17.021000+00:00,2.8566,2.9329,2.8433,2.8976,26685.94,277.0


MATIC (POLYGON) CLEANING

In [23]:
# matic on-chain Data

raw_matic_chain = pd.read_parquet('query_result_MATIC.parquet').set_index('hour').shift(1).iloc[1:]
raw_matic_chain.index = pd.to_datetime(raw_matic_chain.index)
display(raw_matic_chain)
total_matic_nils = np.where(raw_matic_chain['average_base_fee_per_gas'] == '<nil>', 1, 0).sum()
print(f'Total number of Nils in Raw MATIC on-chain Data: {total_matic_nils}')
other_nils_matic = np.where(raw_matic_chain[raw_matic_chain.columns[:-1]] == '<nil>', 1, 0).sum()
print(f'Number of Nils other than average_base_fee_per_gas {other_nils_matic}')
num_missing_hours_chain_matic_raw = len(check_missing_hours(raw_matic_chain))
print(f'Number of missing hours for raw MATIC on-chain Data {num_missing_hours_chain_matic_raw}')

Unnamed: 0_level_0,average_number,average_gas_limit,average_gas_used,average_difficulty,average_total_difficulty,average_size,average_base_fee_per_gas
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-05-30 17:00:00+00:00,437.5,15477045.0,6451.0,7.000000,3.064000e+03,636.300915,<nil>
2020-05-30 18:00:00+00:00,1747.5,20000000.0,2775.0,7.000000,1.223400e+04,614.375143,<nil>
2020-05-30 19:00:00+00:00,3492.5,20000000.0,63.0,7.000000,2.444900e+04,613.760894,<nil>
2020-05-30 20:00:00+00:00,5237.5,20000000.0,0.0,7.000000,3.666400e+04,613.345361,<nil>
2020-05-30 21:00:00+00:00,6983.5,20000000.0,0.0,7.000000,4.888600e+04,613.345361,<nil>
...,...,...,...,...,...,...,...
2025-02-23 19:00:00+00:00,68284509.5,29982424.0,12926826.0,22.000000,1.303319e+09,55411.429752,435457.3028335301
2025-02-23 20:00:00+00:00,68286203.5,29982424.0,10544614.0,22.000000,1.303357e+09,49007.274498,29.060212514757968
2025-02-23 21:00:00+00:00,68287865.5,29996395.0,10100943.0,21.959509,1.303393e+09,46427.977301,26.47791411042945
2025-02-23 22:00:00+00:00,68289491.5,30027939.0,9899159.0,21.954377,1.303429e+09,44820.897657,25.76202219482121


Total number of Nils in Raw MATIC on-chain data: 14338
Number of Nils other than average_base_fee_per_gas 0
Number of missing hours for raw MATIC on-chain data 4


In [24]:
# Split MATIC into matic1 and matic2 then process them
matic1_chain = raw_matic_chain.iloc[:14338][raw_matic_chain.columns[1:-1]]
matic2_chain = raw_matic_chain.iloc[14338:, 1:]

matic1_chain_processed = preprocess_data(data=matic1_chain, on_chain_data=True, set_index=False)
matic2_chain_processed = preprocess_data(data=matic2_chain, on_chain_data=True, set_index=False) #matic2 includes the 'average_base_fee_per_gas' field

num_missing_hours_chain_matic1_processed = len(check_missing_hours(matic1_chain_processed))
num_missing_hours_chain_matic2_processed = len(check_missing_hours(matic2_chain_processed))
print(f'Number of missing hours for processed MATIC1 on-chain Data {num_missing_hours_chain_matic1_processed}')
print(f'Number of missing hours for processed MATIC2 on-chain Data {num_missing_hours_chain_matic2_processed}')

Number of missing hours for processed MATIC1 on-chain data 0
Number of missing hours for processed MATIC2 on-chain data 0


In [25]:
# matic price Data

raw_matic_price = pd.read_csv('matic_hourly.csv').set_index('time_period_end')
raw_matic_price.index = pd.to_datetime(raw_matic_price.index)
init_num_nans_matic_price = raw_matic_price.isna().sum().sum()
print(f'Number of NaNs in the raw MATIC price Data: {init_num_nans_matic_price}')
init_num_missing_hours_matic_price = len(check_missing_hours(raw_matic_price))
print(f'Number of missing hours in the raw MATIC price Data: {init_num_missing_hours_matic_price}')
matic_price_processed = preprocess_data(data=raw_matic_price, price_data=True, set_index=False)
processed_num_missing_hours_matic_price = len(check_missing_hours(matic_price_processed))
print(f'Number of missing hours in the processed MATIC price Data: {processed_num_missing_hours_matic_price}')
matic_price_processed.head()

Number of NaNs in the raw MATIC price data: 0
Number of missing hours in the raw MATIC price data: 440
Number of missing hours in the processed MATIC price data: 0


Unnamed: 0,time_open,time_close,price_open,price_high,price_low,price_close,volume_traded,trades_count
2019-04-26 16:00:00+00:00,2019-04-26 15:00:00.149000+00:00,2019-04-26 15:59:59.951000+00:00,0.0099,0.01052,0.00526,0.0063,1429347000.0,17898.0
2019-04-26 17:00:00+00:00,2019-04-26 16:00:00.065000+00:00,2019-04-26 16:59:58.870000+00:00,0.0063,0.0063,0.0056,0.00579,219023500.0,3049.0
2019-04-26 18:00:00+00:00,2019-04-26 17:00:04.794000+00:00,2019-04-26 17:59:58.318000+00:00,0.0058,0.0058,0.00513,0.00531,173916300.0,1961.0
2019-04-26 19:00:00+00:00,2019-04-26 18:00:01.917000+00:00,2019-04-26 18:59:45.711000+00:00,0.00531,0.00553,0.00497,0.005,119505100.0,1588.0
2019-04-26 20:00:00+00:00,2019-04-26 19:00:00.067000+00:00,2019-04-26 19:59:55.958000+00:00,0.005,0.00514,0.00481,0.00495,82822730.0,1391.0


AVALANCHE CLEANING

In [26]:
# AVAX on-chain Data
raw_avax_chain = pd.read_parquet('query_result_AVALANCHE.parquet').set_index('hour').shift(1).iloc[1:]
raw_avax_chain.index = pd.to_datetime(raw_avax_chain.index)
raw_avax_chain = raw_avax_chain[raw_avax_chain.columns[1:]]
display(raw_avax_chain.head())
total_avax_nils = np.where(raw_avax_chain == '<nil>', 1, 0).sum()
print(f'Total number of Nils in raw AVAX on-chain Data: {total_avax_nils}')
num_missing_hours_chain_avax_raw = len(check_missing_hours(raw_avax_chain))
print(f'Number of missing hours for raw AVAX on-chain Data {num_missing_hours_chain_avax_raw}')

Unnamed: 0_level_0,average_gas_limit,average_gas_used,average_difficulty,average_total_difficulty,average_size,average_base_fee_per_gas
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-09-23 12:00:00+00:00,99902345.0,0.0,1.0,1.0,1059.0,<nil>
2020-09-23 13:00:00+00:00,99756054.0,21000.0,1.0,3.0,704.5,<nil>
2020-09-24 04:00:00+00:00,99561316.0,10500.0,1.0,5.0,801.5,<nil>
2020-09-24 14:00:00+00:00,99415500.0,21000.0,1.0,6.0,705.0,<nil>
2020-09-25 14:00:00+00:00,99318416.0,0.0,1.0,7.0,898.0,<nil>


Total number of Nils in raw AVAX on-chain data: 5593
Number of missing hours for raw AVAX on-chain data 2452


In [27]:
avax1 = raw_avax_chain.iloc[:5593][raw_avax_chain.columns[:-1]]
avax2 = raw_avax_chain.iloc[5593:]
avax1.head()

Unnamed: 0_level_0,average_gas_limit,average_gas_used,average_difficulty,average_total_difficulty,average_size
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-09-23 12:00:00+00:00,99902345.0,0.0,1.0,1.0,1059.0
2020-09-23 13:00:00+00:00,99756054.0,21000.0,1.0,3.0,704.5
2020-09-24 04:00:00+00:00,99561316.0,10500.0,1.0,5.0,801.5
2020-09-24 14:00:00+00:00,99415500.0,21000.0,1.0,6.0,705.0
2020-09-25 14:00:00+00:00,99318416.0,0.0,1.0,7.0,898.0


In [28]:
# Most of the datapoints stop skipping hours by 2021. But I will forward fill since 2020 and make that discretionary decision later.

avax1_chain_processed = preprocess_data(data=avax1, on_chain_data=True, set_index=False)
avax2_chain_processed = preprocess_data(data=avax2, on_chain_data=True, set_index=False)
total_avax1_nils = np.where(avax1_chain_processed == '<nil>', 1, 0).sum()
total_avax2_nils = np.where(avax2_chain_processed == '<nil>', 1, 0).sum()
print(f'Total number of Nils in processed AVAX1 on-chain Data: {total_avax1_nils}')
print(f'Total number of Nils in processed AVAX2 on-chain Data: {total_avax2_nils}')
num_missing_hours_chain_avax1_pro = len(check_missing_hours(avax1_chain_processed))
print(f'Number of missing hours for processed AVAX1 on-chain Data {num_missing_hours_chain_avax1_pro}')
num_missing_hours_chain_avax2_pro = len(check_missing_hours(avax2_chain_processed))
print(f'Number of missing hours for processed AVAX2 on-chain Data {num_missing_hours_chain_avax2_pro}')

Total number of Nils in processed AVAX1 on-chain data: 0
Total number of Nils in processed AVAX2 on-chain data: 0
Number of missing hours for processed AVAX1 on-chain data 0
Number of missing hours for processed AVAX2 on-chain data 0


In [29]:
# AVAX Price Data

raw_avax_price = pd.read_csv('avax_hourly.csv').set_index('time_period_end')
raw_avax_price.index = pd.to_datetime(raw_avax_price.index)
init_num_nans_avax_price = raw_avax_price.isna().sum().sum()
print(f'Number of NaNs in the raw AVAX price Data: {init_num_nans_avax_price}')
init_num_missing_hours_avax_price = len(check_missing_hours(raw_avax_price))
print(f'Number of missing hours in the raw AVAX price Data: {init_num_missing_hours_avax_price}')
avax_price_processed = preprocess_data(data=raw_avax_price, price_data=True, set_index=False)
processed_num_missing_hours_avax_price = len(check_missing_hours(avax_price_processed))
print(f'Number of missing hours in the processed AVAX price Data: {processed_num_missing_hours_avax_price}')
avax_price_processed.head()

Number of NaNs in the raw AVAX price data: 0
Number of missing hours in the raw AVAX price data: 327
Number of missing hours in the processed AVAX price data: 0


Unnamed: 0,time_open,time_close,price_open,price_high,price_low,price_close,volume_traded,trades_count
2020-09-22 07:00:00+00:00,2020-09-22 06:30:00+00:00,2020-09-22 06:59:59.772000+00:00,0.85,6.0,0.85,4.8811,3198372.67,29704.0
2020-09-22 08:00:00+00:00,2020-09-22 07:00:00.379000+00:00,2020-09-22 07:59:59.371000+00:00,4.88,5.35,4.245,4.9096,3171013.6,35863.0
2020-09-22 09:00:00+00:00,2020-09-22 08:00:00.608000+00:00,2020-09-22 08:59:59.821000+00:00,4.9096,6.9289,4.82,6.8219,3883981.43,43746.0
2020-09-22 10:00:00+00:00,2020-09-22 09:00:00.514000+00:00,2020-09-22 09:59:59.787000+00:00,6.8299,7.0,5.55,6.2108,3681832.89,37870.0
2020-09-22 11:00:00+00:00,2020-09-22 10:00:00.836000+00:00,2020-09-22 10:59:59.702000+00:00,6.2109,6.3,4.975,5.4901,2729116.18,33779.0


TRON CLEANING

In [30]:
# TRON on-chain Data
raw_tron_chain = pd.read_parquet('TRON_Hourly_On_Chain_Data_sorted.parquet').set_index('hour').shift(1).iloc[1:]
raw_tron_chain.index = pd.to_datetime(raw_tron_chain.index)
raw_tron_chain = raw_tron_chain[raw_tron_chain.columns[1:]]
display(raw_tron_chain)
total_tron_nils = np.where(raw_tron_chain == '<nil>', 1, 0).sum()
print(f'Total number of Nils in raw TRON on-chain Data: {total_tron_nils}')
num_missing_hours_chain_tron_raw = len(check_missing_hours(raw_tron_chain))
print(f'Number of missing hours for raw TRON on-chain Data {num_missing_hours_chain_tron_raw}')

Unnamed: 0_level_0,average_gas_limit,average_gas_used,average_difficulty,average_total_difficulty,average_size,average_base_fee_per_gas,unique_number_of_miners,average_nonce
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-06-25 02:00:00+00:00,0.000000e+00,0.0,0.0,0.0,172.191083,0.0,25.0,0.0
2018-06-25 03:00:00+00:00,0.000000e+00,0.0,0.0,0.0,173.351171,0.0,27.0,0.0
2018-06-25 04:00:00+00:00,0.000000e+00,0.0,0.0,0.0,359.212500,0.0,27.0,0.0
2018-06-25 05:00:00+00:00,0.000000e+00,0.0,0.0,0.0,189.769167,0.0,27.0,0.0
2018-06-25 06:00:00+00:00,0.000000e+00,0.0,0.0,0.0,194.056667,0.0,27.0,0.0
...,...,...,...,...,...,...,...,...
2025-02-23 19:00:00+00:00,5.790075e+09,4575697.0,0.0,0.0,52250.703673,0.0,27.0,0.0
2025-02-23 20:00:00+00:00,5.272376e+09,4022397.0,0.0,0.0,42961.015833,0.0,27.0,0.0
2025-02-23 21:00:00+00:00,5.124580e+09,3383249.0,0.0,0.0,37625.993333,0.0,27.0,0.0
2025-02-23 22:00:00+00:00,3.375096e+09,2917112.0,0.0,0.0,31082.340833,0.0,27.0,0.0


Total number of Nils in raw TRON on-chain data: 0
Number of missing hours for raw TRON on-chain data 4


In [31]:
raw_tron_chain = raw_tron_chain[['average_gas_limit', 'average_gas_used', 'average_size', 'unique_number_of_miners']]
tron_chain_processed = preprocess_data(data=raw_tron_chain, on_chain_data=True, set_index=False)
total_tron_nils = np.where(tron_chain_processed == '<nil>', 1, 0).sum()
print(f'Total number of Nils in processed TRON on-chain Data: {total_tron_nils}')
num_missing_hours_chain_tron_pro = len(check_missing_hours(tron_chain_processed))
print(f'Number of missing hours for processed TRON on-chain Data {num_missing_hours_chain_tron_pro}')
tron_chain_processed.head()

Total number of Nils in processed TRON on-chain data: 0
Number of missing hours for processed TRON on-chain data 0


Unnamed: 0,average_gas_limit,average_gas_used,average_size,unique_number_of_miners
2018-06-25 02:00:00+00:00,0.0,0.0,172.191083,25.0
2018-06-25 03:00:00+00:00,0.0,0.0,173.351171,27.0
2018-06-25 04:00:00+00:00,0.0,0.0,359.2125,27.0
2018-06-25 05:00:00+00:00,0.0,0.0,189.769167,27.0
2018-06-25 06:00:00+00:00,0.0,0.0,194.056667,27.0


In [32]:
print(np.where(tron_chain_processed['average_gas_limit'] == 0.0, 1, 0).sum())
print(np.where(tron_chain_processed['average_gas_used'] == 0.0, 1, 0).sum())

2623
2623


In [33]:
# Split the tron Data
tron1_chain_processed = tron_chain_processed.iloc[:2623][['average_size', 'unique_number_of_miners']]
tron2_chain_processed = tron_chain_processed.iloc[2623:]

In [34]:
# TRON Price Data

raw_tron_price = pd.read_csv('tron_hourly.csv').set_index('time_period_end')
raw_tron_price.index = pd.to_datetime(raw_tron_price.index)
init_num_nans_tron_price = raw_tron_price.isna().sum().sum()
print(f'Number of NaNs in the raw TRON price Data: {init_num_nans_tron_price}')
init_num_missing_hours_tron_price = len(check_missing_hours(raw_tron_price))
print(f'Number of missing hours in the raw TRON price Data: {init_num_missing_hours_tron_price}')
tron_price_processed = preprocess_data(data=raw_tron_price, price_data=True, set_index=False)
processed_num_missing_hours_tron_price = len(check_missing_hours(tron_price_processed))
print(f'Number of missing hours in the processed TRON price Data: {processed_num_missing_hours_tron_price}')
tron_price_processed.head()

Number of NaNs in the raw TRON price data: 0
Number of missing hours in the raw TRON price data: 384
Number of missing hours in the processed TRON price data: 0


Unnamed: 0,time_period_start,time_open,time_close,price_open,price_high,price_low,price_close,volume_traded,trades_count
2018-06-11 12:00:00+00:00,2018-06-11 11:00:00+00:00,2018-06-11 11:30:05.215000+00:00,2018-06-11 11:59:32.597000+00:00,0.05,0.05,0.04522,0.04787,2947872.2,308.0
2018-06-11 13:00:00+00:00,2018-06-11 12:00:00+00:00,2018-06-11 12:00:11.017000+00:00,2018-06-11 12:59:59.904000+00:00,0.04787,0.0483,0.0465,0.04737,3581174.3,378.0
2018-06-11 14:00:00+00:00,2018-06-11 13:00:00+00:00,2018-06-11 13:00:24.532000+00:00,2018-06-11 13:59:41.345000+00:00,0.04723,0.04726,0.04649,0.04662,2663472.8,289.0
2018-06-11 15:00:00+00:00,2018-06-11 14:00:00+00:00,2018-06-11 14:00:40.169000+00:00,2018-06-11 14:59:54.151000+00:00,0.04662,0.04662,0.0452,0.04591,6530794.1,528.0
2018-06-11 16:00:00+00:00,2018-06-11 15:00:00+00:00,2018-06-11 15:00:01.313000+00:00,2018-06-11 15:59:45.642000+00:00,0.04591,0.04618,0.04258,0.04493,12761075.5,942.0


ARBITRUM CLEANING

In [35]:
# ARBITRUM on-chain Data
raw_arb_chain = pd.read_parquet('ARB_Hourly_On_Chain_Data_sorted.parquet').set_index('hour').shift(1).iloc[1:]
raw_arb_chain.index = pd.to_datetime(raw_arb_chain.index)
raw_arb_chain = raw_arb_chain[raw_arb_chain.columns[1:]]
display(raw_arb_chain)
total_arb_nils = np.where(raw_arb_chain == '<nil>', 1, 0).sum()
print(f'Total number of Nils in raw ARB on-chain Data: {total_arb_nils}')
num_missing_hours_chain_arb_raw = len(check_missing_hours(raw_arb_chain))
print(f'Number of missing hours for raw ARB on-chain Data {num_missing_hours_chain_arb_raw}')

Unnamed: 0_level_0,average_gas_limit,average_gas_used,average_difficulty,average_total_difficulty,average_size,average_base_fee_per_gas,unique_number_of_miners,average_nonce
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-05-29 01:00:00+00:00,2.879850e+08,121.0,0.0,0.0,689.000000,<nil>,1.0,0.000000e+00
2021-05-29 02:00:00+00:00,2.879804e+08,5473.0,0.0,0.0,833.227273,<nil>,1.0,0.000000e+00
2021-05-29 03:00:00+00:00,2.879965e+08,69.0,0.0,0.0,790.000000,<nil>,1.0,0.000000e+00
2021-05-29 05:00:00+00:00,2.879965e+08,0.0,0.0,0.0,889.000000,<nil>,1.0,0.000000e+00
2021-05-29 06:00:00+00:00,2.879965e+08,0.0,0.0,0.0,890.000000,<nil>,1.0,0.000000e+00
...,...,...,...,...,...,...,...,...
2025-02-23 19:00:00+00:00,1.125900e+15,786692.0,1.0,286940865.0,2323.591546,1.00004919337795e+07,13.0,1.880706e+06
2025-02-23 20:00:00+00:00,1.125900e+15,612232.0,1.0,286954955.0,2048.388917,1e+07,8.0,1.880735e+06
2025-02-23 21:00:00+00:00,1.125900e+15,660480.0,1.0,286968966.0,2168.462492,1.0082271852960033e+07,7.0,1.880757e+06
2025-02-23 22:00:00+00:00,1.125900e+15,534926.0,1.0,286982844.0,1953.726491,1.0001191646012539e+07,8.0,1.880779e+06


Total number of Nils in raw ARB on-chain data: 10945
Number of missing hours for raw ARB on-chain data 85


In [36]:
print(np.where(raw_arb_chain['average_total_difficulty'] == 0.0, 1, 0).sum())
print(np.where(raw_arb_chain['average_nonce'] == 0.0, 1, 0).sum())
print(np.where(raw_arb_chain['average_base_fee_per_gas'] == '<nil>', 1, 0).sum())

10946
10945
10945


In [37]:
# Split the ARB On-Chain Data
raw_arb1_chain = raw_arb_chain[['average_gas_limit', 'average_gas_used', 'average_size', 'unique_number_of_miners']].iloc[:10945]
raw_arb2_chain = raw_arb_chain.iloc[10945:]
display(raw_arb1_chain.head())
raw_arb2_chain.head()

Unnamed: 0_level_0,average_gas_limit,average_gas_used,average_size,unique_number_of_miners
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-05-29 01:00:00+00:00,287985032.0,121.0,689.0,1.0
2021-05-29 02:00:00+00:00,287980436.0,5473.0,833.227273,1.0
2021-05-29 03:00:00+00:00,287996529.0,69.0,790.0,1.0
2021-05-29 05:00:00+00:00,287996529.0,0.0,889.0,1.0
2021-05-29 06:00:00+00:00,287996529.0,0.0,890.0,1.0


Unnamed: 0_level_0,average_gas_limit,average_gas_used,average_difficulty,average_total_difficulty,average_size,average_base_fee_per_gas,unique_number_of_miners,average_nonce
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-08-31 15:00:00+00:00,247756500000.0,110631.0,0.00022,0.0,1026.215259,100000000.0,1.0,0.00022
2022-08-31 16:00:00+00:00,1125900000000000.0,909032.0,1.0,3.0,806.0,100000000.0,1.0,1.0
2022-08-31 17:00:00+00:00,1125900000000000.0,2786311.0,1.0,684.0,1195.455147,100000000.0,6.0,3.041176
2022-08-31 18:00:00+00:00,1125900000000000.0,1691414.0,1.0,3423.0,1203.601263,100000000.0,83.0,55.294803
2022-08-31 19:00:00+00:00,1125900000000000.0,724278.0,1.0,8732.0,1188.975231,100000000.0,112.0,164.704462


In [38]:
# Process the ARB Data

arb1_chain_processed = preprocess_data(data=raw_arb1_chain, on_chain_data=True, set_index=False)
arb2_chain_processed = preprocess_data(data=raw_arb2_chain, on_chain_data=True, set_index=False)
total_arb1_nils = np.where(arb1_chain_processed == '<nil>', 1, 0).sum()
print(f'Total number of Nils in processed ARB1 on-chain Data: {total_arb1_nils}')
total_arb2_nils = np.where(arb2_chain_processed == '<nil>', 1, 0).sum()
print(f'Total number of Nils in processed ARB2 on-chain Data: {total_arb1_nils}')
num_missing_hours_chain_arb1_pro = len(check_missing_hours(arb1_chain_processed))
print(f'Number of missing hours for processed ARB1 on-chain Data {num_missing_hours_chain_arb1_pro}')
num_missing_hours_chain_arb2_pro = len(check_missing_hours(arb2_chain_processed))
print(f'Number of missing hours for processed ARB2 on-chain Data {num_missing_hours_chain_arb2_pro}')
display(arb1_chain_processed.head())
arb2_chain_processed.head()

Total number of Nils in processed ARB1 on-chain data: 0
Total number of Nils in processed ARB2 on-chain data: 0
Number of missing hours for processed ARB1 on-chain data 0
Number of missing hours for processed ARB2 on-chain data 0


Unnamed: 0,average_gas_limit,average_gas_used,average_size,unique_number_of_miners
2021-05-29 01:00:00+00:00,287985032.0,121.0,689.0,1.0
2021-05-29 02:00:00+00:00,287980436.0,5473.0,833.227273,1.0
2021-05-29 03:00:00+00:00,287996529.0,69.0,790.0,1.0
2021-05-29 04:00:00+00:00,287996529.0,69.0,790.0,1.0
2021-05-29 05:00:00+00:00,287996529.0,0.0,889.0,1.0


Unnamed: 0,average_gas_limit,average_gas_used,average_difficulty,average_total_difficulty,average_size,average_base_fee_per_gas,unique_number_of_miners,average_nonce
2022-08-31 15:00:00+00:00,247756500000.0,110631.0,0.00022,0.0,1026.215259,100000000.0,1.0,0.00022
2022-08-31 16:00:00+00:00,1125900000000000.0,909032.0,1.0,3.0,806.0,100000000.0,1.0,1.0
2022-08-31 17:00:00+00:00,1125900000000000.0,2786311.0,1.0,684.0,1195.455147,100000000.0,6.0,3.041176
2022-08-31 18:00:00+00:00,1125900000000000.0,1691414.0,1.0,3423.0,1203.601263,100000000.0,83.0,55.294803
2022-08-31 19:00:00+00:00,1125900000000000.0,724278.0,1.0,8732.0,1188.975231,100000000.0,112.0,164.704462


In [39]:
# ARBITRUM Price Data

raw_arb_price = pd.read_csv('arbitrum_hourly.csv').set_index('time_period_end')
raw_arb_price.index = pd.to_datetime(raw_arb_price.index)
init_num_nans_arb_price = raw_arb_price.isna().sum().sum()
print(f'Number of NaNs in the raw ARB price Data: {init_num_nans_arb_price}')
init_num_missing_hours_arb_price = len(check_missing_hours(raw_arb_price))
print(f'Number of missing hours in the raw ARB price Data: {init_num_missing_hours_arb_price}')
arb_price_processed = preprocess_data(data=raw_arb_price, price_data=True, set_index=False)
processed_num_missing_hours_arb_price = len(check_missing_hours(arb_price_processed))
print(f'Number of missing hours in the processed ARB price Data: {processed_num_missing_hours_arb_price}')
arb_price_processed.head()

Number of NaNs in the raw ARB price data: 0
Number of missing hours in the raw ARB price data: 27
Number of missing hours in the processed ARB price data: 0


Unnamed: 0,time_period_start,time_open,time_close,price_open,price_high,price_low,price_close,volume_traded,trades_count
2023-03-23 16:00:00+00:00,2023-03-23 15:00:00+00:00,2023-03-23 15:01:52.981000+00:00,2023-03-23 15:59:59.773000+00:00,1.3977,1.4614,1.24,1.325,134431105.3,427434.0
2023-03-23 17:00:00+00:00,2023-03-23 16:00:00+00:00,2023-03-23 16:00:00.044000+00:00,2023-03-23 16:59:59.990000+00:00,1.3251,1.356,1.2133,1.304,80253201.3,273376.0
2023-03-23 18:00:00+00:00,2023-03-23 17:00:00+00:00,2023-03-23 17:00:00.143000+00:00,2023-03-23 17:59:59.913000+00:00,1.304,1.3313,1.2708,1.289,48500134.1,160982.0
2023-03-23 19:00:00+00:00,2023-03-23 18:00:00+00:00,2023-03-23 18:00:00.142000+00:00,2023-03-23 18:59:59.863000+00:00,1.289,1.346,1.276,1.3273,37495855.6,121988.0
2023-03-23 20:00:00+00:00,2023-03-23 19:00:00+00:00,2023-03-23 19:00:00.094000+00:00,2023-03-23 19:59:59.988000+00:00,1.3274,1.43,1.322,1.43,49720710.2,164302.0


BNB CLEANING

In [40]:
# BNB on-chain Data
raw_bnb_chain = pd.read_parquet('query_result_BNB.parquet').set_index('hour').shift(1).iloc[1:]
raw_bnb_chain.index = pd.to_datetime(raw_bnb_chain.index)
raw_bnb_chain = raw_bnb_chain[raw_bnb_chain.columns[1:-1]]
display(raw_bnb_chain)
total_bnb_nils = np.where(raw_bnb_chain == '<nil>', 1, 0).sum()
print(f'Total number of Nils in raw BNB on-chain Data: {total_bnb_nils}')
num_missing_hours_chain_bnb_raw = len(check_missing_hours(raw_bnb_chain))
print(f'Number of missing hours for raw BNB on-chain Data {num_missing_hours_chain_bnb_raw}')

Unnamed: 0_level_0,average_gas_limit,average_gas_used,average_difficulty,average_total_difficulty,average_size
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-08-29 04:00:00+00:00,30520982.0,6751.0,1.933835,625.0,615.201504
2020-08-29 05:00:00+00:00,30000000.0,1034.0,2.000000,2488.0,626.230833
2020-08-29 06:00:00+00:00,30000000.0,0.0,2.000000,4888.0,611.105000
2020-08-29 07:00:00+00:00,30000000.0,755.0,1.999167,7288.0,621.854167
2020-08-29 08:00:00+00:00,30000000.0,895.0,1.914110,9548.0,622.785276
...,...,...,...,...,...
2025-02-23 19:00:00+00:00,139389081.0,11669641.0,2.000000,93257273.0,42722.832500
2025-02-23 20:00:00+00:00,139493646.0,9669754.0,2.000000,93259673.0,35945.705000
2025-02-23 21:00:00+00:00,139409789.0,9477706.0,2.000000,93262073.0,34168.100000
2025-02-23 22:00:00+00:00,139294693.0,9223073.0,1.989967,93264467.0,32188.251672


Total number of Nils in raw BNB on-chain data: 0
Number of missing hours for raw BNB on-chain data 6


In [41]:
bnb_chain_processed = preprocess_data(data=raw_bnb_chain, on_chain_data=True, set_index=False)
num_missing_hours_chain_bnb_pro = len(check_missing_hours(bnb_chain_processed))
print(f'Number of missing hours for processed BNB on-chain Data {num_missing_hours_chain_bnb_pro}')
bnb_chain_processed.head()

Number of missing hours for processed BNB on-chain data 0


Unnamed: 0,average_gas_limit,average_gas_used,average_difficulty,average_total_difficulty,average_size
2020-08-29 04:00:00+00:00,30520982.0,6751.0,1.933835,625.0,615.201504
2020-08-29 05:00:00+00:00,30000000.0,1034.0,2.0,2488.0,626.230833
2020-08-29 06:00:00+00:00,30000000.0,0.0,2.0,4888.0,611.105
2020-08-29 07:00:00+00:00,30000000.0,755.0,1.999167,7288.0,621.854167
2020-08-29 08:00:00+00:00,30000000.0,895.0,1.91411,9548.0,622.785276


In [42]:
# BNB Price Data

raw_bnb_price = pd.read_csv('bnb_hourly.csv').set_index('time_period_end')
raw_bnb_price.index = pd.to_datetime(raw_bnb_price.index)
init_num_nans_bnb_price = raw_bnb_price.isna().sum().sum()
print(f'Number of NaNs in the raw BNB price Data: {init_num_nans_bnb_price}')
init_num_missing_hours_bnb_price = len(check_missing_hours(raw_bnb_price))
print(f'Number of missing hours in the raw BNB price Data: {init_num_missing_hours_bnb_price}')
bnb_price_processed = preprocess_data(data=raw_bnb_price, price_data=True, set_index=False)
processed_num_missing_hours_bnb_price = len(check_missing_hours(bnb_price_processed))
print(f'Number of missing hours in the processed BNB price Data: {processed_num_missing_hours_bnb_price}')
bnb_price_processed.head()

Number of NaNs in the raw BNB price data: 0
Number of missing hours in the raw BNB price data: 496
Number of missing hours in the processed BNB price data: 0


Unnamed: 0,time_open,time_close,price_open,price_high,price_low,price_close,volume_traded,trades_count
2017-11-06 04:00:00+00:00,2017-11-06 03:54:23.590000+00:00,2017-11-06 03:59:48.833000+00:00,1.5,1.799,0.5,1.7,649.12,33.0
2017-11-06 05:00:00+00:00,2017-11-06 04:00:49.289000+00:00,2017-11-06 04:58:42.044000+00:00,1.3,1.65,1.3,1.6479,8147.72,139.0
2017-11-06 06:00:00+00:00,2017-11-06 05:03:22.478000+00:00,2017-11-06 05:56:54.934000+00:00,1.5457,1.5525,1.5455,1.5458,6628.2,27.0
2017-11-06 07:00:00+00:00,2017-11-06 06:02:48.422000+00:00,2017-11-06 06:59:49.692000+00:00,1.5458,1.681,1.5387,1.681,22767.9,133.0
2017-11-06 08:00:00+00:00,2017-11-06 07:00:17.040000+00:00,2017-11-06 07:57:32.779000+00:00,1.6809,1.6809,1.6,1.625,14938.73,58.0
