# Extract and save Ethereum raw transactions

In [1]:
import os
import json
import requests
import datetime as dt

from web3 import Web3
import pandas as pd

In [2]:
PATH = '../data/raw/'

### Alchemy as the remote node provider

In [3]:
ALCHEMY_KEY = os.environ.get('KEY')
w3 = Web3(Web3.HTTPProvider('https://eth-mainnet.alchemyapi.io/v2/'+ALCHEMY_KEY))

url = 'https://eth-mainnet.alchemyapi.io/v2/'+ALCHEMY_KEY

In [5]:
# latest_block_int = w3.eth.blockNumber
# up_to_block_int = latest_block_int - 5

In [5]:
def extract_first_hist_txns(no_of_blocks, up_to_block_int):
    block_num_request_data = {"jsonrpc": "2.0","id": 0,"method": "eth_getBlockByNumber","params":[hex(up_to_block_int - 1),True]}
    block_json = requests.post(url, json=block_num_request_data).json()
    block_details_list = block_json['result']['transactions']

    # Create lists with the values from the first block
    block_numbers_list = [int(block_json['result']['number'],16)]
    block_transactions_list = [len(block_json['result']['transactions'])]
    block_datetime = dt.datetime.fromtimestamp(int(block_json['result']['timestamp'],16))
    block_datetime_list = [block_datetime.strftime('%Y-%m-%d %H:%M:%S')]

        # While loop
    block_int = up_to_block_int - 1
    while block_int > up_to_block_int - no_of_blocks:
        block_int -= 1
        block_num_request_data = {"jsonrpc": "2.0","id": 0,"method": "eth_getBlockByNumber","params":[hex(block_int),True]}
        block_json = requests.post(url, json=block_num_request_data).json()

        # Get the block number from the first transaction in the block and add to a list
        block_numbers_list.append(int(block_json['result']['number'],16))

        # Get the number of transactions each block and add to a list
        block_transactions_list.append(len(block_json['result']['transactions']))

        # Get the timestamp of each block and add to a list
        block_timestamp = dt.datetime.fromtimestamp(int(block_json['result']['timestamp'],16))
        block_datetime = block_timestamp.strftime('%Y-%m-%d %H:%M:%S')
        block_datetime_list.append(block_datetime)

        # Get transaction details of each block and add to a list
        block_details_list.extend(block_json['result']['transactions'])


    # Create a dictionary with block info
    block_info = {}
    block_info['block_number'] = block_numbers_list
    block_info['block_transactions'] = block_transactions_list
    block_info['block_timestamp'] = block_datetime_list
    
    return block_info, block_details_list

In [6]:
def extract_backward_hist_txns(no_of_blocks):
    
    # Identify the oldest block that was last extracted from block_info_log.json
    up_to_block_int = block_info['block_number'][-1]
    
    
    block_num_request_data = {"jsonrpc": "2.0","id": 0,"method": "eth_getBlockByNumber","params":[hex(up_to_block_int - 1),True]}
    block_json = requests.post(url, json=block_num_request_data).json()
    block_details_list = block_json['result']['transactions']

    
    # Create lists with the values from the first block
    block_numbers_list = [int(block_json['result']['number'],16)]
    block_transactions_list = [len(block_json['result']['transactions'])]
    block_datetime = dt.datetime.fromtimestamp(int(block_json['result']['timestamp'],16))
    block_datetime_list = [block_datetime.strftime('%Y-%m-%d %H:%M:%S')]

    
    # While loop to get n number of blocks but excluding the last extracted block
    block_int = up_to_block_int - 1
    while block_int > up_to_block_int - no_of_blocks:
        block_int -= 1
        block_num_request_data = {"jsonrpc": "2.0","id": 0,"method": "eth_getBlockByNumber","params":[hex(block_int),True]}
        block_json = requests.post(url, json=block_num_request_data).json()

        # Get the block number from the first transaction in the block and add to a list
        block_numbers_list.append(int(block_json['result']['number'],16))

        # Get the number of transactions each block and add to a list
        block_transactions_list.append(len(block_json['result']['transactions']))

        # Get the timestamp of each block and add to a list
        block_timestamp = dt.datetime.fromtimestamp(int(block_json['result']['timestamp'],16))
        block_datetime = block_timestamp.strftime('%Y-%m-%d %H:%M:%S')
        block_datetime_list.append(block_datetime)

        # Get transaction details of each block and add to a list
        block_details_list.extend(block_json['result']['transactions'])
        
    
    # Save the block transactions
    start_block = block_numbers_list[0]
    end_block = block_numbers_list[-1]

    block_details_str = json.dumps(block_details_list)
    with open(f'{PATH}{start_block}_{end_block}_eth_transactions.json', 'w') as f:
        f.write(block_details_str)
        

    # Update block info 
    block_info['block_number'].extend(block_numbers_list)
    block_info['block_transactions'].extend(block_transactions_list)
    block_info['block_timestamp'].extend(block_datetime_list)
    
    
    # Save the block info log as json file
    block_info_str = json.dumps(block_info)
    with open(f'{PATH}block_info_log.json', 'w') as f:
        f.write(block_info_str)

In [7]:
def extract_forward_hist_txns(no_of_blocks):
    
    # Identify the newest block that was last extracted from block_info_log.json
    from_block_int = block_info['block_number'][0]
    
    
    block_num_request_data = {"jsonrpc": "2.0","id": 0,"method": "eth_getBlockByNumber","params":[hex(from_block_int + 1),True]}
    block_json = requests.post(url, json=block_num_request_data).json()
    block_details_list = block_json['result']['transactions']

    
    # Create lists with the values from the first block
    block_numbers_list = [int(block_json['result']['number'],16)]
    block_transactions_list = [len(block_json['result']['transactions'])]
    block_datetime = dt.datetime.fromtimestamp(int(block_json['result']['timestamp'],16))
    block_datetime_list = [block_datetime.strftime('%Y-%m-%d %H:%M:%S')]

    
    # While loop to get n number of blocks forward but exclude the newest extracted block in block info
    block_int = from_block_int + 1
    while block_int < from_block_int + no_of_blocks:
        block_int += 1
        block_num_request_data = {"jsonrpc": "2.0","id": 0,"method": "eth_getBlockByNumber","params":[hex(block_int),True]}
        block_json = requests.post(url, json=block_num_request_data).json()

        # Get the block number from the first transaction in the block and add to a list
        block_numbers_list.append(int(block_json['result']['number'],16))

        # Get the number of transactions each block and add to a list
        block_transactions_list.append(len(block_json['result']['transactions']))

        # Get the timestamp of each block and add to a list
        block_timestamp = dt.datetime.fromtimestamp(int(block_json['result']['timestamp'],16))
        block_datetime = block_timestamp.strftime('%Y-%m-%d %H:%M:%S')
        block_datetime_list.append(block_datetime)

        # Get transaction details of each block and add to a list
        block_details_list.extend(block_json['result']['transactions'])
        
    
    # Save the block transactions
    start_block = block_numbers_list[-1]
    end_block = block_numbers_list[0]

    block_details_str = json.dumps(block_details_list)
    with open(f'{PATH}{start_block}_{end_block}_eth_transactions.json', 'w') as f:
        f.write(block_details_str)
        
    
    # Create a new block info dictionary
    new_block_info = {}
    new_block_info['block_number'] = sorted(block_numbers_list, reverse=True)
    new_block_info['block_transactions'] = sorted(block_transactions_list, reverse=True)
    new_block_info['block_timestamp'] = sorted(block_datetime_list, reverse=True)
    
    # Add existing block info to the new block info so that newest block info is always at the beginning of the list.
    new_block_info['block_number'].extend(block_info['block_number'])
    new_block_info['block_transactions'].extend(block_info['block_transactions'])
    new_block_info['block_timestamp'].extend(block_info['block_timestamp'])
    
    
    # Save the block info log as json file
    block_info_str = json.dumps(new_block_info)
    with open(f'{PATH}block_info_log.json', 'w') as f:
        f.write(block_info_str)

In [9]:
def extract_txn_counts(no_of_blocks):
    
    from_block_int = block_txn['block_number'][-1]
    # from_block_int = 12738502
    
    # Request for transaction counts based on block number
    block_txn_request_data = {"jsonrpc": "2.0","id": 0,"method": "eth_getBlockTransactionCountByNumber","params":[hex(from_block_int + 1)]}
    block_json = requests.post(url, json=block_txn_request_data).json()
    block_transactions_list = [int(block_json['result'], 16)]
    
    # Requests for block information to get the timestamp of the block
    block_num_request_data = {"jsonrpc": "2.0","id": 0,"method": "eth_getBlockByNumber","params":[hex(from_block_int + 1),False]}
    block_json = requests.post(url, json=block_num_request_data).json()
    block_numbers_list = [int(block_json['result']['number'],16)]
    block_datetime = dt.datetime.fromtimestamp(int(block_json['result']['timestamp'],16))
    block_datetime_list = [block_datetime.strftime('%Y-%m-%d %H:%M:%S')]
    
    
     # While loop to get n number of blocks forward but exclude the newest extracted block in block info
    block_int = from_block_int + 1
    while block_int < from_block_int + no_of_blocks:
        block_int += 1
        block_txn_request_data = {"jsonrpc": "2.0","id": 0,"method": "eth_getBlockTransactionCountByNumber","params":[hex(block_int)]}
        block_json = requests.post(url, json=block_txn_request_data).json()

        # Get the number of transactions each block and add to a list
        block_transactions_list.append(int(block_json['result'], 16))

        # Get the timestamp of each block and add to a list
        block_num_request_data = {"jsonrpc": "2.0","id": 0,"method": "eth_getBlockByNumber","params":[hex(block_int),False]}
        block_json = requests.post(url, json=block_num_request_data).json()
    
        block_timestamp = dt.datetime.fromtimestamp(int(block_json['result']['timestamp'],16))
        block_datetime = block_timestamp.strftime('%Y-%m-%d %H:%M:%S')
        block_datetime_list.append(block_datetime)

        # Get the block number from the first transaction in the block and add to a list
        block_numbers_list.append(int(block_json['result']['number'],16))
    
    
    # Create new block txn counts
    # block_txn= {}
    # block_txn['block_number'] = block_numbers_list
    # block_txn['block_transactions'] = block_transactions_list
    # block_txn['block_timestamp'] = block_datetime_list
    
    # Update block info
    block_txn['block_number'].extend(block_numbers_list)
    block_txn['block_transactions'].extend(block_transactions_list)
    block_txn['block_timestamp'].extend(block_datetime_list)
    
    
    # Save the block info log as json file
    block_txn_str = json.dumps(block_txn)
    with open(f'{PATH}block_txn_counts.json', 'w') as f:
        f.write(block_txn_str)

In [10]:
block_txn = json.load(open(f'{PATH}block_txn_counts.json'))

In [11]:
tmp_df = pd.DataFrame(block_txn)

In [12]:
tmp_df['block_timestamp'] = tmp_df['block_timestamp'].astype('datetime64[s]')

In [13]:
tmp_df['block_timestamp'].min(), tmp_df['block_timestamp'].max()

(Timestamp('2021-06-30 23:59:04'), Timestamp('2021-07-01 19:29:43'))

In [8]:
tmp_df.tail()

Unnamed: 0,block_number,block_transactions,block_timestamp
5115,12743618,82,2021-07-01 19:29:00
5116,12743619,190,2021-07-01 19:29:06
5117,12743620,236,2021-07-01 19:29:22
5118,12743621,165,2021-07-01 19:29:25
5119,12743622,178,2021-07-01 19:29:43


In [None]:
%%time
n = 5
while n < 10:
    block_info = json.load(open(f'{PATH}block_txn_counts.json'))
    extract_txn_counts(no_of_blocks=8000)
    print('Data extracted and saved, batch =', n)
    n += 1

Data extracted and saved, batch = 5
Data extracted and saved, batch = 6


### Read the block_extracted_log

In [8]:
block_info = json.load(open(f'{PATH}block_info_log.json'))

In [9]:
tmp_df = pd.DataFrame(block_info)
tmp_df['block_timestamp'] = tmp_df['block_timestamp'].astype('datetime64[s]')

In [10]:
tmp_df.resample('D', on='block_timestamp').agg({'block_transactions':'sum'})

Unnamed: 0_level_0,block_transactions
block_timestamp,Unnamed: 1_level_1
2022-06-29,698486
2022-06-30,1054433
2022-07-01,1178620
2022-07-02,1189382
2022-07-03,1157052
2022-07-04,1123019
2022-07-05,1157774
2022-07-06,1192050
2022-07-07,1160880
2022-07-08,129959


In [11]:
tmp_df.query('block_timestamp.between("2022-07-01","2022-07-08")')

Unnamed: 0,block_number,block_transactions,block_timestamp
442,15098519,172,2022-07-07 23:59:58
443,15098518,171,2022-07-07 23:59:50
444,15098517,171,2022-07-07 23:59:29
445,15098516,171,2022-07-07 23:59:09
446,15098515,171,2022-07-07 23:59:02
...,...,...,...
45719,15053230,287,2022-07-01 00:00:44
45720,15053229,34,2022-07-01 00:00:39
45721,15053228,260,2022-07-01 00:00:19
45722,15053227,406,2022-07-01 00:00:03


### While loop to batch extract the BACKWARD historical raw transactions using the Alchemy API

In [None]:
# n = 0
# while n < 10:
#     extract_backward_hist_txns(no_of_blocks=1000)
#     print('Data extracted and saved, batch =', n)
#     n += 1

Data extracted and saved, batch = 0


### While loop to batch extract the FORWARD historical raw transactions using the Alchemy API

In [None]:
# n = 0
# while n < 10:
#     block_info = json.load(open(f'{PATH}block_info_log.json'))
#     extract_forward_hist_txns(no_of_blocks=1000)
#     print('Data extracted and saved, batch =', n)
#     n += 1

Data extracted and saved, batch = 0
Data extracted and saved, batch = 1
Data extracted and saved, batch = 2
Data extracted and saved, batch = 3
Data extracted and saved, batch = 4
Data extracted and saved, batch = 5
Data extracted and saved, batch = 6


### Read the raw extracts into a dataframe

In [3]:
file_dir = os.listdir(PATH)
file_path_list = [os.path.join(PATH, file) for file in file_dir if file.endswith('transactions.json')]

In [4]:
file_list_sorted = sorted(file_path_list, key=os.path.getmtime)

#### Blocks from 1 July 2022 onwards

In [5]:
file_path_list.sort()

In [6]:
# Get the index in the file path list
idx = file_path_list.index('../data/raw/15053949_15052950_eth_transactions.json')
idx

9

#### Manually read multiple json files in batches to save them as separate parquet files

Need to make sure we sort the file list first

In [7]:
file_path_list.sort()

In [8]:
# raw_json_list = [json.load(open(file)) for file in file_path_list[9:20]]
# raw_json_list = [json.load(open(file)) for file in file_path_list[20:31]]
# raw_json_list = [json.load(open(file)) for file in file_path_list[31:38]]
# raw_json_list = [json.load(open(file)) for file in file_path_list[38:]]

In [9]:
df_raw = pd.concat([pd.DataFrame(file) for file in raw_json_list])

In [10]:
df_raw.shape

(3088239, 19)

In [11]:
df = df_raw.copy()

### Convert all hexadecimal columns to decimal

In [12]:
col_list = ['blockNumber','chainId','gas','gasPrice','nonce','nonce','transactionIndex','type','v','value','maxFeePerGas','maxPriorityFeePerGas']

for col in col_list:
    df[col] = df[col].apply(lambda x: int(x, base=16) if type(x) is str else x)

### Convert `type` into category data type

In [13]:
df['type'] = df['type'].astype('category')

### Convert `value` into float64 data type
This allows the dataframe to be saved as parquet otherwise we get the following error:  
"Python int too large to convert to C long"

In [14]:
df['value'] = df['value'].astype('float64')

### Add block timestamp to dataframe

In [15]:
block_info = json.load(open(f'{PATH}block_info_log.json'))
df_block_info = pd.DataFrame(block_info)

# Convert data type to datetime64[s]
df_block_info['block_timestamp'] = df_block_info['block_timestamp'].astype('datetime64[s]')

In [16]:
df = df.merge(df_block_info[['block_number','block_timestamp']], left_on='blockNumber', right_on='block_number')
df.drop(columns='block_number', inplace=True)

### Drop the `r`, `s`, `v` and `accessList` columns to reduce the size

In [17]:
df.drop(columns=['r','s','v','accessList'], inplace=True)

### Save the pandas dataframe as parquet file

In [18]:
max_block = df['blockNumber'].max()
min_block = df['blockNumber'].min()

In [19]:
OUT_PATH = '../data/'

In [20]:
df.to_parquet(f'{OUT_PATH}df_eth_txns_{max_block}_{min_block}.parquet')