# Extract and save Ethereum raw transactions

In [1]:
import os
import json
import requests
import datetime as dt

from web3 import Web3
import pandas as pd

In [2]:
PATH = '../data/raw/'

### Alchemy as the remote node provider

In [3]:
ALCHEMY_KEY = os.environ.get('KEY')
w3 = Web3(Web3.HTTPProvider('https://eth-mainnet.alchemyapi.io/v2/'+ALCHEMY_KEY))

url = 'https://eth-mainnet.alchemyapi.io/v2/'+ALCHEMY_KEY

In [4]:
# latest_block_int = w3.eth.blockNumber
# up_to_block_int = latest_block_int - 5

In [4]:
def extract_first_hist_txns(no_of_blocks, up_to_block_int):
    block_num_request_data = {"jsonrpc": "2.0","id": 0,"method": "eth_getBlockByNumber","params":[hex(up_to_block_int - 1),True]}
    block_json = requests.post(url, json=block_num_request_data).json()
    block_details_list = block_json['result']['transactions']

    # Create lists with the values from the first block
    block_numbers_list = [int(block_json['result']['number'],16)]
    block_transactions_list = [len(block_json['result']['transactions'])]
    block_datetime = dt.datetime.fromtimestamp(int(block_json['result']['timestamp'],16))
    block_datetime_list = [block_datetime.strftime('%Y-%m-%d %H:%M:%S')]

        # While loop
    block_int = up_to_block_int - 1
    while block_int > up_to_block_int - no_of_blocks:
        block_int -= 1
        block_num_request_data = {"jsonrpc": "2.0","id": 0,"method": "eth_getBlockByNumber","params":[hex(block_int),True]}
        block_json = requests.post(url, json=block_num_request_data).json()

        # Get the block number from the first transaction in the block and add to a list
        block_numbers_list.append(int(block_json['result']['number'],16))

        # Get the number of transactions each block and add to a list
        block_transactions_list.append(len(block_json['result']['transactions']))

        # Get the timestamp of each block and add to a list
        block_timestamp = dt.datetime.fromtimestamp(int(block_json['result']['timestamp'],16))
        block_datetime = block_timestamp.strftime('%Y-%m-%d %H:%M:%S')
        block_datetime_list.append(block_datetime)

        # Get transaction details of each block and add to a list
        block_details_list.extend(block_json['result']['transactions'])


    # Create a dictionary with block info
    block_info = {}
    block_info['block_number'] = block_numbers_list
    block_info['block_transactions'] = block_transactions_list
    block_info['block_timestamp'] = block_datetime_list
    
    return block_info, block_details_list

In [5]:
def extract_next_hist_txns(no_of_blocks):
    
    # Identify the oldest block that was last extracted from block_info_log.json
    up_to_block_int = block_info['block_number'][-1]
    
    
    block_num_request_data = {"jsonrpc": "2.0","id": 0,"method": "eth_getBlockByNumber","params":[hex(up_to_block_int - 1),True]}
    block_json = requests.post(url, json=block_num_request_data).json()
    block_details_list = block_json['result']['transactions']

    
    # Create lists with the values from the first block
    block_numbers_list = [int(block_json['result']['number'],16)]
    block_transactions_list = [len(block_json['result']['transactions'])]
    block_datetime = dt.datetime.fromtimestamp(int(block_json['result']['timestamp'],16))
    block_datetime_list = [block_datetime.strftime('%Y-%m-%d %H:%M:%S')]

    
    # While loop to get n number of blocks but excluding the last extracted block
    block_int = up_to_block_int - 1
    while block_int > up_to_block_int - no_of_blocks:
        block_int -= 1
        block_num_request_data = {"jsonrpc": "2.0","id": 0,"method": "eth_getBlockByNumber","params":[hex(block_int),True]}
        block_json = requests.post(url, json=block_num_request_data).json()

        # Get the block number from the first transaction in the block and add to a list
        block_numbers_list.append(int(block_json['result']['number'],16))

        # Get the number of transactions each block and add to a list
        block_transactions_list.append(len(block_json['result']['transactions']))

        # Get the timestamp of each block and add to a list
        block_timestamp = dt.datetime.fromtimestamp(int(block_json['result']['timestamp'],16))
        block_datetime = block_timestamp.strftime('%Y-%m-%d %H:%M:%S')
        block_datetime_list.append(block_datetime)

        # Get transaction details of each block and add to a list
        block_details_list.extend(block_json['result']['transactions'])
        
    
    # Save the block transactions
    start_block = block_numbers_list[0]
    end_block = block_numbers_list[-1]

    block_details_str = json.dumps(block_details_list)
    with open(f'{PATH}{start_block}_{end_block}_eth_transactions.json', 'w') as f:
        f.write(block_details_str)
        

    # Update block info 
    block_info['block_number'].extend(block_numbers_list)
    block_info['block_transactions'].extend(block_transactions_list)
    block_info['block_timestamp'].extend(block_datetime_list)
    
    
    # Save the block info log as json file
    block_info_str = json.dumps(block_info)
    with open(f'{PATH}block_info_log.json', 'w') as f:
        f.write(block_info_str)

### Read the block_extracted_log

In [6]:
block_info = json.load(open(f'{PATH}block_info_log.json'))

### While loop to batch extract the raw transactions using the Alchemy API

In [None]:
n = 0
while n < 10:
    extract_next_hist_txns(no_of_blocks=1000)
    print('Data extracted and saved, batch =', n)
    n += 1

Data extracted and saved, batch = 0


### Read the raw extracts into a dataframe

In [3]:
file_dir = os.listdir(PATH)
file_path_list = [os.path.join(PATH, file) for file in file_dir if file.endswith('transactions.json')]

In [4]:
file_list_sorted = sorted(file_path_list, key=os.path.getmtime)

In [5]:
# raw_json_list = [json.load(open(file)) for file in file_path_list[:15]]
raw_json_list = [json.load(open(file)) for file in file_path_list[15:]]

In [6]:
df_raw = pd.concat([pd.DataFrame(file) for file in raw_json_list])

In [7]:
df_raw.shape

(2791716, 19)

In [8]:
df = df_raw.copy()

### Convert all hexadecimal columns to decimal

In [9]:
%%time
col_list = ['blockNumber','chainId','gas','gasPrice','nonce','nonce','transactionIndex','type','v','value','maxFeePerGas','maxPriorityFeePerGas']

for col in col_list:
    df[col] = df[col].apply(lambda x: int(x, base=16) if type(x) is str else x)

CPU times: user 34 s, sys: 2.97 s, total: 37 s
Wall time: 37 s


### Convert `type` into category data type

In [10]:
df['type'] = df['type'].astype('category')

### Convert `value` into float64 data type
This allows the dataframe to be saved as parquet otherwise we get the following error:  
"Python int too large to convert to C long"

In [11]:
df['value'] = df['value'].astype('float64')

### Add block timestamp to dataframe

In [12]:
block_info = json.load(open(f'{PATH}block_info_log.json'))
df_block_info = pd.DataFrame(block_info)

# Convert data type to datetime64[s]
df_block_info['block_timestamp'] = df_block_info['block_timestamp'].astype('datetime64[s]')

In [13]:
df = df.merge(df_block_info[['block_number','block_timestamp']], left_on='blockNumber', right_on='block_number')
df.drop(columns='block_number', inplace=True)

### Save the pandas dataframe as parquet file

In [14]:
start_block = df['blockNumber'].min()
end_block = df['blockNumber'].max()

In [15]:
OUT_PATH = '../data/'

In [16]:
df.to_parquet(f'{OUT_PATH}eth_transactions_{start_block}_{end_block}.parquet')