# Load blocks

In [1]:
import json
import os
from tqdm import tqdm
import os
import requests
import pickle
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# path of blocks
SOURCE_PATH = '../data/txn_data'
# SOURCE_PATH = 'testData'
# read files 
block_list = os.listdir(SOURCE_PATH)
sorted_block_list = [y for _, y in sorted([(int(a.split('.')[0]), a) for a in block_list])]
sorted_block_list = sorted_block_list
# print(sorted_block_list[:10])
print(f"There are {len(sorted_block_list)} blocks loaded")

There are 4087 blocks loaded


In [3]:
def get_block_txn_list(block_json_file: str) -> dict:
    """
    @input: a block json file, e.g., 1111.json
    @output: a list of transaction in that block
    """
    json_path = os.path.join(SOURCE_PATH, block_json_file)
    f = open(json_path)
    json_data = json.load(f)
    tx = json_data['tx']
    f.close()
    return tx

### Load two dictionaries

In [4]:
with open('../heuristic_data/transaction_address_summary.json') as f:
    transaction_address_summary = json.load(f)

with open('../heuristic_data/two_output_address_summary.json') as f:
    two_output_address_summary = json.load(f)

# Partition transactons with 2 outputs into 4 categories:
- Address reuse 
- Cluster member 
- Unknown change
- Overlay Application

## 2 Partition Functions
- check_address_reuse
- check_OP_RETURN

In [5]:
ADDRESS_REUSE_CODE = 0
CLUSTER_MEMBER_CODE = 1
UNKNOWN_CHANGE_CODE = 2
OVERLAY_APPLICATION_CODE = 3

CLUSTERING_CANDIDATE = {}


In [6]:
def check_address_reuse(txn: dict) -> bool:
    """
    @input: one single transaction, inputs list, outputs list
    @output: address_reuse bit // if reuse exists, then 1, else 0 
    """
    input_addresses = []

    for input_ in txn["inputs"]:
        if "prev_out" in input_:
            input_addresses.append(input_["prev_out"]["addr"])

    for output in txn["out"]:
        if "addr" not in output:
            continue 
        if output["addr"] in input_addresses:
            return True
    return False

def check_OP_RETURN(txn: dict) -> bool:
    """
    @input: one single transaction, inputs list, outputs list
    @ouput: True if the transaction is OP_RETURN, False otherwise 
    """
    for output in txn['out']:
        if 'script' in output and output['script'].startswith('6a'):
            return True
    return False

In [7]:
ADDRESS_REUSE_COUNTER = 0
CLUSTER_MEMBER_COUNTER = 0
UNKNOWN_CHANGE_COUNTER = 0
OVERLAY_APPLICATION_COUNTER = 0

transaction_candidate_list = []
"""store a list of (time, transactions) that are not "reuse address" and "OP_RETURN" """

for block_json_file in tqdm(sorted_block_list[:]):
    # get transaction list of that block
    txns_list = get_block_txn_list(block_json_file)

    for txn in txns_list:
        txn_hash = txn['hash']
        # txn_time = txn['time']
        if transaction_address_summary[txn_hash] != 2:
            continue
        try:
            assert(txn_hash in two_output_address_summary)
        except AssertionError:
            print(txn_hash)
            raise AssertionError("check the above transaction hash^")
        
        """case: check address reuse"""
        if check_address_reuse(txn):
            ADDRESS_REUSE_COUNTER += 1
            two_output_address_summary[txn_hash] = ADDRESS_REUSE_CODE
            continue
            
        """check OP_RETURN """
        if check_OP_RETURN(txn): 
            OVERLAY_APPLICATION_COUNTER += 1
            two_output_address_summary[txn_hash] = OVERLAY_APPLICATION_CODE
            continue
        
        # transaction_candidate_list.append((txn_time, txn_hash))
        transaction_candidate_list.append(txn_hash)


print(f"There are {ADDRESS_REUSE_COUNTER:,} transactions with address reuse problem")
print(f"There are {OVERLAY_APPLICATION_COUNTER:,} transactions with overlay application problem")
print(f"There are {len(transaction_candidate_list):,} candidate transactions")

100%|██████████| 4087/4087 [04:03<00:00, 16.79it/s]

There are 1,231,860 transactions with address reuse problem
There are 16,165 transactions with overlay application problem
There are 4,143,799 candidate transactions





In [8]:
address_reuse_transaction_address_list = []
for i in two_output_address_summary:
    if two_output_address_summary[i] == ADDRESS_REUSE_CODE:
        address_reuse_transaction_address_list.append(i)

with open('../heuristic_data/address_reuse_transaction_address_list.pkl', 'wb') as f:
    pickle.dump(address_reuse_transaction_address_list, f)

################################################################################################
overlay_application_transaction_address_list = []
for i in two_output_address_summary:
    if two_output_address_summary[i] == OVERLAY_APPLICATION_CODE:
        overlay_application_transaction_address_list.append(i)

with open('../heuristic_data/overlay_application_transaction_address_list.pkl', 'wb') as f:
    pickle.dump(overlay_application_transaction_address_list, f)

################################################################################################

transaction_candidate_list = []
for i in two_output_address_summary:
    if two_output_address_summary[i] != OVERLAY_APPLICATION_CODE and two_output_address_summary[i] != ADDRESS_REUSE_CODE:
        transaction_candidate_list.append(i)

with open('../heuristic_data/transaction_candidate_list.pkl', 'wb') as f:
    pickle.dump(transaction_candidate_list, f)

In [9]:
with open('../heuristic_data/transaction_address_summary.json', 'w') as f:
    json.dump(transaction_address_summary, f)

with open('../heuristic_data/two_output_address_summary.json', 'w') as f:
    json.dump(two_output_address_summary, f)