In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import sklearn
import os
import json
from tqdm import tqdm
from scipy.sparse import dok_matrix
from scipy.sparse.linalg import svds
import pickle
import scipy
from sklearn import preprocessing

## Load Data

In [2]:
with open('../../heuristic_data/ground_truth_list.pkl', 'rb') as f:
    ground_truth_list = pickle.load(f)
print(f"Number of ground truth transaction: {len(ground_truth_list)}")
txn_addr_list = ground_truth_list.copy()

Number of ground truth transaction: 86882


In [3]:
txn_addr_hash = {}
for addr in tqdm(txn_addr_list):
    txn_addr_hash[addr] = 1

100%|██████████| 86882/86882 [00:00<00:00, 3214054.68it/s]


In [5]:
# path of blocks
SOURCE_PATH = '../../data/txn_data'
# SOURCE_PATH = 'testData'
# read files 
block_list = os.listdir(SOURCE_PATH)
sorted_block_list = [y for _, y in sorted([(int(a.split('.')[0]), a) for a in block_list])]
sorted_block_list = sorted_block_list
# print(sorted_block_list[:10])
print(f"There are {len(sorted_block_list)} blocks loaded")


def get_block_txn_list(block_json_file: str) -> dict:
    """
    @input: a block json file, e.g., 1111.json
    @output: a list of transaction in that block
    """
    json_path = os.path.join(SOURCE_PATH, block_json_file)
    f = open(json_path)
    json_data = json.load(f)
    tx = json_data['tx']
    f.close()
    return tx

There are 4087 blocks loaded


## Get input and output addr

In [6]:
input_list = []
output_list = []

In [7]:
for block_json_file in tqdm(sorted_block_list[:]):
    # get transaction list of that block
    txns_list = get_block_txn_list(block_json_file)

    for txn in txns_list:
        txn_hash = txn['hash']
        try: 
            flag = txn_addr_hash[txn_hash]
            for input_ in txn['inputs']:
                input_list.append(input_['prev_out']['addr'])
            for output in txn['out']:
                if "addr" in output:
                    output_list.append(output['addr'])

        except KeyError:
            continue

100%|██████████| 4087/4087 [03:50<00:00, 17.76it/s]


In [8]:
len(input_list)

1046340

In [9]:
input_hash = {}
for input_idx, input_ in enumerate(list(set(input_list))):
    input_hash[input_] = input_idx

output_hash = {}
for output_idx, output in enumerate(list(set(output_list))):
    output_hash[output] = output_idx

union_list = list(set(input_list).union(set(output_list)))

union_hash = {}
for idx, addr in enumerate(list(set(union_list))):
    union_hash[addr] = idx


print(f"Length of input_hash: {len(input_hash)}")
print(f"Length of output_hash: {len(output_hash)}")
print(f"Length of union_hash: {len(union_hash)}")

Length of input_hash: 479887
Length of output_hash: 135051
Length of union_hash: 591629


In [10]:
with open("../../heuristic_data/input_hash.json", "w") as f:
    json.dump(input_hash, f)

with open("../../heuristic_data/output_hash.json", "w") as f:
    json.dump(output_hash, f)

with open("../../heuristic_data/union_hash.json", "w") as f:
    json.dump(union_hash, f)

with open("../../heuristic_data/input_list.pkl", "w") as f:
    json.dump(list(input_hash), f)

with open("../../heuristic_data/output_list.pkl", "w") as f:
    json.dump(list(output_hash), f)

with open("../../heuristic_data/union_list.pkl", "w") as f:
    json.dump(list(union_hash), f)
    

## Initialize the data matrix

In [11]:
# data_matrix = dok_matrix((len(input_hash), len(output_hash)))
# print(f"There are {len(input_hash):,} senders, there are {len(output_hash):,} receivers")


data_matrix = dok_matrix((len(union_hash), len(union_hash)))
print(f"There are {data_matrix.shape[0]:,} senders, there are {data_matrix.shape[1]:,} receivers")

There are 591,629 senders, there are 591,629 receivers


## Build the data matrix

In [13]:
for block_json_file in tqdm(sorted_block_list[:]):
    # get transaction list of that block
    txns_list = get_block_txn_list(block_json_file)

    for txn in txns_list:
        txn_hash = txn['hash']
        try: 
            flag = txn_addr_hash[txn_hash]
        except KeyError:
            continue
        # put weight in each of the output
        receiver_list = []
        money_received_list = []
        for output in txn['out']:
            if "addr" in output and "value" in output:
                receiver_list.append(output['addr'])
                money_received_list.append(output['value'])

        money_received_weight_list = [x / sum(money_received_list) \
                                            for x in money_received_list]
        
        for input_ in txn['inputs']:
            input_value = input_['prev_out']['value']
            money_distributed = [x * input_value for x in money_received_weight_list]
            addr_sender = input_['prev_out']['addr']

            for id, amount in enumerate(money_distributed):
                addr_receiver = receiver_list[id]
                data_matrix[union_hash[addr_sender], union_hash[addr_receiver]] += amount

100%|██████████| 4087/4087 [05:11<00:00, 13.10it/s]


In [15]:
print(f"Size of the data matrix: {data_matrix.shape[0]:,}, {data_matrix.shape[1]:,}")
print(f"There are {data_matrix.shape[0] * data_matrix.shape[1]:,} entries in the data matrix")
print(f"There are {data_matrix.count_nonzero():,} non-zero entries")

Size of the data matrix: 591,629, 591,629
There are 350,024,873,641 entries in the data matrix
There are 1,302,804 non-zero entries


In [16]:
print(f"Size of the data matrix: {data_matrix.shape}")
print(f"There are {data_matrix.shape[0] * data_matrix.shape[1]:,} entries in the data matrix")
print(f"There are {data_matrix.count_nonzero():,} non-zero entries")

Size of the data matrix: (591629, 591629)
There are 350,024,873,641 entries in the data matrix
There are 1,302,804 non-zero entries


In [17]:
data_matrix.shape

(591629, 591629)

In [18]:
data_matrix_coo = data_matrix.tocoo()
scipy.sparse.save_npz('../../heuristic_data/data_matrix_square_coo.npz', data_matrix_coo)