# Linking Heuristics
In this notebook, we present two linking heuristics and we compare the results with our ground truth data. The sections are:
1. Linking Heuristic 1
    - 1.1 Find Reused Coins
    - 1.2 Linking
2. Linking Heuristic 2
    - Linking
3. Validation
    - 3.1 Prepare Ground Truth
    - 3.2 Compare with Ground Truth
    - 3.3 Compare with each other

In [53]:
from api_calls import get_address_txs
from utils import read_json, write_json, on_chain_heuristics_list, set_mapping, get_results, most_common, \
    link_other_nodes, invert_mapping, add_node_to_entity, get_entity_neighbors, df_to_dicts_set

# input files
from utils import funded_address_settlement_txs_file, funding_address_entity_file, settlement_address_entity_file, \
    channels_file, funding_txs_file, settlement_addresses_file, settlement_txs_file, outgoing_channels_file, \
    incoming_channels_file

# outputs files
from utils import funding_entity_channels_nodes_file, heuristics_files, gt_node_entity_file, gt_address_txs_file, \
    entity_nbrs_file, nodes_csv_file

import pandas as pd
from collections import Counter
import time

# 1. Linking heuristic 1
In this section, we first read and prepare some data and then we perform the linking heuristic 1.

In [60]:
# read data
channels_df = pd.read_csv(channels_file)
funding_address_entity = read_json(funding_address_entity_file)
settlement_address_entity = read_json(settlement_address_entity_file)
funding_txs = read_json(funding_txs_file)
settlement_addresses = set(read_json(settlement_addresses_file))
funded_address_settlement_txs = read_json(funded_address_settlement_txs_file)
settlement_txs = read_json(settlement_txs_file)


In [61]:
# given a node, tell me its channels
node_channels = dict()
for channel in channels_df.values:
    c, n1, n2 = channel
    if n1 not in node_channels:
        node_channels[n1] = set()
    node_channels[n1].add(c)
    if n2 not in node_channels:
        node_channels[n2] = set()
    node_channels[n2].add(c)

In [62]:
# Nodes on-chain activity
# for each node, create a list of timestamps of
# openings, closings and first/last_activity
node_openings_closings = dict()
for node, chnls in node_channels.items():
    node_openings_closings[node] = {'openings': [], 'closings': []}
    for chnl in chnls:
        tx_hsh, out_index = chnl.split(':')
        t_open = funding_txs[tx_hsh]['status']['block_time']
        node_openings_closings[node]['openings'].append(t_open)

        t_closed = 0
        funded_address = funding_txs[tx_hsh]['vout'][int(out_index)]['scriptpubkey_address']
        stxs = funded_address_settlement_txs[funded_address]
        if stxs:
            t_closed = stxs[0]['status']['block_time']
        node_openings_closings[node]['closings'].append(t_closed)
    node_openings_closings[node]['first_activity'] = min(
        node_openings_closings[node]['openings'])
    node_openings_closings[node]['last_activity'] = max(
        max(node_openings_closings[node]['openings']),
        max(node_openings_closings[node]['closings']))
    if min(node_openings_closings[node]['closings']) == 0:
        # still open -> now
        node_openings_closings[node]['last_activity'] = int(time.time())


## 1.1 Find channels reusing coins from other channels
Here we look for channels that were funded with settlement coins (outputs of settlement txs of other channels). We also use the on-chain clustering results.

In [63]:
# use all on-chain clustering heuristics to have a wider overlap
# then the linking heuristics will decide which triplets to use
och = {h: (True if h != 'none' else False) for h in on_chain_heuristics_list}
fae, sae, = set_mapping(funding_address_entity, settlement_address_entity, och)

fes = set(fae.values())
ses = set(sae.values())
overlap_entities = fes.intersection(ses)

chpoints_reusing_coins = set()
settlement_entities = sae.values()
for chpoint in channels_df.chan_point.values:
    hsh, out_index = chpoint.split(':')
    ftx = funding_txs[hsh]
    for inp in ftx['vin']:
        e = fae[inp['prevout']['scriptpubkey_address']]
        if e in overlap_entities and e in settlement_entities:
            chpoints_reusing_coins.add(chpoint)
            break


use stars
use snakes
use collectors
use proxies


## 1.2 Linking
Here is the actual linking heuristic that we run using different on-chain patterns separately and then all of them together.

In [69]:
use_entities = True

def heuristic_1(fae, sae, och, files):
    print()
    # create a copy of initial state
    funding_address_entity = {k: v for k, v in fae.items()}
    settlement_address_entity = {k: v for k, v in sae.items()}

    # prepare results
    r = dict()
    r['n_funding_entities'] = len(set(funding_address_entity.values()))
    r['n_settlement_entities'] = len(set(settlement_address_entity.values()))
    r['n_entities'] = len(set(settlement_address_entity.values()).union(set(funding_address_entity.values())))
    r['n_addresses'] = len(set(settlement_address_entity.keys()).union(set(funding_address_entity.keys())))
    r['n_nodes'] = len(node_channels)

    # map entities to components
    funding_address_entity, settlement_address_entity = \
        set_mapping(funding_address_entity, settlement_address_entity, och)

    # print('Start heuristic 1...')

#     print('use_entities', use_entities)

    # # mapping between stx and its ftx
    stx_its_chpoint = dict()
    for channel in channels_df.values:
        funding_tx, out_index = channel[0].split(':')
        funded_address = funding_txs[funding_tx]['vout'][int(out_index)]['scriptpubkey_address']
        settlement_txs_fa = funded_address_settlement_txs[funded_address]
        if len(settlement_txs_fa) == 1:  # it is always zero or one tx
            stx = settlement_txs_fa[0]['txid']
            if stx not in stx_its_chpoint:
                stx_its_chpoint[stx] = channel[0]
            else:
                print('stx already in dict', stx)

    # create links for heuristic 1 (both at address and entity level)
    stx_a_chpoint = []  # list of settlement tx, address, funding tx
    for chpoint in chpoints_reusing_coins:
        hsh, out_index = chpoint.split(':')
        uftx = funding_txs[hsh]
        for i in uftx['vin']:
            a = i['prevout']['scriptpubkey_address']
            prev_tx = i['txid']
            if a in settlement_addresses:
                if prev_tx in settlement_txs:
                    stx_a_chpoint.append([prev_tx, a, chpoint])
    #             else:
    #                 # a is a settlement_address but prev_tx is not a
    #                 settlement_tx in our data

    stx_e_chpoint = []  # list of settlement tx, entity, chpoint
    print('n coins reused', len(chpoints_reusing_coins))
    settlement_entities = set(settlement_address_entity.values())
    for chpoint in chpoints_reusing_coins:
        hsh, out_index = chpoint.split(':')
        uftx = funding_txs[hsh]
        for i in uftx['vin']:
            e = funding_address_entity[i['prevout']['scriptpubkey_address']]
            prev_tx = i['txid']
            if e in settlement_entities:
                if prev_tx in settlement_txs:
                    stx_e_chpoint.append([prev_tx, e, chpoint])

    # I need a mapping between ch_point and nodes
    # and between settlement tx and nodes
    chpoint_nodes = dict()
    for channel in channels_df.values:
        chpoint_nodes[channel[0]] = [channel[1], channel[2]]

    funded_address_chpoint = dict()
    for chpoint in channels_df.chan_point.values:
        hsh, out_index = chpoint.split(':')
        funded_address = funding_txs[hsh]['vout'][int(out_index)]['scriptpubkey_address']
        if funded_address not in funded_address_chpoint:
            funded_address_chpoint[funded_address] = chpoint
        else:
            print(funded_address, ' has multiple channels')

    stx_nodes = dict()
    for fa, chpoint in funded_address_chpoint.items():
        stxs = funded_address_settlement_txs[fa]
        if stxs:
            stx = stxs[0]['txid']
            stx_nodes[stx] = chpoint_nodes[chpoint]
    # print('Initial number of links addresses', len(stx_a_ftx))
    # print('Initial number of links entities', len(stx_e_ftx))

    # decide link level
    triplet = stx_a_chpoint
    if use_entities:
        triplet = stx_e_chpoint

    links = []  # like stx_a_chpoint plus 4 nodes of channels
    for el in triplet:
        # the funding entity controls the node in common between the channel
        # opened with ftx and closed with stx
        stx, a, chpoint = el
        n1, n2 = chpoint_nodes[chpoint]  # happens after the stx
        n3, n4 = stx_nodes[stx]
        links.append([stx, a, chpoint, n1, n2, n3, n4])

    useful_links = []
    for link in links:
        s = set(link[3:])
        if len(s) == 3:
            useful_links.append(link)

    # if closing of other node in ch1 > opening of other node in ch2
    # then we can use the link
    usable_links = []
    for link in useful_links:
        node_in_common = most_common(link[3:])
        other_node_ch1 = ''
        other_node_ch2 = ''
        for node in link[3:][::-1]:
            if node != node_in_common:
                if not other_node_ch1:
                    other_node_ch1 = node
                else:
                    other_node_ch2 = node
        if node_openings_closings[other_node_ch1]['last_activity'] > \
                node_openings_closings[other_node_ch2]['first_activity']:
            usable_links.append(link)

    reliable_links_addresses = []
    for link in usable_links:
        link_address = link[1]
        stx = link[0]
        its_ftx = stx_its_chpoint[stx].split(':')[0]
        if link_address in [el['prevout']['scriptpubkey_address'] for el in
                            funding_txs[its_ftx]['vin']]:
            reliable_links_addresses.append(link)
    print('Number of reliable links at address level:',
          len(reliable_links_addresses))

    reliable_links_entities = []
    entities_reusing = set()
    for link in usable_links:
        if use_entities:
            link_entity = link[1]
        else:
            link_entity = settlement_address_entity[link[1]]
        stx = link[0]
        its_ftx = stx_its_chpoint[stx].split(':')[0]
        if link_entity in [funding_address_entity[el['prevout']['scriptpubkey_address']] for el
                           in funding_txs[its_ftx]['vin']]:
            entities_reusing.add(link_entity)
            reliable_links_entities.append(link)

    print('Number of reliable links at entity level:', len(reliable_links_entities))
    print('Number of entities reusing funding addresses:', len(entities_reusing))

    # step 1: linking nodes to entity using stx and ftx
    # print('Step 1:')
    heuristic_1a_entity_node = dict()
    heuristic_1a_node_entity = dict()
    for link in reliable_links_entities:
        if use_entities:
            e = link[1]
        else:
            e = settlement_address_entity[link[1]]
        n = most_common(link[3:])
        if e not in heuristic_1a_entity_node:
            heuristic_1a_entity_node[e] = set()
        heuristic_1a_entity_node[e].add(n)
        if n not in heuristic_1a_node_entity:
            heuristic_1a_node_entity[n] = set()
        heuristic_1a_node_entity[n].add(e)
    # print('Number of entities linked to nodes:', len(heuristic_1a_entity_node))
    # print('Number of nodes linked to entities:', len(heuristic_1a_node_entity))

    # print('Step 2:')
    # link other node and entity in channel
    heuristic_1b_entity_node = link_other_nodes(heuristic_1a_entity_node, channels_df,
                                                funded_address_settlement_txs,
                                                funding_txs,
                                                settlement_address_entity)
    heuristic_1b_node_entity = invert_mapping(heuristic_1b_entity_node)

    # correct means that the settlement tx has exactly two output entities
    correct_stxs = []  # correct stxs
    correct_settlement_entities = set()  # output entities of correct stxs
    correct_nodes = set()
    for channel in channels_df.values:
        funding_tx, out_index = channel[0].split(':')
        node_1 = channel[1]
        node_2 = channel[2]
        funded_address = \
            funding_txs[funding_tx]['vout'][int(out_index)]['scriptpubkey_address']

        settlement_txs_fa = funded_address_settlement_txs[funded_address]
        # if channel is closed and number of outputs == 2 and
        # one node is mapped to one entity in the outputs
        if settlement_txs_fa:  # it is always only one
            for settlement_tx in settlement_txs_fa:
                # count entities
                entities = set([settlement_address_entity[out['scriptpubkey_address']]
                                for out in settlement_tx['vout']])
                if len(entities) == 2:
                    correct_stxs.append(settlement_tx)
                    correct_settlement_entities = correct_settlement_entities.union(entities)
                    correct_nodes.add(node_1)
                    correct_nodes.add(node_2)

    perc_entities_linked_settled = round(100 * len(heuristic_1b_entity_node) / r['n_settlement_entities'], 2)
    perc_entities_linked_2e = round(100 * len(heuristic_1b_entity_node) / len(correct_settlement_entities), 2)
    perc_nodes_linked_2e = round(100 * len(heuristic_1b_node_entity) / len(correct_nodes), 2)

    r = get_results(r, heuristic_1b_entity_node, heuristic_1b_node_entity)

    print('Number of settlement entities:', r['n_settlement_entities'], '--', perc_entities_linked_settled, '% linked')
    print('Number of settlement entities considering settlement txs with 2 output entities:', len(correct_settlement_entities), '--', perc_entities_linked_2e, '% linked')
    print('Number of nodes considering settlement txs with 2 output entities:', len(correct_nodes), '--', perc_nodes_linked_2e, '% linked')

    addresses_linked = set()
    for address_entity in [funding_address_entity, settlement_address_entity]:
        for address, entity in address_entity.items():
            if entity in heuristic_1b_entity_node:
                addresses_linked.add(address)
    r['perc_addresses_linked'] = round(
        100 * len(addresses_linked) / r['n_addresses'], 2)

    output_file_a, output_file_b = files[1]['all']
    for k in ['stars', 'none', 'snakes', 'collectors', 'proxies', 'all']:
        if och[k]:
            output_file_a, output_file_b = files[1][k]

    # Write to file
    heuristic_1_entity_node = {str(k): [e for e in v]
                               for k, v in heuristic_1b_entity_node.items()}
    heuristic_1_node_entity = {k: [int(e) for e in v]
                               for k, v in heuristic_1b_node_entity.items()}
    print('On-chain clustering', och)
    print('writing to', output_file_a, output_file_b)
    write_json(heuristic_1_entity_node, output_file_a)
    write_json(heuristic_1_node_entity, output_file_b)

    return r


In [70]:
on_chain_heuristics = {och: False for och in on_chain_heuristics_list}

results_1 = dict()
for och in on_chain_heuristics:
    # one by one
    if och != 'all':
        on_chain_heuristics[och] = True
        results_1[och] = heuristic_1(funding_address_entity, settlement_address_entity, on_chain_heuristics, heuristics_files)
        on_chain_heuristics[och] = False

# all
on_chain_heuristics = {och: (True if och != 'none' else False) for och in on_chain_heuristics_list}
results_1['all'] = heuristic_1(funding_address_entity, settlement_address_entity, on_chain_heuristics, heuristics_files)


n coins reused 12149
Number of reliable links at address level: 0
Number of reliable links at entity level: 83
Number of entities reusing funding addresses: 22
Iteration: 1 -- Number of linked entities: 22
Iteration: 2 -- Number of linked entities: 3351
Iteration: 3 -- Number of linked entities: 7403
Iteration: 4 -- Number of linked entities: 8645
Iteration: 5 -- Number of linked entities: 8945
Iteration: 6 -- Number of linked entities: 9027
Iteration: 7 -- Number of linked entities: 9042
Number of settlement entities: 53370 -- 16.94 % linked
Number of settlement entities considering settlement txs with 2 output entities: 32321 -- 27.98 % linked
Number of nodes considering settlement txs with 2 output entities: 4626 -- 46.91 % linked
On-chain clustering {'none': True, 'stars': False, 'snakes': False, 'collectors': False, 'proxies': False, 'all': False}
writing to ../data/results/none_1_entity_node.json ../data/results/none_1_node_entity.json

use stars
n coins reused 12149
Number of r

In [14]:
results['all']

{'n_funding_entities': 96181,
 'n_settlement_entities': 53370,
 'n_entities': 138457,
 'n_addresses': 238070,
 'n_nodes': 10910,
 'n_entities_linked': 11272,
 'n_nodes_linked': 2579,
 'perc_entities_linked': 8.14,
 'perc_nodes_linked': 23.64,
 'perc_addresses_linked': 20.96}

In [None]:
write_json(results, heuristics_files[1]['results'])

## 2. Linking heuristic 2
Here we run the linking heuristic 2 using on-chain clustering separately and then combined together.

In [4]:
def heuristic_2(fae, sae, och, files):
    print()
    min_conf = 2  # min confidence level for results

    funding_address_entity = {k: v for k, v in fae.items()}
    settlement_address_entity = {k: v for k, v in sae.items()}
    r = dict()
    r['n_funding_entities'] = len(set(funding_address_entity.values()))
    r['n_settlement_entities'] = len(set(settlement_address_entity.values()))
    r['n_entities'] = len(set(settlement_address_entity.values()).union(set(funding_address_entity.values())))
    r['n_addresses'] = len(set(settlement_address_entity.keys()).union(set(funding_address_entity.keys())))
    r['n_nodes'] = len(node_channels)

    funding_address_entity, settlement_address_entity, = \
        set_mapping(funding_address_entity, settlement_address_entity, och)

    # print('Start heuristic 2...')
    # print('Step 1:')
    funding_entity_possible_nodes = dict()
    for channel in channels_df.values:
        funding_tx, out_index = channel[0].split(':')
        funding_address = funding_txs[funding_tx]['vin'][0]['prevout']['scriptpubkey_address']
        funding_entity = funding_address_entity[funding_address]
        if funding_entity not in funding_entity_possible_nodes:
            funding_entity_possible_nodes[funding_entity] = []
        funding_entity_possible_nodes[funding_entity].append(channel[1])
        funding_entity_possible_nodes[funding_entity].append(channel[2])

    # each funding entity that has at least n_channels possible nodes
    # (confidence level >= n_channels)
    n_channels = min_conf
    entity_channels_half = []
    fe_confidence = []
    fe_confidence_dict = dict()
    for fe, pns in funding_entity_possible_nodes.items():
        if len(pns) >= n_channels * 2:  # *2 cause we have two nodes per channel
            pn_occur = Counter(pns)
            for pn, occur in pn_occur.items():
                if occur * 2 == len(pns):
                    fe_confidence.append([fe, occur])
                    fe_confidence_dict[fe] = occur
                    entity_channels_half.append(occur)
    entity_channels_half.sort()

    funding_entity_channels_nodes = dict()
    node_possible_entities = dict()
    # populate funding_entity_channels_nodes
    for channel in channels_df.values:
        funding_tx, out_index = channel[0].split(':')
        funding_address = funding_txs[funding_tx]['vin'][0]['prevout']['scriptpubkey_address']
        funding_entity = funding_address_entity[funding_address]
        if funding_entity not in funding_entity_channels_nodes:
            # use chan_point as key
            funding_entity_channels_nodes[funding_entity] = dict()
        # add nodes
        funding_entity_channels_nodes[funding_entity][channel[0]] = [channel[1],
                                                                     channel[2]]
        for i in [1, 2]:
            if channel[i] not in node_possible_entities:
                node_possible_entities[channel[i]] = set()
            node_possible_entities[channel[i]].add(funding_entity)

#     write_json(funding_entity_channels_nodes, funding_entity_channels_nodes_file)

    heuristic_2a_entity_node = dict()
    # create link between entity and a node when
    # the node is the only one present in every channel of the entity
    for fe in funding_entity_channels_nodes:
        # count number of occurrences of each node in channels
        node_occur = dict()

        # compute node_occur
        for channel in funding_entity_channels_nodes[fe]:
            for node in funding_entity_channels_nodes[fe][channel]:
                if node not in node_occur:
                    node_occur[node] = 0
                node_occur[node] += 1

        # get max_occur
        max_occur = max(node_occur.values())
        selected_node = None

        # check if there is a perfect max_occur, i.e.,
        # if max_occur is unique and in every channel
        # (corresponding node is in every channel)
        if list(node_occur.values()).count(max_occur) == 1 \
                and max_occur == len(funding_entity_channels_nodes[fe]) \
                and max_occur >= min_conf:
            # get node present in every channel and add it to its entity
            selected_node = [n for n, occ in node_occur.items()
                             if occ == max_occur][0]
            if fe not in heuristic_2a_entity_node:
                heuristic_2a_entity_node[fe] = set()
            heuristic_2a_entity_node[fe] \
                .add(selected_node)

    # print('Step 2:')
    heuristic_2b_entity_node = link_other_nodes(heuristic_2a_entity_node, channels_df,
                         funded_address_settlement_txs, funding_txs,
                         settlement_address_entity)

    heuristic_2b_node_entity = invert_mapping(heuristic_2b_entity_node)

    r = get_results(r, heuristic_2b_entity_node,
                    heuristic_2b_node_entity)

    addresses_linked = set()
    for address_entity in [funding_address_entity, settlement_address_entity]:
        for address, entity in address_entity.items():
            if entity in heuristic_2b_entity_node:
                addresses_linked.add(address)
    r['perc_addresses_linked'] = round(
        100*len(addresses_linked)/r['n_addresses'], 2)

    output_file_a, output_file_b = files[2]['all']
    for k in ['stars', 'none', 'snakes', 'collectors', 'proxies', 'all']:
        if och[k]:
            output_file_a, output_file_b = files[2][k]

    # Write to file
    heuristic_2_entity_node = \
        {str(k): [e for e in v] for k, v in heuristic_2b_entity_node.items()}
    heuristic_2_node_entity = \
        {k: [int(e) for e in v] for k, v in heuristic_2b_node_entity.items()}
    print('On-chain clustering', och)
    print('writing to', output_file_a, output_file_b)
    write_json(heuristic_2_entity_node, output_file_a)
    write_json(heuristic_2_node_entity, output_file_b)

    return r


In [5]:
on_chain_heuristics = {och: False for och in on_chain_heuristics_list}

results_2 = dict()
for och in on_chain_heuristics:
    # one by one
    if och != 'all':
        on_chain_heuristics[och] = True
        results_2[och] = heuristic_2(funding_address_entity, settlement_address_entity, on_chain_heuristics, heuristics_files)
        on_chain_heuristics[och] = False

# all
on_chain_heuristics = {och: (True if och != 'none' else False) for och in on_chain_heuristics_list}
results_2['all'] = heuristic_2(funding_address_entity, settlement_address_entity, on_chain_heuristics, heuristics_files)


Iteration: 1 -- Number of linked entities: 841
Iteration: 2 -- Number of linked entities: 4825
Iteration: 3 -- Number of linked entities: 8629
Iteration: 4 -- Number of linked entities: 9636
Iteration: 5 -- Number of linked entities: 9855
Iteration: 6 -- Number of linked entities: 9900
Iteration: 7 -- Number of linked entities: 9904
On-chain clustering {'none': True, 'stars': False, 'snakes': False, 'collectors': False, 'proxies': False, 'all': False}
writing to ../data/results/none_2_entity_node.json ../data/results/none_2_node_entity.json

use stars
Iteration: 1 -- Number of linked entities: 862
Iteration: 2 -- Number of linked entities: 4846
Iteration: 3 -- Number of linked entities: 8650
Iteration: 4 -- Number of linked entities: 9657
Iteration: 5 -- Number of linked entities: 9876
Iteration: 6 -- Number of linked entities: 9921
Iteration: 7 -- Number of linked entities: 9925
On-chain clustering {'none': False, 'stars': True, 'snakes': False, 'collectors': False, 'proxies': False,

In [None]:
write_json(results, heuristics_files[2]['results'])

# 3. Validation
In this section we prepare the ground truth (GT) data we collected to validate our linking heuristics and then we compare it with our results. We collected data by opening channels (see section "Outgoing Channels") and by letting other people opening channels to us (see section "Incoming Channels").

## 3.1 Prepare Ground Truth Data

In [4]:
# use all on-chain clustering heuristics
on_chain_heuristics = {och: (True if och != 'none' else False) for och in on_chain_heuristics_list}
funding_address_entity, settlement_address_entity = set_mapping(funding_address_entity, settlement_address_entity, on_chain_heuristics)

use stars
use snakes
use collectors
use proxies


### Outgoing Channels

In [5]:
gt_outgoing_channels = read_json(outgoing_channels_file)['channels']

In [6]:
local_node = '025228840b37ade9aa2f96b3c961a35e76571a7c87a4ee67e2f33c64de64aa822f'
first_block = 999999
last_block = 0
for el in gt_outgoing_channels:
    hsh, _ = el['channel_point'].split(':')
    funding_block = funding_txs[hsh]['status']['block_height']
    settlement_block = el['close_height']
    if funding_block < first_block:
        first_block = funding_block
    if settlement_block > last_block:
        last_block = settlement_block

gt_outgoing_channel_points = [el['channel_point'] for el in gt_outgoing_channels]

closed_channel_nodes = set()
for closed_channel in gt_outgoing_channels:
    closed_channel_nodes.add(closed_channel['remote_pubkey'])
closed_channel_nodes.add(local_node)

print('First block with ground truth data:', first_block)
print('Last block with ground truth data:', last_block)
print('GT channels opened and closed:', len(gt_outgoing_channel_points))
print('GT number of nodes with which we closed a channel:', len(closed_channel_nodes))

First block with ground truth data: 646559
Last block with ground truth data: 647433
GT channels opened and closed: 81
GT number of nodes with which we closed a channel: 73


In [7]:
chpoint_n1_n2 = dict()
for r in channels_df.values:
    channel_point, node1, node2 = r
    if channel_point in gt_outgoing_channel_points:
        chpoint_n1_n2[channel_point] = [node1, node2]

In [8]:
gt_entity_node = dict()
for cp, ns in chpoint_n1_n2.items():
    hsh, out_index = cp.split(':')
    funding_address = funding_txs[hsh]['vin'][0]['prevout']['scriptpubkey_address']
    funding_entity = funding_address_entity[funding_address]
    gt_entity_node = add_node_to_entity(local_node, funding_entity, gt_entity_node)

received_coins_nodes = set()
for closed_channel in gt_outgoing_channels:
    funding_hsh, out_index = closed_channel['channel_point'].split(':')
    funded_address = funding_txs[funding_hsh]['vout'][int(out_index)]['scriptpubkey_address']
    stx = funded_address_settlement_txs[funded_address][0]
    if not stx['txid'] == closed_channel['closing_tx_hash']:
        print(stx['txid'])
    else:
        # if there are two outputs, the first is remote and the second is local
        if len(stx['vout']) == 2:
            received_coins_nodes.add(closed_channel['remote_pubkey'])
            remote_settlement_entity = settlement_address_entity[stx['vout'][0]['scriptpubkey_address']]
            local_settlement_entity = settlement_address_entity[stx['vout'][1]['scriptpubkey_address']]
            gt_entity_node = add_node_to_entity(local_node, local_settlement_entity, gt_entity_node)
            gt_entity_node = add_node_to_entity(closed_channel['remote_pubkey'], remote_settlement_entity, gt_entity_node)
        # if there is one output, it is local
        elif len(stx['vout']) == 1:
            local_settlement_entity = settlement_address_entity[stx['vout'][0]['scriptpubkey_address']]
            gt_entity_node = add_node_to_entity(local_node, local_settlement_entity, gt_entity_node)        

print('GT number of nodes that received coins from us:', len(received_coins_nodes))

GT number of nodes that received coins from us: 52


### Incoming channels

In [9]:
incoming_channels_df = pd.read_csv(incoming_channels_file)

In [10]:
available_funding_txs = set([el.split(':')[0] for el in incoming_channels_df.chan_point.values]).intersection(set(funding_txs.keys()))

In [82]:
# external_node_key is linked to the funding entity
nodes_opened_channels_to_us = set()
for r in incoming_channels_df.values:
    chan_point, remote_node, remote_alias = r
    hsh, out_index = chan_point.split(':')
    if hsh in funding_txs:
        nodes_opened_channels_to_us.add(remote_node)
        funding_entity = funding_address_entity[funding_txs[hsh]['vin'][0]['prevout']['scriptpubkey_address']]
        gt_entity_node = add_node_to_entity(remote_node, funding_entity, gt_entity_node)

print('Number of nodes that opened channels to us:', len(nodes_opened_channels_to_us))

Number of nodes that opened channels to us: 3


### Node-Entity links
Here we create the gt linking between node and entity using the results of the cells above.

In [12]:
gt_node_entity = dict()
for e, ns in gt_entity_node.items():
    for n in ns:
        gt_node_entity = add_node_to_entity(e, n, gt_node_entity) # don't be fooled by the name ;)

In [16]:
# write_json(gt_node_entity, gt_node_entity_file)

In [17]:
# gt_node_entity = read_json(gt_node_entity_file)

## 3.2 Compare with Ground Truth
For each node in the ground truth, compare its `gt_entities` and its `linked_entities`. If there is at least one entity in common in the two sets, the node-entity link is valid. We then extend this also by looking at neighboring entities (entities that directly receive or send coins to a specific entity).

In [13]:
heuristic_2_node_entity = read_json(heuristics_files[2]['all'][1])

In [87]:
validated_nodes = set()
for n in gt_node_entity:
    if n in heuristic_2_node_entity:
        gt_entities = list(set(gt_node_entity[n]))
        linked_entities = list(set(heuristic_2_node_entity[n]))
        gt_entities.sort()
        linked_entities.sort()
        len_intersection = len(set(linked_entities).intersection(set(gt_entities)))
        if len_intersection:
            validated_nodes.add(n)

print('Number of directly validated nodes:', len(validated_nodes))


Number of directly validated nodes: 7


### Find indirect connection between gt entity and linked entity for unvalidated nodes

In [20]:
entity_nbrs = dict()

In [99]:
entity_nbrs = read_json(entity_nbrs_file) # if available

In [88]:
# **WARNING** GraphSense token is needed to run this cell, unless you have entity_nbrs_file
gt_entity_hop_nbrs = dict() # key: gt_entity, value: dict of key: hop, value: neighbors
h = 1
for n, es in gt_node_entity.items():
    if n not in validated_nodes:
        for e in es:
            if e not in gt_entity_hop_nbrs:
                gt_entity_hop_nbrs[e] = dict()
                gt_entity_hop_nbrs[e][h] = set()
            # get neighbors at hop 1
            if not gt_entity_hop_nbrs[e][h] and e > 0:
                if e not in entity_nbrs:
                    entity_nbrs[e] = get_entity_neighbors(e)
                gt_entity_hop_nbrs[e][h] = gt_entity_hop_nbrs[e][h].union(entity_nbrs[e])

indirectly_validated_nodes = set()
for n, es in gt_node_entity.items():
    if n not in validated_nodes:
        for e in es:
            nbrs = gt_entity_hop_nbrs[e][h]
            nbrs_linked_entities_intersection = nbrs.intersection(heuristic_2_node_entity[n])
            if nbrs_linked_entities_intersection:
                print(e, len(nbrs_linked_entities_intersection), n)
                indirectly_validated_nodes.add(n)
                validated_nodes.add(n)

701710153 1 03fce165537aea120bffe8505876b44d5119354f825b3eac329b761fc5636bf334
702110475 1 0311cad0edf4ac67298805cf4407d94358ca60cd44f2e360856f3b1c088bcd4782


In [89]:
print('Total number of validated nodes:', len(validated_nodes))

Total number of validated nodes: 9


In [99]:
write_json(entity_nbrs, entity_nbrs_file, values_to_list=True)

In [22]:
# manually checked, spent before last_block = 647529
settlement_txs_where_remote_spends_coins =[
    'd6d658c4a13c8f2d2927a71e1cdd5ef310d7d9adb9f96774018276a3590c3788',
    'be051cfb727c10c28c3975d8d32a0c29bb244b4f21bd14e2eab584219f496b27',
    '9fa69e68dc5ce6525b3edeb7dba8f2d954adea3d82ca340a55b7157418d384c1',
    '101c492db10266eb1c7cd63e00bcfbb9f60860a0badff6c7573b673f523b45f2',
    'f65aef03e5a93d5acebf8135cd411ccf46013a957620eab2a5a95171327f4e93',
    'e836d71d6cc8b5a79562b46890429a89ecc5e9e3be8cbc0203a1c00bd69c8d2a',
    'fe9f60f930d1a7cad6b17923c8f3f041b5e2ba308447c73d82abc048389c930a',
    '75f27715d27c6629673ddf080cf5267dc85bea40bf702fcb59377f82553b7e08',
    '2894fac92b98402a993b6b57db0877db085d796978c89495b630871e12b2427f',
    '4dd694546be280a08803b9e2eb9e15adfe0f4e4ef0f53567d9a7f183188ebcff',
    '7e25d41fd47d10287e560c3d98cebe041c0f7ae57c1fff270ad662753ec706c0',
    'f4c30c226bb4ce2c16673555768190318dd327bab9d06b51ff35a497483eff70',
    '399cfcd171e69d7a0c150772bba5202850c0186e58d9b818cfaf6a4c74f567fa',
    '05489ca075ff037934e734b893098559c10eabc953edca1cc2faa80fe1042582',
]

In [23]:
node_settlement_address_txs = dict()

In [25]:
gt_address_txs = dict()

In [25]:
gt_address_txs = read_json(gt_address_txs_file) # if available

In [27]:
# **WARNING** GraphSense token is needed to run this cell, unless you have gt_address_txs_file
# check activity of node addresses not spending our coins
nodes_that_spent_our_coins = set()
for closed_channel in gt_outgoing_channels:
    funding_hsh, out_index = closed_channel['channel_point'].split(':')
    funded_address = funding_txs[funding_hsh]['vout'][int(out_index)]['scriptpubkey_address']
    stx = funded_address_settlement_txs[funded_address][0]
    # if there are two outputs, the first is remote and the second is local
    if len(stx['vout']) == 2:
        if stx['txid'] not in settlement_txs_where_remote_spends_coins:
            if closed_channel['remote_pubkey'] not in node_settlement_address_txs:
                node_settlement_address_txs[closed_channel['remote_pubkey']] = dict()
            a = stx['vout'][0]['scriptpubkey_address']
            if a not in node_settlement_address_txs[closed_channel['remote_pubkey']]:
                if a not in gt_address_txs:
                    gt_address_txs[a] = get_address_txs(a)
                node_settlement_address_txs[closed_channel['remote_pubkey']][a] = gt_address_txs[a]
            elif not node_settlement_address_txs[closed_channel['remote_pubkey']][a]:
                if a not in address_txs:
                    gt_address_txs[a] = get_address_txs(a)
                node_settlement_address_txs[closed_channel['remote_pubkey']][a] = gt_address_txs[a]
        else:
            nodes_that_spent_our_coins.add(closed_channel['remote_pubkey'])
for node in nodes_that_spent_our_coins:
    node_settlement_address_txs.pop(node, None)

print('Number of nodes that received our coins and did not spend them:', len(node_settlement_address_txs.keys()))
print('Number of addresses that received our coins and did not spend them', sum([len(d.keys()) for d in node_settlement_address_txs.values()]))

GT number of nodes that received our coins and did not spend them: 41
GT number of addresses that received our coins and did not spend them 43


In [95]:
write_json(gt_address_txs, gt_address_txs_file)

In [37]:
node_address_no_spend = set()
# the node receives coins on addresses that have only 1 incoming tx
for node, d in node_settlement_address_txs.items():
    discard = False # discard if address is not new or spent coins
    for a, txs in d.items():
        # discard if the address has more than 1 incoming tx or spent 
        if txs['no_incoming_txs'] > 1 or txs['no_outgoing_txs']: 
            discard = True
    if not discard:
        node_address_no_spend.add(node)

print('Number of nodes that did not reuse addresses, received our coins and never spent any coins:', len(node_address_no_spend))

GT number of nodes that did not reuse addresses, received our coins and never spent any coins: 41


In [38]:
received_spent_coins_nodes = set()
for closed_channel in gt_outgoing_channels:
    funding_hsh, out_index = closed_channel['channel_point'].split(':')
    funded_address = funding_txs[funding_hsh]['vout'][int(out_index)]['scriptpubkey_address']
    stx = funded_address_settlement_txs[funded_address][0]
    # if there are two outputs, the first is remote and the second is local
    if len(stx['vout']) == 2 and stx['txid'] in settlement_txs_where_remote_spends_coins:
        received_spent_coins_nodes.add(closed_channel['remote_pubkey'])

print('Number of nodes that received and spent our coins:', len(received_spent_coins_nodes))
print('Number of nodes that received and spent our coins and are validated:', len(received_spent_coins_nodes.intersection(validated_nodes)))

GT number of nodes that received and spent our coins: 11
GT number of nodes that received and spent our coins and are validated: 7


In [40]:
received_coins_linked_nodes = received_coins_nodes.intersection(set(heuristic_2_node_entity.keys()))
print('Number of nodes that received our coins and were linked with our heuristic:', len(received_coins_linked_nodes))

GT number of nodes that received our coins and were linked with our heuristic: 52


### List info about non-validated nodes

In [50]:
nodes = pd.read_csv(nodes_csv_file)
node_alias, alias_node = df_to_dicts_set(nodes)

In [51]:
for n in received_spent_coins_nodes:
    if n not in validated_nodes:
        print(n)
        print(node_alias[n])
        print(gt_node_entity[n])
        print(len(heuristic_2_node_entity))
        print()

032d4baebebfdeab7a2ecef2fbe109cbef10de95f05aa54090fdb687789547dbf5
{'CONNECT_WITH_ME'}
{702410217}
4600

0303a518845db99994783f606e6629e705cfaf072e5ce9a4d8bf9e249de4fbd019
{'LNBIG.com [lnd-25]'}
{702145255}
4600

031ce29116eab7edd66148f5169f1fb658fad62bdc5091221ab895fe5d36db00b2
{'LNBIG.com [lnd-05]'}
{701940461}
4600

03864ef025fde8fb587d989186ce6a4a186895ee44a926bfc370e2c366597a3f8f
{'ACINQ'}
{700855070}
4600



## 3.3 Compare with each Other
Here we compare the linking results of heuristic 1 with the ones of heuristic 2.

In [80]:
heuristic_1_entity_node_dict = dict()
heuristic_2_entity_node_dict = dict()
for h in on_chain_heuristics_list:
    heuristic_1_entity_node_dict[h] = read_json(heuristics_files[1][h][0] , True)

for h in on_chain_heuristics_list:
    heuristic_2_entity_node_dict[h] = read_json(heuristics_files[2][h][0], True)

for h in on_chain_heuristics_list:
    print('On-chain heuristic used:', h)
    entities_heuristics_1_2 = set(heuristic_1_entity_node_dict[h]).intersection(set(heuristic_2_entity_node_dict[h]))
    # to see if the two heuristics say the same
    same = 0
    intersect = 0
    for e in entities_heuristics_1_2:
        s1 = set(heuristic_1_entity_node_dict[h][e])
        s2 = set(heuristic_2_entity_node_dict[h][e])
        if s1 == s2:
            same += 1
            intersect += 1
        elif s1.intersection(s2):
            intersect += 1
    print(same, 'entities out of', len(entities_heuristics_1_2),
          'entities in common are linked to the same nodes')
    print(intersect, 'entities out of', len(entities_heuristics_1_2),
          'entities in common are linked to at least on same node')


On-chain heuristic used: none
9042 entities out of 9042 entities in common are linked to the same nodes
9042 entities out of 9042 entities in common are linked to at least on same node
On-chain heuristic used: stars
9042 entities out of 9042 entities in common are linked to the same nodes
9042 entities out of 9042 entities in common are linked to at least on same node
On-chain heuristic used: snakes
9042 entities out of 9042 entities in common are linked to the same nodes
9042 entities out of 9042 entities in common are linked to at least on same node
On-chain heuristic used: collectors
9193 entities out of 9193 entities in common are linked to the same nodes
9193 entities out of 9193 entities in common are linked to at least on same node
On-chain heuristic used: proxies
10584 entities out of 10584 entities in common are linked to the same nodes
10584 entities out of 10584 entities in common are linked to at least on same node
On-chain heuristic used: all
11272 entities out of 11272 en