# Off-Chain Clustering
In this notebook, we show you how we cluster LN nodes based on their alias and IP address.

In [2]:
from jellyfish import levenshtein_distance, damerau_levenshtein_distance, \
    hamming_distance
from matplotlib import rcParams
import scipy
import pandas as pd
import numpy as np
from tqdm import tqdm  # progress bars
import networkx as nx
import matplotlib.pyplot as plt
from plotnine import theme_bw, theme, element_text, ggplot, geom_bar, aes, \
    geom_text, labs

# input files
from utils import nodes_csv_file, ips_csv_file, whois_csv_file, \
    get_same_asn_clusters, evaluate_single_result, evaluate_measure

# output files
from utils import alias_address_clusters_csv_file

from utils import relative_lcs, cluster, lcs_distance, \
    relative_levenshtein, relative_damerau_levenshtein, relative_hamming, \
    jaro_distance, jaro_winkler_distance, is_reserved_address

theme_publication = theme_bw() + theme(text=element_text(family="cmr10", size=12, color="black"), axis_title=element_text(size=14))

In [3]:
node_aliases = pd.read_csv(nodes_csv_file)
# have each pub_key / alias combination in a separate row
node_aliases.alias = node_aliases.alias.apply(lambda x: x.split("|"))
node_aliases = node_aliases.explode("alias")
# filter to remove empty aliases
node_aliases = node_aliases[node_aliases.alias.str.len() > 0].copy()
print("Got", len(node_aliases), "aliases")

Got 9724 aliases


In [4]:
# Load IP addresses
node_ips = pd.read_csv(ips_csv_file)
# have each pub_key / ip combination in a separate row
node_ips.ip_address = node_ips.ip_address.apply(lambda x: x.split("|"))
node_ips = node_ips.explode("ip_address")
# separate port and ip address for easier querying
node_ips['port'] = node_ips.ip_address.apply(lambda x: x.rsplit(":", 1)[1])
node_ips['ip_address'] = node_ips.ip_address.apply(lambda x: x.rsplit(":", 1)[0].strip("[]"))

# FILTER (remove addresses that don't make sense)
# https://en.wikipedia.org/wiki/Reserved_IP_addresses

reserved_ips = node_ips.ip_address.apply(is_reserved_address)
print("Removing a total of:", sum(reserved_ips), "reserved IPs")
node_ips = node_ips[~reserved_ips]

Removing a total of:  393 reserved IPs


In [7]:
try:
    whois_data = pd.read_csv(whois_csv_file)
    whois_data = whois_data[~whois_data.entities.isna()]
except:
    print("No existing whois data, querying all IP addresses now")
    from ipwhois import IPWhois

    def lookup(ip_address):
        fail = {"query": ip_address}
        if ".onion" in ip_address:
            return fail
        else:
            try:
                res = IPWhois(ip_address).lookup_rdap(depth=1)
                return res
            except:
                print(ip_address, "couldn't be queried...")
                return fail
    whois_jsons = [lookup(ip_address) for ip_address in tqdm(node_ips.ip_address.unique())]
    whois_data = pd.DataFrame(whois_jsons)
    # whois_data = pd.DataFrame.from_dict(whois_jsons) # tofix
    whois_data.to_csv(whois_csv_file, index=False)

# only keep asn and ip_address
whois_data = whois_data[["asn", "query"]].rename(columns={"query": "ip_address"})

In [None]:
tmp_aliases = node_aliases[node_aliases.alias.str.contains("LNBIG")]
# add random "Lightning" aliases
tmp_aliases = tmp_aliases.append(
    node_aliases[node_aliases.alias.str.contains("Lightning")].sample(10, random_state=7))
# add entirely random aliases
tmp_aliases = tmp_aliases.append(node_aliases.sample(
    10, random_state=7)).drop_duplicates().reset_index(drop=True)

distance_measure = relative_lcs # tofix redefined below
# the common substring needs to account for 70% of all letters of the longer string
max_distance_threshold = 1 - 0.7

distance_measure = relative_lcs  # jellyfish.levenshtein_distance
max_distance_threshold = 0.46

clusters, Z = cluster(tmp_aliases, distance_measure, max_distance_threshold)


rcParams['font.family'] = 'cmr10'

# plot dendrogram (only makes sense for smaller data...)
fig = plt.figure(figsize=(9, 16))
ax = fig.add_subplot(1, 1, 1)
dn = scipy.cluster.hierarchy.dendrogram(
    Z, orientation="right", labels=clusters.alias.values, color_threshold=max_distance_threshold)
ax.axvline(x=max_distance_threshold, color='r', ls="--")
ax.xaxis.set_ticks_position("top")
plt.savefig("alias_dendrogram_example.pdf", papertype="letter", bbox_inches="tight")

same_asn_clusters = get_same_asn_clusters(clusters, node_ips, whois_data)
same_asn_clusters.groupby(["cluster", "pub_key"])["alias"].agg(
    lambda x: '|'.join(set(x))).reset_index()

evaluate_single_result(clusters)
tmp_aliases = node_aliases.copy()
results = pd.concat([
    evaluate_measure(tmp_aliases, relative_lcs, np.arange(0, 1, 0.01)),
    evaluate_measure(tmp_aliases, lcs_distance, 1/np.arange(1, 15, 1)),
    evaluate_measure(tmp_aliases, levenshtein_distance, np.arange(0, 10, 1)),
    evaluate_measure(tmp_aliases, relative_levenshtein, np.arange(0, 1, 0.01)),
    evaluate_measure(tmp_aliases, damerau_levenshtein_distance, np.arange(0, 10, 1)),
    evaluate_measure(tmp_aliases, relative_damerau_levenshtein, np.arange(0, 1, 0.01)),
    evaluate_measure(tmp_aliases, hamming_distance, np.arange(0, 10, 1)),
    evaluate_measure(tmp_aliases, relative_hamming, np.arange(0, 1, 0.01)),
    evaluate_measure(tmp_aliases, jaro_distance, np.arange(0, 1, 0.01)),
    evaluate_measure(tmp_aliases, jaro_winkler_distance, np.arange(0, 1, 0.01))
])

results[results.cluster_count_min26 == 2].sort_values("node_count", ascending=False).dropna()

# plot method comparison
bestMeasure_for_plot = results[results.groupby(['measure'])['cluster_count_min26'].transform(
    max) == results['cluster_count_min26']].reset_index(drop=True)
bestMeasure_for_plot = bestMeasure_for_plot.iloc[bestMeasure_for_plot.reset_index().groupby(["measure"])[
    'node_count'].idxmax()]

bestMeasure_for_plot = bestMeasure_for_plot.sort_values("node_count", ascending=False)
measure_list = bestMeasure_for_plot['measure'].values.tolist()
measure_cat = pd.Categorical(bestMeasure_for_plot['measure'], categories=measure_list)
bestMeasure_for_plot = bestMeasure_for_plot.assign(measure_cat=measure_cat)
bestMeasure_for_plot["label_pos"] = bestMeasure_for_plot["node_count"]/2
bestMeasure_for_plot["label_text"] = bestMeasure_for_plot.apply(
    lambda x: str(x["cluster_count"]) + " clusters at threshold "+str(round(x['threshold'], 2)), axis=1)

plot = ggplot(bestMeasure_for_plot) +\
    geom_bar(aes(x="measure_cat", y="node_count"), stat="identity", colour="black", fill="white") +\
    geom_text(aes(x="measure_cat", y="label_pos", label="label_text"), angle=90, colour="black") +\
    labs(x="String distance measure", y="Total clustered lightning nodes") +\
    theme_publication +\
    theme(axis_text_x=element_text(rotation=45, vjust=1, hjust=1))

plot.save("alias_clustering_white.pdf", width=5, height=5)


# print summary of best method:
best = results[results.groupby(['measure'])['cluster_count_min26'].transform(
    max) == results['cluster_count_min26']].reset_index(drop=True)
best = best[best["node_count"] == best["node_count"].max()].iloc[0]
print(best)

print("Running final alias clustering with best configuration")
final_alias_cluster, _ = cluster(
    node_aliases, globals()[best.measure.replace(" ", "_")], best.threshold)
#final_alias_cluster, _ = cluster(node_aliases.head(1000), relative_lcs, 0.46)
alias_asn_clusters = get_same_asn_clusters(final_alias_cluster, node_ips, whois_data)[
    ["pub_key", "cluster"]].drop_duplicates()
alias_asn_clusters["cluster_origin"] = "alias/asn"

print("Clustering based on same IP/onion address")
same_ip_nodes = node_ips.drop(columns=['port']) \
    .groupby('ip_address') \
    .filter(lambda x: x['pub_key'].nunique() > 1) \
    .sort_values('ip_address')
same_ip_nodes = same_ip_nodes.rename(columns={"ip_address": "cluster"})
same_ip_nodes["cluster_origin"] = "address"
same_ip_nodes.head()

print("Merging both clusterings")
combined = pd.concat([alias_asn_clusters, same_ip_nodes])
G = nx.from_pandas_edgelist(combined, source="pub_key", target="cluster", create_using=nx.DiGraph)
l = list(nx.weakly_connected_components(G))
L = [dict.fromkeys(y, x) for x, y in enumerate(l)]
d = {k: v for d in L for k, v in d.items()}
mapping = pd.DataFrame(list(d.items()), columns=['pub_key', 'newcluster'])
mapping = mapping.merge(combined, how="left", on="pub_key").drop(columns="cluster").dropna()
mapping = mapping.groupby(["newcluster", "pub_key"])["cluster_origin"].agg(
    lambda x: ' & '.join(set(x))).reset_index()
mapping = mapping.merge(node_aliases, how="left")

final_clusters = mapping.groupby(["newcluster", "pub_key", "cluster_origin"])[
    "alias"].agg(lambda x: ' | '.join(set(x.dropna()))).reset_index()
final_clusters = final_clusters.rename(columns={"newcluster": "cluster"})
final_clusters.to_csv(alias_address_clusters_csv_file, index=False)

cluster_type_distribution = final_clusters.groupby("cluster_origin")["pub_key"].count()
print("Best clustering algorithm:\n", cluster_type_distribution)
print("Total nodes clustered: ", final_clusters["pub_key"].count())
print("Total cluster count:", final_clusters.cluster.nunique())
print("Largest clusters:")
final_clusters.groupby("cluster").agg(
    {"pub_key": len, "alias": lambda x: "{%s}" % ', '.join(x)}).sort_values("pub_key", ascending=False).head()