In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score
import scipy.sparse as sp
import os
from tqdm import tqdm
from scipy.sparse import csc_matrix
from scipy.sparse import save_npz, load_npz, coo_matrix

In [2]:
classes_csv = 'elliptic_txs_classes.csv'
edgelist_csv = 'elliptic_txs_edgelist.csv'
features_csv = 'elliptic_txs_features.csv'
data_dir = 'elliptic_bitcoin_dataset'
classes = pd.read_csv(os.path.join(data_dir, classes_csv), index_col = 'txId') # labels for the transactions i.e. 'unknown', '1', '2'
edgelist = pd.read_csv(os.path.join(data_dir, edgelist_csv), index_col = 'txId1') # directed edges between transactions
features = pd.read_csv(os.path.join(data_dir, features_csv), header = None, index_col = 0) # features of the transactions


In [3]:
num_features = features.shape[1]
num_tx = features.shape[0]
total_tx = list(classes.index)

# select only the transactions which are labelled
labelled_classes = classes[classes['class'] != 'unknown']
labelled_tx = list(labelled_classes.index)

# to calculate a list of adjacency matrices for the different timesteps

adj_mats = []
features_labelled_ts = []
classes_ts = []

In [6]:
edgelist

Unnamed: 0_level_0,txId2
txId1,Unnamed: 1_level_1
230425980,5530458
232022460,232438397
230460314,230459870
230333930,230595899
232013274,232029206
...,...
158365409,157930723
188708874,188708879
157659064,157659046
87414554,106877725


In [43]:
num_tx_labeled = len(labelled_tx)
set_labeled_tx = set(labelled_tx)

In [44]:
num_tx_labeled

46564

In [45]:
row = []
col = []

In [46]:
for i in tqdm(range(len(edgelist))):
#     print(i)
    node_id = edgelist.iloc[i]['txId2']
    if edgelist.index[i] not in set_labeled_tx or node_id not in set_labeled_tx:
        continue
    index = labelled_tx.index(node_id)
    first = labelled_tx.index(edgelist.index[i])
    row.append(first)
    col.append(index)

100%|█████████████████████████████████| 234355/234355 [02:48<00:00, 1394.59it/s]


In [51]:
data = np.ones(len(row))
A = csc_matrix((data, (row, col)), shape=(num_tx_labeled, num_tx_labeled))
# len(row)

In [52]:
save_npz("train_adj_mat.npz", A)
print('saved sparse train adj mat')

features_l_ts = features.loc[labelled_tx]
np.save('train_features.npy', features_l_ts.values) # save
print('saved features for train')
#     new_num_arr = np.load('data.npy') # load

classes_cur = classes.loc[labelled_tx]
np.save('train_classes.npy', classes_cur.values.astype(int).flatten())
print('saved classes for train')

saved sparse train adj mat
saved features for train
saved classes for train


In [53]:
features_l_ts

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,157,158,159,160,161,162,163,164,165,166
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
232438397,1,0.163054,1.963790,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,-0.115831,...,-0.577099,-0.613614,0.241128,0.241406,1.072793,0.085530,-0.131155,0.677799,-0.120613,-0.119792
232029206,1,-0.005027,0.578941,-0.091383,4.380281,-0.063725,4.667146,0.851305,-0.163645,-0.144554,...,-0.577099,-0.613614,0.241128,0.241406,0.604120,0.008632,-0.131155,0.333211,-0.120613,-0.119792
232344069,1,-0.147852,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.137933,-0.144108,...,-0.577099,-0.613614,0.241128,0.241406,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
27553029,1,-0.151357,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.141519,-0.147643,...,-0.539735,-0.582077,-0.979074,-0.978556,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
3881097,1,-0.172306,-0.184668,-1.201369,0.028105,-0.043875,-0.029140,0.242712,-0.163640,-0.169115,...,-0.577099,-0.600999,0.241128,0.241406,0.018279,-0.068266,-0.084674,-0.054450,-1.760926,-1.760984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80329479,49,-0.159293,-0.037276,1.018602,-0.121970,0.035526,-0.113002,-0.061584,-0.149635,-0.155646,...,1.793987,1.408971,0.231244,-0.388216,-0.098889,1.931078,3.168259,3.707301,-1.390548,-1.214035
158406298,49,-0.172962,-0.126566,1.018602,-0.121970,-0.063725,-0.113002,-0.061584,-0.163622,-0.169437,...,-0.577099,0.647874,0.241128,0.241406,10.914916,1.700384,-0.131155,7.914145,-0.120613,-0.119792
158375075,49,-0.170412,-0.078164,1.018602,0.028105,-0.043875,0.054722,-0.061584,-0.163631,-0.167106,...,1.709623,1.606604,1.461330,1.461369,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
147478192,49,-0.093732,-0.116160,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,-0.082559,-0.089510,...,-0.577099,-0.613614,0.241128,0.241406,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792


In [None]:
## Single training graph (not split by ts)

features_ts = features[features[1] <= 1]
# features_ts = features[:]
tx_ts = list(features_ts.index)

labelled_tx_ts = [tx for tx in tx_ts if tx in set(labelled_tx)]

# adjacency matrix for all the transactions
# we will only fill in the transactions of this timestep which have labels and can be used for training
adj_mat = pd.DataFrame(np.zeros((num_tx, num_tx)), index = total_tx, columns = total_tx)

edgelist_labelled_ts = edgelist.loc[edgelist.index.intersection(labelled_tx_ts).unique()]
for i in range(edgelist_labelled_ts.shape[0]):
    adj_mat.loc[edgelist_labelled_ts.index[i], edgelist_labelled_ts.iloc[i]['txId2']] = 1

adj_mat_ts = adj_mat.loc[labelled_tx_ts, labelled_tx_ts]
A = csc_matrix(adj_mat_ts.values)
save_npz("train_adj_mat.npz", A)
print('saved sparse train adj mat')

features_l_ts = features.loc[labelled_tx_ts]
np.save('train_features.npy', features_l_ts.values) # save
print('saved features for train')
#     new_num_arr = np.load('data.npy') # load

classes_cur = classes.loc[labelled_tx_ts]
np.save('train_classes.npy', classes_cur.values.astype(int).flatten())
print('saved classes for train')

In [None]:
adj_mat_ts

In [None]:
for ts in range(34, 49):
    features_ts = features[features[1] == ts+1]
    tx_ts = list(features_ts.index)

    labelled_tx_ts = [tx for tx in tx_ts if tx in set(labelled_tx)]

    # adjacency matrix for all the transactions
    # we will only fill in the transactions of this timestep which have labels and can be used for training
    adj_mat = pd.DataFrame(np.zeros((num_tx, num_tx)), index = total_tx, columns = total_tx)

    edgelist_labelled_ts = edgelist.loc[edgelist.index.intersection(labelled_tx_ts).unique()]
    for i in range(edgelist_labelled_ts.shape[0]):
        adj_mat.loc[edgelist_labelled_ts.index[i], edgelist_labelled_ts.iloc[i]['txId2']] = 1

    adj_mat_ts = adj_mat.loc[labelled_tx_ts, labelled_tx_ts]
    A = csc_matrix(adj_mat_ts.values)
    save_npz("{}_adj_mat.npz".format(ts+1), A)
    print('saved sparse adj mat {}'.format(ts+1))
    
    features_l_ts = features.loc[labelled_tx_ts]
    np.save('{}_features.npy'.format(ts+1), features_l_ts) # save
    print('saved features for ts {}'.format(ts+1))
#     new_num_arr = np.load('data.npy') # load
    
    classes_cur = classes.loc[labelled_tx_ts]
    np.save('{}_classes.npy'.format(ts+1), classes_cur.values.astype(int).flatten())
    print('saved classes for ts {}'.format(ts+1))

In [None]:
from scipy.sparse import csc_matrix
from scipy.sparse import save_npz, load_npz, coo_matrix
A = csc_matrix(adj_mat_ts.values)
save_npz("yourmatrix.npz", A)
# your_matrix_back = sparse.load_npz("yourmatrix.npz")

In [None]:
features_l_ts

In [None]:
classes_cur

In [None]:
classes

In [None]:
len(classes)

In [None]:
csc_matrix((data, (row, col)), shape=(3, 3)).toarray()

In [55]:
your_matrix_back = load_npz("train_adj_mat.npz")
print(your_matrix_back.toarray().shape)

(46564, 46564)


In [56]:
features = np.load('train_features.npy') # load
features.shape

(46564, 166)

In [57]:
classes = np.load('train_classes.npy')
classes.shape

(46564,)

In [61]:
# your_matrix_back = load_npz("train/homo/1_adj_mat.npz")
your_matrix_back = load_npz("train_adj_mat.npz")
cx = coo_matrix(your_matrix_back)

features = np.load('train_features.npy') # load

row = []
col = []

# relation: sim < 25

for start,end,val in zip(cx.row, cx.col, cx.data):
    # iterate over nonzero elems of adj matrix
    row_start = features[start]
    row_end = features[end]
    num = np.count_nonzero(row_start[1:94] == row_end[1:94])
#     if num <= 25:
#     if num > 25 and num <= 40:
    if num > 40:
        row.append(start)
        col.append(end)

data = np.ones(len(row))
print(len(data))
geq_40_relation = csc_matrix((data, (row, col)), shape=cx.shape)
save_npz("geq_40_relation_adjmat.npz", geq_40_relation)

9784


In [None]:
len(row)

In [None]:
# def load_data(data_dir, start_ts, end_ts):
#     classes_csv = 'elliptic_txs_classes.csv'
#     edgelist_csv = 'elliptic_txs_edgelist.csv'
#     features_csv = 'elliptic_txs_features.csv'

#     classes = pd.read_csv(os.path.join(data_dir, classes_csv), index_col = 'txId') # labels for the transactions i.e. 'unknown', '1', '2'
#     edgelist = pd.read_csv(os.path.join(data_dir, edgelist_csv), index_col = 'txId1') # directed edges between transactions
#     features = pd.read_csv(os.path.join(data_dir, features_csv), header = None, index_col = 0) # features of the transactions
    
#     num_features = features.shape[1]
#     num_tx = features.shape[0]  
#     total_tx = list(classes.index)

#     # select only the transactions which are labelled
#     labelled_classes = classes[classes['class'] != 'unknown']
#     labelled_tx = list(labelled_classes.index)

#     # to calculate a list of adjacency matrices for the different timesteps

#     adj_mats = []
#     features_labelled_ts = []
#     classes_ts = []
#     num_ts = 49 # number of timestamps from the paper

#     for ts in range(start_ts, end_ts):
#         features_ts = features[features[1] == ts+1]
#         tx_ts = list(features_ts.index)
        
#         labelled_tx_ts = [tx for tx in tx_ts if tx in set(labelled_tx)]
        
#         # adjacency matrix for all the transactions
#         # we will only fill in the transactions of this timestep which have labels and can be used for training
#         adj_mat = pd.DataFrame(np.zeros((num_tx, num_tx)), index = total_tx, columns = total_tx)
        
#         edgelist_labelled_ts = edgelist.loc[edgelist.index.intersection(labelled_tx_ts).unique()]
#         for i in range(edgelist_labelled_ts.shape[0]):
#             adj_mat.loc[edgelist_labelled_ts.index[i], edgelist_labelled_ts.iloc[i]['txId2']] = 1
        
#         adj_mat_ts = adj_mat.loc[labelled_tx_ts, labelled_tx_ts]
#         features_l_ts = features.loc[labelled_tx_ts]
        
#         adj_mats.append(adj_mat_ts)
#         features_labelled_ts.append(features_l_ts)
#         classes_ts.append(classes.loc[labelled_tx_ts])

#     return adj_mats, features_labelled_ts, classes_ts

In [None]:
# load_data('elliptic_bitcoin_dataset', 0, 30)

In [None]:
df = pd.read_csv(os.path.join('elliptic_bitcoin_dataset', 'elliptic_txs_features.csv'), header = None, index_col = 0) 

In [None]:
df

In [None]:
row = df.iloc[0].iloc[1:94]
row2 = df.iloc[1].iloc[1:94]

In [None]:
np.count_nonzero(row == row3)

In [None]:
sims = []
for i in tqdm(range(len(df))):
    p = np.random.uniform()
    if p > 0.01:
        continue
    row = df.iloc[i].iloc[1:94]
    for j in range(i, len(df)):
        p = np.random.uniform()
        if p > 0.01:
            continue
        row2 = df.iloc[j].iloc[1:94]
        num_sim = np.count_nonzero(row == row2)
        sims.append(num_sim)

In [None]:
import matplotlib.pyplot as plt
plt.hist(sims)

In [None]:
np.quantile(sims, 0.33), np.quantile(sims, 0.66)

In [None]:
# use < 25, 25<= .. <= 40, > 40 as relation types?