### Prepare Data from Raw Data
The Elliptic dataset contains 3 raw files:
1. `elliptic_txs_classes.csv`: Each row contains a single transaction with a unique `txId` and a `class` value (1 = illicit, 2 = licit, unknown = unknown)
1. `elliptic_txs_edgelist.csv`: Each row is an edge representing that Bitcoin from one transaction has been used as input of another transaction.
1. `elliptic_txs_features.csv`: Each row maps 1-1 to the class list and contains more extensive features (166 total). No headers are provided.

Here, we combined the classes and features dataset into one, and we also construct an adjacency list representation for
easier parsing.

In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

In [2]:
base_path = os.path.abspath("../elliptic_bitcoin_dataset/")
output_path = os.path.abspath("../02_data")

if not os.path.exists(base_path):
    raise Exception("First download elliptic bitcoin dataset")

if not os.path.exists(output_path):
    os.mkdir(output_path)

In [3]:
# Read in raw data
classes = (
    pd.read_csv(os.path.join(base_path, "elliptic_txs_classes.csv")).set_index(
        "txId",
    )
    # Replace class labels with something more descriptive
    .replace({"1": "illicit", "2": "licit", "unknown": "unknown"})
)
features = pd.read_csv(
    os.path.join(base_path, "elliptic_txs_features.csv"),
    index_col="txId",
    # The dataset does not include column names. The paper describes the following features:
    # a unique transaction id, 94 local features (including a time step), and 72 aggregate features.
    names=[
        "txId",
        "timeStep",
        *[f"local{i + 1}" for i in range(94)][:-1],  # already accounted for timeStep
        *[f"aggregate{i + 1}" for i in range(72)],
    ],
)
edges = pd.read_csv(
    os.path.join(base_path, "elliptic_txs_edgelist.csv"),
    header=0,
    names=["fromTxId", "toTxId"],
)

print("Classes Data:", classes.shape)
print("Features Data:", features.shape)
print("Edges Data:", edges.shape)

Classes Data: (203769, 1)
Features Data: (203769, 166)
Edges Data: (234355, 2)


In [4]:
# Join the classes and the features into a single dataset.
joined = pd.concat([classes, features], axis=1)
# Split data for training and testing
train, test = train_test_split(
    joined, train_size=0.75, stratify=joined["class"], random_state=42
)

# These are large files, so saving these CSVs takes some time
joined.to_csv(os.path.join(output_path, "transactions.csv"))
train.to_csv(os.path.join(output_path, "transactions_train.csv"))
test.to_csv(os.path.join(output_path, "transactions_test.csv"))
edges.to_csv(os.path.join(output_path, "edges.csv"), index=False)

In [5]:
# Create adjacency list (might use it later for visualization purposes)
forward_links = edges.groupby("fromTxId")["toTxId"].apply(
    lambda x: ",".join(map(str, x))
)
back_links = edges.groupby("toTxId")["fromTxId"].apply(lambda x: ",".join(map(str, x)))

pd.merge(
    forward_links, back_links, how="outer", left_index=True, right_index=True
).to_csv(
    os.path.join(output_path, "adjacency_list.tsv"),
    sep="\t",
    index_label="txId",
    header=["forward", "backward"],
)