In [1]:
import time

import numpy as np
import pandas as pd
import pyarrow.dataset as ds
from tabulate import tabulate

START_SLOT = 370656000  # Start of epoch 858
END_SLOT = 377135999  # End of epoch 872
TX_TYPES = ["frontRun", "backRun", "victim", "transfer"]
ATTACKER_TX_TYPES = ["frontRun", "backRun", "transfer"]
EPS_WIN = 1e-5

In [2]:
def print_rows(df, idx, show_addr=False):
    df_print = df.iloc[idx]
    if not show_addr:
        df_print = df_print.drop(columns=["signer_addresses", "signer_addresses_str"])
    print(tabulate(df_print, headers="keys", tablefmt="psql"))


def print_df(df, n=5, show_addr=False, columns=None):
    df_print = df.head(n)

    if columns is not None:
        df_print = df_print[columns]

    if not show_addr:
        if (
            "signer_addresses" in df_print.columns.tolist()
            and "signer_addresses_str" in df_print.columns.tolist()
        ):
            df_print = df_print.drop(
                columns=["signer_addresses", "signer_addresses_str"]
            )

    print(tabulate(df_print, headers="keys", tablefmt="psql", showindex=True))

In [3]:
TX_PATH = "data/parquet_out"
SANDWICH_STAT_PATH = "data/sandwich_stat.csv"

# 1. load sandwich transaction data
start_time = time.time()

sandwiches_txs = ds.dataset(TX_PATH, format="parquet")
sandwiches_txs = sandwiches_txs.to_table().to_pandas()

end_time = time.time()
elapsed = end_time - start_time
print(f"Query took {elapsed:.2f} seconds")

# 2. load sandwich statistics
sandwich_stat = pd.read_csv(SANDWICH_STAT_PATH)

# 3. print basic information
print(
    f"Block {sandwiches_txs['tx_slot'].min()} - Block {sandwiches_txs['tx_slot'].max()}"
)
print(f" - Number of transactions: {len(sandwiches_txs)}")
print(f" - Number of sandwiches: {sandwiches_txs['sandwichId'].nunique()}")
print(
    f" - Number of victims: {sandwiches_txs[sandwiches_txs['type'] == 'victim']['signer'].nunique()}"
)
print(
    f" - Number of unique victim transactions: {sandwiches_txs[sandwiches_txs['type'] == 'victim']['signature'].nunique()}"
)
print(
    f" - Number of victim transactions: {len(sandwiches_txs[sandwiches_txs['type'] == 'victim']['signature'])}"
)

Query took 16.31 seconds
Block 370656011 - Block 377135999
 - Number of transactions: 19228747
 - Number of sandwiches: 2771401
 - Number of victims: 1104148
 - Number of unique victim transactions: 5344203
 - Number of victim transactions: 13679119


In [4]:
attacker_summary = pd.read_csv("data/attacker_summary.csv")

In [5]:
INBUNDLE_RATIO_THRES = 0.8
HIGH_WIN_RATE_THRES = 0.9
HIGH_WIN_RATE_VOLUME_THRES = 100

In [6]:
print(f"\nNumber of attacker {len(attacker_summary)}")
print(
    f"\nAverage sandwich count of attackers {attacker_summary['sandwich_count'].mean()}"
)

print(
    f"\nNumber of attacker with #sandwich > 100: {len(attacker_summary[attacker_summary['sandwich_count'] > 100])}"
)
print(
    f"Number of attacker with win rate > 0.9: {len(attacker_summary[attacker_summary['win_rate'] > 0.9])}"
)

high_win_rate_attacker = attacker_summary[
    (attacker_summary["win_rate"] >= HIGH_WIN_RATE_THRES)
    & (attacker_summary["sandwich_count"] > HIGH_WIN_RATE_VOLUME_THRES)
]
print(f"Number of high win rate attacker {len(high_win_rate_attacker)}")
print(
    f"Total profit of high win rate attacker {high_win_rate_attacker['total_profit_in_usd'].sum()}"
)
print(
    f"Share of profit of high win rate attacker {high_win_rate_attacker['total_profit_in_usd'].sum() / attacker_summary['total_profit_in_usd'].sum()}"
)
print_df(high_win_rate_attacker.sort_values(by="total_profit_in_usd", ascending=False))


Number of attacker 382240

Average sandwich count of attackers 7.250421201339472

Number of attacker with #sandwich > 100: 2298
Number of attacker with win rate > 0.9: 35834
Number of high win rate attacker 95
Total profit of high win rate attacker 0.0
Share of profit of high win rate attacker nan
+------+--------------------+------------------+--------------------+-----------------------+----------------------+------------------+---------------------+-------------+------------+---------------------+----------------------+------------------------+-------------------------+--------------------+---------------------+------------------+-------------------+------------------+-------------------+----------------------------+-----------------------------+---------------------+----------------------+------------------------+----------------+
|      | attacker_key       |   sandwich_count |   total_profit_SOL |   total_profit_in_usd |   sol_sandwich_count |   avg_profit_SOL |   avg_profit_in_

  f"Share of profit of high win rate attacker {high_win_rate_attacker['total_profit_in_usd'].sum() / attacker_summary['total_profit_in_usd'].sum()}"


In [7]:
# Calculate jito_rate
jito_related_attacker = high_win_rate_attacker[
    high_win_rate_attacker["inbundle_count"] > 0
].copy()
print(f"Number of high win rate jito-related attackers: {len(jito_related_attacker)}")

jito_related_attacker["jito_rate"] = (
    jito_related_attacker["inbundle_count"] / jito_related_attacker["sandwich_count"]
)

print(jito_related_attacker["jito_rate"].describe())

# Assume jito_related_attacker already contains the jito_rate column
bins = np.linspace(0, 1, 11)  # Divide the range 0â€“1 into 10 equal bins
labels = [f"{int(l * 100)}%-{int(r * 100)}%" for l, r in zip(bins[:-1], bins[1:])]

# Bin jito_rate into intervals
jito_related_attacker["jito_bin"] = pd.cut(
    jito_related_attacker["jito_rate"], bins=bins, labels=labels, include_lowest=True
)

# Count number of attackers in each bin
bin_stats = (
    jito_related_attacker["jito_bin"]
    .value_counts()
    .sort_index()
    .reset_index()
    .rename(columns={"index": "jito_rate_bin", "jito_bin": "num_attackers"})
)

print(bin_stats)

Number of high win rate jito-related attackers: 67
count    67.000000
mean      0.109213
std       0.270150
min       0.001980
25%       0.010374
50%       0.015748
75%       0.030383
max       0.994083
Name: jito_rate, dtype: float64
  num_attackers  count
0        0%-10%     59
1       10%-20%      1
2       20%-30%      0
3       30%-40%      0
4       40%-50%      0
5       50%-60%      2
6       60%-70%      0
7       70%-80%      0
8       80%-90%      0
9      90%-100%      5


In [8]:
# Jito-attacker
jito_attacker = attacker_summary[
    attacker_summary["inbundle_count"] / attacker_summary["sandwich_count"]
    > INBUNDLE_RATIO_THRES
]

print(f"\nNumber of jito attacker: {len(jito_attacker)}")
print(f"Average win rate of jito attacker: {jito_attacker['win_rate'].mean()}")
print(f"Average profit of jito attacker: {jito_attacker['total_profit_in_usd'].mean()}")

jito_attacker_high_win_rate = jito_attacker[
    (jito_attacker["win_rate"] >= HIGH_WIN_RATE_THRES)
    & (jito_attacker["sandwich_count"] > HIGH_WIN_RATE_VOLUME_THRES)
]

print(f"\nNumber of high win rate jito attacker: {len(jito_attacker_high_win_rate)}")
print(
    f"\nTotal profit of high win rate jito attacker: {jito_attacker_high_win_rate['total_profit_in_usd'].sum()}"
)
print_df(
    jito_attacker_high_win_rate.sort_values(by="total_profit_in_usd", ascending=False),
    show_addr=True,
    n=len(jito_attacker_high_win_rate),
)


Number of jito attacker: 293142
Average win rate of jito attacker: 0.03862798438565569
Average profit of jito attacker: 0.0

Number of high win rate jito attacker: 5

Total profit of high win rate jito attacker: 0.0
+-----+--------------------+------------------+--------------------+-----------------------+----------------------+------------------+---------------------+-------------+------------+---------------------+----------------------+------------------------+-------------------------+--------------------+---------------------+------------------+-------------------+------------------+-------------------+----------------------------+-----------------------------+---------------------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [9]:
# Non-jito attacker
non_jito_attacker = attacker_summary[
    attacker_summary["inbundle_count"] / attacker_summary["sandwich_count"]
    <= INBUNDLE_RATIO_THRES
]

print(f"\nNumber of non-jito attacker: {len(non_jito_attacker)}")
print(f"Average win rate of non-jito attacker: {non_jito_attacker['win_rate'].mean()}")
print(
    f"Average profit of non-jito attacker: {non_jito_attacker['total_profit_in_usd'].mean()}"
)

non_jito_attacker_high_win_rate = non_jito_attacker[
    (non_jito_attacker["win_rate"] >= HIGH_WIN_RATE_THRES)
    & (non_jito_attacker["sandwich_count"] > HIGH_WIN_RATE_VOLUME_THRES)
]

print(
    f"\nNumber of high win rate jito attacker: {len(non_jito_attacker_high_win_rate)}"
)
print(
    f"\nTotal profit if high win rate non-jito attacker: {non_jito_attacker_high_win_rate['total_profit_in_usd'].sum()}"
)
print_df(
    non_jito_attacker_high_win_rate.sort_values(
        by="total_profit_in_usd", ascending=False
    ),
    show_addr=False,
)


Number of non-jito attacker: 89098
Average win rate of non-jito attacker: 0.40223336266496007
Average profit of non-jito attacker: 0.0

Number of high win rate jito attacker: 90

Total profit if high win rate non-jito attacker: 0.0
+------+--------------------+------------------+--------------------+-----------------------+----------------------+------------------+---------------------+-------------+------------+---------------------+----------------------+------------------------+-------------------------+--------------------+---------------------+------------------+-------------------+------------------+-------------------+----------------------------+-----------------------------+---------------------+----------------------+------------------------+----------------+
|      | attacker_key       |   sandwich_count |   total_profit_SOL |   total_profit_in_usd |   sol_sandwich_count |   avg_profit_SOL |   avg_profit_in_usd |   win_count |   win_rate |   signer_diff_count |   signer_dif

In [10]:
TYPE_RATIO_THRES = 0.8

high_win_rate_consec = non_jito_attacker_high_win_rate[
    (
        non_jito_attacker_high_win_rate["inblock_consec_count"]
        / non_jito_attacker_high_win_rate["sandwich_count"]
        > TYPE_RATIO_THRES
    )
]
print_df(high_win_rate_consec, show_addr=True)
high_win_rate_corss_block = non_jito_attacker_high_win_rate[
    (
        non_jito_attacker_high_win_rate["cross_block_count"]
        / non_jito_attacker_high_win_rate["sandwich_count"]
        > TYPE_RATIO_THRES
    )
]

high_win_rate_inblock_nonconsec = non_jito_attacker_high_win_rate[
    (
        non_jito_attacker_high_win_rate["inblock_non_consec_count"]
        / non_jito_attacker_high_win_rate["sandwich_count"]
        > TYPE_RATIO_THRES
    )
]

high_win_rate_non_consec_both = non_jito_attacker_high_win_rate[
    (
        non_jito_attacker_high_win_rate["cross_block_count"]
        / non_jito_attacker_high_win_rate["sandwich_count"]
        > 0.4
    )
    & (
        non_jito_attacker_high_win_rate["inblock_non_consec_count"]
        / non_jito_attacker_high_win_rate["sandwich_count"]
        > 0.4
    )
]

high_win_rate_bundle = non_jito_attacker_high_win_rate[
    (
        non_jito_attacker_high_win_rate["inbundle_count"]
        / non_jito_attacker_high_win_rate["sandwich_count"]
        > TYPE_RATIO_THRES
    )
]

print(
    f"\nConsec. sandwich attacker: {len(high_win_rate_consec)} ({len(high_win_rate_consec) / len(non_jito_attacker_high_win_rate) * 100:.3f}%)"
)
print(
    f"Consec. sandwich attacker profit: {high_win_rate_consec['inblock_consec_profit'].sum()} / {non_jito_attacker_high_win_rate['total_profit_in_usd'].sum()}"
)

print(
    f"\nCross-Block. sandwich attacker: {len(high_win_rate_corss_block)} ({len(high_win_rate_corss_block) / len(non_jito_attacker_high_win_rate) * 100:.3f}%)"
)
print(
    f"Cross-Block. sandwich attacker profit: {high_win_rate_corss_block['cross_block_profit'].sum()} / {non_jito_attacker_high_win_rate['total_profit_in_usd'].sum()}"
)

print(
    f"\nIn-Block Non-Consec. sandwich attacker: {len(high_win_rate_inblock_nonconsec)} ({len(high_win_rate_inblock_nonconsec) / len(non_jito_attacker_high_win_rate) * 100:.3f}%)"
)
print(
    f"In-Block Non-Consec. sandwich attacker profit: {high_win_rate_inblock_nonconsec['inblock_non_consec_profit'].sum()} / {non_jito_attacker_high_win_rate['total_profit_in_usd'].sum()}"
)

print(
    f"\nCross-Block & InBlock Non-Consec. sandwich attacker: {len(high_win_rate_non_consec_both)} ({len(high_win_rate_non_consec_both) / len(non_jito_attacker_high_win_rate) * 100:.3f}%)"
)
print(
    f"Cross-Block & InBlock Non-Consec. sandwich attacker profit: {high_win_rate_non_consec_both['total_profit_in_usd'].sum()} / {non_jito_attacker_high_win_rate['total_profit_in_usd'].sum()}"
)
high_win_rate_non_consec_both["cross_block_ratio"] = (
    high_win_rate_non_consec_both["cross_block_count"]
    / high_win_rate_non_consec_both["non_consec_count"]
)
print_df(
    high_win_rate_non_consec_both,
    show_addr=True,
    n=len(high_win_rate_non_consec_both),
    columns=[
        "attacker_key",
        "inblock_consec_count",
        "non_consec_count",
        "cross_block_count",
        "inblock_non_consec_count",
        "cross_block_ratio",
    ],
)

+----+--------------------+------------------+--------------------+-----------------------+----------------------+------------------+---------------------+-------------+------------+---------------------+----------------------+------------------------+-------------------------+--------------------+---------------------+------------------+-------------------+------------------+-------------------+----------------------------+-----------------------------+---------------------+----------------------+--------------------------------------------------+------------------------+----------------------------------------------+----------------+
|    | attacker_key       |   sandwich_count |   total_profit_SOL |   total_profit_in_usd |   sol_sandwich_count |   avg_profit_SOL |   avg_profit_in_usd |   win_count |   win_rate |   signer_diff_count |   signer_diff_profit |   inblock_consec_count |   inblock_consec_profit |   non_consec_count |   non_consec_profit |   inbundle_count |   inbundle_prof

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high_win_rate_non_consec_both["cross_block_ratio"] = (


In [11]:
import ast

LEADER_DOM_THRES = 0.8

# slot->leader
slot_leader = pd.read_csv("data/slot_leader.csv")
slot_leader = slot_leader[
    (slot_leader["slot"] >= START_SLOT) & (slot_leader["slot"] <= END_SLOT)
]
slot_leader_map = dict(zip(slot_leader["slot"], slot_leader["leader"]))
print(f"Get {len(slot_leader)} slots")

# signer -> attacker_key
non_jito_attacker_high_win_rate["signer_addresses"] = non_jito_attacker_high_win_rate[
    "signer_addresses"
].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
signer_to_attacker = (
    non_jito_attacker_high_win_rate[["attacker_key", "signer_addresses"]]
    .explode("signer_addresses")
    .rename(columns={"signer_addresses": "signer"})
)

# unique sandwiches txs (by sandwichId + signer)
filtered_txs = sandwiches_txs[sandwiches_txs["type"].isin(ATTACKER_TX_TYPES)]
filtered_txs = filtered_txs.drop_duplicates(subset=["sandwichId"])

# map attacker key info, reserve sandwiches of high win rate attacker
merged = filtered_txs.merge(signer_to_attacker, on="signer", how="inner")

# merge info (attacker_key, leader) to sandwich_stat
sandwich_attacker_map = dict(zip(merged["sandwichId"], merged["attacker_key"]))
sandwich_stat_high_win_rate = sandwich_stat.merge(
    merged[["sandwichId", "attacker_key"]], on="sandwichId", how="inner"
)
sandwich_stat_high_win_rate["leader"] = (
    sandwich_stat_high_win_rate["slot"].map(slot_leader_map).fillna("UNKNOWN")
)
print(
    sandwich_stat_high_win_rate[sandwich_stat_high_win_rate["leader"] == "UNKNOWN"][
        "slot"
    ].value_counts()
)

# count volume and profit of each attacker under each leader
leader_stats = (
    sandwich_stat_high_win_rate.groupby(["attacker_key", "leader"])
    .agg(
        total_sandwiches=("sandwichId", "count"),
        win_sandwiches=("profitA", lambda x: (x > 0).sum()),
        total_profit=("profit_in_usd", "sum"),
    )
    .reset_index()
)
leader_stats["win_rate"] = (
    leader_stats["win_sandwiches"] / leader_stats["total_sandwiches"]
)
# print_df(leader_stats)

# count related-leader of each attacker
leader_count = (
    leader_stats.groupby("attacker_key")["leader"]
    .nunique()
    .reset_index(name="leader_count")
)

# get leader list of each attacker
leader_list = (
    leader_stats.sort_values(
        ["attacker_key", "total_sandwiches"], ascending=[True, False]
    )
    .groupby("attacker_key")["leader"]
    .apply(list)
    .reset_index(name="leader_list")
)

high_win_rate_attacker_summary = leader_count.merge(leader_list, on="attacker_key")

leader_detail = (
    leader_stats.sort_values(
        ["attacker_key", "total_sandwiches"], ascending=[True, False]
    )
    .groupby("attacker_key")
    .apply(
        lambda x: x[["leader", "total_sandwiches", "win_rate", "total_profit"]].to_dict(
            "records"
        )
    )
    .reset_index()
)
leader_detail = leader_detail.rename(columns={0: "leader_detail"})

high_win_rate_attacker_summary = high_win_rate_attacker_summary.merge(
    leader_detail, on="attacker_key"
)

# count sandwiches of each attacker
total_sandwiches_per_attacker = (
    sandwich_stat_high_win_rate.groupby("attacker_key")["sandwichId"].count().to_dict()
)


# Define a function to check whether a dominant leader exists
def check_exist_dominate(leader_detail, attacker_key):
    total = total_sandwiches_per_attacker[attacker_key]
    # Check if any leader contributes more than 20% of total sandwiches
    for ld in leader_detail:
        if ld["total_sandwiches"] / total > 0.2:
            return True
    return False


high_win_rate_attacker_summary["exist_dominate"] = high_win_rate_attacker_summary.apply(
    lambda row: check_exist_dominate(row["leader_detail"], row["attacker_key"]), axis=1
)

high_win_rate_attacker_summary = high_win_rate_attacker_summary.merge(
    non_jito_attacker_high_win_rate[
        ["attacker_key", "sandwich_count", "total_profit_in_usd", "win_rate"]
    ],
    on="attacker_key",
    how="left",
)

# sort by leader_count
high_win_rate_attacker_summary = high_win_rate_attacker_summary.sort_values(
    by="total_profit_in_usd", ascending=False
).reset_index(drop=True)
high_win_rate_attacker_summary = high_win_rate_attacker_summary[
    [
        "attacker_key",
        "leader_count",
        "exist_dominate",
        "sandwich_count",
        "total_profit_in_usd",
        "win_rate",
        "leader_detail",
        "leader_list",
    ]
]

print("\nInfo of high win rate attacker with least leader count:")
print_df(high_win_rate_attacker_summary, 20)

print("\nInfo of attacker with dominate leader")
print_df(
    high_win_rate_attacker_summary[
        high_win_rate_attacker_summary["exist_dominate"] == True
    ]
)

FileNotFoundError: [Errno 2] No such file or directory: 'data/slot_leader.csv'

In [None]:
# leader info
leader_list = sandwich_stat_high_win_rate["leader"].unique().tolist()
print(
    f"Total number of leader related to high win rate attacker: {len(leader_list)} (Total leader: {slot_leader['leader'].nunique()})"
)

# calculate:
# high_win_rate_attacker_count, high_win_rate_attacker_sandwich_count,
# average_high_win_rate_attacker_in_all_slot, average_sandwich_count_in_all_slot,
# average_profit_in_all_slot, average_win_rate_in_all_slot

leader_slot_stats = (
    sandwich_stat_high_win_rate.groupby(["leader", "slot"])
    .agg(
        high_win_rate_attacker_count=(
            "attacker_key",
            pd.Series.nunique,
        ),  # attacker number in slot
        high_win_rate_attacker_sandwich_count=(
            "sandwichId",
            "count",
        ),  # sandwich count in slot
        total_profit=("profit_in_usd", "sum"),  # total profit in slot
        win_sandwich_count=(
            "profitA",
            lambda x: (x > 0).sum(),
        ),  # win sandwich count in slot
    )
    .reset_index()
)

unknown_data = leader_slot_stats[leader_slot_stats["leader"] == "UNKNOWN"]
if len(unknown_data) != 0:
    print_df(unknown_data)

leader_slot_stats["win_rate_in_slot"] = (
    leader_slot_stats["win_sandwich_count"]
    / leader_slot_stats["high_win_rate_attacker_sandwich_count"]
)

# total slot of each leader
total_slots = slot_leader.groupby("leader")["slot"].nunique().to_dict()

leader_stats_list = []
for leader, group in leader_slot_stats.groupby("leader"):
    total_slot = total_slots[leader]
    high_win_rate_attacker_count_total = group["high_win_rate_attacker_count"].sum()
    high_win_rate_attacker_sandwich_count_total = group[
        "high_win_rate_attacker_sandwich_count"
    ].sum()
    average_high_win_rate_attacker_in_all_slot = (
        high_win_rate_attacker_count_total / total_slot
    )
    average_sandwich_count_in_all_slot = (
        high_win_rate_attacker_sandwich_count_total / total_slot
    )
    average_profit_in_all_slot = group["total_profit"].sum() / total_slot
    average_win_rate_of_sandwiches = (
        group["win_rate_in_slot"].sum() / high_win_rate_attacker_sandwich_count_total
    )

    leader_stats_list.append(
        {
            "leader": leader,
            "high_win_rate_attacker_freq": high_win_rate_attacker_count_total,
            "high_win_rate_attacker_sandwich_count": high_win_rate_attacker_sandwich_count_total,
            "average_high_win_rate_attacker_freq_in_all_slot": average_high_win_rate_attacker_in_all_slot,
            "average_sandwich_count_in_all_slot": average_sandwich_count_in_all_slot,
            "average_profit_in_all_slot": average_profit_in_all_slot,
            "average_win_rate_of_sandwiches": average_win_rate_of_sandwiches,
            "total_slot_count": total_slot,
            "slot_count_with_high_win_rate_sandwich": (
                group["high_win_rate_attacker_sandwich_count"] > 0
            ).sum(),
            "slot_with_attacker_ratio": (
                group["high_win_rate_attacker_sandwich_count"] > 0
            ).sum()
            / total_slot,
        }
    )

leader_stats = pd.DataFrame(leader_stats_list)

leader_stats.sort_values(
    by="average_sandwich_count_in_all_slot", ascending=False, inplace=True
)
print_df(leader_stats, n=10)