In [None]:
import pandas as pd
from tqdm import tqdm
import gc
import time
from typing import Callable, Optional
import pyarrow.dataset as ds, pandas as pd
import matplotlib.pyplot as plt
import os
from collections import defaultdict
import pandas.api.types as ptypes
import numpy as np
import seaborn as sns
import matplotlib.dates as mdates
from matplotlib.ticker import ScalarFormatter
from tabulate import tabulate

START_SLOT = 370656000  # Start of epoch 858
END_SLOT = 377135999  # End of epoch 872
TX_TYPES = ["frontRun", "backRun", "victim", "transfer"]
ATTACKER_TX_TYPES = ["frontRun", "backRun", "transfer"]
EPS_WIN = 1e-5

start = START_SLOT
end = END_SLOT

In [None]:
def print_rows(df, idx, show_addr=False):
    df_print = df.iloc[idx]
    if not show_addr:
        df_print = df_print.drop(columns=["signer_addresses", "signer_addresses_str"])
    print(tabulate(df_print, headers='keys', tablefmt='psql'))

def print_df(df, top_n=5):
    print(tabulate(df.head(top_n), headers='keys', tablefmt='pretty', showindex=False))

In [None]:
TX_PATH = "data/parquet_out"
SANDWICH_STAT_PATH = "data/sandwich_stat.csv"

# 1. load sandwich transaction data
start_time = time.time()

sandwiches_txs = ds.dataset(TX_PATH, format="parquet")
sandwiches_txs = sandwiches_txs.to_table().to_pandas()

end_time = time.time()
elapsed = end_time - start_time
print(f"Query took {elapsed:.2f} seconds")

# 2. load sandwich statistics 
sandwich_stat = pd.read_csv(SANDWICH_STAT_PATH)

# 3. print basic information
print(f"Block {sandwiches_txs['tx_slot'].min()} - Block {sandwiches_txs['tx_slot'].max()}")
print(f" - Number of transactions: {len(sandwiches_txs)}")
print(f" - Number of sandwiches: {sandwiches_txs['sandwichId'].nunique()}")
print(f" - Number of victims: {sandwiches_txs[sandwiches_txs['type']=='victim']['signer'].nunique()}")
print(f" - Number of unique victim transactions: {sandwiches_txs[sandwiches_txs['type']=='victim']['signature'].nunique()}")
print(f" - Number of victim transactions: {len(sandwiches_txs[sandwiches_txs['type']=='victim']['signature'])}")

In [None]:
victims = sandwiches_txs[sandwiches_txs['type'] == 'victim']

# Count how many distinct sandwich attacks each victim transaction experienced
victim_attack_counts = (
    victims.groupby('signature')['sandwichId']
    .nunique()
    .reset_index(name='num_sandwiches')
)

victim_attack_counts.sort_values(by='num_sandwiches', ascending=False, inplace=True)

# Compute the average number of sandwich attacks per victimized transaction
avg_sandwiches_per_victim = victim_attack_counts['num_sandwiches'].mean()

print(f"Average sandwiches per victimized tx: {avg_sandwiches_per_victim:.3f}")

# Average Victim Count
victim_counts = (
    victims.groupby('sandwichId')['signature']   # group by sandwich and count distinct victims
    .nunique()
    .reset_index(name='num_victims')
)

# Sort and print top 10
victim_counts = victim_counts.sort_values(by='num_victims', ascending=False)

# Average number of victims per sandwich attack
avg_victims_per_sandwich = victim_counts['num_victims'].mean()
print(f"Average victims per sandwich: {avg_victims_per_sandwich:.3f}")

In [None]:
perfect_sandwiches = sandwiches_txs[(sandwiches_txs['relativeDiffB'] > -1e-6) & (sandwiches_txs['relativeDiffB'] < 1e-6)]['sandwichId'].unique().tolist()
print(f"Number of perfect sandwiches: {len(perfect_sandwiches)} ({len(perfect_sandwiches) / len(sandwich_stat) * 100:.2f}%)")

In [None]:
token_address_map = {
    'SOL': 'SOL',
    'EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v': 'USDC',
    'Es9vMFrzaCERmJfrF4H2FYD4KCoNkY11McCe8BenwNYB': 'USDT',
    'USD1ttGY1N17NEEHLmELoaybftRBUSErhqYiQzvEmuB': 'USD1',
    '7zoFNf4p3PuFMjL38TPZ4MsGBzofXQfoYPy7aWdBUG1S': 'DS Ai',
    'JUPyiwrYJFskUPiHa7hkeR8VUtAeFoSYbKedZNsDvCN': 'JUP',
    '7NvJpPcxaNaUqxjLSkx7gHHo9zkmDTA3YTvwB5hw7gpe': 'GROKAI',
    'AjkYmQPU1vg9oWQ2oJV3cxKJD3i6C8Xt73yyzLXYZ6FE': '$GROKAI',
}
sandwich_stat['tokenA_label'] = sandwich_stat['tokenA'].map(token_address_map).fillna(sandwich_stat['tokenA'])

token_stats = (
    sandwich_stat.groupby('tokenA_label', as_index=False)
    .agg(sandwich_count=('sandwichId', 'count'),
         total_profit=('profit_in_usd', 'sum'))
)
total_sandwiches = token_stats['sandwich_count'].sum()
token_stats['percentage'] = token_stats['sandwich_count'] / total_sandwiches * 100

top_5_tokens = token_stats.sort_values('sandwich_count', ascending=False).head(5)
print_df(top_5_tokens)

In [None]:
sandwich_stat["distance"] = np.where(
    sandwich_stat["distance_type"] != "cross_block",
    sandwich_stat["inblock_distance"],
    sandwich_stat["crossblock_distance"],
)

atomic_sandwiches = sandwich_stat[sandwich_stat['distance_type'] == 'inblock_consec']
non_atomic_sandwiches = sandwich_stat[sandwich_stat['distance_type'] != 'inblock_consec']
non_atomic_inblock_sandwiches = non_atomic_sandwiches[non_atomic_sandwiches['distance_type'] == 'inblock_non_consec']
non_atomic_crossblock_sandwiches = non_atomic_sandwiches[non_atomic_sandwiches['distance_type'] == 'cross_block']

print(f"\nAtomic Sandwich - \n\tCount: {len(atomic_sandwiches)}, \tShare: {len(atomic_sandwiches) / len(sandwich_stat):.3f} \
        \n\tProfit: {atomic_sandwiches['profit_in_usd'].sum()}, \tAverage Profit: {atomic_sandwiches['profit_in_usd'].sum() / len(atomic_sandwiches):.3f} \
        \n\tAverage Distance: {atomic_sandwiches['distance'].mean()}")


print(f"\nNon-Atomic Sandwich - \n\tCount: {len(non_atomic_sandwiches)}, \tShare: {len(non_atomic_sandwiches) / len(sandwich_stat):.3f} \
        \n\tProfit: {non_atomic_sandwiches['profit_in_usd'].sum()}, \tAverage Profit: {non_atomic_sandwiches['profit_in_usd'].sum() / len(non_atomic_sandwiches):.3f} \
        \n\tAverage Distance: {non_atomic_sandwiches['distance'].mean()}")


print(f"\nNon-Atomic Inblock Sandwich - \n\tCount: {len(non_atomic_inblock_sandwiches)}, \tShare: {len(non_atomic_inblock_sandwiches) / len(sandwich_stat):.3f} \
        \n\tProfit: {non_atomic_inblock_sandwiches['profit_in_usd'].sum()}, \tAverage Profit: {non_atomic_inblock_sandwiches['profit_in_usd'].sum() / len(non_atomic_sandwiches):.3f} \
        \n\tAverage Distance: {non_atomic_inblock_sandwiches['distance'].mean()}")

print(f"\nNon-Atomic Crossblock Sandwich - \n\tCount: {len(non_atomic_crossblock_sandwiches)}, \tShare: {len(non_atomic_crossblock_sandwiches) / len(sandwich_stat):.3f} \
        \n\tProfit: {non_atomic_crossblock_sandwiches['profit_in_usd'].sum()}, \tAverage Profit: {non_atomic_crossblock_sandwiches['profit_in_usd'].sum() / len(non_atomic_sandwiches):.3f} \
        \n\tAverage Distance: {non_atomic_crossblock_sandwiches['distance'].mean()}")

In [None]:
from clickhouse_connect import get_client
from dotenv import load_dotenv

def load_env():
    load_dotenv(dotenv_path=".env")
    return {
        "host": os.getenv("CLICKHOUSE_HOST"),
        "port": int(os.getenv("CLICKHOUSE_PORT")),
        "username": os.getenv("CLICKHOUSE_USERNAME"),
        "password": os.getenv("CLICKHOUSE_PASSWORD"),
    }

# Load credentials from .env
config = load_env()
# Initialize ClickHouse client
client = get_client(
    host=config["host"],
    port=config["port"],
    username=config["username"],
    password=config["password"],
)

def query_slot_leader_in_DB(start_slot=0, end_slot=500000000):
    """
    Query the ClickHouse DB to get slot info stored
    """
    if start_slot < START_SLOT:
        print("Warning: start_slot is before the earliest valid slot in DB.")

    query = f"""
    SELECT
        slot,
        leader
    FROM solwich.slot_leaders
    WHERE slot BETWEEN {start_slot} AND {end_slot}
    ORDER BY slot ASC
    """
    result = client.query(query)
    df = pd.DataFrame(result.result_rows, columns=result.column_names)
    return df

slot_leader_df = query_slot_leader_in_DB(start, end)
slot_leader_df = slot_leader_df.drop_duplicates(subset=['slot', 'leader'])
slot_leader_df.to_csv('data/slot_leader.csv', index=False)

total_vol = pd.read_csv('data/slot_tx.csv')
total_vol = total_vol.merge(
    sandwiches_txs[['tx_slot', 'timestamp']],
    how='left',
    left_on='slot',
    right_on='tx_slot'
)

In [None]:
import matplotlib.ticker as mticker

# This function plots two layers of time-series summaries:
# The upper chart shows hourly profit in USD and hourly sandwich volume on dual y-axes.
# The lower chart shows daily user transaction volume, daily sandwich volume, and the percentage change of SOL price, also on dual y-axes.
def plot_time_summary_line(
    df: pd.DataFrame,
    sol_price_csv: str,
    interval_hours: int = 1,
    smooth_window: int = None,
    figsize=(12, 8),
    save_path=None,
    dpi: int = 600,
):

    # Convert timestamps, remove invalid entries, and bucket into hourly intervals
    df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce").dt.tz_localize(None)
    df = df.dropna(subset=["timestamp"])
    df["time_bucket"] = df["timestamp"].dt.floor(f"{interval_hours}H")

    # Aggregate profit and sandwich volume by hour
    hourly_summary = (
        df.groupby("time_bucket", as_index=False)
          .agg(total_profit_usd=("profit_in_usd", "sum"),
               total_volume=("sandwichId", "count"))
    )
    if smooth_window and smooth_window > 1:
        hourly_summary["profit_smooth"] = hourly_summary["total_profit_usd"].rolling(smooth_window, min_periods=1).mean()
        hourly_summary["volume_smooth"] = hourly_summary["total_volume"].rolling(smooth_window, min_periods=1).mean()
    else:
        hourly_summary["profit_smooth"] = hourly_summary["total_profit_usd"]
        hourly_summary["volume_smooth"] = hourly_summary["total_volume"]

    # Aggregate sandwich volume by day
    df["date"] = df["timestamp"].dt.date
    daily_summary = (
        df.groupby("date", as_index=False)
          .agg(total_volume=("sandwichId", "count"))
    )

    # Load daily SOL price
    sol_price = pd.read_csv(sol_price_csv)
    sol_price["date"] = pd.to_datetime(sol_price["date"]).dt.date
    daily_summary = pd.merge(daily_summary, sol_price, on="date", how="left")

    # Load daily user transaction volume
    total_vol["date"] = total_vol["timestamp"].dt.date
    daily_vol = (
        total_vol.groupby("date", as_index=False)
            .agg(total_tx_volume=("validTxCount", "sum"))
    )
    daily_summary = pd.merge(daily_summary, daily_vol, on="date", how='left')

    # Compute absolute percentage change of SOL price
    daily_summary = daily_summary.sort_values("date")
    daily_summary["price_diff"] = abs(daily_summary["price"].pct_change() * 100)
    daily_summary["price_diff"] = daily_summary["price_diff"].fillna(0)

    # General plot settings
    plt.style.use("default")
    plt.rcParams.update({
        "font.size": 15,
        "axes.labelsize": 14,
        "axes.titlesize": 17,
        "xtick.labelsize": 12,
        "ytick.labelsize": 12,
        "axes.edgecolor": "black",
        "xtick.color": "black",
        "ytick.color": "black",
        "axes.labelcolor": "black",
    })

    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=figsize, sharex=True)

    # Colors for the different metrics
    color_profit = "#1f77b4"
    color_volume = "#2ca02c"
    color_volume_d = "#ff7f0e"
    color_tx_volume = "#d62728"
    color_price = "#ffcc00"

    # Upper chart: hourly profit and hourly sandwich volume
    ax1b = ax1.twinx()
    ax1b.set_ylabel("Hourly Sandwich Volume", color="black")
    ax1b.plot(hourly_summary["time_bucket"], hourly_summary["volume_smooth"],
              color=color_volume, lw=3, label="Hourly Sandwich Volume")
    
    ax1.set_ylabel("Hourly Profit (USD)", color="black")
    ax1.plot(hourly_summary["time_bucket"], hourly_summary["profit_smooth"],
             color=color_profit, lw=3, label="Hourly Profit (USD)")

    # Lower chart: daily transaction volume, daily sandwich volume, and SOL price change
    ax2c = ax2.twinx()
    ax2c.spines["right"].set_position(("outward", 65))
    ax2c.set_ylabel("Change of SOL Price (%)", color="black")
    ax2c.plot(daily_summary["date"], daily_summary["price_diff"],
              color=color_price, lw=3, label="SOL Price Change")
    
    ax2.set_ylabel("Daily Users Tx Volume", color="black")
    ax2.plot(daily_summary["date"], daily_summary["total_tx_volume"],
            color=color_tx_volume, lw=3, label="Users Tx Volume")

    ax2b = ax2.twinx()
    ax2b.set_ylabel("Daily Sandwich Volume", color="black")
    ax2b.plot(daily_summary["date"], daily_summary["total_volume"],
             color=color_volume_d, lw=3, label="Daily Sandwich Volume")

    # X-axis alignment across both charts
    t1 = hourly_summary["time_bucket"]
    t2 = pd.to_datetime(daily_summary["date"])
    all_times = pd.date_range(start=min(t1.min(), t2.min()), end=max(t1.max(), t2.max()), freq="D")
    min_time, max_time = all_times.min(), all_times.max()
    padding = pd.Timedelta(days=2)
    ax1.set_xlim(min_time - padding, max_time + padding)
    ax2.set_xlim(min_time - padding, max_time + padding)

    # Use integer ticks for y-axes
    ax1.yaxis.set_major_locator(mticker.MaxNLocator(integer=True))
    ax2.yaxis.set_major_locator(mticker.MaxNLocator(integer=True))
    ax1b.yaxis.set_major_locator(mticker.MaxNLocator(integer=True))
    ax2b.yaxis.set_major_locator(mticker.MaxNLocator(integer=True))
    ax2c.yaxis.set_major_locator(mticker.MaxNLocator(integer=True))

    ax1.set_ylim(bottom=-5000)
    ax1b.set_ylim(bottom=-5000)
    ax2.set_ylim(bottom=-5e7)
    ax2b.set_ylim(bottom=-25000)
    ax2c.set_ylim(bottom=-1)

    # Draw grid lines based on the left axis of each subplot
    x_grid_dates = pd.date_range(min_time, max_time, freq="D")
    for ax, axb in [(ax1, ax1b), (ax2, ax2b)]:
        for y in ax.get_yticks():
            ax.axhline(y=y, color="#DDDDDD", linestyle="--", linewidth=0.7, alpha=0.6, zorder=0)

        l, r = ax.get_xlim()
        left_dt  = mdates.num2date(l).replace(tzinfo=None)
        right_dt = mdates.num2date(r).replace(tzinfo=None)
        for x in pd.date_range(left_dt, right_dt, freq="D"):
            ax.axvline(x=x, color="#DDDDDD", linestyle="--", linewidth=0.7, alpha=0.6, zorder=0)

        ax.grid(False)
        axb.grid(False)

    # Legend setup
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax1b.get_legend_handles_labels()
    lines3, labels3 = ax2.get_legend_handles_labels()
    lines4, labels4 = ax2b.get_legend_handles_labels()
    lines5, labels5 = ax2c.get_legend_handles_labels()

    upper_lines = lines1 + lines2
    upper_labels = labels1 + labels2

    lower_lines = lines3 + lines4 + lines5
    lower_labels = labels3 + labels4 + labels5

    fig.legend(
        upper_lines,
        upper_labels,
        loc="lower center",
        ncol=2,
        bbox_to_anchor=(0.5, -0.01),
        fontsize=14,
        frameon=True,
        facecolor="white",
        edgecolor="#DDDDDD",
        handlelength=2.5,
        columnspacing=1.2,
    )

    fig.legend(
        lower_lines,
        lower_labels,
        loc="lower center",
        ncol=3,
        bbox_to_anchor=(0.5, -0.06),
        fontsize=14,
        frameon=True,
        facecolor="white",
        edgecolor="#DDDDDD",
        handlelength=2.5,
        columnspacing=1.2,
    )

    # Styling for axes
    for ax in [ax1, ax1b, ax2, ax2b]:
        for spine in ax.spines.values():
            spine.set_color("black")
        ax.tick_params(colors="black", direction="inout", length=5)

    ax2.xaxis.set_major_locator(mdates.DayLocator(interval=2))
    ax2.xaxis.set_major_formatter(mdates.DateFormatter("%m-%d"))

    # Format y-axis using scientific notation when appropriate
    def format_axis(ax):
        from matplotlib.ticker import ScalarFormatter, MaxNLocator
        formatter = ScalarFormatter(useMathText=True)
        formatter.set_powerlimits((-3, 4))
        ax.yaxis.set_major_locator(MaxNLocator(integer=True))
        ax.yaxis.set_major_formatter(formatter)
        ax.ticklabel_format(axis='y', style='sci', scilimits=(-3,4))

        offset_text = ax.yaxis.get_offset_text()
        offset_text.set_fontsize(11)
        offset_text.set_color('black')
        x, y = offset_text.get_position()
        offset_text.set_position((x, y + 0.02))

    for a in [ax1, ax1b, ax2, ax2b, ax2c]:
        format_axis(a)

    plt.tight_layout(rect=[0, 0.05, 1, 1])

    if save_path:
        plt.savefig(save_path, dpi=dpi, bbox_inches="tight")
        print(f"Saved: {save_path}")

    plt.show()

    return hourly_summary, daily_summary

hourly_summary, daily_summary = plot_time_summary_line(
    sandwich_stat,
    sol_price_csv="data/sol_price.csv",
    interval_hours=1,
    smooth_window=3,
    save_path="data/sandwich_line_profit_volume.pdf"
)

In [None]:
def plot_bidirectional_bar_rotated(
    grouped, 
    df_raw, 
    title, 
    save_path,
    positive_color='#6BAED6',    
    negative_color='#F08080',    
    profit_line_color="#983232", 
    volume_line_color="#2C2C81"  
):
    """
    Draws a bidirectional bar chart rotated 90 degrees, combined with an average profit line
    and a total volume line using a secondary y-axis.
    - The x-axis represents distance bins.
    - The left y-axis shows average profit (positive and negative bars plus a profit line).
    - The right y-axis shows total volume.
    """

    # Prepare sorted data and extract bins
    g = grouped.sort_values('distance_bin').copy()
    bins = g['distance_bin'].astype(str).to_numpy()
    x_pos = np.arange(len(bins))

    # Positive and negative profit values
    pos = g['positive'].fillna(0).to_numpy()
    neg = g['negative'].fillna(0).to_numpy()

    # Average profit across all sandwiches within each distance bin
    avg_all = (
        df_raw.groupby('distance_bin')['profit_in_usd']
        .mean()
        .reindex(g['distance_bin'])
        .fillna(0)
        .to_numpy()
    )

    # Total volume counted by distinct sandwichId in each bin
    total_volume = (
        df_raw.groupby('distance_bin')['sandwichId']
        .nunique()
        .reindex(g['distance_bin'])
        .fillna(0)
        .to_numpy()
    )

    # Begin plotting
    fig, ax1 = plt.subplots(figsize=(9, 6))

    # Draw positive and negative bars on the left axis
    bar_w = 0.55
    ax1.bar(x_pos, pos, color=positive_color, width=bar_w,
            label='Average Gain', alpha=0.9)
    ax1.bar(x_pos, neg, color=negative_color, width=bar_w,
            label='Average Loss', alpha=0.9)

    # Average profit line
    ax1.plot(
        x_pos, avg_all, color=profit_line_color, marker='o',
        linewidth=1.5, label='Average Profit', alpha=0.9
    )

    # Left-axis styling
    ax1.axhline(0, color='#000000', linewidth=0.8)
    ax1.set_xticks(x_pos)
    ax1.set_xticklabels(bins, rotation=30, ha='right', fontsize=12)
    ax1.set_xlabel('Distance', fontsize=16)
    ax1.set_ylabel('Average Profit (USD)', fontsize=16)
    ax1.tick_params(axis='x', labelsize=13)
    ax1.tick_params(axis='y', labelsize=13)
    ax1.grid(True, axis='y', linestyle='--', alpha=0.5)
    ax1.grid(False, axis='x')
    ax1.set_facecolor('#F9FAFB')
    ax1.spines[['top', 'right']].set_visible(False)

    # Right axis for total volume
    ax2 = ax1.twinx()
    ax2.plot(
        x_pos, total_volume, color=volume_line_color, marker='s',
        linewidth=1.5, linestyle='--', label='Total Volume', alpha=0.9
    )
    ax2.axhline(0, color='black', linewidth=0.8, alpha=0.6)
    ax2.set_ylabel('Volume', fontsize=16)
    ax2.grid(False)

    # Compute axis limits for both sides so that the visual scale is balanced
    pos_max = np.nanmax(pos) if len(pos) else 0.0
    neg_min = np.nanmin(neg) if len(neg) else 0.0
    avg_max = np.nanmax(avg_all) if len(avg_all) else 0.0
    avg_min = np.nanmin(avg_all) if len(avg_all) else 0.0
    vol_max = np.nanmax(total_volume) if len(total_volume) else 0.0
    vol_min = np.nanmin(total_volume) if len(total_volume) else 0.0

    upper_left = max(pos_max, avg_max) * 1.2
    lower_left = min(neg_min, avg_min) * 1.05

    upper_right = max(vol_max, 0) * 1.2
    lower_right = min(vol_min, 0) * 1.05

    left_ratio = upper_left / abs(lower_left) if lower_left != 0 else 0

    if abs(upper_right) > abs(lower_right):
        lower_right = -abs(upper_right / left_ratio)
    else:
        upper_right = abs(lower_right * left_ratio)

    ax1.set_ylim(lower_left, upper_left)
    ax2.set_ylim(lower_right, upper_right)

    # Combine legends and control ordering
    lines_1, labels_1 = ax1.get_legend_handles_labels()
    lines_2, labels_2 = ax2.get_legend_handles_labels()

    handles_dict = dict(zip(labels_1 + labels_2, lines_1 + lines_2))
    order = ['Average Gain', 'Average Loss', 'Average Profit', 'Total Volume']

    ordered_handles = [handles_dict[o] for o in order if o in handles_dict]
    ordered_labels = [o for o in order if o in handles_dict]

    ax1.legend(
        ordered_handles, ordered_labels,
        frameon=False, loc='lower left', ncol=2, fontsize=14
    )
    
    ax1.tick_params(axis='x', labelsize=14)
    ax1.tick_params(axis='y', labelsize=14)
    ax2.tick_params(axis='y', labelsize=14)

    for ax in [ax1, ax2]:
        for spine in ax.spines.values():
            spine.set_edgecolor('black')
            spine.set_linewidth(0.8)

    plt.tight_layout()
    plt.savefig(save_path, dpi=600, bbox_inches='tight')
    plt.show()

In [None]:
# Distance vs profit
df = sandwich_stat

print(f"\nAverage distance of all sandwiches: {df['distance'].mean()}")
print(f"Average distance of Non-consec. IB: {sandwich_stat[sandwich_stat['distance_type']=='inblock_non_consec']['inblock_distance'].mean()}")
print(f"Average distance of Non-consec. IB: {sandwich_stat[sandwich_stat['distance_type']=='cross_block']['crossblock_distance'].mean()}")
print(f"\nAverage profit of inblock-non-consec: {df[df['distance_type']=='inblock_non_consec']['profit_in_usd'].mean()}")
print(f"Average profit of inblock-non-consec: {df[df['distance_type']=='cross_block']['profit_in_usd'].mean()}")

small_dist_consec = sandwich_stat[(sandwich_stat['distance_type']=='inblock_consec') & (sandwich_stat['distance']<=5)]
small_dist_non_consec = sandwich_stat[(sandwich_stat['distance_type']!='inblock_consec') & (sandwich_stat['distance']<=5)]
print(f"\nAverage profit of consecutive sandwich with distance in 1-9: {small_dist_consec['profit_in_usd'].mean()} ({len(small_dist_consec)})")
print(f"Average profit of non-consecutive sandwich with distance in 1-9: {small_dist_non_consec['profit_in_usd'].mean()} ({len(small_dist_non_consec)})")

distance_bins = [0, 10, 100, 500, 1000, 3000, 6000, 10000, float('inf')]
distance_labels = [
    '1-9', '10–99', '100-499', '500–999', 
    '1000–2999','3000–5999', '6000–9999', '≥10000'
]

df['distance_bin'] = pd.cut(
    df['distance'],
    bins=distance_bins,
    labels=distance_labels,
    right=False
)

grouped_profit = df.groupby(['distance_bin'])['profit_in_usd'].agg(
    positive=lambda x: x[x >= 0].mean(),
    negative=lambda x: x[x < 0].mean()
).reset_index()

plot_bidirectional_bar_rotated(
    grouped_profit,
    df_raw=df,   
    title='Average Profit and Total Volume by Distance',
    save_path='data/average_profit_volume_by_distance.pdf'
)

del df

In [None]:
# Evasive Strategies

# 1. Changing-signer
diff_signer_sandwiches = sandwich_stat[sandwich_stat['signerSame']==False]
print(f"\nNumber of sandwiches utilizing signer-changing: {len(diff_signer_sandwiches)} ({len(diff_signer_sandwiches) / len(sandwich_stat):.2f})")
print(f"Atomic: {len(diff_signer_sandwiches[diff_signer_sandwiches['distance_type'] == 'inblock_consec'])} ({len(diff_signer_sandwiches[diff_signer_sandwiches['distance_type'] == 'inblock_consec']) / len(sandwich_stat[sandwich_stat['distance_type']=='inblock_consec']) * 100:.2f}), Profit: {diff_signer_sandwiches[diff_signer_sandwiches['distance_type'] == 'inblock_consec']['profit_in_usd'].sum()}")
print(f"Non-Atomic: {len(diff_signer_sandwiches[diff_signer_sandwiches['distance_type'] != 'inblock_consec'])} \
        ({len(diff_signer_sandwiches[diff_signer_sandwiches['distance_type'] != 'inblock_consec']) / len(sandwich_stat[sandwich_stat['distance_type'] != 'inblock_consec'])}), \
        Profit: {diff_signer_sandwiches[diff_signer_sandwiches['distance_type'] != 'inblock_consec']['profit_in_usd'].sum()}")
print(f"Total Profit: {diff_signer_sandwiches['profit_in_usd'].sum()}")

diff_signer_sandwiches_with_transfer_list = sandwiches_txs[sandwiches_txs['type']=='transfer']['sandwichId'].unique().tolist()
diff_signer_sandwiches_with_transfer = sandwich_stat[sandwich_stat['sandwichId'].isin(diff_signer_sandwiches_with_transfer_list)]
print(f"\nNumber of sandwiches using transfer: {len(diff_signer_sandwiches_with_transfer_list)}")
print(f"Profit of transfer sandwiches: {diff_signer_sandwiches_with_transfer['profit_in_usd'].sum()}")

print(f"\nTop diff signer sandwiches")
top_diff_signer_attacks = diff_signer_sandwiches.sort_values(by='profit_in_usd', ascending=False).head(10)
print_df(top_diff_signer_attacks)

print(f"\nTop diff signer transfer sandwiches")
top_transfer_attacks = diff_signer_sandwiches_with_transfer.sort_values(by='profit_in_usd', ascending=False).head(10)
print_df(top_transfer_attacks)

# 2. Multi-front/backrunning
multi_fb_sandwiches = sandwich_stat[(sandwich_stat['fr_count'] > 1) | (sandwich_stat['br_count'] > 1)]
print(f"\nNumber of sandwiches utilizing multi-fb: {len(multi_fb_sandwiches)} ({len(multi_fb_sandwiches) / len(sandwich_stat):.2f})")
print(f"Atomic: {len(multi_fb_sandwiches[multi_fb_sandwiches['distance_type'] == 'inblock_consec'])}, Profit: {multi_fb_sandwiches[multi_fb_sandwiches['distance_type'] == 'inblock_consec']['profit_in_usd'].sum()}")
print(f"Non-Atomic: {len(multi_fb_sandwiches[multi_fb_sandwiches['distance_type'] != 'inblock_consec'])}, Profit: {multi_fb_sandwiches[multi_fb_sandwiches['distance_type'] != 'inblock_consec']['profit_in_usd'].sum()}")
print(f"\tCross Block: {len(multi_fb_sandwiches[multi_fb_sandwiches['distance_type'] == 'cross_block'])}, Profit: {multi_fb_sandwiches[multi_fb_sandwiches['distance_type'] == 'cross_block']['profit_in_usd'].sum()}")
print(f"\tIn-Block: {len(multi_fb_sandwiches[multi_fb_sandwiches['distance_type'] == 'inblock_non_consec'])}, Profit: {multi_fb_sandwiches[multi_fb_sandwiches['distance_type'] == 'inblock_non_consec']['profit_in_usd'].sum()}")
print(f"Total Profit: {multi_fb_sandwiches['profit_in_usd'].sum()}")

print(f"\nTop multi-fb sandwiches")
top_multi_fb_attacks = multi_fb_sandwiches.sort_values(by='profit_in_usd', ascending=False).head(10)
print_df(multi_fb_sandwiches, 10)

In [None]:
# Jito Usage
bundle_sandwich = sandwich_stat[
    (sandwich_stat['bundle_status']=='all') 
]
non_bundle_sandwich = sandwich_stat[
    (sandwich_stat['bundle_status']!='all') 
]

consec_bundle_sandwich = bundle_sandwich[
    (bundle_sandwich['distance_type']=='inblock_consec') & 
    (bundle_sandwich['bundle_status']=='all')
]

non_consec_bundle_sandwich = bundle_sandwich[
    (bundle_sandwich['distance_type']!='inblock_consec') & 
    (bundle_sandwich['bundle_status']=='all')
]

print(f"\nNumber of Non-Jito sandwiches: {len(non_bundle_sandwich)}")
print(f"\nNon-jito Max Loss: {non_bundle_sandwich['profit_in_usd'].min()}")
print(f"Non-jito Max Profit: {non_bundle_sandwich['profit_in_usd'].max()}")
print(f"Non-jito Average Profit: {non_bundle_sandwich['profit_in_usd'].mean()}")
print(f"Non-jito Median Profit: {non_bundle_sandwich['profit_in_usd'].median()}")

print(f"\nNumber of Jito sandwiches: {len(bundle_sandwich)}")
print(f"Share in all sandwiches: {len(bundle_sandwich)/len(sandwich_stat):.2f}")
print(f"Total Profit: {bundle_sandwich['profit_in_usd'].sum()}")
print(f"Average Profit: {bundle_sandwich['profit_in_usd'].mean()}")
print(f"Jito Max Loss: {bundle_sandwich['profit_in_usd'].min()}")
print(f"Jito Max Profit: {bundle_sandwich['profit_in_usd'].max()}")

print(f"\nNumber of consecutive Jito sandwiches: {len(consec_bundle_sandwich)} / {len(atomic_sandwiches)} ({len(consec_bundle_sandwich) / len(atomic_sandwiches)})")
print(f"Share in all sandwiches: {len(consec_bundle_sandwich)/len(sandwich_stat):.2f}")
print(f"Total Profit: {consec_bundle_sandwich['profit_in_usd'].sum()}")
print(f"Average Profit: {consec_bundle_sandwich['profit_in_usd'].mean()}")
print(f"Jito Max Loss: {consec_bundle_sandwich['profit_in_usd'].min()}")
print(f"Jito Max Profit: {consec_bundle_sandwich['profit_in_usd'].max()}")

print(f"\nNumber of non-consecutive Jito sandwiches: {len(non_consec_bundle_sandwich)} / {len(non_atomic_sandwiches)} ({len(non_consec_bundle_sandwich) / len(non_atomic_sandwiches)})")
print(f"Share in all sandwiches: {len(non_consec_bundle_sandwich)/len(sandwich_stat):.2f}")
print(f"Total Profit: {non_consec_bundle_sandwich['profit_in_usd'].sum()}")
print(f"Average Profit: {non_consec_bundle_sandwich['profit_in_usd'].mean()}")
print(f"Max Loss: {non_consec_bundle_sandwich['profit_in_usd'].max()}")

print(f"\nAverage victim count per Jito sandwich: {bundle_sandwich['victim_count'].mean()}")
print(f"Average victim count per Non-Jito sandwich: {non_bundle_sandwich['victim_count'].mean()}")

win_bundle_sandwich = bundle_sandwich[bundle_sandwich['profitA']>=0]
print(f"\nJito Sandwich Win Rate: {len(win_bundle_sandwich)/len(bundle_sandwich)*100:.2f}")
small_loss_bundle_sandwich = bundle_sandwich[(bundle_sandwich['profitA']<0) & (bundle_sandwich['profit_in_usd']>=-1e-3)]
print(f"Small loss bundle sandwich share: {len(small_loss_bundle_sandwich)/len(bundle_sandwich)*100:.2f}")
print(f"Small loss bundle sandwich share in losed sandwich: {len(small_loss_bundle_sandwich)/len(bundle_sandwich[bundle_sandwich['profitA']<0])*100:.2f}")

win_consec_bundle_sandwich = consec_bundle_sandwich[consec_bundle_sandwich['profitA']>=0]
print(f"\nConsec ito Sandwich Win Rate: {len(win_consec_bundle_sandwich)/len(consec_bundle_sandwich)*100:.2f}")
small_loss_consec_bundle_sandwich = consec_bundle_sandwich[(consec_bundle_sandwich['profitA']<0) & (consec_bundle_sandwich['profit_in_usd']>=-1e-3)]
print(f"Small loss bundle sandwich share: {len(small_loss_consec_bundle_sandwich)/len(consec_bundle_sandwich)*100:.2f}")
print(f"Small loss bundle sandwich share in losed sandwich: {len(small_loss_consec_bundle_sandwich)/len(consec_bundle_sandwich[consec_bundle_sandwich['profitA']<0])*100:.2f}")

In [None]:
jito_sandwich_stat = sandwich_stat[
    ((sandwich_stat['bundle_status'] == 'all') &
    (sandwich_stat['distance_type'] == 'inblock_consec'))
].copy()

all_sandwich_stat = sandwich_stat.copy()

distance_bins = [-float('inf'), -10, -1, -1e-3, 0, 1e-3, 1, 10, float('inf')]
distance_labels = [
    r"$\leq -10$",
    r"$-10\sim-1$",
    r"$-1\sim -10^{-3}$",
    r"$-10^{-3}\sim 0$",
    r"$0\sim 10^{-3}$",
    r"$10^{-3}\sim 1$",
    r"$1\sim10$",
    r"$\geq 10$"
]

for df in [jito_sandwich_stat, all_sandwich_stat]:
    df['profit_bin'] = pd.cut(
        df['profit_in_usd'],
        bins=distance_bins,
        labels=distance_labels,
        right=False
    )

grouped_jito = jito_sandwich_stat.groupby('profit_bin')['profit_in_usd'].agg(
    volume='count',
    total_profit='sum'
).reset_index()
print_df(grouped_jito)

grouped_all = all_sandwich_stat.groupby('profit_bin')['profit_in_usd'].agg(
    volume='count',
    total_profit='sum'
).reset_index()
print_df(grouped_all)

labels = distance_labels
grouped_jito = grouped_jito.set_index(grouped_jito['profit_bin'].astype(str))
grouped_all = grouped_all.set_index(grouped_all['profit_bin'].astype(str))
jito_vol = grouped_jito['volume'].reindex(labels).fillna(0).values
all_vol = grouped_all['volume'].reindex(labels).fillna(0).values
jito_profit = grouped_jito['total_profit'].reindex(labels).fillna(0).values
all_profit = grouped_all['total_profit'].reindex(labels).fillna(0).values

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import pandas as pd

fig, ax1 = plt.subplots(figsize=(8, 6))
ax2 = ax1.twinx()

ax1.bar(labels, jito_vol, color="#87AAB9", alpha=0.8, label="Volume")
ax2.plot(labels, jito_profit, color="#2F5763", marker='o', linewidth=2.5,
         markersize=7, label="Profit (USD)")

# ax1.set_title("Jito Bundle", fontsize=16)
ax1.set_xlabel("Profit Range (USD)", fontsize=16)
ax1.set_ylabel("Sandwich Volume", fontsize=16)
ax2.set_ylabel("Total Profit (USD)", fontsize=16)

# Use log-scale-like spacing for Volume with zero line placed slightly below the center
y1_max = jito_vol.max()
ax1.set_ylim(bottom=-y1_max*0.2, top=y1_max * 1.1)

# Symmetric bounds for the Profit axis with sparse ticks
y2_max = abs(jito_profit).max()
ax2.set_ylim(-y2_max * 0.2, y2_max * 1.1)
ax2.yaxis.set_major_locator(plt.MaxNLocator(5))
ax2.axhline(0, color='gray', linewidth=1.2, linestyle='--', alpha=0.7)

# Grid and legend settings
ax1.grid(axis='both', linestyle='--', alpha=0.4)
ax2.grid(axis='both', linestyle='-', alpha=0)
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper left', fontsize=14)
ax1.tick_params(axis='x', labelrotation=25, labelsize=12)
ax1.tick_params(axis='y', labelsize=14)
ax2.tick_params(axis='y', labelsize=14)

for spine in ax1.spines.values():
    spine.set_edgecolor('black')
    spine.set_linewidth(1.5)

plt.tight_layout()
plt.savefig("data/jito_plot.pdf", dpi=600, bbox_inches="tight")
plt.show()

# Second figure: Non-JITO
fig, ax1 = plt.subplots(figsize=(8, 6))
ax2 = ax1.twinx()

ax1.bar(labels, all_vol, color="#A8D5BA", alpha=0.8, label="Volume")
ax2.plot(labels, all_profit, color="#3C6F4A", marker='s', linewidth=2.5,
         markersize=7, label="Profit (USD)")

ax1.set_xlabel("Profit Range (USD)", fontsize=16)
ax1.set_ylabel("Sandwich Volume", fontsize=16)
ax2.set_ylabel("Total Profit (USD)", fontsize=16)

# Use log-scale-like spacing for Volume with zero line placed slightly below the center
y1_max = all_vol.max()
ax1.set_ylim(bottom=-y1_max*0.6, top=y1_max * 1.1)

# Symmetric bounds for the Profit axis with sparse ticks
y2_max = max(abs(all_profit.max()), abs(all_profit.min()))
ax2.set_ylim(-y2_max*0.6, y2_max * 1.1)
ax2.yaxis.set_major_locator(plt.MaxNLocator(5))
ax2.axhline(0, color='gray', linewidth=1.2, linestyle='--', alpha=0.7)

# Grid and legend settings
ax1.grid(axis='both', linestyle='--', alpha=0.4)
ax2.grid(axis='both', linestyle='-', alpha=0)
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper left', fontsize=14)
ax1.tick_params(axis='x', labelrotation=25, labelsize=12)
ax1.tick_params(axis='y', labelsize=14)
ax2.tick_params(axis='y', labelsize=14)

for spine in ax1.spines.values():
    spine.set_edgecolor('black')
    spine.set_linewidth(1.5)

plt.tight_layout()
plt.savefig("data/nonjito_plot.pdf", dpi=600, bbox_inches="tight")
plt.show()

In [None]:
import glob

files = glob.glob("data/sandwich_bundle_parquet/*.parquet")
jito_fee = pd.concat([pd.read_parquet(f) for f in files], ignore_index=True)
jito_fee['totalLandedTipLamports'] /= 1e9

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.dates import DateFormatter
import numpy as np

sandwich_stat['timestamp'] = pd.to_datetime(sandwich_stat['timestamp'], errors='coerce')
sandwich_stat["date_bucket"] = sandwich_stat["timestamp"].dt.floor(f"{1}D")
sandwich_stat['date_bucket'] = pd.to_datetime(sandwich_stat['date_bucket'])

atomic = sandwich_stat[sandwich_stat['distance_type'] == 'inblock_consec'].copy()
non_atomic = sandwich_stat[sandwich_stat['distance_type'] != 'inblock_consec'].copy()

for df in [atomic, non_atomic]:
    df['avg_fr_fee'] = df['fr_fee'] / df['fr_count'].replace(0, pd.NA) / 1e9
    df['avg_br_fee'] = df['br_fee'] / df['br_count'].replace(0, pd.NA) / 1e9

atomic_daily = (
    atomic.groupby('date_bucket', as_index=False)[['avg_fr_fee', 'avg_br_fee']]
    .mean()
    .rename(columns={'avg_fr_fee': 'atomic_fr_fee', 'avg_br_fee': 'atomic_br_fee'})
)
non_atomic_daily = (
    non_atomic.groupby('date_bucket', as_index=False)[['avg_fr_fee', 'avg_br_fee']]
    .mean()
    .rename(columns={'avg_fr_fee': 'non_atomic_fr_fee', 'avg_br_fee': 'non_atomic_br_fee'})
)
df_fee_daily = atomic_daily.merge(non_atomic_daily, on='date_bucket', how='outer').sort_values('date_bucket')

# total cost data (tip + tx fee)
atomic = atomic.merge(jito_fee[['sandwichId', 'totalLandedTipLamports']], on='sandwichId', how='left')
atomic['totalLandedTipLamports'] = atomic['totalLandedTipLamports'].fillna(0)
atomic['fr_fee'] /= 1e9
atomic['br_fee'] /= 1e9
atomic['total_cost'] = atomic['fr_fee'] + atomic['br_fee'] + atomic['totalLandedTipLamports']

non_atomic = non_atomic.merge(jito_fee[['sandwichId', 'totalLandedTipLamports']], on='sandwichId', how='left')
non_atomic['totalLandedTipLamports'] = non_atomic['totalLandedTipLamports'].fillna(0)
non_atomic['fr_fee'] /= 1e9
non_atomic['br_fee'] /= 1e9
non_atomic['total_cost'] = non_atomic['fr_fee'] + non_atomic['br_fee'] + non_atomic['totalLandedTipLamports']

atomic['timestamp'] = pd.to_datetime(atomic['timestamp'], errors='coerce')
atomic["date_bucket"] = atomic["timestamp"].dt.floor(f"{1}D")
non_atomic['timestamp'] = pd.to_datetime(non_atomic['timestamp'], errors='coerce')
non_atomic["date_bucket"] = non_atomic["timestamp"].dt.floor(f"{1}D")

atomic_daily_cost = (
    atomic.groupby('date_bucket', as_index=False)[['total_cost']]
    .mean()
    .rename(columns={'total_cost': 'atomic_total_cost'})
)
non_atomic_daily_cost = (
    non_atomic.groupby('date_bucket', as_index=False)[['total_cost']]
    .mean()
    .rename(columns={'total_cost': 'non_atomic_total_cost'})
)
df_cost_daily = atomic_daily_cost.merge(non_atomic_daily_cost, on='date_bucket', how='outer').sort_values('date_bucket')

plt.style.use('seaborn-v0_8-whitegrid')

colors = {
    'atomic_fr': '#1b4f72',      
    'atomic_br': '#5dade2',      
    'non_atomic_fr': '#922b21',  
    'non_atomic_br': '#ec7063',  
    'atomic_total': '#1b4f72',
    'non_atomic_total': '#922b21'
}

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10,10), sharex=True)

# Upper Figure
ax1.plot(df_fee_daily['date_bucket'], df_fee_daily['atomic_fr_fee'], color=colors['atomic_fr'],
         label='Consecutive sandwich frontrun', linewidth=2.2)
ax1.plot(df_fee_daily['date_bucket'], df_fee_daily['atomic_br_fee'], color=colors['atomic_br'],
         label='Consecutive sandwich backrun', linewidth=2.2)
ax1.plot(df_fee_daily['date_bucket'], df_fee_daily['non_atomic_fr_fee'], color=colors['non_atomic_fr'],
         label='Non-consecutive sandwich frontrun', linewidth=2.2)
ax1.plot(df_fee_daily['date_bucket'], df_fee_daily['non_atomic_br_fee'], color=colors['non_atomic_br'],
         label='Non-consecutive sandwich backrun', linewidth=2.2)
ax1.set_ylabel('Average Transaction Fee (SOL)', fontsize=16)
ax1.legend(fontsize=14, frameon=False, loc='upper right')

ax1.grid(True, linestyle='--', color='gray', alpha=0.6, linewidth=0.6)
ax1.yaxis.set_major_formatter(ticker.ScalarFormatter())
ax1.tick_params(axis='both', labelsize=14, rotation=30)
for spine in ax1.spines.values():
    spine.set_edgecolor('black')
    spine.set_linewidth(1.2)

# Lower figure
ax2.plot(df_cost_daily['date_bucket'], df_cost_daily['atomic_total_cost'], color=colors['atomic_total'],
         label='Consecutive sandwich', linewidth=2.2)
ax2.plot(df_cost_daily['date_bucket'], df_cost_daily['non_atomic_total_cost'], color=colors['non_atomic_total'],
         label='Non-consecutive sandwich', linewidth=2.2)
ax2.set_ylabel('Average Cost (SOL)', fontsize=16)
ax2.legend(fontsize=16, frameon=False, loc='upper right')

ax2.grid(True, linestyle='--', color='gray', alpha=0.6, linewidth=0.6)
ax2.yaxis.set_major_formatter(ticker.ScalarFormatter())
ax2.tick_params(axis='both', labelsize=14, rotation=30)
for spine in ax2.spines.values():
    spine.set_edgecolor('black')
    spine.set_linewidth(1.2)

ax2.xaxis.set_major_formatter(DateFormatter("%Y-%m-%d"))
plt.xticks(rotation=30, fontsize=13)
plt.tight_layout(h_pad=2)

plt.savefig('data/daily_fee_cost_subplots.pdf', dpi=600)
plt.show()

In [None]:
attacker_summary = pd.read_csv('data/attacker_summary.csv')

print(f"\nTotal identified attackers: {len(attacker_summary)}")
print(f"Total profit: {attacker_summary['total_profit_in_usd'].sum()}")
print(f"Total sandwich: {attacker_summary['sandwich_count'].sum()}")
print(f"Average Profit per Attacker: {attacker_summary['total_profit_in_usd'].sum()/len(attacker_summary):.2f}")

major_attacker = attacker_summary[attacker_summary['sandwich_count']>100]
print(f"\nTotal major attackers: {len(major_attacker)}")
print(f"Total profit: {major_attacker['total_profit_in_usd'].sum()}")
print(f"Total sandwich: {major_attacker['sandwich_count'].sum()}")
print(f"Average Profit per Attacker: {major_attacker['total_profit_in_usd'].sum()/len(major_attacker):.2f}")

high_profit_attacker = major_attacker[major_attacker['win_rate']>=0.9]
print(f"\nTotal high profit attackers: {len(high_profit_attacker)}")
print(f"Total profit: {high_profit_attacker['total_profit_in_usd'].sum()} ({high_profit_attacker['total_profit_in_usd'].sum() / attacker_summary['total_profit_in_usd'].sum()*100:.3f})")
print(f"Total sandwich: {high_profit_attacker['sandwich_count'].sum()}")
print(f"Average Profit per Attacker: {high_profit_attacker['total_profit_in_usd'].sum()/len(high_profit_attacker):.2f}")

In [None]:
print(f"\nTop attackers: ")
print_df(high_profit_attacker.sort_values(by='total_profit_in_usd', ascending=False), 10)

In [None]:
from tqdm.notebook import tqdm
import pandas as pd
from collections import defaultdict
import ast
import matplotlib.pyplot as plt

tqdm.pandas()

# Filter attacker transactions
program_view = sandwiches_txs.copy()
program_view = program_view[program_view["type"].isin(ATTACKER_TX_TYPES)]
program_view['programs'] = program_view['programs'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

# Build signer → attacker_key mapping table
attacker_summary['signer_addresses'] = attacker_summary['signer_addresses'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)
signer_to_attacker = (
    attacker_summary[['attacker_key', 'signer_addresses']]
    .explode('signer_addresses')
    .rename(columns={'signer_addresses': 'signer'})
)
print_df(signer_to_attacker)

# Merge attacker info
program_view = program_view.merge(signer_to_attacker, on='signer', how='left')
program_view = program_view.merge(
    sandwich_stat[['sandwichId', 'profit_in_usd']], 
    on='sandwichId', 
    how='left'
)

# Explode program list into multiple rows
program_view_explode = program_view.explode('programs')
program_view_explode = program_view_explode.dropna(subset=['programs'])

# Count unique sandwiches per program
program_to_unique_sandwich = (
    program_view_explode.groupby('programs')['sandwichId']
    .nunique()
    .reset_index(name='unique_sandwich_count')
)

# Count unique transactions per program
program_to_unique_transaction = (
    program_view_explode.groupby('programs')['signature']
    .nunique()
    .reset_index(name='unique_tx_count')
)

# Count unique attackers per program
program_to_unique_attacker = (
    program_view_explode.groupby('programs')['attacker_key']
    .nunique()
    .reset_index(name='unique_attacker_count')
)

# Sum total profit per program
program_total_profit = (
    program_view_explode.groupby('programs')['profit_in_usd']
    .sum()
    .reset_index(name='total_profit_usd')
)

# Combine all program-level statistics
program_stats = (
    program_to_unique_sandwich
    .merge(program_to_unique_attacker, on='programs', how='outer')
    .merge(program_total_profit, on='programs', how='outer')
    .merge(program_to_unique_transaction, on='programs', how='outer')
    .sort_values(by='unique_sandwich_count', ascending=False)
)

In [None]:
program_stats['sandwich_share'] = program_stats['unique_sandwich_count'] / len(sandwich_stat)
program_stats['attacker_share'] = program_stats['unique_attacker_count'] / len(attacker_summary)
print_df(program_stats.sort_values(by='unique_tx_count', ascending=False), 50)