In [16]:
# Import libraries
import duckdb
from pathlib import Path
import pandas as pd
import plotly.express as px

In [17]:
# Configuration - using prices table with market_tokens to pair YES/NO
PRICES_DIR = Path("../data/prices")
MARKET_TOKENS_DIR = Path("../fetcher/data/market_tokens")  # Market tokens are in fetcher/data
GAMMA_MARKETS_DIR = Path("../data/gamma_markets")

PRICES_GLOB = str(PRICES_DIR / "**" / "*.parquet")
MARKET_TOKENS_GLOB = str(MARKET_TOKENS_DIR / "**" / "*.parquet")
GAMMA_MARKETS_GLOB = str(GAMMA_MARKETS_DIR / "**" / "*.parquet")

print(f"Prices directory: {PRICES_DIR.absolute()}")
print(f"Market tokens directory: {MARKET_TOKENS_DIR.absolute()}")
print(f"Gamma markets directory: {GAMMA_MARKETS_DIR.absolute()}")

Prices directory: c:\Users\User\Desktop\VibeCoding\PolyMarketScrapping\notebooks\..\data\prices
Market tokens directory: c:\Users\User\Desktop\VibeCoding\PolyMarketScrapping\notebooks\..\fetcher\data\market_tokens
Gamma markets directory: c:\Users\User\Desktop\VibeCoding\PolyMarketScrapping\notebooks\..\data\gamma_markets


In [18]:
# Create in-memory DuckDB connection and register parquet files as views
conn = duckdb.connect(":memory:")

conn.execute(f"""
    CREATE VIEW prices AS 
    SELECT * FROM read_parquet('{PRICES_GLOB}', hive_partitioning=true)
""")

conn.execute(f"""
    CREATE VIEW market_tokens AS 
    SELECT * FROM read_parquet('{MARKET_TOKENS_GLOB}', hive_partitioning=true)
""")

conn.execute(f"""
    CREATE VIEW gamma_markets AS 
    SELECT * FROM read_parquet('{GAMMA_MARKETS_GLOB}', hive_partitioning=true)
""")

print("Views created successfully!")

Views created successfully!


## Data Overview

In [19]:
# Check schemas
print("=== Prices Schema ===")
display(conn.execute("DESCRIBE prices").fetchdf())

print("\n=== Market Tokens Schema ===")
display(conn.execute("DESCRIBE market_tokens").fetchdf())

=== Prices Schema ===


Unnamed: 0,column_name,column_type,null,key,default,extra
0,timestamp,BIGINT,YES,,,
1,token_id,VARCHAR,YES,,,
2,price,DOUBLE,YES,,,
3,dt,DATE,YES,,,



=== Market Tokens Schema ===


Unnamed: 0,column_name,column_type,null,key,default,extra
0,condition_Id,VARCHAR,YES,,,
1,price,DOUBLE,YES,,,
2,token_id,VARCHAR,YES,,,
3,winner,BOOLEAN,YES,,,
4,dt,DATE,YES,,,


In [21]:
# Record counts
print("=== Record Counts ===")
print(f"Prices: {conn.execute('SELECT COUNT(*) FROM prices').fetchone()[0]:,}")
print(f"Market Tokens: {conn.execute('SELECT COUNT(*) FROM market_tokens').fetchone()[0]:,}")
print(f"Gamma Markets: {conn.execute('SELECT COUNT(*) FROM gamma_markets').fetchone()[0]:,}")

=== Record Counts ===
Prices: 1,861,996
Market Tokens: 354,240
Gamma Markets: 256,779


In [20]:
# Sample data
print("=== Sample Prices ===")
display(conn.execute("SELECT * FROM prices LIMIT 5").fetchdf())

print("\n=== Sample Market Tokens ===")
display(conn.execute("SELECT * FROM market_tokens LIMIT 5").fetchdf())

=== Sample Prices ===


Unnamed: 0,timestamp,token_id,price,dt
0,1678932037,5639376173383048360109705185789934852249537686...,0.03,2025-12-20
1,1678935618,5639376173383048360109705185789934852249537686...,0.02,2025-12-20
2,1678939236,5639376173383048360109705185789934852249537686...,0.25,2025-12-20
3,1678942810,5639376173383048360109705185789934852249537686...,0.5,2025-12-20
4,1678946434,5639376173383048360109705185789934852249537686...,0.5,2025-12-20



=== Sample Market Tokens ===


Unnamed: 0,condition_Id,price,token_id,winner,dt
0,0xfdb0b338aefdac82810c39fa879e7930ca4b2a45efe5...,0.0,3819512056943756194045456255588549931251239094...,False,2025-12-19
1,0xfdb0b338aefdac82810c39fa879e7930ca4b2a45efe5...,1.0,4875503924034446208396352261914234081204342229...,True,2025-12-19
2,0x3aeaddfa5c56f561a169d75df4670345bd9f4547c8eb...,1.0,4197602046605014841487336786909858522525269556...,True,2025-12-19
3,0x3aeaddfa5c56f561a169d75df4670345bd9f4547c8eb...,0.0,6311994233488278760641913760686355576532388472...,False,2025-12-19
4,0x7fdfacb17deea641a039d33e560f93c5e01ab47c1d9c...,1.0,8903570439317510181328851103936744880929590707...,True,2025-12-19


## Find Arbitrage Opportunities (YES + NO < 1)

We join `prices` with `market_tokens` to get YES and NO prices for each market at each timestamp, then filter for cases where the sum is less than 1.

In [22]:
# First, let's build a token-to-outcome mapping from gamma_markets
# gamma_markets has clobTokenIds as JSON array: ["yes_token_id", "no_token_id"]

import json

# Get the token mapping
token_mapping_query = """
SELECT DISTINCT
    conditionId,
    clobTokenIds
FROM gamma_markets
WHERE clobTokenIds IS NOT NULL 
  AND clobTokenIds != ''
  AND clobTokenIds != '[]'
"""

token_df = conn.execute(token_mapping_query).fetchdf()
print(f"Markets with token IDs: {len(token_df):,}")

# Parse and create a lookup of token_id -> (condition_id, outcome)
token_to_outcome = {}
for _, row in token_df.iterrows():
    try:
        tokens = json.loads(row['clobTokenIds'])
        if len(tokens) >= 2:
            token_to_outcome[tokens[0]] = (row['conditionId'], 'Yes')
            token_to_outcome[tokens[1]] = (row['conditionId'], 'No')
    except:
        pass

print(f"Token mappings created: {len(token_to_outcome):,}")

Markets with token IDs: 256,710
Token mappings created: 513,420


In [23]:
# Now get all prices and map them to outcomes
prices_df = conn.execute("SELECT timestamp, token_id, price, dt FROM prices").fetchdf()
print(f"Total price entries: {len(prices_df):,}")

# Map token_id to condition and outcome
def map_token(token_id):
    if token_id in token_to_outcome:
        return pd.Series(token_to_outcome[token_id])
    return pd.Series([None, None])

prices_df[['condition_id', 'outcome']] = prices_df['token_id'].apply(map_token)
prices_df = prices_df.dropna(subset=['condition_id', 'outcome'])
print(f"Prices with mapped tokens: {len(prices_df):,}")

Total price entries: 1,861,996
Prices with mapped tokens: 1,846,436


In [24]:
# Pair YES and NO prices for the same timestamp and condition_id
yes_prices = prices_df[prices_df['outcome'] == 'Yes'][['timestamp', 'condition_id', 'price', 'dt']].copy()
yes_prices.columns = ['timestamp', 'condition_id', 'yes_price', 'dt']

no_prices = prices_df[prices_df['outcome'] == 'No'][['timestamp', 'condition_id', 'price']].copy()
no_prices.columns = ['timestamp', 'condition_id', 'no_price']

# Merge on timestamp and condition_id
paired_df = yes_prices.merge(no_prices, on=['timestamp', 'condition_id'], how='inner')
paired_df['total_price'] = paired_df['yes_price'] + paired_df['no_price']
paired_df['arbitrage_gap'] = 1 - paired_df['total_price']

print(f"Paired price entries (same timestamp & market): {len(paired_df):,}")

Paired price entries (same timestamp & market): 922,861


In [25]:
# Filter for arbitrage opportunities (YES + NO < 1)
arbitrage_df = paired_df[
    (paired_df['yes_price'] > 0.001) &
    (paired_df['no_price'] > 0.001) &
    (paired_df['total_price'] < 1.0)
].sort_values('arbitrage_gap', ascending=False)

print(f"Found {len(arbitrage_df):,} price entries where YES + NO < 1")
arbitrage_df.head(20)

Found 505 price entries where YES + NO < 1


Unnamed: 0,timestamp,condition_id,yes_price,dt,no_price,total_price,arbitrage_gap
2564,1677103244,0xec8ac7ee6c07430720d094add42b65f9bb58eee4d50c...,0.22,2025-12-20,0.77,0.99,0.01
2565,1677106803,0xec8ac7ee6c07430720d094add42b65f9bb58eee4d50c...,0.22,2025-12-20,0.77,0.99,0.01
2566,1677110435,0xec8ac7ee6c07430720d094add42b65f9bb58eee4d50c...,0.22,2025-12-20,0.77,0.99,0.01
2567,1677114044,0xec8ac7ee6c07430720d094add42b65f9bb58eee4d50c...,0.22,2025-12-20,0.77,0.99,0.01
2568,1677117630,0xec8ac7ee6c07430720d094add42b65f9bb58eee4d50c...,0.22,2025-12-20,0.77,0.99,0.01
2569,1677121254,0xec8ac7ee6c07430720d094add42b65f9bb58eee4d50c...,0.22,2025-12-20,0.77,0.99,0.01
2570,1677124801,0xec8ac7ee6c07430720d094add42b65f9bb58eee4d50c...,0.22,2025-12-20,0.77,0.99,0.01
2571,1677128416,0xec8ac7ee6c07430720d094add42b65f9bb58eee4d50c...,0.22,2025-12-20,0.77,0.99,0.01
2572,1677132021,0xec8ac7ee6c07430720d094add42b65f9bb58eee4d50c...,0.22,2025-12-20,0.77,0.99,0.01
2573,1677135654,0xec8ac7ee6c07430720d094add42b65f9bb58eee4d50c...,0.22,2025-12-20,0.77,0.99,0.01


In [26]:
# Summary statistics of arbitrage opportunities
if len(arbitrage_df) > 0:
    print("=== Arbitrage Gap Statistics ===")
    print(f"Mean gap: {arbitrage_df['arbitrage_gap'].mean():.4f}")
    print(f"Max gap: {arbitrage_df['arbitrage_gap'].max():.4f}")
    print(f"Min gap: {arbitrage_df['arbitrage_gap'].min():.4f}")
    print(f"Median gap: {arbitrage_df['arbitrage_gap'].median():.4f}")
    print(f"\nTotal potential arbitrage (sum of gaps): {arbitrage_df['arbitrage_gap'].sum():.4f}")
    print(f"\n=== Price Statistics ===")
    print(f"Mean YES price: {arbitrage_df['yes_price'].mean():.4f}")
    print(f"Mean NO price: {arbitrage_df['no_price'].mean():.4f}")
    print(f"Mean total price: {arbitrage_df['total_price'].mean():.4f}")
else:
    print("No arbitrage opportunities found in the data.")

=== Arbitrage Gap Statistics ===
Mean gap: 0.0100
Max gap: 0.0100
Min gap: 0.0005
Median gap: 0.0100

Total potential arbitrage (sum of gaps): 5.0310

=== Price Statistics ===
Mean YES price: 0.6155
Mean NO price: 0.3745
Mean total price: 0.9900


## Full Price Distribution (All Price Entries)

For context, let's see the overall distribution of YES + NO prices across all paired price entries.

In [27]:
# Distribution of arbitrage gaps
if len(arbitrage_df) > 0:
    fig = px.histogram(
        arbitrage_df, 
        x='arbitrage_gap',
        nbins=50,
        title='Distribution of Arbitrage Gaps (1 - YES - NO)',
        labels={'arbitrage_gap': 'Gap (1 - YES - NO)', 'count': 'Frequency'}
    )
    fig.show()

In [28]:
# Top markets with most arbitrage instances
if len(arbitrage_df) > 0:
    top_markets = arbitrage_df.groupby('condition_id').agg(
        count=('timestamp', 'count'),
        avg_gap=('arbitrage_gap', 'mean'),
        max_gap=('arbitrage_gap', 'max'),
        total_gap=('arbitrage_gap', 'sum')
    ).sort_values('count', ascending=False).head(20)
    
    print("=== Top 20 Markets with Most Arbitrage Instances ===")
    display(top_markets)

=== Top 20 Markets with Most Arbitrage Instances ===


Unnamed: 0_level_0,count,avg_gap,max_gap,total_gap
condition_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0x0f72c2f02560b7b49542249c3ba32a5eabe86de8353ba36717187e67c2c17e1c,165,0.01,0.01,1.65
0x665b3ebaad5140dd834a56d90a222ee5a736d750e213b0904c0e1a40ffd307f8,81,0.01,0.01,0.81
0x9876cd5626a39e33e073159b813199f48ee860c8279f67244ee085dc7a9f0b41,47,0.01,0.01,0.47
0x6a3086722b7e5ba3646605202b0a87832e86c9b81ba5c33b2c8c9c3bb9de0def,43,0.01,0.01,0.43
0x5bc3cb44fed93bb2a225f4e29e855ea4a6a5b05f3ac5489d11985643ad67c556,24,0.01,0.01,0.24
0x02ba76b36ffb6ec21751d270905a642769d1508c102b1275ca3fa46abd3a2710,16,0.01,0.01,0.16
0xf12d3e7a87e218e66d1442e2d5bc7a914845795c954ae76fbd2fd7303cc3b7df,15,0.01,0.01,0.15
0xec8ac7ee6c07430720d094add42b65f9bb58eee4d50cf871eb283683266f0ce1,13,0.01,0.01,0.13
0xb5858726a0b48b44465ee6c6d0d8fc913bde224ebb1045a537a687cfe1171222,11,0.01,0.01,0.11
0x6013aa7243f1041e8416a35aa9adb9f9c7fc12bff5ecb9d93ffdcd7c2c2e7d71,11,0.01,0.01,0.11


## Arbitrage Betting Analysis

**How arbitrage works on Polymarket:**
- Each share pays out $1 if it wins, $0 if it loses
- If YES price = 0.48 and NO price = 0.51, total = 0.99 (gap = 0.01)
- Bet $0.48 on YES → wins $1 if YES happens
- Bet $0.51 on NO → wins $1 if NO happens  
- Total cost = $0.99, guaranteed payout = $1.00, profit = $0.01

**Optimal stake calculation:**
For a total budget `B`, split proportionally:
- Stake on YES = B × (1 - p_no) = B × p_yes / total_price
- Stake on NO = B × (1 - p_yes) = B × p_no / total_price

This ensures equal payout regardless of outcome.

In [31]:
# Calculate optimal betting amounts for each arbitrage opportunity
# For a $100 total stake, how much profit can we make?

STAKE = 100  # Total dollars to bet

if len(arbitrage_df) > 0:
    arb_betting = arbitrage_df.copy()
    
    # Calculate optimal bet allocation for equal payout
    arb_betting['stake_yes'] = STAKE * (arb_betting['yes_price'] / arb_betting['total_price'])
    arb_betting['stake_no'] = STAKE * (arb_betting['no_price'] / arb_betting['total_price'])
    
    # Verify stakes add up correctly
    arb_betting['total_stake'] = arb_betting['stake_yes'] + arb_betting['stake_no']
    
    # Calculate shares purchased
    arb_betting['shares_yes'] = arb_betting['stake_yes'] / arb_betting['yes_price']
    arb_betting['shares_no'] = arb_betting['stake_no'] / arb_betting['no_price']
    
    # Payout is $1 per share for the winning side
    arb_betting['payout_if_yes'] = arb_betting['shares_yes'] * 1.0  # YES wins
    arb_betting['payout_if_no'] = arb_betting['shares_no'] * 1.0   # NO wins
    
    # Guaranteed profit (both payouts should be equal with optimal staking)
    arb_betting['guaranteed_payout'] = arb_betting[['payout_if_yes', 'payout_if_no']].min(axis=1)
    arb_betting['guaranteed_profit'] = arb_betting['guaranteed_payout'] - STAKE
    arb_betting['profit_pct'] = (arb_betting['guaranteed_profit'] / STAKE) * 100
    
    # Show top opportunities by profit
    display_cols = ['condition_id', 'timestamp', 'yes_price', 'no_price', 'total_price', 
                    'stake_yes', 'stake_no', 'guaranteed_payout', 'guaranteed_profit', 'profit_pct']
    
    print(f"=== Top 20 Arbitrage Opportunities (with ${STAKE} stake) ===")
    display(arb_betting[display_cols].head(20).round(4))

=== Top 20 Arbitrage Opportunities (with $100 stake) ===


Unnamed: 0,condition_id,timestamp,yes_price,no_price,total_price,stake_yes,stake_no,guaranteed_payout,guaranteed_profit,profit_pct
2564,0xec8ac7ee6c07430720d094add42b65f9bb58eee4d50c...,1677103244,0.22,0.77,0.99,22.2222,77.7778,101.0101,1.0101,1.0101
2565,0xec8ac7ee6c07430720d094add42b65f9bb58eee4d50c...,1677106803,0.22,0.77,0.99,22.2222,77.7778,101.0101,1.0101,1.0101
2566,0xec8ac7ee6c07430720d094add42b65f9bb58eee4d50c...,1677110435,0.22,0.77,0.99,22.2222,77.7778,101.0101,1.0101,1.0101
2567,0xec8ac7ee6c07430720d094add42b65f9bb58eee4d50c...,1677114044,0.22,0.77,0.99,22.2222,77.7778,101.0101,1.0101,1.0101
2568,0xec8ac7ee6c07430720d094add42b65f9bb58eee4d50c...,1677117630,0.22,0.77,0.99,22.2222,77.7778,101.0101,1.0101,1.0101
2569,0xec8ac7ee6c07430720d094add42b65f9bb58eee4d50c...,1677121254,0.22,0.77,0.99,22.2222,77.7778,101.0101,1.0101,1.0101
2570,0xec8ac7ee6c07430720d094add42b65f9bb58eee4d50c...,1677124801,0.22,0.77,0.99,22.2222,77.7778,101.0101,1.0101,1.0101
2571,0xec8ac7ee6c07430720d094add42b65f9bb58eee4d50c...,1677128416,0.22,0.77,0.99,22.2222,77.7778,101.0101,1.0101,1.0101
2572,0xec8ac7ee6c07430720d094add42b65f9bb58eee4d50c...,1677132021,0.22,0.77,0.99,22.2222,77.7778,101.0101,1.0101,1.0101
2573,0xec8ac7ee6c07430720d094add42b65f9bb58eee4d50c...,1677135654,0.22,0.77,0.99,22.2222,77.7778,101.0101,1.0101,1.0101


In [32]:
# Summary statistics for betting opportunities
if len(arb_betting) > 0:
    print(f"=== Arbitrage Profit Summary (per ${STAKE} stake) ===")
    print(f"Total opportunities: {len(arb_betting):,}")
    print(f"\nGuaranteed Profit Statistics:")
    print(f"  Mean profit: ${arb_betting['guaranteed_profit'].mean():.4f}")
    print(f"  Max profit: ${arb_betting['guaranteed_profit'].max():.4f}")
    print(f"  Min profit: ${arb_betting['guaranteed_profit'].min():.4f}")
    print(f"  Total if you hit all: ${arb_betting['guaranteed_profit'].sum():.2f}")
    
    print(f"\nProfit % Statistics:")
    print(f"  Mean: {arb_betting['profit_pct'].mean():.4f}%")
    print(f"  Max: {arb_betting['profit_pct'].max():.4f}%")
    
    # What stake would be needed to make $1 profit on average?
    avg_profit_per_dollar = arb_betting['guaranteed_profit'].mean() / STAKE
    stake_for_1_dollar = 1 / avg_profit_per_dollar if avg_profit_per_dollar > 0 else float('inf')
    print(f"\n=== Practical Considerations ===")
    print(f"Average profit per $1 staked: ${avg_profit_per_dollar:.6f}")
    print(f"Stake needed for $1 guaranteed profit: ${stake_for_1_dollar:.2f}")

=== Arbitrage Profit Summary (per $100 stake) ===
Total opportunities: 505

Guaranteed Profit Statistics:
  Mean profit: $1.0063
  Max profit: $1.0101
  Min profit: $0.0500
  Total if you hit all: $508.18

Profit % Statistics:
  Mean: 1.0063%
  Max: 1.0101%

=== Practical Considerations ===
Average profit per $1 staked: $0.010063
Stake needed for $1 guaranteed profit: $99.37


In [33]:
# Visualization: Profit % distribution
if len(arb_betting) > 0:
    fig = px.histogram(
        arb_betting, 
        x='profit_pct',
        nbins=50,
        title=f'Distribution of Guaranteed Profit % (per ${STAKE} stake)',
        labels={'profit_pct': 'Profit %', 'count': 'Frequency'}
    )
    fig.show()
    
    # Scatter: stake split vs profit
    fig2 = px.scatter(
        arb_betting,
        x='yes_price',
        y='no_price',
        color='profit_pct',
        hover_data=['condition_id', 'guaranteed_profit'],
        title='Arbitrage Opportunities: YES vs NO Prices',
        labels={'yes_price': 'YES Price', 'no_price': 'NO Price', 'profit_pct': 'Profit %'}
    )
    # Add the "fair price" line (YES + NO = 1)
    fig2.add_shape(type="line", x0=0, y0=1, x1=1, y1=0, 
                   line=dict(color="red", dash="dash"),
                   name="Fair Price Line")
    fig2.show()

In [34]:
# Breakeven analysis: What different stake amounts could earn
if len(arb_betting) > 0:
    stake_amounts = [10, 50, 100, 500, 1000, 5000, 10000]
    
    print("=== Potential Profits at Different Stake Levels ===")
    print(f"(Based on {len(arb_betting):,} arbitrage opportunities)\n")
    
    results = []
    for stake in stake_amounts:
        # Profit scales linearly with stake
        scale = stake / STAKE
        total_profit = arb_betting['guaranteed_profit'].sum() * scale
        avg_profit = arb_betting['guaranteed_profit'].mean() * scale
        max_profit = arb_betting['guaranteed_profit'].max() * scale
        
        results.append({
            'Stake': f'${stake:,}',
            'Avg Profit/Trade': f'${avg_profit:.4f}',
            'Max Profit/Trade': f'${max_profit:.4f}',
            'Total (all trades)': f'${total_profit:.2f}'
        })
    
    display(pd.DataFrame(results))

=== Potential Profits at Different Stake Levels ===
(Based on 505 arbitrage opportunities)



Unnamed: 0,Stake,Avg Profit/Trade,Max Profit/Trade,Total (all trades)
0,$10,$0.1006,$0.1010,$50.82
1,$50,$0.5031,$0.5051,$254.09
2,$100,$1.0063,$1.0101,$508.18
3,$500,$5.0315,$5.0505,$2540.90
4,"$1,000",$10.0630,$10.1010,$5081.81
5,"$5,000",$50.3149,$50.5051,$25409.04
6,"$10,000",$100.6299,$101.0101,$50818.09


In [None]:
# Arbitrage opportunities by year
if len(arb_betting) > 0:
    # Convert timestamp to datetime (assuming milliseconds since epoch)
    arb_betting['datetime'] = pd.to_datetime(arb_betting['timestamp'], unit='ms', errors='coerce')
    
    # If that didn't work, try seconds
    if arb_betting['datetime'].isna().all():
        arb_betting['datetime'] = pd.to_datetime(arb_betting['timestamp'], unit='s', errors='coerce')
    
    arb_betting['year'] = arb_betting['datetime'].dt.year
    
    # Group by year
    by_year = arb_betting.groupby('year').agg(
        opportunities=('timestamp', 'count'),
        total_profit=('guaranteed_profit', 'sum'),
        avg_profit=('guaranteed_profit', 'mean'),
        max_profit=('guaranteed_profit', 'max')
    ).round(4)
    
    print("=== Arbitrage Opportunities by Year ===")
    display(by_year)
    
    # Bar chart
    fig = px.bar(
        by_year.reset_index(),
        x='year',
        y='opportunities',
        title='Number of Arbitrage Opportunities by Year',
        labels={'year': 'Year', 'opportunities': 'Count'}
    )
    fig.show()

In [29]:
# All paired prices - summary of total price distribution
all_valid_pairs = paired_df[
    (paired_df['yes_price'] > 0.001) & 
    (paired_df['no_price'] > 0.001)
]

print(f"Total valid paired price entries: {len(all_valid_pairs):,}")
print(f"\n=== Total Price (YES + NO) Statistics ===")
print(all_valid_pairs['total_price'].describe())

Total valid paired price entries: 896,814

=== Total Price (YES + NO) Statistics ===
count    896814.000000
mean          1.000358
std           0.001888
min           0.990000
25%           1.000000
50%           1.000000
75%           1.000000
max           1.010000
Name: total_price, dtype: float64


In [30]:
# Summary: How many price entries are under/over 1?
if len(all_valid_pairs) > 0:
    under_1 = (all_valid_pairs['total_price'] < 1.0).sum()
    equal_1 = (all_valid_pairs['total_price'] == 1.0).sum()
    over_1 = (all_valid_pairs['total_price'] > 1.0).sum()
    
    print("=== Total Price Distribution Summary ===")
    print(f"Under 1.0 (arbitrage): {under_1:,} ({100*under_1/len(all_valid_pairs):.2f}%)")
    print(f"Exactly 1.0: {equal_1:,} ({100*equal_1/len(all_valid_pairs):.2f}%)")
    print(f"Over 1.0 (negative edge): {over_1:,} ({100*over_1/len(all_valid_pairs):.2f}%)")

# Distribution histogram
if len(all_valid_pairs) > 0:
    fig = px.histogram(
        all_valid_pairs, 
        x='total_price',
        nbins=100,
        title='Distribution of Total Price (YES + NO) from Prices Table',
        labels={'total_price': 'YES + NO Price', 'count': 'Frequency'}
    )
    fig.add_vline(x=1.0, line_dash="dash", line_color="red", annotation_text="Expected: 1.0")
    fig.show()

# Close connection
conn.close()
print("\nAnalysis complete!")

=== Total Price Distribution Summary ===
Under 1.0 (arbitrage): 505 (0.06%)
Exactly 1.0: 863,692 (96.31%)
Over 1.0 (negative edge): 32,617 (3.64%)



Analysis complete!
