# Polymarket Book Event Analysis

This notebook analyzes Polymarket market events data, specifically focusing on 'book' event types.
We calculate the sum of the two lowest ask prices from each asset_id for each timestamp.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import glob
import os

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

In [ ]:
# Define path to trading data (using repository's data directory)
data_path = "../data/"  # Relative path from notebooks/ to data/
pattern = "*polymarket_market_events.csv"

# Get all polymarket market events files
files = glob.glob(os.path.join(data_path, pattern))
print(f"Found {len(files)} polymarket market events files:")
for file in files[:5]:  # Show first 5 files
    print(f"  - {os.path.basename(file)}")
if len(files) > 5:
    print(f"  ... and {len(files) - 5} more files")

In [None]:
def load_and_filter_data(file_path):
    """
    Load a polymarket events CSV file and filter for book events with ask side.
    
    Returns:
        pd.DataFrame: Filtered dataframe with book events and ask side only
    """
    try:
        # Load the CSV file
        df = pd.read_csv(file_path)
        
        # Filter for book events and ask side only
        book_asks = df[(df['event_type'] == 'book') & (df['side'] == 'ask')].copy()
        
        # Convert timestamp to datetime for easier analysis
        book_asks['datetime'] = pd.to_datetime(book_asks['timestamp'], unit='ms')
        
        print(f"Loaded {len(df)} total rows, {len(book_asks)} book ask events from {os.path.basename(file_path)}")
        
        return book_asks
    
    except Exception as e:
        print(f"Error loading {file_path}: {str(e)}")
        return pd.DataFrame()

# Test with the first file
if files:
    test_data = load_and_filter_data(files[0])
    print(f"\nSample data structure:")
    print(test_data.head())
    print(f"\nUnique asset_ids: {test_data['asset_id'].nunique()}")
    print(f"Unique timestamps: {test_data['timestamp'].nunique()}")

In [None]:
def get_lowest_ask_prices_by_timestamp(df):
    """
    For each timestamp, find the two lowest ask prices for each asset_id,
    then sum them across all asset_ids.
    
    Args:
        df: DataFrame with book ask events
        
    Returns:
        pd.DataFrame: DataFrame with timestamp and combined_lowest_asks columns
    """
    results = []
    
    # Group by timestamp
    for timestamp, timestamp_group in df.groupby('timestamp'):
        asset_lowest_sums = []
        
        # For each asset_id in this timestamp
        for asset_id, asset_group in timestamp_group.groupby('asset_id'):
            # Sort by price and get the two lowest ask prices
            sorted_prices = asset_group.sort_values('price')['price'].values
            
            # Take the two lowest prices (or all available if less than 2)
            lowest_prices = sorted_prices[:2]
            
            # Sum the lowest prices for this asset
            asset_sum = lowest_prices.sum()
            asset_lowest_sums.append(asset_sum)
        
        # Sum across all assets for this timestamp
        combined_sum = sum(asset_lowest_sums)
        
        results.append({
            'timestamp': timestamp,
            'datetime': pd.to_datetime(timestamp, unit='ms'),
            'combined_lowest_asks': combined_sum,
            'num_assets': len(asset_lowest_sums)
        })
    
    return pd.DataFrame(results)

# Test the function with our sample data
if not test_data.empty:
    result_data = get_lowest_ask_prices_by_timestamp(test_data)
    print(f"Processed {len(result_data)} timestamps")
    print("\nSample results:")
    print(result_data.head())
    
    # Show some statistics
    print(f"\nStatistics:")
    print(f"Min combined lowest asks: {result_data['combined_lowest_asks'].min():.4f}")
    print(f"Max combined lowest asks: {result_data['combined_lowest_asks'].max():.4f}")
    print(f"Mean combined lowest asks: {result_data['combined_lowest_asks'].mean():.4f}")

In [None]:
def create_visualization(result_df, title_suffix=""):
    """
    Create a line plot showing the combined lowest ask prices over time.
    
    Args:
        result_df: DataFrame with timestamp and combined_lowest_asks columns
        title_suffix: Additional text for the plot title
    """
    plt.figure(figsize=(14, 8))
    
    # Create the main plot
    plt.plot(result_df['datetime'], result_df['combined_lowest_asks'], 
             linewidth=2, marker='o', markersize=3, alpha=0.8)
    
    plt.title(f'Sum of Two Lowest Ask Prices Over Time{title_suffix}', fontsize=16, fontweight='bold')
    plt.xlabel('Time', fontsize=12)
    plt.ylabel('Combined Lowest Ask Prices', fontsize=12)
    
    # Improve x-axis formatting
    plt.xticks(rotation=45)
    
    # Add grid for better readability
    plt.grid(True, alpha=0.3)
    
    # Add some statistics as text
    stats_text = f"""Data Points: {len(result_df)}
Min: {result_df['combined_lowest_asks'].min():.4f}
Max: {result_df['combined_lowest_asks'].max():.4f}
Mean: {result_df['combined_lowest_asks'].mean():.4f}"""
    
    plt.text(0.02, 0.98, stats_text, transform=plt.gca().transAxes, 
             verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    
    plt.tight_layout()
    plt.show()

# Create visualization for our test data
if not result_data.empty:
    create_visualization(result_data, f" - {os.path.basename(files[0])}")

In [None]:
def process_all_files(file_list, max_files=None):
    """
    Process multiple polymarket events files and combine results.
    
    Args:
        file_list: List of file paths to process
        max_files: Maximum number of files to process (None for all)
        
    Returns:
        dict: Dictionary with filename as key and result DataFrame as value
    """
    all_results = {}
    
    files_to_process = file_list[:max_files] if max_files else file_list
    
    for i, file_path in enumerate(files_to_process):
        print(f"Processing file {i+1}/{len(files_to_process)}: {os.path.basename(file_path)}")
        
        # Load and filter data
        book_data = load_and_filter_data(file_path)
        
        if not book_data.empty:
            # Calculate lowest ask prices
            result = get_lowest_ask_prices_by_timestamp(book_data)
            all_results[os.path.basename(file_path)] = result
            
            print(f"  -> Generated {len(result)} data points\n")
        else:
            print(f"  -> No valid data found\n")
    
    return all_results

# Process first 3 files as an example (you can change this number or remove the limit)
print("Processing multiple files...")
all_results = process_all_files(files, max_files=3)

print(f"\nProcessed {len(all_results)} files successfully:")
for filename, result_df in all_results.items():
    print(f"  - {filename}: {len(result_df)} data points")

In [None]:
# Create individual plots for each file
for filename, result_df in all_results.items():
    if not result_df.empty:
        print(f"\n=== Visualization for {filename} ===")
        create_visualization(result_df, f" - {filename}")

In [None]:
# Optional: Combine all results into a single plot for comparison
if len(all_results) > 1:
    plt.figure(figsize=(16, 10))
    
    colors = plt.cm.tab10(np.linspace(0, 1, len(all_results)))
    
    for i, (filename, result_df) in enumerate(all_results.items()):
        if not result_df.empty:
            plt.plot(result_df['datetime'], result_df['combined_lowest_asks'], 
                    linewidth=2, marker='o', markersize=2, alpha=0.7, 
                    label=filename.replace('-polymarket_market_events.csv', ''), 
                    color=colors[i])
    
    plt.title('Combined View: Sum of Two Lowest Ask Prices Over Time', fontsize=16, fontweight='bold')
    plt.xlabel('Time', fontsize=12)
    plt.ylabel('Combined Lowest Ask Prices', fontsize=12)
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()
    
    print("\nCombined visualization created!")

In [None]:
# Summary analysis
print("=== SUMMARY ANALYSIS ===")
print(f"Total files processed: {len(all_results)}")

all_stats = []
for filename, result_df in all_results.items():
    if not result_df.empty:
        stats = {
            'filename': filename,
            'data_points': len(result_df),
            'min_price': result_df['combined_lowest_asks'].min(),
            'max_price': result_df['combined_lowest_asks'].max(),
            'mean_price': result_df['combined_lowest_asks'].mean(),
            'std_price': result_df['combined_lowest_asks'].std(),
            'num_assets': result_df['num_assets'].iloc[0] if len(result_df) > 0 else 0
        }
        all_stats.append(stats)

if all_stats:
    summary_df = pd.DataFrame(all_stats)
    print("\nSummary Statistics:")
    print(summary_df.to_string(index=False, float_format='%.4f'))
    
    # Overall statistics
    print(f"\nOverall Statistics:")
    print(f"Total data points across all files: {summary_df['data_points'].sum()}")
    print(f"Average data points per file: {summary_df['data_points'].mean():.1f}")
    print(f"Average number of assets per market: {summary_df['num_assets'].mean():.1f}")
else:
    print("No valid data found in any files.")