# Mock Market Data Analysis

Load and analyze the generated mock market data using Polars and Plotly.

In [1]:
import polars as pl
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

## Load Data with Polars

In [2]:
# Load parquet file from root data directory
df = pl.read_parquet('../../data/mock.parquet')

print(f"📊 Data Overview:")
print(f"   Shape: {df.shape}")
print(f"   Columns: {df.columns}")
print(f"   Memory usage: {df.estimated_size('mb'):.2f} MB")

print(f"\n📈 Data Info:")
print(df.describe())

print(f"\n🔍 First 5 rows:")
print(df.head())

📊 Data Overview:
   Shape: (47333, 3)
   Columns: ['timestamp', 'price', 'message_rate']
   Memory usage: 1.08 MB

📈 Data Info:
shape: (9, 4)
┌────────────┬─────────────┬────────────┬──────────────┐
│ statistic  ┆ timestamp   ┆ price      ┆ message_rate │
│ ---        ┆ ---         ┆ ---        ┆ ---          │
│ str        ┆ f64         ┆ f64        ┆ f64          │
╞════════════╪═════════════╪════════════╪══════════════╡
│ count      ┆ 47333.0     ┆ 47333.0    ┆ 47333.0      │
│ null_count ┆ 0.0         ┆ 0.0        ┆ 0.0          │
│ mean       ┆ 1.7849e7    ┆ 111.602542 ┆ 2.160924     │
│ std        ┆ 1.0334e7    ┆ 8.107221   ┆ 0.833614     │
│ min        ┆ 3029.0      ┆ 91.976018  ┆ 0.016667     │
│ 25%        ┆ 9.187631e6  ┆ 107.808052 ┆ 1.8          │
│ 50%        ┆ 1.7480216e7 ┆ 113.327369 ┆ 2.333333     │
│ 75%        ┆ 2.7060573e7 ┆ 117.86285  ┆ 2.783333     │
│ max        ┆ 3.5999604e7 ┆ 126.307449 ┆ 3.783333     │
└────────────┴─────────────┴────────────┴──────────────┘

🔍 

## Calculate Time Intervals

In [3]:
# Calculate time intervals between ticks
df_with_intervals = df.with_columns([
    (pl.col('timestamp').diff().fill_null(0.0)).alias('time_interval'),
    pl.col('price').diff().fill_null(0.0).alias('price_change')
])

# Calculate rolling statistics
window_size = 100  # 100 tick window
df_analysis = df_with_intervals.with_columns([
    pl.col('time_interval').rolling_mean(window_size).alias('avg_interval'),
    pl.col('price').rolling_std(window_size).alias('price_volatility'),
    (1.0 / pl.col('time_interval')).alias('instantaneous_frequency')
])

print(f"⏱️ Time Interval Statistics:")
print(df_analysis.select(['time_interval', 'price_change', 'instantaneous_frequency']).describe())

# Identify burst periods (high frequency)
freq_threshold = df_analysis['instantaneous_frequency'].quantile(0.95)
burst_count = df_analysis.filter(pl.col('instantaneous_frequency') > freq_threshold).height
print(f"\n💥 Burst Analysis:")
print(f"   Frequency threshold (95th percentile): {freq_threshold:.1f} Hz")
print(f"   Burst ticks: {burst_count} ({burst_count/len(df_analysis)*100:.1f}%)")

⏱️ Time Interval Statistics:
shape: (9, 4)
┌────────────┬───────────────┬──────────────┬─────────────────────────┐
│ statistic  ┆ time_interval ┆ price_change ┆ instantaneous_frequency │
│ ---        ┆ ---           ┆ ---          ┆ ---                     │
│ str        ┆ f64           ┆ f64          ┆ f64                     │
╞════════════╪═══════════════╪══════════════╪═════════════════════════╡
│ count      ┆ 47333.0       ┆ 47333.0      ┆ 47333.0                 │
│ null_count ┆ 0.0           ┆ 0.0          ┆ 0.0                     │
│ mean       ┆ 760.496377    ┆ 0.000158     ┆ inf                     │
│ std        ┆ 1534.8239     ┆ 0.041453     ┆ NaN                     │
│ min        ┆ 0.0           ┆ -0.491768    ┆ 0.000025                │
│ 25%        ┆ 136.0         ┆ -0.016918    ┆ 0.001357                │
│ 50%        ┆ 337.0         ┆ 0.001043     ┆ 0.002967                │
│ 75%        ┆ 737.0         ┆ 0.018415     ┆ 0.007353                │
│ max        ┆ 39292.

## Price Time Series Visualization

In [4]:
# Convert to minutes for better readability
df_plot = df_analysis.with_columns([
    (pl.col('timestamp') / 60).alias('time_minutes')
])

# Create main price plot
fig = px.line(df_plot.to_pandas(), x='time_minutes', y='price', 
              title='Mock Market Data: Price Over Time',
              labels={'time_minutes': 'Time (minutes)', 'price': 'Price ($)'})

fig.update_layout(
    width=900, height=500,
    showlegend=False,
    hovermode='x unified'
)

fig.show()

## Dual-Axis Plot: Price + Activity

In [5]:
# Create subplot with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add price trace
fig.add_trace(
    go.Scatter(
        x=df_plot['time_minutes'].to_numpy(),
        y=df_plot['price'].to_numpy(),
        mode='lines',
        name='Price',
        line=dict(color='blue', width=1.5)
    ),
    secondary_y=False,
)

# Add frequency trace
fig.add_trace(
    go.Scatter(
        x=df_plot['time_minutes'].to_numpy(),
        y=df_plot['instantaneous_frequency'].to_numpy(),
        mode='lines',
        name='Tick Frequency',
        line=dict(color='red', width=1),
        opacity=0.7
    ),
    secondary_y=True,
)

# Update axes labels
fig.update_xaxes(title_text="Time (minutes)")
fig.update_yaxes(title_text="Price ($)", secondary_y=False)
fig.update_yaxes(title_text="Tick Frequency (Hz)", secondary_y=True)

# Update layout
fig.update_layout(
    title="Mock Market Data: Price vs Tick Frequency",
    width=900, height=600,
    hovermode='x unified'
)

fig.show()

## Activity Burst Analysis

In [6]:
# Highlight burst periods
burst_data = df_plot.filter(pl.col('instantaneous_frequency') > freq_threshold)

# Create histogram of time intervals
fig_hist = px.histogram(
    df_plot.to_pandas(), 
    x='time_interval', 
    nbins=50,
    title='Distribution of Time Intervals Between Ticks',
    labels={'time_interval': 'Time Interval (seconds)', 'count': 'Frequency'}
)

fig_hist.update_layout(width=700, height=400)
fig_hist.show()

# Summary statistics
total_duration = df_plot['timestamp'].max()
avg_frequency = len(df_plot) / total_duration

print(f"\n📊 Final Statistics:")
print(f"   Total duration: {total_duration/60:.1f} minutes")
print(f"   Total ticks: {len(df_plot):,}")
print(f"   Average frequency: {avg_frequency:.2f} ticks/second")
print(f"   Price range: ${df_plot['price'].min():.2f} - ${df_plot['price'].max():.2f}")
print(f"   Price volatility: {df_plot['price'].std():.3f}")
print(f"   Burst periods: {len(burst_data)} ticks ({len(burst_data)/len(df_plot)*100:.1f}%)")


📊 Final Statistics:
   Total duration: 599993.4 minutes
   Total ticks: 47,333
   Average frequency: 0.00 ticks/second
   Price range: $91.98 - $126.31
   Price volatility: 8.107
   Burst periods: 2301 ticks (4.9%)


## Price Movement Analysis

In [7]:
# Analyze price movements
price_changes = df_plot.filter(pl.col('price_change') != 0)

# Create price change distribution
fig_changes = px.histogram(
    price_changes.to_pandas(),
    x='price_change',
    nbins=50,
    title='Distribution of Price Changes',
    labels={'price_change': 'Price Change ($)', 'count': 'Frequency'}
)

fig_changes.add_vline(x=0, line_dash="dash", line_color="red")
fig_changes.update_layout(width=700, height=400)
fig_changes.show()

print(f"\n💹 Price Movement Analysis:")
print(f"   Mean price change: ${price_changes['price_change'].mean():.4f}")
print(f"   Price change std: ${price_changes['price_change'].std():.4f}")
print(f"   Largest gain: ${price_changes['price_change'].max():.4f}")
print(f"   Largest loss: ${price_changes['price_change'].min():.4f}")


💹 Price Movement Analysis:
   Mean price change: $0.0002
   Price change std: $0.0415
   Largest gain: $0.1777
   Largest loss: $-0.4918
