# Temporal Epoch Splits

Create 10-minute temporal epochs for time series cross-validation and analyze distributions.

In [1]:
import polars as pl
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

## Load Data and Create Epochs

In [2]:
# Load original data
df = pl.read_parquet('../../data/mock.parquet')

print(f"📊 Original Data:")
print(f"   Shape: {df.shape}")
print(f"   Time range: {df['timestamp'].min():.1f} - {df['timestamp'].max():.1f} seconds")
print(f"   Duration: {(df['timestamp'].max() - df['timestamp'].min())/60:.1f} minutes")

📊 Original Data:
   Shape: (223754, 2)
   Time range: 1.5 - 35999.8 seconds
   Duration: 600.0 minutes


In [3]:
# Create epoch_id column (10-minute epochs = 600 seconds)
df_epochs = df.with_columns([
    (pl.col('timestamp') // 600).cast(pl.Int64).alias('epoch_id')
])

print(f"🕐 Epoch Information:")
print(f"   Number of epochs: {df_epochs['epoch_id'].n_unique()}")
print(f"   Epoch range: {df_epochs['epoch_id'].min()} - {df_epochs['epoch_id'].max()}")
print(f"   First 5 rows:")
print(df_epochs.head())

🕐 Epoch Information:
   Number of epochs: 60
   Epoch range: 0 - 59
   First 5 rows:
shape: (5, 3)
┌───────────┬────────────┬──────────┐
│ timestamp ┆ price      ┆ epoch_id │
│ ---       ┆ ---        ┆ ---      │
│ f64       ┆ f64        ┆ i64      │
╞═══════════╪════════════╪══════════╡
│ 1.505061  ┆ 100.0      ┆ 0        │
│ 1.961532  ┆ 100.036535 ┆ 0        │
│ 2.04633   ┆ 100.103935 ┆ 0        │
│ 3.051946  ┆ 100.165622 ┆ 0        │
│ 3.667571  ┆ 100.205532 ┆ 0        │
└───────────┴────────────┴──────────┘


## Epoch Statistics

In [4]:
# Calculate statistics per epoch
epoch_stats = df_epochs.group_by('epoch_id').agg([
    pl.col('price').mean().alias('avg_price'),
    pl.col('price').std().alias('price_std'),
    pl.col('price').min().alias('min_price'),
    pl.col('price').max().alias('max_price'),
    pl.count().alias('num_points'),
    pl.col('timestamp').min().alias('start_time'),
    pl.col('timestamp').max().alias('end_time')
]).sort('epoch_id')

print(f"📈 Epoch Statistics Summary:")
print(epoch_stats.describe())

print(f"\n🔍 First 10 epochs:")
print(epoch_stats.head(10))

📈 Epoch Statistics Summary:
shape: (9, 9)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ statistic ┆ epoch_id  ┆ avg_price ┆ price_std ┆ … ┆ max_price ┆ num_point ┆ start_tim ┆ end_time │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ s         ┆ e         ┆ ---      │
│ str       ┆ f64       ┆ f64       ┆ f64       ┆   ┆ f64       ┆ ---       ┆ ---       ┆ f64      │
│           ┆           ┆           ┆           ┆   ┆           ┆ f64       ┆ f64       ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ count     ┆ 60.0      ┆ 60.0      ┆ 60.0      ┆ … ┆ 60.0      ┆ 60.0      ┆ 60.0      ┆ 60.0     │
│ null_coun ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ … ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ 0.0      │
│ t         ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
│ mean      ┆ 29.5      ┆ 782.08312 ┆ 7.720332  ┆

(Deprecated in version 0.20.5)
  pl.count().alias('num_points'),


## Distribution Visualizations

In [5]:
# Create subplot with two distributions
fig = make_subplots(
    rows=2, cols=1,
    subplot_titles=['Average Price per Epoch', 'Number of Points per Epoch'],
    vertical_spacing=0.12
)

# Average price distribution
fig.add_trace(
    go.Histogram(
        x=epoch_stats['avg_price'].to_numpy(),
        nbinsx=20,
        name='Avg Price',
        marker_color='blue',
        opacity=0.7
    ),
    row=1, col=1
)

# Number of points distribution
fig.add_trace(
    go.Histogram(
        x=epoch_stats['num_points'].to_numpy(),
        nbinsx=20,
        name='Num Points',
        marker_color='red',
        opacity=0.7
    ),
    row=2, col=1
)

# Update layout
fig.update_xaxes(title_text="Average Price ($)", row=1, col=1)
fig.update_xaxes(title_text="Number of Points", row=2, col=1)
fig.update_yaxes(title_text="Frequency", row=1, col=1)
fig.update_yaxes(title_text="Frequency", row=2, col=1)

fig.update_layout(
    title="Epoch Distributions",
    width=800, height=700,
    showlegend=False
)

fig.show()

In [6]:
# Additional statistics
print(f"📊 Distribution Statistics:")
print(f"\n💰 Average Price per Epoch:")
print(f"   Mean: ${epoch_stats['avg_price'].mean():.2f}")
print(f"   Std:  ${epoch_stats['avg_price'].std():.2f}")
print(f"   Min:  ${epoch_stats['avg_price'].min():.2f}")
print(f"   Max:  ${epoch_stats['avg_price'].max():.2f}")

print(f"\n📈 Number of Points per Epoch:")
print(f"   Mean: {epoch_stats['num_points'].mean():.0f}")
print(f"   Std:  {epoch_stats['num_points'].std():.0f}")
print(f"   Min:  {epoch_stats['num_points'].min()}")
print(f"   Max:  {epoch_stats['num_points'].max()}")

# Check for any gaps in epochs
expected_epochs = set(range(epoch_stats['epoch_id'].min(), epoch_stats['epoch_id'].max() + 1))
actual_epochs = set(epoch_stats['epoch_id'].to_list())
missing_epochs = expected_epochs - actual_epochs

print(f"\n🔍 Epoch Validation:")
print(f"   Expected epochs: {len(expected_epochs)}")
print(f"   Actual epochs: {len(actual_epochs)}")
print(f"   Missing epochs: {missing_epochs if missing_epochs else 'None ✅'}")

📊 Distribution Statistics:

💰 Average Price per Epoch:
   Mean: $782.08
   Std:  $528.68
   Min:  $102.03
   Max:  $1674.24

📈 Number of Points per Epoch:
   Mean: 3729
   Std:  523
   Min:  2544
   Max:  4915

🔍 Epoch Validation:
   Expected epochs: 60
   Actual epochs: 60
   Missing epochs: None ✅


## Time Series Visualization with Epochs

In [7]:
# Create time series plot with epoch boundaries
df_plot = df_epochs.with_columns([
    (pl.col('timestamp') / 60).alias('time_minutes')
])

fig = go.Figure()

# Add price trace
fig.add_trace(
    go.Scatter(
        x=df_plot['time_minutes'].to_numpy(),
        y=df_plot['price'].to_numpy(),
        mode='lines',
        name='Price',
        line=dict(color='blue', width=1),
        opacity=0.8
    )
)

# Add epoch boundaries as vertical lines
for epoch_id in range(0, epoch_stats['epoch_id'].max() + 1, 5):  # Every 5th epoch
    epoch_time = epoch_id * 10  # 10 minutes per epoch
    fig.add_vline(
        x=epoch_time,
        line_dash="dash",
        line_color="red",
        opacity=0.5,
        annotation_text=f"Epoch {epoch_id}",
        annotation_position="top"
    )

fig.update_layout(
    title="Price Time Series with Epoch Boundaries",
    xaxis_title="Time (minutes)",
    yaxis_title="Price ($)",
    width=1000, height=500,
    hovermode='x unified'
)

fig.show()

## Save Processed Data

In [9]:
# Save the processed data with epoch_id
output_path = '../../data/mock_processed.parquet'
df_epochs.write_parquet(output_path)

print(f"💾 Saved processed data to: {output_path}")
print(f"   Shape: {df_epochs.shape}")
print(f"   Columns: {df_epochs.columns}")
print(f"   File size: {df_epochs.estimated_size('mb'):.2f} MB")

# Verify the saved file
df_verify = pl.read_parquet(output_path)
print(f"\n✅ Verification:")
print(f"   Loaded shape: {df_verify.shape}")
print(f"   Epoch range: {df_verify['epoch_id'].min()} - {df_verify['epoch_id'].max()}")
print(f"   Sample data:")
print(df_verify.head(3))

💾 Saved processed data to: ../../data/mock_processed.parquet
   Shape: (223754, 3)
   Columns: ['timestamp', 'price', 'epoch_id']
   File size: 5.12 MB

✅ Verification:
   Loaded shape: (223754, 3)
   Epoch range: 0 - 59
   Sample data:
shape: (3, 3)
┌───────────┬────────────┬──────────┐
│ timestamp ┆ price      ┆ epoch_id │
│ ---       ┆ ---        ┆ ---      │
│ f64       ┆ f64        ┆ i64      │
╞═══════════╪════════════╪══════════╡
│ 1.505061  ┆ 100.0      ┆ 0        │
│ 1.961532  ┆ 100.036535 ┆ 0        │
│ 2.04633   ┆ 100.103935 ┆ 0        │
└───────────┴────────────┴──────────┘
