### Testing Parquet compression when storing metadata

In [1]:
import os
import numpy as np
import pandas as pd
from pathlib import Path

out_dir = Path("tmp")
out_dir.mkdir(parents=True, exist_ok=True)


def create_dataframe(rows, scenario, **kwargs):
    """Create a dataframe with controlled repetitiveness based on scenario."""
    df = pd.DataFrame({
        # 'val0': np.random.rand(rows),
    })

    if scenario == "unique":
        # All unique values (no repetition)
        values = np.random.rand(rows)
    
    elif scenario == "repeated_value":
        # Same value repeated n times
        repeat_count = min(kwargs['repeat_count'], rows)
        values = np.repeat(np.random.rand(rows // repeat_count), repeat_count)
        if len(values) < rows:  # Handle remainder
            values = np.append(values, np.repeat(values[-1], rows - len(values)))
    
    elif scenario == "cyclic":
        # Cyclic pattern of n unique values
        unique_values = min(kwargs['unique_values'], rows)
        pattern = np.random.rand(unique_values)
        cycles = rows // unique_values + 1
        values = np.tile(pattern, cycles)[:rows]
    
    elif scenario == "low_cardinality":
        # Low cardinality (few unique values)
        cardinality = min(kwargs['cardinality'], rows)
        values = np.random.choice(np.random.rand(cardinality), size=rows)
    
    elif scenario == "high_cardinality":
        # High cardinality (many unique values but with some repetition)
        cardinality = min(kwargs['cardinality'], rows)
        values = np.random.choice(np.random.rand(cardinality), size=rows)
    
    elif scenario == "sorted":
        # Sorted values (good for delta encoding)
        values = sorted(np.random.rand(rows))
            
    else:
        raise ValueError(f"Unknown scenario: {scenario}")
    
    df["val1"] = values
    return df


def measure_file_size(df, filename, compression=None):
    """Save dataframe to parquet and return file size in bytes."""
    filepath = out_dir / f"{filename}.parquet"
    df.to_parquet(filepath, compression=compression)
    size = os.path.getsize(filepath)
    return size

In [2]:
"""Run various experiments measuring parquet compression efficiency."""
# Configuration
rows = 1_000_000
# rows = 100
results = []

# Test scenarios. All contain 5 columns. 4 columns are random, 1 depends on the scenaario
scenarios = [
    {"name": "Unique Values", "func": "unique"},
    {"name": "Repeated Values (10x)", "func": "repeated_value", "kwargs": {"repeat_count": 10}},
    {"name": "Repeated Values (100x)", "func": "repeated_value", "kwargs": {"repeat_count": 100}},
    {"name": "Repeated Values (1000x)", "func": "repeated_value", "kwargs": {"repeat_count": 1000}},
    {"name": "Repeated Values (10_000x)", "func": "repeated_value", "kwargs": {"repeat_count": 10_000}},
    {"name": "Repeated Values (100_000x)", "func": "repeated_value", "kwargs": {"repeat_count": 100_000}},
    {"name": "Repeated Values (1_000_000x)", "func": "repeated_value", "kwargs": {"repeat_count": 1_000_000}},
    {"name": "Cyclic (10 values)", "func": "cyclic", "kwargs": {"unique_values": 10}},
    {"name": "Cyclic (100 values)", "func": "cyclic", "kwargs": {"unique_values": 100}},
    {"name": "Low Cardinality (5 values)", "func": "low_cardinality", "kwargs": {"cardinality": 5}},
    {"name": "Low Cardinality (100 values)", "func": "low_cardinality", "kwargs": {"cardinality": 100}},
    {"name": f"High Cardinality ({rows // 10} values)", "func": "high_cardinality", "kwargs": {"cardinality": rows // 10}},
    {"name": "Sorted Values", "func": "sorted"},
]

# Compression methods to test
compression_methods = ['brotli']  # None, 'snappy', 'gzip'

In [3]:
for scenario in scenarios:
    kwargs = scenario.get('kwargs', {})
    print(f"Processing scenario: {scenario['name']}...")
    
    # Create the dataframe for this scenario
    df = create_dataframe(rows, scenario['func'], **kwargs)
    print(df.head())
    
    # Get base CSV size for comparison
    csv_filepath = out_dir / f"test_{scenario['func']}.csv"
    df.to_csv(csv_filepath, index=False)
    csv_size = os.path.getsize(csv_filepath)
    
    # Test different compression methods
    for compression in compression_methods:
        compression_name = compression if compression else 'uncompressed'
        print(f"  Testing with {compression_name}...")
        
        # Measure size
        filename = f"test_{scenario['func']}_{compression_name}"
        parquet_size = measure_file_size(df, filename, compression)
        
        # Record results
        results.append({
            'Scenario': scenario['name'],
            'Compression': compression_name,
            'CSV Size': csv_size,
            'Parquet Size': parquet_size,
            'Compression Ratio': parquet_size / csv_size
        })
        
# Convert to DataFrame
results_df = pd.DataFrame(results)

# Print summary
print("\nResults Summary:")
print(results_df.to_string())

# Save results
results_df.to_csv(out_dir / "parquet_compression_results.csv", index=False)

Processing scenario: Unique Values...
       val1
0  0.613677
1  0.549333
2  0.529441
3  0.075059
4  0.697883
  Testing with brotli...
Processing scenario: Repeated Values (10x)...
       val1
0  0.482386
1  0.482386
2  0.482386
3  0.482386
4  0.482386
  Testing with brotli...
Processing scenario: Repeated Values (100x)...
      val1
0  0.84589
1  0.84589
2  0.84589
3  0.84589
4  0.84589
  Testing with brotli...
Processing scenario: Repeated Values (1000x)...
       val1
0  0.385295
1  0.385295
2  0.385295
3  0.385295
4  0.385295
  Testing with brotli...
Processing scenario: Repeated Values (10_000x)...
       val1
0  0.552992
1  0.552992
2  0.552992
3  0.552992
4  0.552992
  Testing with brotli...
Processing scenario: Repeated Values (100_000x)...
       val1
0  0.627064
1  0.627064
2  0.627064
3  0.627064
4  0.627064
  Testing with brotli...
Processing scenario: Repeated Values (1_000_000x)...
       val1
0  0.094813
1  0.094813
2  0.094813
3  0.094813
4  0.094813
  Testing with brot

In [4]:
# Create visualization with plotly
import plotly.graph_objects as go

# Prepare data for plotting
scenarios_df = results_df['Scenario'].unique()
compression_methods = results_df['Compression'].unique()

# Create figure
fig = go.Figure()

# Add traces for each scenario in reverse order to match legend with bars
for scenario_name in reversed(scenarios_df):
    data = results_df[results_df['Scenario'] == scenario_name]
    
    fig.add_trace(go.Bar(
        y=data['Compression'],
        x=data['Compression Ratio'],
        name=scenario_name,
        orientation='h'
    ))

# Update layout
fig.update_layout(
    title='Parquet Compression Efficiency by Scenario and Method',
    xaxis_title='Compression Ratio (Parquet Size / CSV Size)',
    yaxis_title='Compression Method',
    barmode='group',
    height=800,
    width=1000,
    legend=dict(
        x=1.05,
        y=1,
        xanchor='left',
        yanchor='top',
        traceorder='reversed'  # Ensure legend order matches visual order
    ),
    margin=dict(l=100, r=100, t=100, b=100)
)

# Save as image
# fig.write_image(out_dir / 'parquet_compression_chart.png')

# Show interactive plot
fig.show()

In [27]:
# Create visualization with plotly
import plotly.graph_objects as go

# Prepare data for plotting
scenarios_df = results_df['Scenario'].unique()
compression_methods = results_df['Compression'].unique()

# Create figure
fig = go.Figure()

# Add traces for each scenario in reverse order to match legend with bars
for scenario_name in reversed(scenarios_df):
    data = results_df[results_df['Scenario'] == scenario_name]
    
    fig.add_trace(go.Bar(
        y=data['Compression'],
        x=data['Compression Ratio'],
        name=scenario_name,
        orientation='h'
    ))

# Update layout
fig.update_layout(
    title='Parquet Compression Efficiency by Scenario and Method',
    xaxis_title='Compression Ratio (Parquet Size / CSV Size)',
    yaxis_title='Compression Method',
    barmode='group',
    height=800,
    width=1000,
    legend=dict(
        x=1.05,
        y=1,
        xanchor='left',
        yanchor='top',
        traceorder='reversed'  # Ensure legend order matches visual order
    ),
    margin=dict(l=100, r=100, t=100, b=100)
)

# Show interactive plot
fig.show()