
# ParTES Plot Notebook

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import glob
import os


## V-1 Raw Data
### V-1.1 Single rank inverted CDF 

Draw plot for the inverted CDF of single rank's measured time in nanoseconds. 

**Usage**: 
- folder: the path of the csv files
- rank: the MPI rank
- ntiles: the number of tiles
- cut_tile: the percentage of the highest tile to draw

In [None]:

folder = '.'
rank = 2
# Set total tiles and cut percentage
ntiles = 100
cut_tile = 0.99

# Choose y-axis scale, available: 'log10', 'log2', 'linear'
y_scale = 'linear'

pattern = os.path.join(folder, f'meas_r{rank}_ng*.csv')
csv_files = glob.glob(pattern)

plt.figure(figsize=(10, 6))
for csv_file in csv_files:
    data = np.loadtxt(csv_file)
    sorted_t = np.sort(data)
    n = len(sorted_t)
    
    # Generate ntiles quantile data
    # Map array's ID with quantiles
    q_ids = np.linspace(0, n-1, ntiles, dtype=int)
    # Map percentile with quantiles
    qs = (q_ids + 1) / n
    filt_t = sorted_t[q_ids]
    
    # Filter out tiles > cut_tile*ntiles
    max_tiles = int(cut_tile * ntiles)
    tile_filter = np.arange(ntiles) < max_tiles
    qs = qs[tile_filter]
    filt_t = filt_t[tile_filter]
    
    # Apply y-axis scaling
    if y_scale == 'log10':
        filt_t = np.log10(filt_t)
    elif y_scale == 'log2':
        filt_t = np.log2(filt_t)
    # For 'linear', no transformation needed
    
    ng = os.path.basename(csv_file).split('_ng')[1].split('.csv')[0]
    plt.plot(qs, filt_t, label=f'ng={ng}', marker='x', markersize=1, linestyle='--')

plt.xlabel('Quantile')
plt.ylabel('Time (ns)')
plt.title(f'ICDF Comparison for Rank {rank}')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()


### V-1.2 Single rank normalized box

Draw box plot for a rank's all measured time. Runtimes are normalized to the minimum measured runtime.

**Usage**: 
- folder: the path of the csv files
- rank: the MPI rank
- ntiles: the number of tiles
- cut_tile: the percentage of the highest tile to draw
- y_scale: the y-axis scaling: 'linear', 'log10', or 'log2'

In [None]:
# V-1.2
# Parameters for box plot
folder = '.'  # Folder containing CSV files
rank = 0         # MPI rank to analyze
ntiles = 100     # Number of quantiles (not used for box plot but kept for consistency)
cut_tile = 0.95   # Cut-off for tiles (not used for box plot but kept for consistency)
y_scale = 'linear'  # Y-axis scaling: 'linear', 'log10', or 'log2'

# Load data for box plot
pattern = os.path.join(folder, f'meas_r{rank}_ng*.csv')
csv_files = glob.glob(pattern)

# Dictionary to store data for each ng value
data_dict = {}
min_times = {}

# First pass: collect all data and find minimum times
for csv_file in csv_files:
    data = np.loadtxt(csv_file)
    ng = os.path.basename(csv_file).split('_ng')[1].split('.csv')[0]
    data_dict[ng] = data
    min_times[ng] = np.min(data)
    # Sort and filter runtime > cut_tile% of samples
    
# Create normalized box plot
plt.figure(figsize=(12, 6))

# Prepare data for box plot
box_data = []
labels = []

for ng in sorted(data_dict.keys(), key=lambda x: int(x)):
    # Sort data and filter to only keep the lowest cut_tile% of samples
    sorted_t = np.sort(data_dict[ng])
    n_samples = len(sorted_t)
    n_keep = int(n_samples * cut_tile)
    filt_t = sorted_t[:n_keep]
    
    # Normalize filtered data to minimum measured runtime
    norm_t = filt_t / min_times[ng]
    
    # Apply y-axis scaling
    if y_scale == 'log10':
        norm_t = np.log10(norm_t)
    elif y_scale == 'log2':
        norm_t = np.log2(norm_t)
    # For 'linear', no transformation needed
    
    box_data.append(norm_t)
    labels.append(f'ng={ng}')

# Create box plot
plt.boxplot(box_data, tick_labels=labels, patch_artist=True)
plt.xlabel('ng values')
plt.ylabel('Normalized Time (relative to minimum)')
plt.title(f'Normalized Runtime Distribution for Rank {rank} (lowest {cut_tile*100:.0f}% of samples)')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

### V-1.3 Single rank normalized volin

Draw box plot for a rank's all measured time. Runtimes are normalized to the minimum measured runtime.

**Usage**: 
- folder: the path of the csv files
- rank: the MPI rank
- ntiles: the number of tiles
- cut_tile: the percentage of the highest tile to draw
- y_scale: the y-axis scaling: 'linear', 'log10', or 'log2'

In [None]:
folder = "."
rank = 0
cut_tile = 0.95
y_scale = 'linear'  # Options: 'linear', 'log10', 'log2'

# Collect data for the specified rank
data_dict = {}
min_times = {}

for fname in os.listdir(folder):
    if fname.startswith(f'meas_r{rank}_ng') and fname.endswith('.csv'):
        # Extract ng value from filename
        ng_str = fname.split('_ng')[1].split('.csv')[0]
        ng = int(ng_str)
        
        # Read CSV data
        fpath = os.path.join(folder, fname)
        times = np.loadtxt(fpath)
        
        data_dict[ng] = times
        min_times[ng] = np.min(times)

if not data_dict:
    print(f"No CSV files found for rank {rank} in folder {folder}")
else:
    print(f"Found data for ng values: {sorted(data_dict.keys())}")

# Prepare data for violin plot
violin_data = []
labels = []

for ng in sorted(data_dict.keys(), key=lambda x: int(x)):
    # Sort data and filter to only keep the lowest cut_tile% of samples
    sorted_t = np.sort(data_dict[ng])
    n_samples = len(sorted_t)
    n_keep = int(n_samples * cut_tile)
    filt_t = sorted_t[:n_keep]
    
    # Normalize filtered data to minimum measured runtime
    norm_t = filt_t / min_times[ng]
    
    # Apply y-axis scaling
    if y_scale == 'log10':
        norm_t = np.log10(norm_t)
    elif y_scale == 'log2':
        norm_t = np.log2(norm_t)
    # For 'linear', no transformation needed
    
    violin_data.append(norm_t)
    labels.append(f'ng={ng}')

# Create violin plot
plt.figure(figsize=(10, 6))
parts = plt.violinplot(violin_data, positions=range(1, len(violin_data)+1), showmeans=True, showmedians=True)

# Customize violin plot colors
for pc in parts['bodies']:
    pc.set_facecolor('lightblue')
    pc.set_alpha(0.7)

plt.xticks(range(1, len(labels)+1), labels, rotation=45)
plt.xlabel('ng values')
plt.ylabel('Normalized Time (relative to minimum)')
plt.title(f'Normalized Runtime Distribution for Rank {rank} (lowest {cut_tile*100:.0f}% of samples)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


### V-1.4 Single ng inverted CDF

Plotting the inverted CDF for a ng's all measured time. 

**Usage**: 
- folder: the path of the csv files
- rank: the MPI rank
- ntiles: the number of tiles
- cut_tile: the percentage of the highest tile to draw
- y_scale: the y-axis scaling: 'linear', 'log10', or 'log2'

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import glob
import os

folder = '.'
ng = 3300000000
# Set total tiles and cut percentage
ntiles = 100
cut_tile = 0.99
ranks = []

# Choose y-axis scale, available: 'log10', 'log2', 'linear'
y_scale = 'linear'
# If ranks is empty, find all available ranks from CSV files
if not ranks:
    pattern_all = os.path.join(folder, f'meas_r*_ng{ng}.csv')
    all_files = glob.glob(pattern_all)
    for csv_file in all_files:
        # Extract rank from filename: meas_r<rank>_ng<ng>.csv
        basename = os.path.basename(csv_file)
        rank_part = basename.split('_')[1]  # Get 'r<rank>'
        rank = int(rank_part[1:])  # Remove 'r' and convert to int
        ranks.append(rank)
    ranks = sorted(list(set(ranks)))  # Remove duplicates and sort
    print(f"Found ranks: {ranks}")

# Filter CSV files based on specified ranks
filtered_files = []
if ranks:
    for rank in ranks:
        rank_pattern = os.path.join(folder, f'meas_r{rank}_ng{ng}.csv')
        rank_files = glob.glob(rank_pattern)
        filtered_files.extend(rank_files)
else:
    # If still no ranks found, use all available files
    print("No ranks specified, using all available files")
    exit(1)

plt.figure(figsize=(10, 6))
for csv_file in filtered_files:
    data = np.loadtxt(csv_file)
    sorted_t = np.sort(data)
    n = len(sorted_t)
    
    # Generate ntiles quantile data
    # Map array's ID with quantiles
    q_ids = np.linspace(0, n-1, ntiles, dtype=int)
    # Map percentile with quantiles
    qs = (q_ids + 1) / n
    filt_t = sorted_t[q_ids]
    
    # Filter out tiles > cut_tile*ntiles
    max_tiles = int(cut_tile * ntiles)
    tile_filter = np.arange(ntiles) < max_tiles
    qs = qs[tile_filter]
    filt_t = filt_t[tile_filter]
    
    # Apply y-axis scaling
    if y_scale == 'log10':
        filt_t = np.log10(filt_t)
    elif y_scale == 'log2':
        filt_t = np.log2(filt_t)
    # For 'linear', no transformation needed
    
    rank = os.path.basename(csv_file).split('_')[1]
    plt.plot(qs, filt_t, label=f'rank={rank}', marker='x', markersize=1, linestyle='--')

plt.xlabel('Quantile')
plt.ylabel('Time (ns)')
plt.title(f'ICDF Comparison for ng={ng}')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()


### V-1.5 All ranks aggregated violin

Plotting the inverted CDF for a ng's all measured time. 

**Usage**: 
- folder: the path of the csv files
- ntiles: the number of quantile levels
- ntiles_max: the maximum number of quantile levels
- y_scale: the y-axis scaling: 'linear', 'log10', or 'log2'

In [None]:
import os
import glob
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

# Configuration
folder = './'  # Set the path to your CSV files
ntiles = 100  # Number of quantile levels
ntile_cut = 0.9  # Cut-off for quantiles (e.g., 0.9 means use only up to 90th percentile)
y1_scale = 'log10'  # Choose y-axis scale: 'linear', 'log10', or 'log2'
y2_scale = 'linear'  # Choose y-axis scale: 'linear', 'log10', or 'log2'
x3_scale = 'log10'  # Choose x-axis scale: 'linear', 'log10', or 'log2'
y3_scale = 'log10'  # Choose y-axis scale: 'linear', 'log10', or 'log2'

# Calculate maximum number of tiles to use
max_tiles = int(ntile_cut * ntiles)

# Read all CSV files and organize by ng
ng_data = defaultdict(list)

# Find all CSV files matching the pattern
pattern = os.path.join(folder, 'meas_r*_ng*.csv')
csv_files = glob.glob(pattern)

# Parse files and group by ng value
for csv_file in csv_files:
    basename = os.path.basename(csv_file)
    # Extract ng from filename: meas_r<rankid>_ng<ng>.csv
    parts = basename.split('_')
    if len(parts) >= 3 and parts[2].startswith('ng'):
        ng_str = parts[2][2:].split('.')[0]  # Remove 'ng' prefix and '.csv' suffix
        try:
            ng = int(ng_str)
            # Read the single column CSV
            data = np.loadtxt(csv_file)
            ng_data[ng].extend(data.flatten())
        except (ValueError, IOError):
            print(f"Warning: Could not process file {csv_file}")

# Sort ng values for consistent plotting
sorted_ngs = sorted(ng_data.keys())

# Create figure with three subplots
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6))

# Plot 1: ICDF for each ng
for ng in sorted_ngs:
    all_data = np.array(ng_data[ng])
    sorted_data = np.sort(all_data)
    n = len(sorted_data)
    
    # Generate quantile data only up to max_tiles
    q_ids = np.linspace(0, n-1, ntiles, dtype=int)[:max_tiles]
    qs = (q_ids + 1) / n  # Quantile values from 0 to 1
    quantile_times = sorted_data[q_ids]
    
    # Apply y-axis scaling
    if y1_scale == 'log10':
        quantile_times = np.log10(quantile_times)
    elif y1_scale == 'log2':
        quantile_times = np.log2(quantile_times)
    # For 'linear', no transformation needed
    
    ax1.plot(qs, quantile_times, label=f'ng={ng}', marker='o', markersize=2)

ax1.set_xlabel('Quantile')
ylabel = 'Time (ns)'
if y1_scale == 'log10':
    ylabel = 'log10(Time (ns))'
elif y1_scale == 'log2':
    ylabel = 'log2(Time (ns))'
ax1.set_ylabel(ylabel)
ax1.set_title('Inverse CDF for Different ng Values')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Plot 2: Violin plot with normalized data
violin_data = []
violin_labels = []
for ng in sorted_ngs:
    all_data = np.array(ng_data[ng])
    
    # Sort and limit to max_tiles
    sorted_data = np.sort(all_data)
    n = len(sorted_data)
    
    # Generate quantile data only up to max_tiles
    q_ids = np.linspace(0, n-1, ntiles, dtype=int)[:max_tiles]
    limited_data = sorted_data[q_ids]
    
    min_time = np.min(limited_data)
    normalized_data = limited_data / min_time
    if y2_scale == 'log10':
        normalized_data = np.log10(normalized_data)
    elif y2_scale == 'log2':
        normalized_data = np.log2(normalized_data)
    violin_data.append(normalized_data)
    violin_labels.append(f'{ng}')

# Create violin plot
parts = ax2.violinplot(violin_data, positions=range(len(sorted_ngs)), showmeans=True)

# Customize violin plot
for pc in parts['bodies']:
    pc.set_alpha(0.7)

ax2.set_xlabel('ng')
ax2.set_ylabel('Normalized Time (ns / min_ns)')
ax2.set_title('Violin Plot: Normalized Runtime Distribution')
ax2.set_xticks(range(len(sorted_ngs)))
ax2.set_xticklabels(violin_labels)
ax2.grid(True, alpha=0.3)

# Plot 3: Minimum and median runtime for each ng
min_times = []
median_times = []
for ng in sorted_ngs:
    all_data = np.array(ng_data[ng])
    min_times.append(np.min(all_data))
    median_times.append(np.median(all_data))

# Apply y-axis scaling for plot 3
min_times_scaled = np.array(min_times)
median_times_scaled = np.array(median_times)
if y3_scale == 'log10':
    min_times_scaled = np.log10(min_times_scaled)
    median_times_scaled = np.log10(median_times_scaled)
elif y3_scale == 'log2':
    min_times_scaled = np.log2(min_times_scaled)
    median_times_scaled = np.log2(median_times_scaled)

# Apply x-axis scaling for plot 3
if x3_scale == 'log10':
    sorted_ngs = np.log10(sorted_ngs)
elif x3_scale == 'log2':
    sorted_ngs = np.log2(sorted_ngs)

ax3.plot(sorted_ngs, min_times_scaled, label='Minimum', marker='o', linestyle='-', markersize=4)
ax3.plot(sorted_ngs, median_times_scaled, label='Median', marker='s', linestyle='-', markersize=4)

ax3.set_xlabel('ng')
ylabel = 'Time (ns)'
if y3_scale == 'log10':
    ylabel = 'log10(Time (ns))'
elif y3_scale == 'log2':
    ylabel = 'log2(Time (ns))'
ax3.set_ylabel(ylabel)
if x3_scale == 'log10':
    ax3.set_xlabel('log10(ng)')
elif x3_scale == 'log2':
    ax3.set_xlabel('log2(ng)')
ax3.set_title('Minimum and Median Runtime vs ng')
ax3.legend()
ax3.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Processed {len(csv_files)} CSV files for {len(sorted_ngs)} different ng values")


## V2 Derived/Statistical Metrics
### V2.1 