
# ParTES Plot Notebook

## V-1 Raw Data
### V-1.1 Single rank inverted CDF 

Draw plot for the inverted CDF of single rank's measured time in nanoseconds. 

**Usage**: 
- folder: the path of the csv files
- rank: the MPI rank
- ntiles: the number of tiles
- cut_tile: the percentage of the highest tile to draw

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import glob
import os

folder = '.'
rank = 2
# Set total tiles and cut percentage
ntiles = 100
cut_tile = 0.99

# Choose y-axis scale, available: 'log10', 'log2', 'linear'
y_scale = 'linear'

pattern = os.path.join(folder, f'meas_r{rank}_ng*.csv')
csv_files = glob.glob(pattern)

plt.figure(figsize=(10, 6))
for csv_file in csv_files:
    data = np.loadtxt(csv_file)
    sorted_t = np.sort(data)
    n = len(sorted_t)
    
    # Generate ntiles quantile data
    # Map array's ID with quantiles
    q_ids = np.linspace(0, n-1, ntiles, dtype=int)
    # Map percentile with quantiles
    qs = (q_ids + 1) / n
    filt_t = sorted_t[q_ids]
    
    # Filter out tiles > cut_tile*ntiles
    max_tiles = int(cut_tile * ntiles)
    tile_filter = np.arange(ntiles) < max_tiles
    qs = qs[tile_filter]
    filt_t = filt_t[tile_filter]
    
    # Apply y-axis scaling
    if y_scale == 'log10':
        filt_t = np.log10(filt_t)
    elif y_scale == 'log2':
        filt_t = np.log2(filt_t)
    # For 'linear', no transformation needed
    
    ng = os.path.basename(csv_file).split('_ng')[1].split('.csv')[0]
    plt.plot(qs, filt_t, label=f'ng={ng}', marker='x', markersize=1, linestyle='--')

plt.xlabel('Quantile')
plt.ylabel('Time (ns)')
plt.title(f'ICDF Comparison for Rank {rank}')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()


### V-1.2 Single rank normalized box

Draw box plot for a rank's all measured time. Runtimes are normalized to the minimum measured runtime.

**Usage**: 
- folder: the path of the csv files
- rank: the MPI rank
- ntiles: the number of tiles
- cut_tile: the percentage of the highest tile to draw
- y_scale: the y-axis scaling: 'linear', 'log10', or 'log2'

In [None]:
# V-1.2
# Parameters for box plot
folder = '.'  # Folder containing CSV files
rank = 0         # MPI rank to analyze
ntiles = 100     # Number of quantiles (not used for box plot but kept for consistency)
cut_tile = 0.95   # Cut-off for tiles (not used for box plot but kept for consistency)
y_scale = 'linear'  # Y-axis scaling: 'linear', 'log10', or 'log2'

# Load data for box plot
pattern = os.path.join(folder, f'meas_r{rank}_ng*.csv')
csv_files = glob.glob(pattern)

# Dictionary to store data for each ng value
data_dict = {}
min_times = {}

# First pass: collect all data and find minimum times
for csv_file in csv_files:
    data = np.loadtxt(csv_file)
    ng = os.path.basename(csv_file).split('_ng')[1].split('.csv')[0]
    data_dict[ng] = data
    min_times[ng] = np.min(data)
    # Sort and filter runtime > cut_tile% of samples
    
# Create normalized box plot
plt.figure(figsize=(12, 6))

# Prepare data for box plot
box_data = []
labels = []

for ng in sorted(data_dict.keys(), key=lambda x: int(x)):
    # Sort data and filter to only keep the lowest cut_tile% of samples
    sorted_t = np.sort(data_dict[ng])
    n_samples = len(sorted_t)
    n_keep = int(n_samples * cut_tile)
    filt_t = sorted_t[:n_keep]
    
    # Normalize filtered data to minimum measured runtime
    norm_t = filt_t / min_times[ng]
    
    # Apply y-axis scaling
    if y_scale == 'log10':
        norm_t = np.log10(norm_t)
    elif y_scale == 'log2':
        norm_t = np.log2(norm_t)
    # For 'linear', no transformation needed
    
    box_data.append(norm_t)
    labels.append(f'ng={ng}')

# Create box plot
plt.boxplot(box_data, tick_labels=labels, patch_artist=True)
plt.xlabel('ng values')
plt.ylabel('Normalized Time (relative to minimum)')
plt.title(f'Normalized Runtime Distribution for Rank {rank} (lowest {cut_tile*100:.0f}% of samples)')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

### V-1.3 Single rank normalized volin

Draw box plot for a rank's all measured time. Runtimes are normalized to the minimum measured runtime.

**Usage**: 
- folder: the path of the csv files
- rank: the MPI rank
- ntiles: the number of tiles
- cut_tile: the percentage of the highest tile to draw
- y_scale: the y-axis scaling: 'linear', 'log10', or 'log2'

In [None]:
folder = "."
rank = 0
cut_tile = 0.95
y_scale = 'linear'  # Options: 'linear', 'log10', 'log2'

# Collect data for the specified rank
data_dict = {}
min_times = {}

for fname in os.listdir(folder):
    if fname.startswith(f'meas_r{rank}_ng') and fname.endswith('.csv'):
        # Extract ng value from filename
        ng_str = fname.split('_ng')[1].split('.csv')[0]
        ng = int(ng_str)
        
        # Read CSV data
        fpath = os.path.join(folder, fname)
        times = np.loadtxt(fpath)
        
        data_dict[ng] = times
        min_times[ng] = np.min(times)

if not data_dict:
    print(f"No CSV files found for rank {rank} in folder {folder}")
else:
    print(f"Found data for ng values: {sorted(data_dict.keys())}")

# Prepare data for violin plot
violin_data = []
labels = []

for ng in sorted(data_dict.keys(), key=lambda x: int(x)):
    # Sort data and filter to only keep the lowest cut_tile% of samples
    sorted_t = np.sort(data_dict[ng])
    n_samples = len(sorted_t)
    n_keep = int(n_samples * cut_tile)
    filt_t = sorted_t[:n_keep]
    
    # Normalize filtered data to minimum measured runtime
    norm_t = filt_t / min_times[ng]
    
    # Apply y-axis scaling
    if y_scale == 'log10':
        norm_t = np.log10(norm_t)
    elif y_scale == 'log2':
        norm_t = np.log2(norm_t)
    # For 'linear', no transformation needed
    
    violin_data.append(norm_t)
    labels.append(f'ng={ng}')

# Create violin plot
plt.figure(figsize=(10, 6))
parts = plt.violinplot(violin_data, positions=range(1, len(violin_data)+1), showmeans=True, showmedians=True)

# Customize violin plot colors
for pc in parts['bodies']:
    pc.set_facecolor('lightblue')
    pc.set_alpha(0.7)

plt.xticks(range(1, len(labels)+1), labels, rotation=45)
plt.xlabel('ng values')
plt.ylabel('Normalized Time (relative to minimum)')
plt.title(f'Normalized Runtime Distribution for Rank {rank} (lowest {cut_tile*100:.0f}% of samples)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


## V2 Derived/Statistical Metrics
### V2.1 