In [None]:
import matplotlib.pyplot as plt
import csv
import itertools
import numpy as np

def load_data_from_csv(filename):
    """Load numerical values from third column of a CSV file"""
    values = []
    with open(filename, 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            # Extract third column value and convert to float
            if len(row) > 1:
              values.append(float(row[2]))
    return values

In [None]:

# Read and parse data from CSV files
direct_values = load_data_from_csv('./sample_data/ds1-1_direct.csv')
broker_values = load_data_from_csv('./sample_data/ds5-1_direct.csv')

def calculate_cdf(values):
    """Calculate sorted values and CDF"""
    sorted_values = np.sort(values)
    cdf = np.arange(1, len(sorted_values)+1) / len(sorted_values)
    return sorted_values, cdf

  # Calculate CDFs
direct_sorted, direct_cdf = calculate_cdf(direct_values)
broker_sorted, broker_cdf = calculate_cdf(broker_values)

# Create plot
plt.figure(figsize=(10, 6))
plt.step(direct_sorted, direct_cdf * 100, where='post',
         label='Direct', linewidth=2)
plt.step(broker_sorted, broker_cdf * 100, where='post',
         label='Broker', linewidth=2)

# Formatting
plt.xlabel('Time (seconds)', fontsize=12)
plt.ylabel('Percentage of Data (%)', fontsize=12)
plt.title('Cumulative Distribution Function (CDF) Comparison', fontsize=14)
plt.xlim(0, max(max(direct_sorted), max(broker_sorted)) * 1.1)
plt.ylim(0, 100)
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(fontsize=12)
plt.tight_layout()

plt.show()


In [None]:
# Plot latency and the left as y, x as start time (min-max scaling)

import csv
import matplotlib.pyplot as plt

def read_data(filename):
    """Read CSV data and return lists of timestamps and latencies."""
    timestamps = []
    latencies = []
    with open(filename, "r") as file:
        reader = csv.reader(file)
        for row in reader:
            # Each row is expected to be: [pub_id, dissemination, latency, timestamp]
            if len(row) < 4:
                continue  # Skip incomplete rows
            try:
                latency = float(row[2])
                timestamp = float(row[3])
                latencies.append(latency)
                timestamps.append(timestamp)
            except ValueError:
                # Skip rows with invalid numerical values
                continue
    return timestamps, latencies

def scale_timestamps(timestamps):
    """
    Min-max scale the timestamps such that the minimum timestamp becomes 0.
    Each scaled timestamp = original timestamp - min(timestamp)
    """
    if not timestamps:
        return []
    min_time = min(timestamps)
    return [t - min_time for t in timestamps]

def plot_latency(scaled_timestamps, latencies):
    """Plot latency (y-axis) vs. scaled start time (x-axis)."""
    plt.figure(figsize=(10, 6))
    plt.plot(scaled_timestamps, latencies, marker='o', linestyle='-', color='blue')
    plt.xlabel("Start Time (scaled)")
    plt.ylabel("Latency")
    plt.title("Latency vs. Scaled Start Time")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

if __name__ == "__main__":
    filename = "output.csv"
    timestamps, latencies = read_data(filename)
    if not timestamps:
        print("No valid data found in the CSV file.")
    else:
        scaled_timestamps = scale_timestamps(timestamps)
        plot_latency(scaled_timestamps, latencies)
