In [2]:
import math
import os.path
import csv
import pickle
import time

import copy
import warnings
from typing import List, Dict
from collections import OrderedDict

import matplotlib.pyplot as plt
from scipy.stats import expon, gamma, pareto
import numpy as np

DEBUG = False

In [3]:
def preprocess_azure_v1_trace(trace_dir, n_day=14):
    if not os.path.exists(trace_dir):
        raise RuntimeError(f"{trace_dir}")
    tracelines = OrderedDict()
    print(f"Reading azure v1 trace in 14 days; it might take a while...")
    tic = time.time()
    for i in range(1, n_day + 1):
        day_str = str(i) if i >= 10 else "0" + str(i)
        filename = os.path.join(trace_dir, f"invocations_per_function_md.anon.d{day_str}.csv")
        print(f"Read file: {filename}")
        with open(filename, newline="") as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                function_name = row["HashFunction"]
                histogram_1min = np.array([int(row[str(j)]) for j in range(1, 1441)], dtype=np.int32)
                if i == 1:
                    assert function_name not in tracelines
                    tracelines[function_name] = histogram_1min
                else:
                    expected_size = 1440 * (i - 1)
                    if function_name in tracelines:
                        cur_size = tracelines[function_name].size
                        if cur_size != expected_size:
                            diff = expected_size - cur_size
                            assert diff % 1440 == 0
                            tracelines[function_name] = np.concatenate((tracelines[function_name],
                                                                       np.zeros((diff,), dtype=np.int32),
                                                                       histogram_1min))
                        else:
                            tracelines[function_name] = np.concatenate((tracelines[function_name],
                                                                       histogram_1min))
                    else:
                        tracelines[function_name] = np.concatenate((np.zeros((expected_size, ), dtype=np.int32),
                                                                   histogram_1min))
    for function_name, histogram_1min in tracelines.items():
        if histogram_1min.size != n_day * 1440:
            diff = n_day * 1440 - histogram_1min.size
            assert diff % 1440 == 0
            tracelines[function_name] = np.concatenate((tracelines[function_name], np.zeros((diff,), dtype=np.int32)))
    print(f"Reading takes: {time.time() - tic}s.")

    # report the stats.
    num_function_invocations = []
    for function_name, histogram_1min in tracelines.items():
        assert (histogram_1min.size == 1440 * n_day), f"length: {histogram_1min.size}"
        num_function_invocations.append(np.sum(histogram_1min))
    num_functions = len(tracelines.keys())
    print(f"Azure trace v1, stats: #days: {n_day}, #functions: {num_functions}, "
          f"total invocations: {sum(num_function_invocations)}, "
          f"max: {max(num_function_invocations)}, min: {min(num_function_invocations)}, "
          f"avg: {np.mean(num_function_invocations):.2f}")

    # pickle it to disk
    save_path = os.path.join(trace_dir, "azure_v1.pkl")
    with open(save_path, "wb") as handle:
        pickle.dump(tracelines, handle)
    print(f"Dump the data into {save_path}, file size: {os.path.getsize(save_path) // 1e6} MB.")

In [6]:
preprocess_azure_v1_trace('/home/zy/data/datasets/azurefunctions-dataset2019')

Reading azure v1 trace in 14 days; it might take a while...
Read file: /home/zy/data/datasets/azurefunctions-dataset2019/invocations_per_function_md.anon.d01.csv
Read file: /home/zy/data/datasets/azurefunctions-dataset2019/invocations_per_function_md.anon.d02.csv
Read file: /home/zy/data/datasets/azurefunctions-dataset2019/invocations_per_function_md.anon.d03.csv
Read file: /home/zy/data/datasets/azurefunctions-dataset2019/invocations_per_function_md.anon.d04.csv
Read file: /home/zy/data/datasets/azurefunctions-dataset2019/invocations_per_function_md.anon.d05.csv
Read file: /home/zy/data/datasets/azurefunctions-dataset2019/invocations_per_function_md.anon.d06.csv
Read file: /home/zy/data/datasets/azurefunctions-dataset2019/invocations_per_function_md.anon.d07.csv
Read file: /home/zy/data/datasets/azurefunctions-dataset2019/invocations_per_function_md.anon.d08.csv
Read file: /home/zy/data/datasets/azurefunctions-dataset2019/invocations_per_function_md.anon.d09.csv
Read file: /home/zy/da

In [7]:
def preprocess_azure_v2_trace(trace_dir):
    """Load and process azure v2 trace."""
    if not os.path.exists(trace_dir):
        raise RuntimeError(f"{trace_dir}")
    filename = os.path.join(trace_dir, "AzureFunctionsInvocationTraceForTwoWeeksJan2021.txt")
    tracelines = OrderedDict()
    print(f"Reading azure v2 trace in 14 days...")
    tic = time.time()
    with open(filename, newline="") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            function_name = row["func"]
            end_time = float(row["end_timestamp"])
            duration = float(row["duration"])
            if function_name not in tracelines:
                tracelines[function_name] = [end_time - duration]
            else:
                tracelines[function_name].append(end_time -duration)

    for function_name, trace in tracelines.items():
        tracelines[function_name] = np.sort(np.array(tracelines[function_name]))
    print(f"Reading takes: {time.time() - tic}s.")
    # Do some check and report stats:
    num_functions = len(tracelines.keys())
    num_function_invocations = []
    for function_name, trace in tracelines.items():
        num_function_invocations.append(len(trace))
    print(f"Azure trace v2, stats: #days: 14, #functions: {num_functions}, "
          f"total invocations: {sum(num_function_invocations)}, "
          f"max: {max(num_function_invocations)}, min: {min(num_function_invocations)}, "
          f"avg: {np.mean(num_function_invocations):.2f}")

    # pickle it to disk
    save_path = os.path.join(trace_dir, "azure_v2.pkl")
    with open(save_path, "wb") as handle:
        pickle.dump(tracelines, handle)
    print(f"Dump the data into {save_path}, file size: {os.path.getsize(save_path) // 1e6} MB.")

In [8]:
preprocess_azure_v2_trace('/home/zy/data/datasets')

Reading azure v2 trace in 14 days...
Reading takes: 8.449933052062988s.
Azure trace v2, stats: #days: 14, #functions: 424, total invocations: 1980951, max: 535667, min: 1, avg: 4672.05
Dump the data into /home/zy/data/datasets/azure_v2.pkl, file size: 15.0 MB.
