In [1]:
import pandas as pd
import os
import json

In [2]:
def _groupby_instance_sequence(grp):
    return pd.Series({"sequence" : " ".join(grp["op_id"].to_list())})

def _groupby_sequence_occ(grp):
    return pd.Series({"occ" : len(grp)})

def sequences_and_frequencies(df_sequence_occ):
    good_df_sequnce = df_sequence_occ[df_sequence_occ["occ"] > 2]
    sequences = good_df_sequnce["sequence"].apply(lambda x: tuple(x.split(" ")))
    p = good_df_sequnce["occ"]/sum(good_df_sequnce["occ"])
    return sequences.to_list(), p.to_list()

def _flat(LOL):
    return [e for L in LOL for e in L]

def to_patterns(filepath):
    df=pd.read_csv(filepath).drop("Unnamed: 0", axis=1)
    df_instance_sequence = df.groupby("instance").apply(_groupby_instance_sequence).reset_index()
    df_sequence_occ = df_instance_sequence.groupby("sequence").apply(_groupby_sequence_occ).reset_index()
    
    patterns, p = sequences_and_frequencies(df_sequence_occ)
    vocabulary = list(set(_flat(patterns)))
    
    return vocabulary, patterns, p

def write(filename, vocabulary, patterns, p, dir_out):
    data = { 
        "original_filename" : filename,
        "vocabulary" : vocabulary,
        "patterns" : patterns, 
        "p" : p
    }
    os.makedirs(dir_out, exist_ok=True)
    pathout = os.path.join(dir_out, "sim_{}.json".format(filename))
    json.dump(data, open(pathout, "w"))

def read(filename, dir_in):
    pathin = os.path.join(dir_in, "sim_{}.json".format(filename))
    assert os.path.isfile(pathin)
    data = json.loads(open(pathin).read())
    
    patterns = data["patterns"]
    patterns = [tuple(p) for p in patterns]
    
    return data["vocabulary"], patterns, data["p"]
    

In [3]:
def make_static_params(filename):
    dir_out = "..\\static-openstack"
    vocabulary, patterns, p = to_patterns("{}.csv".format(filename))
    
    write(filename, vocabulary, patterns, p, dir_out)
    
    ret_vocabulary, ret_patterns, ret_p = read(filename, dir_out)
    assert ret_vocabulary == vocabulary
    assert ret_patterns == patterns
    assert ret_p == p

In [4]:
for filename in ["openstack_val_n1", "openstack_val_n2"]:
    make_static_params(filename)