In [None]:
import os
import sys

import itertools
import inspect
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats
import functools
import tabulate
import time

sys.path.append("../")
from helper import load_data, preprocess, data_source_release, dist_helper

sc_dict = data_source_release.get_sc_dict()
data_dir = data_source_release.get_data_dir()

%matplotlib inline
plt.style.use('seaborn-deep')
dpi=300

In [None]:
SCENARIOS = ["yalsat_swgcp", "spear_swgcp",
             "clasp_factoring", "saps-CVVAR",
             "lpg-zeno", "yalsat_qcp", "spear_qcp"
            ]
DISTS = ["invgauss_floc", "norm", "lognorm_floc", "expon_floc"]

In [None]:
#%%time
p_dict = dict()
lkh_dict = dict()
param_dict = dict()

for SC in SCENARIOS:
    p_dict[SC] = dict()
    lkh_dict[SC] = dict()
    param_dict[SC] = dict()
    print("Load %s" % SC)
    
    runtimes, _, _ = load_data.get_data(scenario=SC, data_dir=data_dir, 
                                        sc_dict=sc_dict, retrieve=sc_dict[SC]['use'])

    print(runtimes.shape)
    y_max_ = np.max(np.max(runtimes))
    print(y_max_)
    runtimes = (runtimes) / y_max_
    
    for dist_name in DISTS:
        #if "floc" in dist_name:
        #    continue
        start = time.time()
        print("{:>20s}".format(dist_name), end="")
        p_dict[SC][dist_name] = list()
        lkh_dict[SC][dist_name] = list()
        param_dict[SC][dist_name] = list()

        for idx, instance in enumerate(runtimes):
            assert len(instance) == 100
            if idx%int(len(runtimes)/10) == 0: print(".",end="")
            instance = np.array(instance)
            param = dist_helper.fit_dist(x=instance, dist_name=dist_name)
            param_dict[SC][dist_name].append(param)
            
            p = dist_helper.kstest(x=instance, dist_name=dist_name, param=param)
            lkh = dist_helper.nllh(instance, param, dist_name)

            p_dict[SC][dist_name].append(p)
            lkh_dict[SC][dist_name].append(lkh)
        dur = time.time() - start
        print("%3.2gsec len: %3.2g mean_p: %3.2g, mean lkh: %3.2g" % (dur, len(runtimes), np.mean(p_dict[SC][dist_name]), np.mean(lkh_dict[SC][dist_name])))

In [None]:
dist_trans = {
"expon_floc": "EXP",
"invgauss_floc": "INV",
"lognorm_floc": "LOG",
"norm": "N",
}
tabular_data = list()

for SC in SCENARIOS:

    keys = sorted(DISTS)

    perc_ls = list()
    mean_ls = list()
    l_ls = list()
    d_ls = list()
    for dist_name in keys:
        #if "floc" in dist_name:
        #    continue
        num_instances = len(p_dict[SC][dist_name])
        num_equal = np.mean([1 if p <= 0.01 else 0 for p in p_dict[SC][dist_name]])*100
        perc_ls.append(num_equal)
        l_ls.append(np.mean(lkh_dict[SC][dist_name]))
        d_ls.append(dist_trans[dist_name])

    sort_idx = np.argsort(l_ls)
    
    TOP = len(DISTS)
    tabular_data.append([SC, ])
    tabular_data[-1].extend([d_ls[i] for i in sort_idx[:TOP]])
    tabular_data[-1].extend([d_ls[i] for i in sort_idx[:TOP]])
    tabular_data.append([" ", ])
    tabular_data[-1].extend([round(l_ls[i], 3) for i in sort_idx[:TOP]])
    tabular_data[-1].extend([round(perc_ls[i],1) for i in sort_idx[:TOP]])

a = tabulate.tabulate(tabular_data, tablefmt="latex", floatfmt="7.4")
print(a)
#with open("results/Kolmo_res.tex", "w") as fh:
#    fh.write(a)

In [None]:
for SC in SCENARIOS:
    keys = sorted(DISTS)

    perc_ls = list()
    mean_ls = list()
    l_ls = list()
    for dist_name in keys:
        #if "floc" in dist_name: continue
        num_instances = len(p_dict[SC][dist_name])
        num_equal = np.sum([1 if p > 0.05 else 0 for p in p_dict[SC][dist_name]])
        perc_ls.append(float(num_equal)/num_instances*100)
        mean_ls.append(np.mean(p_dict[SC][dist_name]))
        l_ls.append(np.mean(lkh_dict[SC][dist_name]))

    sort_idx = np.argsort(perc_ls)
    #with open("plots/distributions.txt", "a") as fh:
    print("{:>20}: {:>10s} {:>10s} {:>5s}".format("name", "% p>0.05", "mean p", "nll"))
    #fh.write("%s\n" % SC)
    #fh.write("{:>20}: {:>10s} {:>10s} {:>5s}\n".format("name", "% p>0.05", "mean p", "nll"))
    for idx in reversed(sort_idx):
        print("{:>20}: {: 10.2f} {: 10.3f} {: 1.3f}".format(keys[idx], perc_ls[idx], 
                                                            mean_ls[idx], l_ls[idx]))
        #fh.write("{:>20}: {: 10.2f} {: 10.3f} {: 1.3f}\n".format(keys[idx], perc_ls[idx], 
        #                                                         mean_ls[idx], l_ls[idx]))
    #fh.write("\n")
    print()
    
    # Plot fitted distribution
    runtimes, _, _ = load_data.get_data(scenario=SC, data_dir=data_dir, sc_dict=sc_dict, retrieve=sc_dict[SC]['use'])
    
    y_max_ = np.max(np.max(runtimes))
    runtimes = (runtimes) / y_max_    
    
    x_axis = 10**np.arange(np.log10(0.000001), np.log10(sc_dict[SC]['cutoff']), 0.01)

    cs = ['#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00','#a65628','#f781bf','#999999']
    colors1 = itertools.cycle(cs)
    colors2 = itertools.cycle(cs)

    num_plots = 2
    num_runs = 10
    plt.figure(figsize=(10, num_plots*4.5))
    for d_idx, s_idx in enumerate(reversed(sort_idx[-num_plots:])):
        dist_name = keys[s_idx]
        #if "floc" in dist_name: continue

        plt.subplot(num_plots, 1, d_idx+1)
        min_ = 100
        max_ = -100
        for idx, instance in enumerate(runtimes[:num_runs]):
            srtd = np.sort(instance)
            yvals = np.arange(1,len(srtd)+1)/float(len(srtd))
            min_= min(np.min(instance), min_)
            max_= max(np.max(instance), max_)
            plt.step(srtd, yvals, c=next(colors1))
            #plt.hist(instance, normed=True)

        for param in param_dict[SC][dist_name][:num_runs]:
            y = dist_helper.lhood_dist(x_axis, dist_name=dist_name, param=param)
            y = dist_helper.cdf_dist(x_axis, dist_name=dist_name, param=param)
            plt.plot(x_axis, y, c=next(colors2))
            
        plt.title("%s; %% p: %3.3g; nll: %3.3g" % (dist_name, perc_ls[s_idx], l_ls[s_idx]))
        plt.xscale("log")
        plt.xlim([min_, max_])
        #plt.ylim([-0.01, 1.01])

    plt.tight_layout()
    #plt.savefig("plots/fitted_distributions_%s.png" % SC)

In [None]:
dist_name = "invgauss_floc"

plt.figure(figsize=(5, len(SCENARIOS)*2))
plt_idx = 0
cs = ['#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00','#a65628','#f781bf','#999999']
colors1 = itertools.cycle(cs)

for SC in SCENARIOS:
    p = np.array(param_dict[SC][dist_name])
    
    for param_idx in (0,1):
        plt_idx += 1
        plt.subplot(len(SCENARIOS), 2, plt_idx)
        if plt_idx == 1: plt.title("mu")
        if plt_idx == 2: plt.title("scale")
        if plt_idx%2 == 1: plt.ylabel(SC)
        plt.hist(p[:, param_idx], facecolor=next(colors1), normed=True)
            
plt.tight_layout()
#plt.savefig("plots/%s_parameter_distribution.png" % dist_name)
plt.show()