In [1]:
# import external packages
import numpy as np
import cvxpy as cp
import mosek
from sklearn.model_selection import train_test_split
import time

# import internal packages
import phi_divergence as phi
from iter_gen_and_eval_alg import iter_gen_and_eval_alg

The toy problem we examine is as follows:

\begin{align}\label{toy_model_2}
    \begin{split}
        \max_{\mathbf{0} \leq \mathbf{x} \leq \mathbf{10}}\{\mathbf{e}^T \mathbf{x}: \mathbf{\xi}^T \mathbf{x} \leq 1,~\sum_{j=1}^{k-1}x_j-x_k\leq -1,~\mathbf{x}\leq 10\}.
    \end{split}
\end{align}


With $\mathbf{x} \in \mathbb{R}^k$ and $\mathbf{\xi}\in [-1,1]^{k}$ (assume uniformly distributed).

We would like to obtain solutions for which we can make the following probabilistic guarantee regarding its feasibility: $$\mathbb{P}^*(\mathbf{\xi}^T \mathbf{x} \leq 1)\geq 1 - \epsilon$$

In [2]:
# import external packages
import numpy as np
import cvxpy as cp
from sklearn.model_selection import train_test_split
import time
import math

# import internal packages
import phi_divergence as phi
from iter_gen_and_eval_alg import iter_gen_and_eval_alg
import util

# problem specific functions:
def generate_data(random_seed, N, **kwargs):
    np.random.seed(random_seed)
    dim_x = kwargs.get('dim_x',2)
    data = np.random.uniform(-1,1,size = (N,dim_x)) # generates N random scenarios    
    return data 

def generate_data_with_nominal(random_seed, N, **kwargs):
    np.random.seed(random_seed)
    dim_x = kwargs.get('dim_x',2)
    data_nominal = np.array([[0] * dim_x])
    data = np.random.uniform(-1,1,size = (N-1,dim_x)) # generate N-1 scenarios
    data = np.concatenate((data_nominal,data)) # add nominal case to training data
    return data

def solve_P_SCP(S, **kwargs):
    dim_x = kwargs.get('dim_x', 2)
    x = cp.Variable(dim_x, nonneg = True)
    setup_time_start = time.time()
    constraints = [cp.sum(x[0:(dim_x-1)]) <= x[dim_x-1]-1, x<=10]
    for s in range(len(S)):
        constraints.append(cp.multiply(S[s], x) - 1 <= 0)
    obj = cp.Minimize(- cp.sum(x)) # formulate as a minimization problem
    prob = cp.Problem(obj,constraints)
    time_limit = kwargs.get('time_limit', 2*60*60) - (time.time() - setup_time_start)
    if time_limit < 0:
        print("Error: did not provide sufficient time for setting up & solving problem")
        return (None, None)
    try:
#         prob.solve(solver=cp.MOSEK, mosek_params = {mosek.dparam.optimizer_max_time: time_limit})
        prob.solve(solver=cp.GUROBI, verbose=False, TimeLimit=time_limit)
    except cp.error.SolverError:
        return (None, None)
    return (x.value, prob.value)

def unc_func(x, data, **kwargs):
    return (np.dot(data,x)) - 1
    
def get_true_prob(x, dim_x):
    return(1/2+1/(2*x[dim_x-1]))
    
def solve_toyproblem_true_prob(desired_rhs, dim_x):
    beta = desired_rhs
    x = cp.Variable(dim_x, nonneg = True)
    constraints = [(1-2*beta)*x[dim_x-1] + 1 >= 0, cp.sum(x[0:(dim_x-1)]) <= x[dim_x-1]-1, x<=10]
    obj = cp.Maximize(cp.sum(x))
    prob = cp.Problem(obj,constraints)
#     prob.solve(solver=cp.MOSEK)
    prob.solve(solver=cp.GUROBI)
    return(x.value, prob.value)

In [11]:
from phi_divergence import mod_chi2_cut
import scipy.stats
import itertools

def solve_with_ihsan2013(dim_x,risk_param_epsilon,conf_param_alpha,data,m_j=10,
                         omega_init=0.1,step_size=0.01,use_robist_lb=False):
    def make_center(lb,ub,m_j):
        delta = (ub-lb)/m_j
        center = np.arange(lb+delta/2,ub,delta)
        return(center)          

    def get_freq(m_j,data,lb,ub):
        Freq = np.zeros(m_j)
        delta = (ub-lb)/m_j
        for i in range(len(data)):
            index = int(np.floor((data[i]-lb)/delta))
            Freq[index] = Freq[index] + 1
        return(Freq/len(data))
    
    def get_freq_v2(data,lb,ub,m_j,indices):
        dim_x = len(data[0])
        num_cells = len(indices)
        Freq = np.zeros(num_cells)
        delta = (ub-lb)/m_j
        
        for i in range(len(data)):
            ind = 0
            for j in range(dim_x):
                index_j = int(np.floor((data[i][j]-lb)/delta))
                ind += m_j**(dim_x - 1 - j) * index_j
            Freq[ind] += 1
        return(Freq/len(data))

    def solve_rc(omega,a,b):
        d = len(a[0])
        x = cp.Variable(d, nonneg = True)
        z = cp.Variable(d)
        w = cp.Variable(d)
        constraints = [cp.norm(z,1)+omega*cp.norm(w,2) + a[0] @ x <= b]
        for i in range(d):
            constraints.append(z[i] + w[i] == -a[i+1] @ x) 
        
        # add our additional constraints
        constraints.append(cp.sum(x[0:(d-1)]) <= x[d-1]-1)
        constraints.append(x<=10)
        
        obj = cp.Maximize(cp.sum(x))
        prob = cp.Problem(obj,constraints)
        # prob.solve(solver = cp.MOSEK)
        prob.solve(solver = cp.GUROBI)
        return(x.value)

    def lower_bound(alpha,p,S,N,phi_dot=2):
        N_v = len(p)
        q = cp.Variable(N_v, nonneg = True)
        t = cp.Variable(N_v, nonneg = True)
        r = phi_dot/(2*N)*scipy.stats.chi2.ppf(1-alpha, N_v-1)
        constraints = [cp.sum(q) == 1]
        f_obj = 0
        for i in range(N_v):
            if S[i] == 1:
                f_obj = f_obj + q[i]
            z = cp.vstack([2*(q[i]-p[i]),(t[i]-q[i])])
            constraints.append(cp.norm(z,2) <= (t[i]+q[i]))
        constraints.append(cp.sum(t) <= r)
        obj = cp.Minimize(f_obj)
        prob = cp.Problem(obj,constraints)
        # prob.solve(solver = cp.MOSEK)
        prob.solve(solver = cp.GUROBI)
        return(prob.value)
        
    def cpt_feas(cpt_arr,x,a,b,indices):
        d = len(cpt_arr)
        S = np.zeros(len(indices))
        for i in range(len(S)):
            const = a[0]
            for j in range(d):
                const = const + cpt_arr[j][indices[i][j]] * a[j+1] 
            if const.dot(x) <= b:
                S[i] = 1
        return(S)
    
    def lower_bound_ROBIST(data, x, conf_param_alpha, phi_div=mod_chi2_cut, phi_dot=2, numeric_precision=1e-6):
        N = len(data)
        constr_evals = (np.dot(data,x)) - 1
        N_vio = sum(constr_evals>(0+numeric_precision))
        p_vio = N_vio/N
        if p_vio == 0:
            return 1
        elif p_vio == 1:
            return 0
        return util.compute_cc_lb_chi2_analytic(1-p_vio, N, conf_param_alpha)
    
    def get_true_prob(x, dim_x):
        return(1/2+1/(2*x[dim_x-1]))
    
    # see notation from ihsan2013
    # a = [np.array([1,1]), np.array([1,0]), np.array([0,1])]
    # b = 10
    a = []
    for i in range(dim_x+1):
        if i == 0:
            a.append(np.array([0 for j in range(dim_x)]))
        else:
            temp = [0 for j in range(i-1)]
            temp.append(1)
            temp = temp + [0 for j in range(i, dim_x)]
            a.append(np.array(temp))
    b = 1
    
    N = len(data)
    
    cpt_arr = []
    lb = -1
    ub = 1
    m_j = m_j # assume that the support is always split into 10 equal intervals, even as dim_x increases
    
    # OLD CODE: Assumes independence
    # np.random.seed(random_seed) 
    # xi = np.random.uniform(size = (dim_x,N))*2-1
    # N = N**dim_x # assume data is indep and all combinations are taken
    # # to get all possible combinations of independent data:
    # data = np.array(np.meshgrid(*xi)).T.reshape(-1,dim_x)
    # indices = np.asarray(list((itertools.product(np.arange(m_j), repeat = dim_x))))
    # p = np.zeros(len(indices))
    # freq_ct = []
    # for i in range(dim_x):
    #     cpt_arr.append(make_center(lb,ub,m_j))
    #     freq_ct.append(get_freq(m_j, data.T[i],lb,ub))
    # for j in range(len(indices)):
    #     p[j] = 1
    #     for k in range(dim_x):
    #         p[j] = p[j] * freq_ct[k][indices[j][k]]
    
    for i in range(dim_x):
        cpt_arr.append(make_center(lb,ub,m_j))
        
    indices = np.asarray(list((itertools.product(np.arange(m_j), repeat = dim_x))))
    p = get_freq_v2(data,lb,ub,m_j,indices)
    
    start_time = time.time()
    omega = omega_init
    lowerbound = -np.inf
    lb_gap_ihsan2013 = []
    lb_gap_robist = []
    
    while lowerbound < 1-risk_param_epsilon:
        x = solve_rc(omega,a,b)
        S = cpt_feas(cpt_arr,x,a,b,indices)
        lb_ihsan2013 = lower_bound(conf_param_alpha,p,S,N)
        lb_robist = lower_bound_ROBIST(data,x,conf_param_alpha)
        
        if use_robist_lb:
            lowerbound = lb_robist
        else:
            lowerbound = lb_ihsan2013
        
        true_prob = get_true_prob(x, dim_x)
        
        lb_gap_ihsan2013.append(abs(true_prob - lb_ihsan2013)/true_prob)
        lb_gap_robist.append(abs(true_prob - lb_robist)/true_prob)
        
        obj = np.sum(x)
        # print('omega:', omega)
        # print('True Prob:',get_true_prob(x, dim_x))
        # print('lowerbound Ihsan:',lb_ihsan2013)
        # print('lowerbound ROBIST:',lb_robist)
        # print('Objective:', obj)
        # print()
        omega = omega + step_size
    runtime = time.time() - start_time
    avg_lb_gap_ihsan2013 = sum(lb_gap_ihsan2013) / len(lb_gap_ihsan2013)
    avg_lb_gap_robist = sum(lb_gap_robist) / len(lb_gap_robist)
    return runtime, x, obj, lb_ihsan2013, avg_lb_gap_ihsan2013, avg_lb_gap_robist

In [None]:
# set parameter values
dim_x = 2
problem_instance = {'dim_x': dim_x, 'time_limit': 10*60}

In [None]:
# generate and split data into train and test
random_seed = 0
N_total = 10000
data = generate_data(random_seed, N_total, dim_x=dim_x)

N_train = N_total / 2
data_train, data_test = train_test_split(data, train_size=(N_train/N_total), random_state=random_seed)

In [None]:
# set our own algorithm parameter values
risk_param_epsilon = 0.10
conf_param_alpha = 0.05
add_strategy = 'random_vio'
remove_strategy = 'random_any'

In [None]:
# provide functions and other info for generating & evaluating solutions
solve_SCP = solve_P_SCP
eval_unc_obj = None
eval_unc_constr = [{'function': unc_func,
                    'info': {'risk_measure': 'probability', # must be either 'probability' or 'expectation'
                             'desired_rhs': 1 - risk_param_epsilon}}]

In [None]:
# run the algorithm
alg = iter_gen_and_eval_alg(solve_SCP, problem_instance, eval_unc_obj, eval_unc_constr, 
                            data_train, data_test, conf_param_alpha=conf_param_alpha,
                            add_strategy=add_strategy ,remove_strategy=remove_strategy,
                            verbose=True)

stop_criteria={'max_elapsed_time': 0.5*60} # in seconds (time provided to search algorithm)

(best_sol, runtime, num_iter, pareto_frontier, S_history) = alg.run(stop_criteria=stop_criteria)

In [None]:
runtime

In [None]:
num_iter

In [None]:
len(solutions)

In [None]:
best_sol

In [None]:
#OPTIONAL:
N_eval = 50000
data_eval = generate_data(random_seed + 99, N_eval, k=k)

In [None]:
beta = problem_info['desired_prob_guarantee_beta']
x_true, obj_true = solve_toyproblem_true_prob(beta, k)
obj_alg = best_sol['obj']
obj_gap_true =  100*(obj_true - obj_alg)/obj_true
obj_gap_true

In [None]:
# Determine optimal solution given data_test
runtime, opt_x, opt_sum_y, opt_obj, opt_lb = util.compute_opt_given_data(conf_param_alpha, beta, phi_div, phi_dot, data_test)
obj_alg = best_sol['obj']
obj_gap_opt = 100*(opt_obj - obj_alg)/opt_obj
obj_gap_opt

In [None]:
for i,sol in enumerate(solutions):
    if i<=7:
        Z_arr = data_train[sol['scenario_set']]
        true_prob = get_true_prob(sol['sol'], k)
        if i == 0:
            dataio.plot_iter(i, data_train, Z_arr, sol['sol'], sol['obj'], 
                             sol['p_train'], sol['lb_train'], true_prob,
                             True, "png", True, N_train, alpha, beta)
        else:
            dataio.plot_iter(i, data_train, Z_arr, sol['sol'], sol['obj'], 
                             sol['p_train'], sol['lb_train'], true_prob,
                             True, "png", False, N_train, alpha, beta)
        
            dataio.plot_iter(i, data_test, None, sol['sol'], sol['obj'], 
                             sol['p_test'], sol['lb_test'], true_prob,
                             True, "png", False, N_test, alpha, beta)
    else:
        break

In [None]:
dataio.plot_pareto_curve(pareto_solutions, beta, None, None, None, None)

In [None]:
dataio.plot_obj_over_time(solutions, best_sol, None, None, None, None)

In [None]:
dataio.plot_size_set_over_time(solutions, best_sol, None, None, None, None)

In [None]:
# Plot final solution found by algorithm
name = 'Strategy: '+ str(add_strategy)
save_plot = False
plot_type = "eps"
show_legend = True
Z_values = data_train[best_sol['scenario_set']]
dataio.plot_solution(name, data_train, Z_values, best_sol['sol'], 
              best_sol['obj'], best_sol['lb_test'], save_plot, plot_type, show_legend, N, alpha, beta)

In [None]:
# Compute optimal solution with true probability constraint
prob_true = beta
[x_true, obj_true] = solve_toyproblem_true_prob(prob_true, k)
constr = uncertain_constraint(data_test, x_true)
vio = constr[constr>(0+numeric_precision)]   
p_vio = len(vio)/N_train
p = np.array([1-p_vio, p_vio])
r = phi_dot/(2*N_test)*scipy.stats.chi2.ppf(1-alpha, 1)
lb = rs.compute_lb(p, r, par, phi_div)
print(p)
print(lb)
print(obj_true)

In [None]:
name = "TrueProb="+str(prob_true)
save_plot = False
plot_type = "eps"
show_legend = True
dataio.plot_solution(name, data_test, None, x_true, obj_true, lb, save_plot, plot_type, show_legend, N, alpha, beta)

In [None]:
# Determine optimal solution given data_test
runtime, opt_x, opt_sum_y, opt_obj, opt_lb = util.compute_opt_given_data(alpha, beta, par, phi_div, data_test, time_limit_mosek)

In [None]:
runtime

In [None]:
# Plot optimal solution given data_test
name = 'Opt_given_test_data'
save_plot = False
plot_type = "eps"
show_legend = True
dataio.plot_solution(name, data_test, None, opt_x, opt_obj, opt_lb, save_plot, plot_type, show_legend, N, alpha, beta)

In [None]:
# Compute solution via Campi method
data = generate_data(k, N_campi)
runtime, campi_x, campi_obj, campi_true_prob, Z_arr = util.solve_with_campi_N(alpha, beta, numeric_precision, data, time_limit_mosek)

In [None]:
# Plot Campi solution
name = 'Campi method'
save_plot = False
plot_type = "eps"
show_legend = True
dataio.plot_solution(name, data, Z_arr, campi_x, campi_obj, 0, save_plot, plot_type, show_legend, N, alpha, beta)

In [None]:
# Compute Garatti2022 solution
k = 1000
dim_x = k
beta = 0.95
alpha = 10e-6
time_limit_solve = 5*60
numeric_precision = 1e-6

set_sizes, time_determine_set_sizes = util.Garatti2022_determine_set_sizes(dim_x, beta, alpha)

In [None]:
set_sizes

In [None]:
# Compute Calafiore2016 solution
k = 10
dim_x = k
beta = 0.95
alpha = 10e-6
time_limit_solve = 5*60
numeric_precision = 1e-6

scale_eps_prime = 0.7
N_eval = 10000

N, time_determine_set_sizes = util.determine_N_calafiore2016(dim_x, beta, alpha, scale_eps_prime, N_eval)

In [None]:
N

# Numerical Experiments

In [None]:
output_file_name = 'tp_ihsan2013_k=3_mj=10_eps=0.05_alpha=0.01_seeds=1-10'

headers = ['seed', '$k$', '$m_j$', '$m$', '$N_{min}$', 
           'N', '$N_1$', '$N_2$', '$T$ (ishan2013)', '$T$ (ROBIST)', 
           'obj. (ishan2013)', 'obj. (ROBIST)', 'opt. obj. (true)',
           '$\gamma$ (ishan2013)', '$\gamma$ (ROBIST)', 
           'true prob. (ishan2013)', 'true prob. (ROBIST)',
           'prob. MAPE (ishan2013)', 'prob. MAPE (ROBIST)',
           '\#Iter.~(\\texttt{add})', '\#Iter.~(\\texttt{remove})', 
           '$\mu_{|\mathcal{S}_i|}$', '$\max_{i}|\mathcal{S}_i|$']

# Write headers to .txt file
with open(r'output/ToyProblem/headers_'+output_file_name+'.txt','w+') as f:
    f.write(str(headers))

output_data = {}

# set parameter values
risk_param_epsilon = 0.05
conf_param_alpha = 0.01
dim_x = 3
m_j = 10
m = m_j**dim_x
N_min = 5*m
N = 2*N_min

N_train = math.floor(N/2)
N_test = N - N_train
# str_tmp = "("+str(N_train)+", "+str(N_test)+")"
# print(dim_x, m_j, m, N_min, N, str_tmp)

opt_x_true, opt_obj_true = solve_toyproblem_true_prob(1-risk_param_epsilon, dim_x)

problem_instance = {}
problem_instance['dim_x'] = dim_x
problem_instance['time_limit'] = 1*60*60 

# ROBIST settings:
stop_criteria={'max_elapsed_time': 1*60} 
solve_SCP = solve_P_SCP
eval_unc_obj = None
eval_unc_constr = [{'function': unc_func,
                   'info': {'risk_measure': 'probability', # must be either 'probability' or 'expectation'
                            'desired_rhs': 1 - risk_param_epsilon}}]

random_seed_settings = [i for i in range(1,11)] #101
run_count = 0
for random_seed in random_seed_settings:
    
    data = generate_data(random_seed, N, dim_x=dim_x)               
    data_train, data_test = train_test_split(data, train_size=(N_train/N), random_state=random_seed)

    # ihsan2013:
    runtime_ihsan2013, x, obj_ihsan2013, lb_ihsan2013, avg_lb_gap_ihsan2013, avg_lb_gap_robist = solve_with_ihsan2013(dim_x,risk_param_epsilon,conf_param_alpha,data,m_j=m_j,
                                                                                                                      Omega_init=0.0,step_size=0.01,random_seed=random_seed)
    true_prob_ihsan2013 = get_true_prob(x, dim_x)

    # ROBIST:
    alg = iter_gen_and_eval_alg(solve_SCP, problem_instance, eval_unc_obj, eval_unc_constr, 
                                data_train, data_test, conf_param_alpha=conf_param_alpha,
                                verbose=False)
    
    (best_sol, runtime_alg, num_iter, pareto_frontier, S_history) = alg.run(stop_criteria=stop_criteria)
    
    lb_alg = best_sol['feas'][0]
    obj_alg = - best_sol['obj']
    true_prob_alg = get_true_prob(best_sol['sol'], dim_x)
    S_avg = sum(len(S_i) for S_i in S_history) / len(S_history)
    S_max = max(len(S_i) for S_i in S_history)
    num_iter_add = num_iter['add']
    num_iter_remove = num_iter['remove']
    
    # # turn off:   
    # (best_sol, runtime_alg, num_iter, pareto_frontier, S_history) = (np.nan, np.nan, np.nan, np.nan, np.nan)
    # lb_alg = np.nan
    # obj_alg = np.nan
    # true_prob_alg = np.nan
    # S_avg = np.nan
    # S_max = np.nan
    # num_iter_add = np.nan
    # num_iter_remove = np.nan

    output_data[(random_seed, dim_x, m_j)] = [m_j, m, N_min, N, N_train, N_test, 
                                              runtime_ihsan2013, runtime_alg, 
                                              obj_ihsan2013, obj_alg, opt_obj_true,
                                              lb_ihsan2013, lb_alg,
                                              true_prob_ihsan2013, true_prob_alg,
                                              avg_lb_gap_ihsan2013, avg_lb_gap_robist,
                                              num_iter_add, num_iter_remove,
                                              S_avg, S_max]
    
    output_file_name = 'new_output_data'
    with open(r'output/ToyProblem/'+output_file_name+'.txt','w+') as f:
        f.write(str(output_data))
    
    run_count += 1
    print("Completed run: " + str(run_count))

In [19]:
output_file_name = 'tp_ihsan2013_k=5_mj=10_eps=0.05_alpha=0.01_seeds=1-10'

headers = ['seed', '$k$', '$m_j$', '$m$', '$N_{min}$', 
           'N', '$N_1$', '$N_2$', '$T$ (ishan2013)', '$T$ (ROBIST)', 
           'obj. (ishan2013)', 'obj. (ROBIST)', 'opt. obj. (true)',
           '$\gamma$ (ishan2013)', '$\gamma$ (ROBIST)', 
           'true prob. (ishan2013)', 'true prob. (ROBIST)',
           'prob. MAPE (ishan2013)', 'prob. MAPE (ROBIST)',
           '\#Iter.~(\\texttt{add})', '\#Iter.~(\\texttt{remove})', 
           '$\mu_{|\mathcal{S}_i|}$', '$\max_{i}|\mathcal{S}_i|$']

# Write headers to .txt file
with open(r'output/ToyProblem/headers_'+output_file_name+'.txt','w+') as f:
    f.write(str(headers))

output_data = {}

# set parameter values
risk_param_epsilon = 0.05
conf_param_alpha = 0.01
dim_x = 5
m_j = 10
m = m_j**dim_x
N_min = 5*m
N = 2*N_min

N_train = math.floor(N/2)
N_test = N - N_train
# str_tmp = "("+str(N_train)+", "+str(N_test)+")"
# print(dim_x, m_j, m, N_min, N, str_tmp)

opt_x_true, opt_obj_true = solve_toyproblem_true_prob(1-risk_param_epsilon, dim_x)

problem_instance = {}
problem_instance['dim_x'] = dim_x
problem_instance['time_limit'] = 1*60*60 

# ROBIST settings:
stop_criteria={'max_elapsed_time': 1*60} 
solve_SCP = solve_P_SCP
eval_unc_obj = None
eval_unc_constr = [{'function': unc_func,
                   'info': {'risk_measure': 'probability', # must be either 'probability' or 'expectation'
                            'desired_rhs': 1 - risk_param_epsilon}}]

random_seed_settings = [i for i in range(1,11)] #101
run_count = 0
for random_seed in random_seed_settings:
    
    data = generate_data(random_seed, N, dim_x=dim_x)               
    data_train, data_test = train_test_split(data, train_size=(N_train/N), random_state=random_seed)

    # ihsan2013:
#     runtime_ihsan2013, x, obj_ihsan2013, lb_ihsan2013, avg_lb_gap_ihsan2013, avg_lb_gap_robist = solve_with_ihsan2013(dim_x,risk_param_epsilon,conf_param_alpha,data,m_j=m_j,
#                                                                                                                       Omega_init=0.0,step_size=0.01,random_seed=random_seed)
#     true_prob_ihsan2013 = get_true_prob(x, dim_x)

    # turn off:
    runtime_ihsan2013 = np.nan
    obj_ihsan2013 = np.nan
    lb_ihsan2013 = np.nan
    avg_lb_gap_ihsan2013 = np.nan
    avg_lb_gap_robist = np.nan
    true_prob_ihsan2013 = np.nan

    # ROBIST:
    alg = iter_gen_and_eval_alg(solve_SCP, problem_instance, eval_unc_obj, eval_unc_constr, 
                                data_train, data_test, conf_param_alpha=conf_param_alpha,
                                verbose=False)
    
    (best_sol, runtime_alg, num_iter, pareto_frontier, S_history) = alg.run(stop_criteria=stop_criteria)
    
    lb_alg = best_sol['feas'][0]
    obj_alg = - best_sol['obj']
    true_prob_alg = get_true_prob(best_sol['sol'], dim_x)
    S_avg = sum(len(S_i) for S_i in S_history) / len(S_history)
    S_max = max(len(S_i) for S_i in S_history)
    num_iter_add = num_iter['add']
    num_iter_remove = num_iter['remove']
    
    # # turn off:   
    # (best_sol, runtime_alg, num_iter, pareto_frontier, S_history) = (np.nan, np.nan, np.nan, np.nan, np.nan)
    # lb_alg = np.nan
    # obj_alg = np.nan
    # true_prob_alg = np.nan
    # S_avg = np.nan
    # S_max = np.nan
    # num_iter_add = np.nan
    # num_iter_remove = np.nan

    output_data[(random_seed, dim_x, m_j)] = [m_j, m, N_min, N, N_train, N_test, 
                                              runtime_ihsan2013, runtime_alg, 
                                              obj_ihsan2013, obj_alg, opt_obj_true,
                                              lb_ihsan2013, lb_alg,
                                              true_prob_ihsan2013, true_prob_alg,
                                              avg_lb_gap_ihsan2013, avg_lb_gap_robist,
                                              num_iter_add, num_iter_remove,
                                              S_avg, S_max]
    
    output_file_name = 'new_output_data_2'
    with open(r'output/ToyProblem/'+output_file_name+'.txt','w+') as f:
        f.write(str(output_data))
    
    run_count += 1
    print("Completed run: " + str(run_count))

Completed run: 1
Completed run: 2
Completed run: 3
Completed run: 4
Completed run: 5
Completed run: 6
Completed run: 7
Completed run: 8
Completed run: 9
Completed run: 10


In [17]:
from numpy import nan

output_file_name = 'tp_ihsan2013_k=5_mj=10_eps=0.05_alpha=0.01_seeds=1-10'
# Read from .txt file
file_path = 'output/ToyProblem/'+output_file_name+'.txt'
dic = ''
with open(file_path,'r') as f:
     for i in f.readlines():
        if i != "nan":
            dic=i #string
output_data_read = eval(dic)
output_data_read

{(1, 5, 10): [10,
  100000,
  500000,
  1000000,
  500000,
  500000,
  nan,
  60.365718126297,
  nan,
  1.2055159426503064,
  1.2222222222222223,
  nan,
  0.9526946574297765,
  nan,
  0.9534086472294225,
  nan,
  nan,
  80,
  78,
  2.867088607594937,
  8],
 (2, 5, 10): [10,
  100000,
  500000,
  1000000,
  500000,
  500000,
  nan,
  60.13407039642334,
  nan,
  1.1848663110687707,
  1.2222222222222223,
  nan,
  0.9568579127506267,
  nan,
  0.9576939078303743,
  nan,
  nan,
  80,
  78,
  2.7151898734177213,
  7],
 (3, 5, 10): [10,
  100000,
  500000,
  1000000,
  500000,
  500000,
  nan,
  60.185465574264526,
  nan,
  1.2119132843865374,
  1.2222222222222223,
  nan,
  0.9508121760515322,
  nan,
  0.9520972892829046,
  nan,
  nan,
  93,
  92,
  2.9783783783783786,
  8],
 (4, 5, 10): [10,
  100000,
  500000,
  1000000,
  500000,
  500000,
  nan,
  60.12677764892578,
  nan,
  1.2070594954042226,
  1.2222222222222223,
  nan,
  0.9518621549221422,
  nan,
  0.9530915465044363,
  nan,
  nan,
  

In [18]:
output_data = output_data_read
output_data

{(1, 5, 10): [10,
  100000,
  500000,
  1000000,
  500000,
  500000,
  nan,
  60.365718126297,
  nan,
  1.2055159426503064,
  1.2222222222222223,
  nan,
  0.9526946574297765,
  nan,
  0.9534086472294225,
  nan,
  nan,
  80,
  78,
  2.867088607594937,
  8],
 (2, 5, 10): [10,
  100000,
  500000,
  1000000,
  500000,
  500000,
  nan,
  60.13407039642334,
  nan,
  1.1848663110687707,
  1.2222222222222223,
  nan,
  0.9568579127506267,
  nan,
  0.9576939078303743,
  nan,
  nan,
  80,
  78,
  2.7151898734177213,
  7],
 (3, 5, 10): [10,
  100000,
  500000,
  1000000,
  500000,
  500000,
  nan,
  60.185465574264526,
  nan,
  1.2119132843865374,
  1.2222222222222223,
  nan,
  0.9508121760515322,
  nan,
  0.9520972892829046,
  nan,
  nan,
  93,
  92,
  2.9783783783783786,
  8],
 (4, 5, 10): [10,
  100000,
  500000,
  1000000,
  500000,
  500000,
  nan,
  60.12677764892578,
  nan,
  1.2070594954042226,
  1.2222222222222223,
  nan,
  0.9518621549221422,
  nan,
  0.9530915465044363,
  nan,
  nan,
  

In [19]:
# obtain average and std dev
import pandas as pd
df_output = pd.DataFrame.from_dict(output_data, orient='index')
df_output.mean()

0          10.000000
1      100000.000000
2      500000.000000
3     1000000.000000
4      500000.000000
5      500000.000000
6                NaN
7          60.145310
8                NaN
9           1.209914
10          1.222222
11               NaN
12          0.951665
13               NaN
14          0.952515
15               NaN
16               NaN
17        117.900000
18        116.400000
19          2.846271
20          8.000000
dtype: float64

In [None]:
output_data_str = {}
for i,res in output_data.items():
    res_str = []
    for i2,el in enumerate(res):
        if i2 < 5:
            if np.isnan(el):
                res_str.append('-')
            else:
                res_str.append(f'{round(el,2):.2f}') 
        elif i2 == 5:
            res_str.append(f'{round(el,0):.0f}') 
        else:
            res_str.append(el)
    
    output_data_str[i] = res_str

In [None]:
headers = ['$k$', 'seed', 'remove strategy', '$n_{\mathcal{X}}$',
           'Obj.~(RS)', 'Obj.~(TP)', 'Gap TP.~(\%)', 
           'Obj.~($\mathcal{D}^{\\text{test}}_{N_2}$)', 'Gap $\mathcal{D}^{\\text{test}}_{N_2}$ (\%)',
           'Time', '$|\mathcal{X}|$']

In [None]:
dataio.write_output_to_latex(4, headers, output_data_str)

In [None]:
output_file_name = 'new_output'

In [None]:
# Write headers + output to .txt file
with open(r'output/headers_'+output_file_name+'.txt','w+') as f:
    f.write(str(headers))

# with open(r'output/'+output_file_name+'.txt','w+') as f:
#     f.write(str(output_data))

In [None]:
output_file_name = 'eval_gap_as_L_to_inf_k=[2,10]'

In [None]:
# Read from .txt file
file_path = 'output/'+output_file_name+'.txt'
dic = ''
with open(file_path,'r') as f:
         for i in f.readlines():
            dic=i #string
output_data_read = eval(dic)
output_data_read

In [None]:
output_data = output_data_read
output_data

In [None]:
# Read from .txt file
file_path = 'output/headers_'+output_file_name+'.txt'
dic = ''
with open(file_path,'r') as f:
         for i in f.readlines():
            dic=i #string
output_data_headers_read = eval(dic)

In [None]:
headers = output_data_headers_read
headers

In [None]:
dataio.write_output_to_latex(3, headers, output_data_str)

In [None]:
import pandas as pd

k = 1000
beta = 0.9
N_total_settings = [100]
p_train_settings = [0.25, 0.5, 0.75]
random_seed_data_settings = [i for i in range(1, 7)]
random_seed_split_settings = [i for i in range(1, 11)]

output_data_agg = {}
for N_total in N_total_settings:
    for p_train in p_train_settings:
        
        N_train = round(p_train * N_total)
        N_test = N_total - N_train
        
        df = pd.DataFrame({key: pd.Series(val) for key, val in output_data.items() if (key[0] == N_train
                                                                                       and key[1] == N_test
                                                                                       and key[2] in random_seed_data_settings)})
        df = df.astype(float)
        df_agg = df.agg(["mean","std"], axis="columns")

        df_feas = df.loc[:,df.iloc[3,:] >= beta]
        df_feas_agg = df_feas.agg(["mean","std"], axis="columns")

        prob_FF = sum(df.iloc[3,:] >= beta) / len(df.columns)
        true_prob_FF = sum(df.iloc[4,:] >= beta) / len(df.columns)

        avg_obj = df_agg.loc[0,'mean']
        std_obj = df_agg.loc[0,'std']

        if prob_FF > 0:
            avg_obj_F = df_feas_agg.loc[0, 'mean']
            std_obj_F = df_feas_agg.loc[0, 'std']
            avg_gap_F = df_feas_agg.loc[1, 'mean']
            std_gap_F = df_feas_agg.loc[1, 'std']
        else:
            avg_obj_F = 0
            std_obj_F = 0
            avg_gap_F = 0
            std_gap_F = 0

        avg_lb_train = df_agg.loc[2,'mean']
        std_lb_train = df_agg.loc[2,'std']
        avg_lb_test = df_agg.loc[3,'mean']
        std_lb_test = df_agg.loc[3,'std']
        avg_true_prob = df_agg.loc[4,'mean']
        std_true_prob = df_agg.loc[4,'std']
        
        avg_num_test_feas_found = df_agg.loc[6,'mean']
        std_num_test_feas_found = df_agg.loc[6,'std']
        
        avg_time_spent = df_agg.loc[8,'mean']
        std_time_spent = df_agg.loc[8,'std']

        li = []
        li.append(f'{prob_FF:.2f}')
        li.append(f'{true_prob_FF:.2f}')
        #li.append(f'{round(avg_obj,3):.3f}' + " ("+f'{round(std_obj,3):.3f}'+")")
        if prob_FF > 0:
            li.append(f'{round(avg_obj_F,3):.3f}' + " ("+f'{round(std_obj_F,3):.3f}'+")")
            li.append(f'{round(avg_gap_F,3):.3f}' + " ("+f'{round(std_gap_F,3):.3f}'+")")
        else:
            li.append("-")
            li.append("-")
        li.append(f'{round(avg_lb_train,3):.3f}' + " ("+f'{round(std_lb_train,3):.3f}'+")")
        li.append(f'{round(avg_lb_test,3):.3f}' + " ("+f'{round(std_lb_test,3):.3f}'+")")
        li.append(f'{round(avg_true_prob,3):.3f}' + " ("+f'{round(std_true_prob,3):.3f}'+")")
        li.append(f'{round(avg_num_test_feas_found,1):.1f}' + " ("+f'{round(std_num_test_feas_found,1):.1f}'+")")
        li.append(f'{round(avg_time_spent,1):.1f}' + " ("+f'{round(std_time_spent,1):.1f}'+")")

        output_data_agg[(N_train, N_test, 'Single Split Run')] = li
        
        
        count_FF = 0
        count_true_FF = 0
        best_obj = []
        best_gap = []
        lb_train = []
        lb_test = []
        true_prob = []
        num_test_feas_found = []
        time_spent = []
        
        for random_seed_data in random_seed_data_settings:
            df = pd.DataFrame({key: pd.Series(val) for key, val in output_data.items() if (key[0] == N_train
                                                                                       and key[1] == N_test
                                                                                       and key[2] == random_seed_data)})
            
            if sum(df.iloc[3,:] >= beta) > 0:
                count_FF += 1
            if sum(df.iloc[4,:] >= beta) > 0:
                count_true_FF += 1
                
            df_feas = df.loc[:,df.iloc[3,:] >= beta]
            #df_feas_agg = df_feas.agg(["mean","std","max", "min"], axis="columns")
            if not df_feas.empty:
                best_i = df_feas.idxmax(axis=1)[0]
                best_obj.append(df_feas.loc[0, best_i])
                best_gap.append(df_feas.loc[1, best_i])
                lb_train.append(df_feas.loc[2, best_i])
                lb_test.append(df_feas.loc[3, best_i])
                true_prob.append(df_feas.loc[4, best_i])
            
            num_test_feas_found.append(sum(df.iloc[6,:]))
            time_spent.append(sum(df.iloc[8,:]))
            
        prob_FF = count_FF / len(random_seed_data_settings)
        true_prob_FF = count_true_FF / len(random_seed_data_settings)
        avg_obj_F = np.mean(best_obj)
        std_obj_F = np.std(best_obj)
        avg_gap_F = np.mean(best_gap)
        std_gap_F = np.std(best_gap)
        avg_lb_train = np.mean(lb_train)
        std_lb_train = np.std(lb_train)
        avg_lb_test = np.mean(lb_test)
        std_lb_test = np.std(lb_test)
        avg_true_prob = np.mean(true_prob)
        std_true_prob = np.std(true_prob)
        avg_num_test_feas_found = np.mean(num_test_feas_found)
        std_num_test_feas_found = np.std(num_test_feas_found)
        avg_time_spent = np.mean(time_spent)
        std_time_spent = np.std(time_spent)
        
        li = []
        li.append(f'{prob_FF:.2f}')
        li.append(f'{true_prob_FF:.2f}')
        #li.append(f'{round(avg_obj,3):.3f}' + " ("+f'{round(std_obj,3):.3f}'+")")
        if prob_FF > 0:
            li.append(f'{round(avg_obj_F,3):.3f}' + " ("+f'{round(std_obj_F,3):.3f}'+")")
            li.append(f'{round(avg_gap_F,3):.3f}' + " ("+f'{round(std_gap_F,3):.3f}'+")")
        else:
            li.append("-")
            li.append("-")
        li.append(f'{round(avg_lb_train,3):.3f}' + " ("+f'{round(std_lb_train,3):.3f}'+")")
        li.append(f'{round(avg_lb_test,3):.3f}' + " ("+f'{round(std_lb_test,3):.3f}'+")")
        li.append(f'{round(avg_true_prob,3):.3f}' + " ("+f'{round(std_true_prob,3):.3f}'+")")
        li.append(f'{round(avg_num_test_feas_found,1):.1f}' + " ("+f'{round(std_num_test_feas_found,1):.1f}'+")")
        li.append(f'{round(avg_time_spent,1):.1f}' + " ("+f'{round(std_time_spent,1):.1f}'+")")
        
        output_data_agg[(N_train, N_test, 'Best of 10 Split Runs')] = li

In [None]:
headers_agg = ['$N_{train}$', '$N_{test}$', 'Strategy',
               'Prob.~FF', 'True Prob.~FF', 'Obj.~(Feas)', 'Gap (\%)', '$LB_{train}$', '$LB_{test}$', 'True Prob.',
              '\# FF (test)', 'Time']

dataio.write_output_to_latex(3, headers_agg, output_data_agg)

In [None]:
count_FF

In [None]:
# To plot histograms for random seed output
# Read from .txt file
output_file_name = 'new_output'
file_path = 'output/'+output_file_name+'.txt'
dic = ''
with open(file_path,'r') as f:
         for i in f.readlines():
            dic=i #string
output_data_read = eval(dic)

df = pd.DataFrame.from_dict(output_data_read, orient='index')
#li1 = [col for col in df.columns if 'add + improve' == col[1]]

df2 = pd.DataFrame({key: pd.Series(val) for key, val in output_data_read.items() if key[1] == 'add + improve + remove'})
obj2 = df2.iloc[0,:].astype(float)

title = 'Distribution of best found solution objective for random add + improve + remove'# for $\beta = 0.95$, $\alpha=10^{-6}$, $N_{1} = 1,000$, $N_{2} = 10,000$ and time limit $\mathcal{L} = 1$ minute'
dataio.plot_hist(obj2, 'Gap (%)', 'Frequency', title, 20, 0.75)