In [1]:
# import external packages
import numpy as np
import cvxpy as cp
from sklearn.model_selection import train_test_split
import time
import math

# import internal packages
import phi_divergence as phi
from iter_gen_and_eval_alg import iter_gen_and_eval_alg
import util

The problem we examine is as follows:

\begin{align}
\label{math_form:examples:pm2}
    \max_{\mathbf{x}}&~\theta \\
    \text{s.t.}&~\mathbf{r}^T \mathbf{x} \geq \theta \\
    &~\mathbf{e}^T \mathbf{x} = 1, \\
    &~\mathbf{x} \geq 0,
\end{align}

where $\mathbf{x}, \mathbf{r} \in \mathbb{R}^{k}$

In order to compare with Bertsimas, D., Gupta, V., & Kallus, N. (2018), we randomly generate $N$ synthetic returns for k assets, which is done as in Natarajan et al. (2008):

\begin{equation}
\tilde{r}_{i}=\left\{\begin{array}{ll}
\frac{\sqrt{\left(1-\gamma_{i}\right) \gamma_{i}}}{\gamma_{i}} & \text { with probability } \gamma_{i} \\[2mm]
-\frac{\sqrt{\left(1-\gamma_{i}\right) \gamma_{i}}}{1-\gamma_{i}} & \text { with probability } 1-\gamma_{i}
\end{array}, \quad \gamma_{i}=\frac{1}{2}\left(1+\frac{i}{k + 1}\right), \quad i=1, \ldots, k. \right.
\end{equation}

In [2]:
# Problem specific functions:
def generate_data_natarajan2008(random_seed, N, **kwargs):
    k = kwargs['k']    
    np.random.seed(random_seed)
    gamma = np.fromiter((((1/2)*(1 + (i/(k+1)))) for i in range(1,k+1)), float)
    return_pos = np.fromiter(((math.sqrt((1-gamma[i])*gamma[i])/gamma[i]) for i in range(0,k)), float)
    return_neg = np.fromiter((-(math.sqrt((1-gamma[i])*gamma[i])/(1-gamma[i])) for i in range(0,k)), float)
    data = np.empty([N,k])
    for n in range(0, N):
        for i in range(0, k):
            prob = np.random.uniform()
            if prob <= gamma[i]:
                data[n, i] = return_pos[i]
            else:
                data[n, i] = return_neg[i]
    return data 

def generate_data_mohajerin2018(random_seed, N, **kwargs):
    k = kwargs['k']
    np.random.seed(random_seed)
    # NOTE: not entirely clear in Esfahani & Kuhn paper whether they refer to stdev or var
    sys_risk_mean = 0
    #sys_risk_stdev = math.sqrt(0.02)
    sys_risk_stdev = 0.02
    unsys_risk_mean = np.fromiter(((i * 0.03) for i in range(1,k+1)), float)
    #unsys_risk_stdev = np.fromiter(( math.sqrt(i * 0.025) for i in range(1,k+1)), float)
    unsys_risk_stdev = np.fromiter(( i * 0.025 for i in range(1,k+1)), float)
    data = np.empty([N,k])
    for n in range(0, N):
        sys_return = np.random.normal(sys_risk_mean, sys_risk_stdev)
        for i in range(0, k):
            unsys_return = np.random.normal(unsys_risk_mean[i], unsys_risk_stdev[i])
            data[n, i] = sys_return + unsys_return
    return data 

def solve_SCP(S, **kwargs):
    k = S.shape[1]
    x = cp.Variable(k, nonneg = True)
    theta = cp.Variable(1)
    constraints = [- (S @ x) <= theta, cp.sum(x) == 1]
    obj = cp.Minimize(theta) # must formulate as min problem
    prob = cp.Problem(obj,constraints)
    time_limit = kwargs.get('time_limit', 2*60*60)    
#     prob.solve(solver=cp.MOSEK, mosek_params = {mosek.dparam.optimizer_max_time: time_limit})
    prob.solve(solver=cp.GUROBI, verbose=False, TimeLimit=time_limit)
    x_value = np.concatenate((theta.value,x.value)) # Combine x and theta into 1 single solution vector
    return(x_value, prob.value)

# def unc_obj_func(x, data, **kwargs):
#     return np.dot(data,x[1:]) # Assume that x[0] contains theta variable 

# def eval_x_OoS(x, obj, data, eval_unc_obj, **kwargs):
#     unc_obj_func = eval_unc_obj['function']
#     desired_rhs = eval_unc_obj['info']['desired_rhs']
    
#     evals = unc_obj_func(x, data, **kwargs)  
#     p_vio = sum(evals<(-obj+(1e-6))) / len(data) 
#     VaR = - np.quantile(evals, desired_rhs, method='inverted_cdf')
#     return p_vio, VaR

def unc_obj_func(x, data, **kwargs):
    return - np.dot(data,x[1:]) # Assume that x[0] contains theta variable 

def eval_x_OoS(x, obj, data, eval_unc_obj, **kwargs):
    unc_obj_func = eval_unc_obj['function']
    desired_rhs = eval_unc_obj['info']['desired_rhs']
    
    evals = unc_obj_func(x, data, **kwargs)  
    p_vio = sum(evals>(obj+(1e-6))) / len(data) 
    VaR = - np.quantile(evals, desired_rhs, method='inverted_cdf')
    return p_vio, VaR

In [4]:
# calafiore2013 method:
def solve_with_calafiore2013(solve_SCP, problem_instance, dim_x, data, risk_param_epsilon, conf_param_alpha, q=-1):
    start_time = time.time()
    # 1) given N, determine maximum q such that rhs of eq 12 is no greater than N
    N = len(data)
    z_tol_cal = risk_param_epsilon
    n_cal = dim_x
    beta_cal = conf_param_alpha
    
    if q == -1:
        def eval_eq_12_calafiore2013(z_tol_cal, beta_cal, q, n_cal):
            return 2/z_tol_cal * math.log(1/beta_cal) + 4/z_tol_cal * (q+n_cal)
        
        # do bisection search to find maximum q
        a = 0
        b = N - n_cal - 1
        f_b = eval_eq_12_calafiore2013(z_tol_cal, beta_cal, b, n_cal)
        
        if f_b <= N:
            q = b
        else:
            while True:
                if b-a == 1:
                    if eval_eq_12_calafiore2013(z_tol_cal, beta_cal, b, n_cal) <= N:
                        q = b
                        break
                    else:
                        q = a
                        break
                
                c = math.ceil((a+b)/2)
                f_c = eval_eq_12_calafiore2013(z_tol_cal, beta_cal, c, n_cal)
                if f_c > N:
                    b = c
                else:
                    a = c
    
    # 2) iteratively, using Lagrange multiplier-based rule, discard q scenarios
    def solve_SCP_w_duals(S, **kwargs):
        k = S.shape[1]
        x = cp.Variable(k, nonneg = True)
        theta = cp.Variable(1)
        constraints = [theta - (S @ x) <= 0, cp.sum(x) == 1]
        obj = cp.Minimize(-theta) # must formulate as min problem
        prob = cp.Problem(obj,constraints)
        time_limit = kwargs.get('time_limit', 2*60*60)    
        prob.solve(solver=cp.GUROBI, verbose=False, TimeLimit=time_limit)
        x_value = np.concatenate((theta.value,x.value)) # Combine x and theta into 1 single solution vector
        
        duals = constraints[0].dual_value
        return(x_value, prob.value, duals)
        
    # Start with all N scenarios and remove one-by-one
    num_removed = 0
    while num_removed < q:
        x, obj, duals = solve_SCP_w_duals(data, **problem_instance)
        scen_i = np.argmax(duals)
        data = np.delete(data, scen_i, axis=0)
        num_removed += 1
        
    # return final solution
    x, obj = solve_SCP(data, **problem_instance)
    return x, obj, (time.time() - start_time), q

In [None]:
# Check gamma values:
k = 10
gamma = np.fromiter((((1/2)*(1 + (i/(k+1)))) for i in range(1,k+1)), float)
return_pos = np.fromiter(((math.sqrt((1-gamma[i])*gamma[i])/gamma[i]) for i in range(0,k)), float)
return_neg = np.fromiter((-(math.sqrt((1-gamma[i])*gamma[i])/(1-gamma[i])) for i in range(0,k)), float)
print(gamma)
print(return_pos)
print(return_neg)

In [None]:
# Set parameter values (as in Bertsimas paper)
k = 10
conf_param_alpha = 0.10
risk_param_epsilon = 0.10
N_total = 500 
N_train = int(N_total / 2)
N_test = N_total - N_train

In [None]:
# provide functions and other info for generating & evaluating solutions
problem_instance = {}
problem_instance['time_limit'] = 2*60*60 

eval_unc_obj = {'function': unc_obj_func,
                'info': {'risk_measure': 'probability', # must be either 'probability' or 'expectation'
                         'desired_rhs': 1-risk_param_epsilon}}

eval_unc_constr = None

In [None]:
# Determine data generation function
# generate_data = generate_data_natarajan2008
generate_data = generate_data_mohajerin2018

In [None]:
# Get generated data
random_seed = 0
data = generate_data(random_seed, N_total, k=k)
data_train, data_test = train_test_split(data, train_size=(N_train/N_total), random_state=random_seed)

In [None]:
# Generate extra out-of-sample (OoS) data
random_seed_OoS = 1234
N_OoS = int(1e6)
data_OoS = generate_data(random_seed_OoS, N_OoS, k=k)

In [None]:
# run the algorithm
alg = iter_gen_and_eval_alg(solve_SCP, problem_instance, eval_unc_obj, eval_unc_constr, 
                            data_train, data_test, conf_param_alpha=conf_param_alpha,
                            verbose=False)
time_limit_alg = 60
stop_criteria={'max_elapsed_time': time_limit_alg} # in seconds (time provided to search algorithm)

N2_min = alg._determine_N_min(N_test, 1-risk_param_epsilon)
eval_unc_obj['info']['N2_min'] = N2_min

(best_sol, runtime, num_iter, pareto_frontier, S_history) = alg.run(stop_criteria=stop_criteria)

obj_alg = - best_sol['obj']
p_vio_alg, VaR_alg = eval_x_OoS(best_sol['sol'], best_sol['obj'], data_OoS, eval_unc_obj, **problem_instance)
print(N_train, N_total-N_train, runtime, obj_alg, p_vio_alg, VaR_alg)

In [None]:
num_iter

In [None]:
runtime_eval

In [None]:
dataio.print_solution_info(best_sol)

In [None]:
VaR_bound = best_sol['sol'][0]
VaR_bound

In [None]:
best_sol['sol'][1:]

In [None]:
returns_test = np.dot(data_test, best_sol['sol'][1:])

In [None]:
# This should be the emperical VaR observed on test data 
np.percentile(a=returns_test, q=100*risk_param_epsilon, method='inverted_cdf')

In [None]:
len(solutions)

In [None]:
len(pareto_solutions)

In [None]:
dataio.plot_pareto_curve(pareto_solutions, beta, None, None, None, None)

In [None]:
dataio.plot_pareto_curve(pareto_solutions, beta, None, None, None, None)

In [None]:
dataio.plot_obj_over_time(solutions, best_sol, None, None, None, None)

In [None]:
dataio.plot_size_set_over_time(solutions, best_sol, None, None, None, None)

# Here we calculate the a-priori phi-divergence RO solution assuming no prior knowledge of the distribution of the returns

In [None]:
phi_dot = 2 # This is the phi_dot for modified chi squared.
phi_conj = phi.mod_chi2_conj
r = phi_dot/(2*N)*scipy.stats.chi2.ppf(0.1, N-1)
p = np.zeros(N)+1/N
beta = 0.95
print(util.af_RC_exp_pmin(p,returns,r,phi_conj,np.array([1/(1-beta),0]),np.array([0,1])))

In [None]:
import importlib
importlib.reload(util)

# Now for the computational experiments

In [9]:
output_file_name = 'pm_calafiore2013_generate_data_mohajerin2018_k=100_N=2000_eps=0.10_seeds=1-100'

headers = ['seed', '$N$', '$N_1$', '$N_2$', '$T$', 'Obj.', '$p_{vio}^{OoS}$', '$VaR^{OoS}$',
           '\#Iter.~(\\texttt{add})', '\#Iter.~(\\texttt{remove})', 
           '$\mu_{|\mathcal{S}_i|}$', '$\max_{i}|\mathcal{S}_i|$',
           'N (CM)', 'q (CM)', 'runtime (CM)',
           'Obj. (CM)', '$p_{vio}^{OoS}$ (CM)', '$VaR^{OoS}$ (CM)',]

# # Write headers to .txt file
# with open(r'output/PortfolioManagement/headers_'+output_file_name+'.txt','w+') as f:
#     f.write(str(headers))

output_data = {}

# Set parameter values (as in Bertsimas paper)
k = 100
conf_param_alpha = 0.10
risk_param_epsilon = 0.10

N_cal_min = util.determine_campi_N_min(k, 1-risk_param_epsilon, conf_param_alpha)
print("N campi min:", N_cal_min)

N_total = 2000 
N_train = math.floor(N_total / 2)
N_test = N_total - N_train

problem_instance = {}
problem_instance['time_limit'] = 1*60*60 

eval_unc_obj = {'function': unc_obj_func,
                'info': {'risk_measure': 'probability', # must be either 'probability' or 'expectation'
                         'desired_rhs': 1 - risk_param_epsilon}}

eval_unc_constr = None

# Determine data generation function
# generate_data = generate_data_natarajan2008
generate_data = generate_data_mohajerin2018

# Generate extra out-of-sample (OoS) data
random_seed_OoS = 1234
N_OoS = int(1e6)
data_OoS = generate_data(random_seed_OoS, N_OoS, k=k)

random_seed_settings = [i for i in range(1,4)] #101
q_max = -1
run_count = 0
for random_seed in random_seed_settings:
    
    data = generate_data(random_seed, N_total, k=k)               
    data_train, data_test = train_test_split(data, train_size=(N_train/N_total), random_state=random_seed)
    

    # our method   
    alg = iter_gen_and_eval_alg(solve_SCP, problem_instance, eval_unc_obj, eval_unc_constr, 
                                data_train, data_test, conf_param_alpha=conf_param_alpha,
                                verbose=False)
    
    stop_criteria={'max_elapsed_time': 1*60} 
    (best_sol, runtime_alg, num_iter, pareto_frontier, S_history) = alg.run(stop_criteria=stop_criteria)
    
    obj_alg = - best_sol['obj']
    p_vio_alg, VaR_alg = eval_x_OoS(best_sol['sol'], best_sol['obj'], data_OoS, eval_unc_obj, **problem_instance)
    S_avg = sum(len(S_i) for S_i in S_history) / len(S_history)
    S_max = max(len(S_i) for S_i in S_history)
        
    # calafiore2013 method    
    x, obj, runtime_cal, q = solve_with_calafiore2013(solve_SCP, problem_instance, k, data, risk_param_epsilon, 
                                                      conf_param_alpha, q=q_max)
    obj_cal = - obj
    q_max = q
    p_vio_cal, VaR_cal = eval_x_OoS(x, obj, data_OoS, eval_unc_obj, **problem_instance)
        
    output_data[(random_seed, N_total)] = [N_train, N_test, runtime_alg, obj_alg, p_vio_alg, VaR_alg, 
                                           num_iter['add'], num_iter['remove'], S_avg, S_max,
                                           N_total, q, runtime_cal,
                                           obj_cal, p_vio_cal, VaR_cal]
    
    output_file_name = 'new_output_data'
    with open(r'output/PortfolioManagement/'+output_file_name+'.txt','w+') as f:
        f.write(str(output_data))
    
    run_count += 1
    print("Completed run: " + str(run_count))

N campi min: 599
Completed run: 1
Completed run: 2
Completed run: 3


In [None]:
from numpy import nan
from numpy import array # add if the .txt file contains numpy arrays

output_file_name = 'pm_bertsimas2018_N=500_eps=0.10_seeds=1-100'
# Read from .txt file
file_path = 'output/PortfolioManagement/'+output_file_name+'.txt'
dic = ''
with open(file_path,'r') as f:
     for i in f.readlines():
        if i != "nan":
            dic+=i #string
output_data_read = eval(dic)
output_data_read

In [None]:
output_data = output_data_read
output_data

In [None]:
# obtain average and std dev
import pandas as pd
df_output = pd.DataFrame.from_dict(output_data, orient='index')
df_output

In [None]:
df_output.mean()

In [None]:
df_output.std()

In [None]:
# Read in previous output from .txt file
from numpy import array # add if the .txt file contains numpy arrays
output_file_name = 'PM_compare_bertsimas_phidot=1_N1=250_N2=250_alpha=0.1_beta=0.9_100_seeds_L=1min'

file_path = 'output/'+output_file_name+'.txt'
dic = ''
with open(file_path,'r') as f:
    for i in f.readlines():
        dic+=i
output_data_read = eval(dic)

In [None]:
# Aggregate data to get avg and stddev across random seed runs
import pandas as pd

output_data_agg = {}
df_VaR = pd.DataFrame({key: pd.Series(val[5]) for key, val in output_data.items()})
df_Sol = pd.DataFrame({key: pd.Series(val[0]) for key, val in output_data.items()})
df_Sol = df_Sol.T

In [None]:
df_VaR.mean(axis=1)

In [None]:
df_VaR.std(axis=1)

In [None]:
df_Sol.drop(0, axis=1, inplace=True)
df_Sol

In [None]:
solutions = df_Sol

In [None]:
dataio.plot_portfolio_holdings(solutions)