In [None]:
# Import packages
import numpy as np
import cvxpy as cp
import mosek
import time
import math
from sklearn.model_selection import train_test_split

import phi_divergence as phi
import robust_sampling as rs
import dataio
import util

The problem we examine is as follows:

\begin{align}
\label{math_form:examples:pm2}
    \max_{\mathbf{x}}&~\theta \\
    \text{s.t.}&~\mathbf{r}^T \mathbf{x} \geq \theta \\
    &~\mathbf{e}^T \mathbf{x} = 1, \\
    &~\mathbf{x} \geq 0,
\end{align}

where $\mathbf{x}, \mathbf{r} \in \mathbb{R}^{k}$

In order to compare with Bertsimas, D., Gupta, V., & Kallus, N. (2018), we randomly generate $N$ synthetic returns for k assets, which is done as in Natarajan et al. (2008):

\begin{equation}
\tilde{r}_{i}=\left\{\begin{array}{ll}
\frac{\sqrt{\left(1-\gamma_{i}\right) \gamma_{i}}}{\gamma_{i}} & \text { with probability } \gamma_{i} \\[2mm]
-\frac{\sqrt{\left(1-\gamma_{i}\right) \gamma_{i}}}{1-\gamma_{i}} & \text { with probability } 1-\gamma_{i}
\end{array}, \quad \gamma_{i}=\frac{1}{2}\left(1+\frac{i}{k + 1}\right), \quad i=1, \ldots, k. \right.
\end{equation}

In [None]:
# Problem specific functions:
def generate_data(random_seed, k, N):
    np.random.seed(random_seed)
    gamma = np.fromiter((((1/2)*(1 + (i/(k+1)))) for i in range(1,k+1)), float)
    return_pos = np.fromiter(((math.sqrt((1-gamma[i])*gamma[i])/gamma[i]) for i in range(0,k)), float)
    return_neg = np.fromiter((-(math.sqrt((1-gamma[i])*gamma[i])/(1-gamma[i])) for i in range(0,k)), float)
    data = np.empty([N,k])
    for n in range(0, N):
        for i in range(0, k):
            prob = np.random.uniform()
            if prob <= gamma[i]:
                data[n, i] = return_pos[i]
            else:
                data[n, i] = return_neg[i]
    return data 

def solve_SCP(S, time_limit):
    k = S.shape[1]
    x = cp.Variable(k, nonneg = True)
    theta = cp.Variable(1)
    constraints = [theta - (S @ x) <= 0, cp.sum(x) == 1]
    obj = cp.Maximize(theta) # equivalent to min theta
    prob = cp.Problem(obj,constraints)
    prob.solve(solver=cp.MOSEK, mosek_params = {mosek.dparam.optimizer_max_time: time_limit})
    x_value = np.concatenate((theta.value,x.value)) # Combine x and theta into 1 single solution vector
    return(x_value, prob.value)

def uncertain_constraint(S, x):
    return (x[0] - np.dot(S,x[1:])) # Assume that x[0] contains theta variable 

In [None]:
# Check gamma values:
k = 10
gamma = np.fromiter((((1/2)*(1 + (i/(k+1)))) for i in range(1,k+1)), float)
return_pos = np.fromiter(((math.sqrt((1-gamma[i])*gamma[i])/gamma[i]) for i in range(0,k)), float)
return_neg = np.fromiter((-(math.sqrt((1-gamma[i])*gamma[i])/(1-gamma[i])) for i in range(0,k)), float)
print(gamma)
print(return_pos)
print(return_neg)

In [None]:
# Set parameter values (as in Bertsimas paper)
k = 10
alpha = 0.10
beta = 0.90
N_total = 500 
N_train = int(N_total / 2)
N_test = N_total - N_train

In [None]:
# Set other parameter values
par = 1
phi_div = phi.mod_chi2_cut
phi_dot = 2
numeric_precision = 1e-6 # To correct for floating-point math operations

In [None]:
# Get generated data
random_seed = 3
data = generate_data(random_seed, k, N_total)               
data_train, data_test = train_test_split(data, train_size=(N_train/N_total), random_state=random_seed)

In [None]:
# Run RS algorithms
time_limit_search = 0.1*60 # in seconds (time provided to search algorithm)
time_limit_solve = 5*60 # in seconds (for individuals solves of SCP)
max_nr_solutions = 10000 # for easy problems with long time limits, we may want extra restriction
add_remove_threshold = 0.00 # This determines when randomness is introduced in add/removal decision

add_strategy = 'random_vio'
remove_strategy = 'random_active'
clean_strategy = (15, 'all_inactive')

runtime_search, num_iter, solutions = rs.search_alg(data_train, N_test, beta, alpha, time_limit_search, time_limit_solve, 
                                                   max_nr_solutions, add_strategy, remove_strategy, clean_strategy, 
                                                   add_remove_threshold, 
                                                   par, phi_div, phi_dot, numeric_precision,
                                                   solve_SCP, uncertain_constraint, random_seed)

runtime_eval, best_sol, pareto_solutions = rs.evaluate_alg(solutions, data_test, beta, alpha, par, phi_div, phi_dot, 
                                                           uncertain_constraint, numeric_precision)

In [None]:
num_iter

In [None]:
runtime_eval

In [None]:
dataio.print_solution_info(best_sol)

In [None]:
VaR_bound = best_sol['sol'][0]
VaR_bound

In [None]:
best_sol['sol'][1:]

In [None]:
returns_test = np.dot(data_test, best_sol['sol'][1:])

In [None]:
# This should be the emperical VaR observed on test data 
np.percentile(a=returns_test, q=100*(1-beta), method='inverted_cdf')

In [None]:
len(solutions)

In [None]:
len(pareto_solutions)

In [None]:
dataio.plot_pareto_curve(pareto_solutions, beta, None, None, None, None)

In [None]:
dataio.plot_pareto_curve(pareto_solutions, beta, None, None, None, None)

In [None]:
dataio.plot_obj_over_time(solutions, best_sol, None, None, None, None)

In [None]:
dataio.plot_size_set_over_time(solutions, best_sol, None, None, None, None)

# The following cells are used to obtain output and write to latex tables

In [None]:
headers = ['seed', 
           'sol', 'obj', 'lb_{train}', 'lb_{test}', 'True Prob.', 'True VaR', 'True CVaR',
           '\#Iter.~(add)', '\#Iter.~(remove)', '\#Iter.~(clean)', '$|\mathcal{X}|$',
           '$|\mathcal{S}_{max}|$', 'Time Found',
          'N_{Campi}', 'obj_{Campi}', 'solvetime_{Campi}']

output_data = {}

# Set parameter values (as in Bertsimas paper)
k = 10
alpha = 0.10
#alpha = 0.000001
beta = 0.90
N_total = 2000 
N_train = int(N_total / 2)
N_test = N_total - N_train
N_campi = util.determine_campi_N_min(k, beta, alpha)

N_eval = 1000000
seed_eval = 987
data_eval = generate_data(seed_eval, k, N_eval)

# Other parameter values
par = 1
phi_div = phi.mod_chi2_cut
phi_dot = 2
numeric_precision = 1e-6 # To correct for floating-point math operations

# Alg parameters
time_limit_search = 1*60 # in seconds (time provided to search algorithm)
time_limit_solve = 5*60 # in seconds (for individuals solves of SCP)
max_nr_solutions = 10000 # for easy problems with long time limits, we may want extra restriction
add_remove_threshold = 0.00 # This determines when randomness is introduced in add/removal decision
add_strategy = 'random_vio'
remove_strategy = 'random_active'
clean_strategy = (10, 'all_inactive')

random_seed_settings = [i for i in range(1, 11)]

run_count = 0
for random_seed in random_seed_settings:
    
    data = generate_data(random_seed, k, N_total)               
    data_train, data_test = train_test_split(data, train_size=(N_train/N_total), random_state=random_seed)
    data_campi = generate_data(random_seed, k, N_campi)
    campi_runtime, campi_x, campi_obj = util.solve_with_campi_N(solve_SCP, data_campi, time_limit_solve)
    

    runtime_search, num_iter, solutions = rs.search_alg(data_train, N_test, beta, alpha, time_limit_search, time_limit_solve, 
                                                       max_nr_solutions, add_strategy, remove_strategy, clean_strategy, 
                                                       add_remove_threshold,  
                                                       par, phi_div, phi_dot, numeric_precision,
                                                       solve_SCP, uncertain_constraint, random_seed)

    runtime_eval, best_sol, pareto_solutions = rs.evaluate_alg(solutions, data_test, beta, alpha, par, phi_div, phi_dot, 
                                                               uncertain_constraint, numeric_precision)
    
    # Get "true" performance using eval_data
    constr = uncertain_constraint(data_eval, best_sol['sol'])
    num_violations = np.count_nonzero(constr > (0+numeric_precision))
    p_eval = 1 - (num_violations/N_eval)
    
    # We are also interested in the "true" VaR and CVaR under data_eval
    returns_eval = np.dot(data_eval, best_sol['sol'][1:])
    VaR_eval = np.percentile(returns_eval, 100*(1-beta), method='inverted_cdf')
    CVaR_eval = np.mean(returns_eval[(returns_eval <= VaR_eval)])
    
    # Get max scenario_set size
    max_S = 0
    for sol in solutions:
        S = len(sol['scenario_set'])
        if S > max_S:
            max_S = S
        
    output_data[random_seed] = [best_sol['sol'], best_sol['obj'], best_sol['lb_train'], best_sol['lb_test'],
                                     p_eval, VaR_eval, CVaR_eval, num_iter['add'], num_iter['remove'], num_iter['clean'],
                                     len(solutions), max_S, best_sol['time'],
                               N_campi, campi_obj, campi_runtime]
        
    run_count += 1
    print("Completed run: " + str(run_count))
    
output_file_name = 'new_output_data'
with open(r'output/headers_'+output_file_name+'.txt','w+') as f:
    f.write(str(headers))
with open(r'output/'+output_file_name+'.txt','w+') as f:
    f.write(str(output_data))

In [None]:
# Read from file
import json
from numpy import array # add if the .txt file contains numpy arrays

output_file_name = 'PM_compare_bertsimas_phidot=1_N1=250_N2=250_alpha=0.1_beta=0.9_100_seeds_L=1min' 
# Read from .txt file
file_path = 'output/'+output_file_name+'.txt'
dic = ''
with open(file_path,'r') as f:
     for i in f.readlines():
        dic+=i #string
output_data_read = eval(dic)

In [None]:
output_data_read

In [None]:
output_data = output_data_read

In [None]:
# Aggregate data to get avg and stddev across random seed runs
import pandas as pd

output_data_agg = {}
df_VaR = pd.DataFrame({key: pd.Series(val[5]) for key, val in output_data.items()})
df_Sol = pd.DataFrame({key: pd.Series(val[0]) for key, val in output_data.items()})
df_Sol = df_Sol.T

In [None]:
df_VaR.mean(axis=1)

In [None]:
df_VaR.std(axis=1)

In [None]:
df_Sol.drop(0, axis=1, inplace=True)
df_Sol

In [None]:
solutions = df_Sol

In [None]:
dataio.plot_portfolio_holdings(solutions)