# Generating data

In [None]:
import jax
import numpy as np
import pandas as pd
from scipy.special import logsumexp

In [None]:
import jax.numpy as jnp
import dill

In [None]:
# Feature generation
np.random.seed(7)  # For reproducibility

In [None]:
def gen_xr_MELD(T=500, A=2, K=4, sigma=0.10):
    '''
    Simulating synthetic data. Inspired by semisynthetic MELD example from (Huyuk, 2022)
    '''
    # Initialize the context matrix
    data_x = np.zeros((T, A, K))
    
    # Define periods for MELD-based policy
    period_pre_MELD = range(0, T // 3)
    period_MELD = range(T // 3, 2 * T // 3)
    period_post_MELD = range(2 * T // 3, T)
    
    # Generate context data
    for t in range(T):
        for a in range(A):
            waiting_time = np.random.exponential(scale=20)
            creatinine, INR, bilirubin = np.random.lognormal(mean=1, sigma=0.5, size=3)
            data_x[t, a, :] = [waiting_time, creatinine, INR, bilirubin]
    
    # Reward generation based on policy period
    rewards = np.zeros((T, A))
    for t in range(T):
        for a in range(A):
            if t in period_pre_MELD:
                rewards[t, a] = data_x[t, a, 0] + np.log(data_x[t, a, 1])
            elif t in period_MELD:
                rewards[t, a] = 9.57 * np.log(data_x[t, a, 1]) + 11.2 * np.log(data_x[t, a, 2]) + 3.78 * np.log(data_x[t, a, 3])
            else:  # post_MELD
                rewards[t, a] = data_x[t, a, 0] + np.log(data_x[t, a, 2]) + np.log(data_x[t, a, 3])
            rewards[t, a] += np.random.normal(loc=0, scale=np.sqrt(sigma))
    
    return data_x, rewards

In [None]:
# ALGORITHMS AND THEIR POLICIES
def optimistic(beta_mean, x, beta_cov, alpha):
    '''
    Softmax on ucb like choices
    '''
    q = alpha * np.einsum('ij,j->i', x, beta_mean) + np.einsum('ij,jk,ki->i', x, beta_cov, x.T)
    prob = np.exp(q - logsumexp(q))
    return np.random.choice(np.arange(x.shape[0]), p=prob)

def softmax(beta_mean, x, alpha):
    '''
    Softmax on greedy like choices
    '''    
    q = alpha * np.einsum('ij,j->i', x, beta_mean)
    prob = np.exp(q - logsumexp(q))
    return np.random.choice(np.arange(x.shape[0]), p=prob)

def greedy(beta_mean, x, alpha):
    '''
    Greedy choices
    '''    
    q = np.einsum('ij,j->i', x, beta_mean)
    return np.argmax(q)

def ucb(beta_mean, x, beta_cov, alpha):
    '''
    UCB choices
    '''    
    pred = np.einsum('ij,j->i', x, beta_mean)
    unc = np.sqrt(np.einsum('ij,jk,ki->i', x, beta_cov, x.T))
    q = pred + alpha * unc
    return np.argmax(q)

def ts(beta_mean, x, beta_cov, alpha,  num_samples=10):
    '''
    TS Monte Carlo version choices
    '''    
    K = beta_mean.shape[0]  # Number of features
    A = x.shape[0]  # Number of actions

    # Result(num_samples, K)
    sampled_rhos = np.random.multivariate_normal(beta_mean, beta_cov, size=num_samples)
    
    # x to (1, A, K) and broadcasted 
    # Result shape (num_samples, A)
    scores = np.dot(sampled_rhos, x.T)

    # Result shape (num_samples,)
    best_actions = np.argmax(scores, axis=1)
    
#    counts = np.bincount(best_actions, minlength=A)
#    freq = best_action_counts / num_samples
    freq=np.zeros(A)
    for action in range(A):
        freq[action] = np.sum(best_actions == action) / num_samples
        
    return np.random.choice(np.arange(x.shape[0]), p=freq)

def igw(beta_mean, x, alpha):
    '''
    IGW choices
    '''             
    erewards = np.einsum('ij,j->i', x, beta_mean)  # prediction
    best_arm = np.argmax(erewards)
    gaps = erewards[best_arm] - erewards  # Gaps

    A = x.shape[0]  # x is (A, K)
    # Compute the prob for non-best 
    pi = 1 / (A + alpha * gaps)
    pi[best_arm] = 0  # temp

    # Adjust the best arm
    pi_best = 1 - np.sum(pi)
    pi[best_arm] = pi_best

    return np.random.choice(np.arange(x.shape[0]), p=pi)


In [None]:
def algo_wrapper(algo_name, beta_mean, x, beta_cov, alpha, **kwargs):
    if algo_name in ["greedy", "igw", "softmax"]:
        # algo do not use beta_cov
        return globals()[algo_name](beta_mean, x, alpha=alpha, **kwargs)
    elif algo_name in ["ucb", "ts", "optimistic"]:
        # algo use beta_cov
        return globals()[algo_name](beta_mean, x, beta_cov, alpha=alpha, **kwargs)
    else:
        # Default case
        return globals()[algo_name](beta_mean, x, beta_cov, alpha=alpha, **kwargs)
            

In [None]:
def run_experiment(data_x, rewards, algo, sigma, alpha):
    '''
    Wrapper for the algorithms run:
    1) Unpacks the initial values and sets the parameters
    2) Orchestrates algorithm choices
    3) Outputs the samples collected
    '''

    T, A, K = data_x.shape
    sigma  = 0.10
    hyper  = {'beta0_y': -np.ones(K)/K * (T//50), 'beta0_N': np.eye(K) * (T//50)}
    beta_y = hyper['beta0_y']
    beta_N = hyper['beta0_N']    
    data   = {'x': [], 'a': [], 'rhox': [], 'betas_mean': []}
    data['betas_mean'] = list()
    data['betas_cov']  = list()
    
    # Run the algorithm for T rounds
    for t in range(T):
        # beliefs
        beta_mean = np.linalg.inv(beta_N) @ beta_y            
        beta_cov = sigma**2 * np.linalg.inv(beta_N)
            
        # (x,a,r)
        x = data_x[t]            
        a = algo_wrapper(algo, beta_mean, x, beta_cov, alpha)            
        r = rewards[t, a]
        
        # Update data and beliefs
        data['x'].append(x)
        data['a'].append(a)
        data['betas_mean'].append(beta_mean)
        data['betas_cov'].append(beta_cov)

        # Update beliefs utilities
        beta_y = beta_y + r * x[a]
        beta_N = beta_N + np.einsum('i,j->ij', x[a], x[a])
                            
        # After, calc rho_env
        rhox = np.linalg.inv(beta_N) @ beta_y
        data['rhox'] = (rhox / np.abs(rhox).sum()).tolist()
        
    return data

In [None]:
output_dir = "../Datasets"
algos = ["igw", "optimistic", "softmax", "greedy", "ucb", "ts"]
alpha = 20
T=1000
A=3
K=4
sigma=0.10

for i in range(11, 100):
    data_x, rewards = gen_xr_MELD(T=T, A=A, K=K, sigma=sigma)

    for algo in algos:
        # Run the experiment and get results
        experiment_data = run_experiment(data_x, rewards, algo, sigma=0.10, alpha=alpha)

        # Save the results
        filename = f'{output_dir}/dataset_{i}_{algo}.dill'
        with open(filename, 'wb') as f:
            dill.dump(experiment_data, f)

In [None]:
# Open and load the data from the .diller
for i in range(1):
    file_path = f'{output_dir}/dataset_{i}_{algo}.dill'

with open(file_path, 'rb') as f:
    data = dill.load(f)

In [None]:
# Sanity check is everything ok?
data_x = np.array(data['x'])  
data_a = np.array(data['a'])  
rhox = data['rhox']           
betas_mean = data['betas_mean']  
betas_cov=data['betas_cov']

print("Contexts (data_x):", data_x)
print("Actions (data_a):", data_a)
print("Estimated rhox:", rhox)
print("Beta means:", betas_mean)