In [None]:
import pandas as pd
from scipy.stats import mannwhitneyu
import seaborn as sns
from itertools import combinations, permutations
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [None]:


pd.set_option('display.float_format', '{:.6g}'.format)
df_save_path = 'tmp/evaluation_results'

results_df = pd.read_pickle(df_save_path).fillna(np.nan)
results_df = results_df[(results_df['save_type'] == 'x_interval') | (results_df['save_type'].isna())]
algos = results_df['algo'].unique().tolist()
trained_results_df = results_df[results_df['trained_model']==True]
results_df

#

# Test If Training helps
$H_{0}$: Training the discretization model has no effect on the final rewards
$H_{a}$: Training the discretization model results in greater rewards

In [None]:
training_hypothesis_columns = ['algo','x','y','p-value']
training_hypothesis_df = pd.DataFrame(columns=training_hypothesis_columns)
for algo in results_df['algo'].unique():

    trained_rewards = results_df[results_df['algo']==algo][results_df['trained_model']==True]['rewards'].tolist()
    untrained_rewards = results_df[results_df['algo']==algo][results_df['trained_model']==False]['rewards'].tolist()
    sns.displot(results_df[results_df['algo']==algo], x="rewards", hue="trained_model", kind="kde", fill=True, label=algo)
    corr, p_value = mannwhitneyu(trained_rewards,untrained_rewards, alternative='greater')
    new_row = pd.DataFrame([[algo,True,False,p_value]], columns=training_hypothesis_columns )
    training_hypothesis_df = pd.concat((training_hypothesis_df,new_row))
print(training_hypothesis_df)

In [None]:

algo_combinations = list(permutations(algos,2))
algo_combinations

# Test if results from one algorithm is better than other
$H_{0}$: Algorithm A results in rewards same as B
$H_{a}$: Algorithm A results in rewards greater then B

In [None]:
algo_comp_hypothesis_columns = ['algo_A','algo_B','p-value']
algo_comp_hypothesis_df = pd.DataFrame(columns=algo_comp_hypothesis_columns)
for algo_combination in algo_combinations:
    print(algo_combination)
    trained_rewards_algo = [results_df[results_df['algo']==algo][results_df['trained_model']==True]['rewards'].tolist() for algo in algo_combination]

    sns.displot(results_df[(results_df['trained_model']==True) & (results_df['algo'].isin(algo_combination))], x="rewards", hue="algo", kind="kde", fill=True)
    _, p_value = mannwhitneyu(trained_rewards_algo[0],trained_rewards_algo[1], alternative='greater')
    new_row = pd.DataFrame([[algo_combination[0],algo_combination[1],p_value]], columns=algo_comp_hypothesis_columns )
    algo_comp_hypothesis_df = pd.concat((algo_comp_hypothesis_df,new_row))
print(algo_comp_hypothesis_df)

In [None]:
def calculate_corr(df,x,y):
    col = ['algo','corr','p_value']
    results_df = pd.DataFrame(columns=col)
    for algo in df['algo'].unique():
        corr, p_value = scipy.stats.kendalltau(df[df['algo']==algo][x].tolist(), df[df['algo']==algo][y].tolist())
        new_row = pd.DataFrame([[algo,corr,p_value]], columns=col )
        results_df = pd.concat((results_df,new_row))
    return results_df

# Correlation between rewards and dataset size
$H_{0}$: Rewards are independent from the dataset size used for the offline algorithms
$H_{a}$: Rewards are dependent on the dataset size used for the offline algorithms

In [None]:
import scipy

offline_algos = ['rmin', 'policy_iteration']
offline_trained_df = trained_results_df[trained_results_df['algo'].isin(offline_algos)]

dataset_size_corr_df = calculate_corr(offline_trained_df,'dataset_size','rewards')
print(dataset_size_corr_df)

In [None]:
offline_trained_df[offline_trained_df['algo']=='rmin']

# Correlation between rewards and model loss
$H_{0}$: Rewards are independent from the loss of the discretizing model
$H_{a}$: Rewards are dependent on the loss of the discretizing model

In [None]:

loss_test_df = trained_results_df[trained_results_df['dataset_size'].isin([np.nan,10000])]
model_loss_corr_df = calculate_corr(loss_test_df,'model_loss','rewards')
print(model_loss_corr_df)

# Correlation between rewards and total_states
$H_{0}$: Rewards are independent from the total states in the policy
$H_{a}$: Rewards are dependent on the total states in the policy

In [None]:
total_states_corr_df = calculate_corr(loss_test_df,'total_states', 'rewards')
print(total_states_corr_df)

In [None]:
loss_test_df[loss_test_df['algo']=='q_learning']

# Correlation between rewards and observed states
$H_{0}$: Rewards are independent from the observed states during evaluation
$H_{a}$: Rewards are dependent on the observed states during evaluation

In [None]:
observed_states_corr_df = calculate_corr(loss_test_df,'unique_obs', 'rewards')
print(observed_states_corr_df)

log of model loss

box plot