In [None]:
%load_ext autoreload 
%autoreload 2

In [None]:
from utils.helper_functions import * 
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
benchmark_dirs = ["Connecticut"]
ref_sets = ["Connecticut", "USA", "North_America", "Global"]
dates = ["2020-01-01_till_2022-01-01", "2020-06-01_till_2022-01-01", "2021-01-01_till_2022-01-01", "2021-06-01_till_2022-01-01"]
seeds = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
abundances = [1,2,3,4,5,6,7,8,9,10,20,30,40,50,60,70,80,90,100]

write abundance predictions as json files

In [None]:
for ref_sts in  ref_sets:
    output_results_to_json_3_dirs(first_dir = ref_sts, second_dirs=dates, third_dirs=seeds, threshold="0.1", abundances= abundances, seq_name= "AY.103_sequence", prediction_level="VOC", VOC= "delta")

load abundance predictions & calculate absolute error

In [None]:
results = dict()
results_who = dict()

for dir_name in ref_sets:
    results[dir_name] = dict()
    
    with open("statistical_analysis/results_who_{}.json".format(dir_name)) as json_file:
        results_who[dir_name] = json.loads(json_file.read())

calculate absolute errors 

In [None]:
absolute_errors_who = calculate_absolute_errors_af(results_who, seeds, abundances, dates, ref_sets)

scatter plot

In [None]:
def plot_with_scatterplots_custom(abundances, dates, results, ref_sets, special_plot_name = None):

    fig, ax = plt.subplots(2, 2, figsize=(20, 20))
    # fig.set_dpi(2000)

    for ref_set, tuple in zip(ref_sets,[[0,0], [0,1], [1,0], [1,1]]):
        i = tuple[0]
        j = tuple[1]
        
        for date in dates:
            # create a new figure
            ax[i][j].scatter(abundances, [results[ref_set][date]["1"][str(abundance)] for abundance in abundances], label= date.replace("_", " "), s = 170, alpha= 0.65)
            ax[i][j].set_xscale('log')
            ax[i][j].set_title("{}".format(ref_set.replace("_", " ")), fontsize = 25, pad = 10)
            
        # plot x=y line black and dashed
        ax[i][j].plot(abundances, abundances, color='black', linestyle='dashed', label = "True abundance")
        # ax[i][j].legend(fontsize = 13)
        ax[i][j].set_yticks([0, 10, 20, 30, 40, 50, 60, 70, 80 , 90, 100])
    
        ax[i][j].tick_params(axis='both', labelsize=20)
    
    # plot the same common legend outside of plot for all subplots
    handles, labels = ax[0][0].get_legend_handles_labels()
    fig.legend(handles, labels, loc='center right', bbox_to_anchor=(1.2, 0.5),fontsize = 20)

    # plot commonn x annd y label
    fig.text(0.5, 0.04, 'Simulated abundance', ha='center', fontsize = 25)
    fig.text(0.04, 0.5, 'Predicted abundance', va='center', rotation='vertical', fontsize = 25)


    # save figure as pdf with tight layout
    plt.subplots_adjust(hspace=0.19)
    if special_plot_name == None:
        plt.savefig("figures/scatter_plot.pdf", bbox_inches='tight')
    else:
        plt.savefig("figures/{}.pdf".format(special_plot_name), bbox_inches='tight')


    return 


plot_with_scatterplots_custom(abundances, dates, results_who, ref_sets, "scatter_plot_who_custom")

dataset info

In [None]:
dirs = ["Connecticut", "USA", "North_America", "Global"]
start_dates = ["2020-01-01_till_2022-01-01", "2020-06-01_till_2022-01-01", "2021-01-01_till_2022-01-01", "2021-06-01_till_2022-01-01"]

for ref_set in dirs:
    for dates in start_dates:
        path = "../../../data/Timeframe_Experiments/HPC/reference_sets/{}/{}".format(ref_set, dates)
        selection_path = "../../../data/Timeframe_Experiments/HPC/reference_sets/{}/{}/processed".format(ref_set, dates)
        output_dataset_info("AY.103", path, selection_path)

# merge csv per directory
for dir in dirs:
    merge_csv_from_subdirectory("../../../data/Timeframe_Experiments/HPC/reference_sets/{}".format(dir, dates), "Reference set", True)

statistical analysis

In [None]:
benchmark_dirs = ["Connecticut"]
ref_sets = ["Connecticut", "USA", "North_America", "Global"]
dates = ["2020-01-01_till_2022-01-01", "2020-06-01_till_2022-01-01", "2021-01-01_till_2022-01-01", "2021-06-01_till_2022-01-01"]
seeds = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
abundances = [1,2,3,4,5,6,7,8,9,10,20,30,40,50,60,70,80,90,100]

In [None]:
#  restructure absolute errors such that is absolute_errors[date][ref_set][seed][abundance]
absolute_errors_resrtuctured = dict()

for date in dates:
    absolute_errors_resrtuctured[date] = dict()
    for ref_set in ref_sets:
        absolute_errors_resrtuctured[date][ref_set] = dict()
        for seed in seeds:
            seed = str(seed)
            absolute_errors_resrtuctured[date][ref_set][seed] = dict()
            for abundance in abundances:
                abundance = str(abundance)
                absolute_errors_resrtuctured[date][ref_set][seed][abundance] = absolute_errors_who[ref_set][date][seed][abundance]

In [None]:
# calculate t-test per date and reference set

t_test_results = dict()

recent_date = dates[len(dates)-1]

for date in dates[:len(dates)-1]:
    t_test_results[date] = dict()
    for ref_set in ref_sets:
        t_test_results[date][ref_set] = dict()
        for abundance in abundances:
            abundance = str(abundance)

            # take abundance for each seed for date and ref_set
            abundance_for_date_ref_set = [absolute_errors_resrtuctured[date][ref_set][str(seed)][abundance] for seed in seeds]

            # take abundance for each seed for recent date and ref_set
            abundance_for_recent_date_ref_set = [absolute_errors_resrtuctured[recent_date][ref_set][str(seed)][abundance] for seed in seeds]

            # calculate t-test
            t_test_results[date][ref_set][abundance] = stats.ttest_ind(abundance_for_recent_date_ref_set,abundance_for_date_ref_set, equal_var=False, alternative="less")



# save t-test results as dataframe and then as csv such that is dates vs abundances for each reference set

for ref_set in ref_sets:
    df = pd.DataFrame()
    for date in dates[:len(dates)-1]:
        df[date] = [t_test_results[date][ref_set][str(abundance)].pvalue for abundance in abundances]
    df.index = abundances
    df.to_csv("statistical_analysis/t-test_results_{}.csv".format(ref_set))
            



In [None]:
# plot heatmap for p-values date  vs abundance for each reference set
cmap = sns.diverging_palette(220, 20, as_cmap=True)

# scale font size
sns.set(font_scale=2)

for ref_set in ref_sets:

    df = pd.read_csv("statistical_analysis/t-test_results_{}.csv".format(ref_set), index_col=0)
    # remove dashes from date column in place
    df.columns = df.columns.str.replace("_", " ")

    fig, ax = plt.subplots(figsize=(20, 20))
    # fig.set_dpi(2000)
    sns.heatmap(df, cmap=cmap, linewidths=0.5, cbar_kws={"shrink": 0.5}, ax=ax, vmin=0, vmax = 0.1)

    ax.set_title("{}".format(ref_set.replace("_", " ")), fontsize = 25, pad = 10)
    ax.set_xlabel("Date", fontsize = 25)
    ax.set_ylabel("Abundance", fontsize = 25)
    ax.tick_params(axis='both', labelsize=20)

    # save figure as pdf with tight layout
    plt.savefig("statistical_analysis/heatmap_{}.pdf".format(ref_set), bbox_inches='tight')

    plt.show()

distribution over absolute errors

In [None]:
benchmark_dirs = ["Connecticut"]
ref_sets = ["Connecticut", "USA", "North_America", "Global"]
dates = ["2020-01-01_till_2022-01-01", "2020-06-01_till_2022-01-01", "2021-01-01_till_2022-01-01", "2021-06-01_till_2022-01-01"]
seeds = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
abundances = [1,2,3,4,5,6,7,8,9,10,20,30,40,50,60,70,80,90,100]


results = dict()
results_who = dict()

for dir_name in ref_sets:
    results[dir_name] = dict()
    
    with open("statistical_analysis/results_who_{}.json".format(dir_name)) as json_file:
        results_who[dir_name] = json.loads(json_file.read())

absolute_errors_who = calculate_absolute_errors_af(results_who, seeds, abundances, dates, ref_sets)

#  restructure absolute errors such that is absolute_errors[date][ref_set][seed][abundance]
absolute_errors_resrtuctured = dict()

for date in dates:
    absolute_errors_resrtuctured[date] = dict()
    for ref_set in ref_sets:
        absolute_errors_resrtuctured[date][ref_set] = dict()
        for seed in seeds:
            seed = str(seed)
            absolute_errors_resrtuctured[date][ref_set][seed] = dict()
            for abundance in abundances:
                abundance = str(abundance)
                absolute_errors_resrtuctured[date][ref_set][seed][abundance] = absolute_errors_who[ref_set][date][seed][abundance]

# plot distribution of absolute errors for each date and reference set

for date in dates:
    for ref_set in ref_sets:
        for ab in abundances:
            fig, ax = plt.subplots(figsize=(20, 20))
            ax.hist([absolute_errors_resrtuctured[date][ref_set][str(seed)][str(ab)] for seed in seeds], bins=100)
            ax.set_title("{} {} {}".format(ref_set, date, ab), fontsize = 25, pad = 10)
            ax.set_xlabel("Absolute error", fontsize = 25)
            ax.set_ylabel("Frequency", fontsize = 25)
            ax.tick_params(axis='both', labelsize=20)
            plt.show()