In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import os

import h2o
import seaborn as sns
sys.path.insert(0, '..')
from DataModule.Data_Preparation import CoronnaCERTAINDataset

from sklearn import metrics
import matplotlib
matplotlib.__version__

In [None]:
def responseClassify_binary(row, baseline, _next):
    # set threshold
    lower_change = 0.6
    upper_change = 1.2

    change = row[_next]
    row[_next] = row[baseline] - change

    if change <= lower_change:
        return "Nonresponder"

    elif (change <= upper_change) & (change > lower_change):
        if row[_next] > 5.1:
            return "Nonresponder"
        else:
            return "Responder"

    elif change > upper_change:
        if row[_next] > 3.2:
            return "Responder"
        else:
            return "Responder"

    else:
        return 2
    
def MSE(true, pred):
    return metrics.mean_squared_error(true, pred)
    
def Classification_Accuracy(true, pred):
    return metrics.accuracy_score(true, pred)

def R2(true, pred):
    return metrics.r2_score(true, pred)

def Adjusted_R2(true, pred, p):
    n = len(true)
    return 1-((1-R2(true,pred))*(n-1))/(n-p-1)

def F1_Score(true, pred):
    return metrics.f1_score(true, pred, average='macro')

def RPT(stability,performance,beta=1):
    return (beta**2+1)*stability*performance / (beta**2*stability+performance)

def calculate_RPT(row, metrics):
    stability = row[f'{metrics}_std']
    performance = row[f'{metrics}_mean']
    if metrics == 'mse':
        performance = 1 / performance
        stability = 1 / stability
    return RPT(stability,performance,beta=1)

In [None]:
df = pd.read_csv('../leaderboard/output.csv')
df

In [None]:
def add_model_family(row):
    if "StackedEnsemble" in row['model_id']:
        return '_'.join(row['model_id'].split('_',3)[:2])
    else:
        return row['model_id'].split('_')[0]

In [None]:
# df_dev = df.drop(columns=['process_approach','imputation','patient_group','drug_group','train_test_rate','remove_low_DAS','random_state'])
df_dev = df
df_dev.loc[:,'model_family'] = df_dev.apply(lambda row:add_model_family(row),axis=1)
# df_dev.loc[:,'RPT'] = df_dev.apply(lambda row: calculate_RPT(row,'mse'), axis=1)
# df_dev = df_dev.sort_values(by='mse_mean',ascending=True).reset_index(drop=True)
df_dev

In [None]:
df_dev.to_csv("../imputation_comparison.csv",index=False)

In [None]:
df_dev.sort_values(by='mse_mean',ascending=True)[['model_id','mse_mean','mse_std']].head(20)

In [None]:
def error_bar_plot(df, evaluation_metrics):
    if evaluation_metrics == 'r2' or evaluation_metrics == 'RPT':
        selected_models = df.sort_values(by=f'{evaluation_metrics}_mean',ascending=False).groupby('model_family').first()
        selected_models = selected_models.sort_values(by=f'{evaluation_metrics}_mean', ascending=False)
        y = selected_models[f'{evaluation_metrics}_mean']
        x = y.index
        y_error = selected_models[f'{evaluation_metrics}_std']
    else:
        selected_models = df.sort_values(by=f'{evaluation_metrics}_mean',ascending=True).groupby('model_family').first()
        selected_models = selected_models.sort_values(by=f'{evaluation_metrics}_mean', ascending=True)
        y = selected_models[f'{evaluation_metrics}_mean']
        x = y.index
        y_error = selected_models[f'{evaluation_metrics}_std']
   
    y_1 = [round(a,2) for a in y]
    y_perc = [round(a*100,1) for a in y]
#     print(min(y_error[:2]))
    colors = ["#F9665E" if i == 0 else "#799FCB" for i in range(len(x))]
    
    fig, (ax1,ax2) = plt.subplots(1,2)
    fig.set_size_inches(20,6)

    if evaluation_metrics == 'mse':
        ax1.set_ylim(0,max(y+y_error)+0.1)
        bars = ax1.bar(x,y_1,color=colors)
    elif evaluation_metrics == 'accuracy':
        ax1.set_ylim(50,100)
        bars = ax1.bar(x,y_perc,color=colors)
    elif evaluation_metrics == 'RPT':
        ax1.set_ylim(1,1.75)
        bars = ax1.bar(x,y_1,color=colors)
    else:
        bars = ax1.bar(x,y,color=colors)
        
    ax1.set_xlabel('Model', fontsize=20)
    ax1.set_ylabel(evaluation_metrics.upper(), fontsize=20)
    
    if evaluation_metrics == 'RPT':
        pass
    
    else:
        ax1.bar_label(bars, label_type='center',color='white', fontsize=12)

        ax1.errorbar(x, y, yerr=y_error,
                  fmt='o', color='orange', ecolor='orange',
                  elinewidth = 3, capsize=10)
        extent = ax2.get_window_extent().transformed(fig.dpi_scale_trans.inverted())
        fig.savefig('mse.png', bbox_inches=extent)
        
        ax2.scatter(y,y_error)
        for i in range(len(x)):
            alignment = 'left' if y[i] < 1.22 else 'right'
            ax2.text(x=y[i], y=y_error[i], s=x[i], 
                     horizontalalignment=alignment, verticalalignment='bottom',
                     fontdict=dict(color='black', alpha=0.8, size=16))
        ax2.plot([min(y), max(y)], [min(y_error), max(y_error)], ls="--", c=".3")
        ax2.set_xlabel(f'{evaluation_metrics}_mean', fontsize=20)
        ax2.set_ylabel(f'{evaluation_metrics}_std', fontsize=20)
    
    plt.setp(ax1.get_xticklabels(), rotation=30, horizontalalignment='right')
    
    plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=0.9, 
                    top=0.9, 
                    wspace=0.2, 
                    hspace=0.6)

In [None]:
# top_k, top_k_error, test = filter_data(rank_metrix='MSE',challenge='regression',topk=5)
error_bar_plot(df_dev, 'mse')

In [None]:
dataset = CoronnaCERTAINDataset(
    library_root = 'Dataset/',
    challenge = 'regression_delta', #option: regression, regression_delta, classification, binary_classification, regression_delta_binary
    dataset = 'CORRONA CERTAIN', 
    process_approach = 'SC', #option: KVB, SC
    imputation = None, #option: SimpleFill, KNN, SoftImpute, BiScaler, NuclearNormMinimization, IterativeImputer, IterativeSVD, None(raw)
    patient_group = ['bionaive TNF'], #option: "all", "bioexp nTNF", "bionaive TNF", "bionaive orencia", "KVB"
    drug_group = 'all', #option: "all", "actemra", "cimzia", "enbrel", "humira", "orencia", "remicade", "rituxan", "simponi"
    time_points = (0,3), 
    train_test_rate = 0.8,
    remove_low_DAS = True,
    save_csv = False, 
    random_state = 2022,
    verbose=False)

train, train_loc = dataset.get_train()
test, test_loc = dataset.get_test()

# Start the H2O cluster (locally)
h2o.init()

# Import a sample binary outcome train/test set into H2O
# train_h2o = h2o.upload_file(str(train_loc))
# test_h2o = h2o.upload_file(str(test_loc))
train_h2o = h2o.import_file(str(train_loc))
test_h2o = h2o.import_file(str(test_loc))

# Identify predictors and response
x = train_h2o.columns[:-1]
# y = "DAS28_CRP_3M"
y = dataset.target

for feature in dataset.categorical:
    train_h2o[feature] = train_h2o[feature].asfactor()
    test_h2o[feature] = test_h2o[feature].asfactor()
if "classification" in dataset.challenge:
    train_h2o[y] = train_h2o[y].asfactor()
    test_h2o[y] = test_h2o[y].asfactor()

In [None]:
# regression
evaluation_metrics = 'mse'
model_path_dir = '../leaderboard/model_saved/'
model_id_list = list(df_dev.loc[df_dev.groupby('model_family')[f'{evaluation_metrics}_mean'].idxmin()].reset_index(drop=True).sort_values(by=f'{evaluation_metrics}_mean')['model_id'].values)
# model_id = model_id_list[0]
for i, model_id in enumerate(model_id_list):
    model_path = os.path.join(model_path_dir,model_id)
    uploaded_model = h2o.upload_model(model_path)
    m = h2o.get_model(model_id)
    print(m)
    print(m.model_performance(test_h2o))
    regression_pred = m.predict(test_h2o).as_data_frame()['predict']
    regression_true = test_h2o.as_data_frame()[dataset.target]
    m.model_performance(test_h2o)

    X = test.drop(columns=dataset.target)
    true = test[dataset.target]
    pred = m.predict(test_h2o).as_data_frame()
    
    print("R2", R2(true,pred))
    print("Adjusted_R2", Adjusted_R2(true,pred,len(test.columns)))
    print()

    baseline = test['DAS28_CRP_0M']

    baseline, true, pred = np.array(baseline), np.array(true), np.squeeze(np.array(pred))
    results_df = pd.DataFrame(list(zip(baseline, true, pred)),
                      columns=['baseline', 'true', 'pred'])

    classification_pred = results_df.apply(
        lambda row: responseClassify_binary(row, 'baseline', 'pred'), axis=1)

    classification_true = results_df.apply(
        lambda row: responseClassify_binary(row, 'baseline', 'true'), axis=1)

    print("classification accuracy", Classification_Accuracy(classification_true,classification_pred))
    print("F1 score", F1_Score(classification_true,classification_pred))

    contingency_matrix = pd.crosstab(classification_true, classification_pred, rownames=['true'], colnames=[
                                                 'prediction'], normalize='columns')
    plt.figure(i)
    sns.heatmap(contingency_matrix.T, annot=True,
                fmt='.2f', cmap="YlGnBu", cbar=False)
    plt.show()


In [None]:
model_path = '../leaderboard/model_saved/GLM_1_AutoML_10_20220803_231402'
uploaded_model = h2o.upload_model(model_path)
m = h2o.get_model('GLM_1_AutoML_10_20220803_231402')
m

In [None]:
m.varimp_plot()

In [None]:
df = pd.read_csv('../leaderboard/SC_regression_Aug3_final_output.csv')
df.loc[:,'model_family'] = df.apply(lambda row:add_model_family(row),axis=1)

selected_models = df.sort_values(by=f'{evaluation_metrics}_mean',ascending=True).groupby('model_family').first()
selected_models = selected_models.sort_values(by=f'{evaluation_metrics}_mean', ascending=True)
model_list = selected_models['model_id'].values
        
# model_list = df['model_id'].values
df = h2o.H2OFrame(selected_models)

for model in model_list:
    model_path = os.path.join('../leaderboard/model_saved/', model)
    h2o.upload_model(model_path)
    
print(df)
    
h2o.varimp_heatmap(df, test_h2o,num_of_features=10)

# Binary Classification

In [None]:
df = pd.read_csv('../leaderboard/SC_binary_classification_Aug4_final_output.csv')
df

In [None]:
df_dev = df
df_dev.loc[:,'model_family'] = df_dev.apply(lambda row:add_model_family(row),axis=1)
df_dev

In [None]:
def addlabels(ax,x,y):
    for i in range(len(x)):
        ax.text(i, (y[i]+50)//2, y[i], ha = 'center', color='white',fontsize=12)

In [None]:
def error_bar_plot(df, evaluation_metrics):
    selected_models = df.sort_values(by=f'{evaluation_metrics}_mean',ascending=False).groupby('model_family').first()
    selected_models = selected_models.sort_values(by=f'{evaluation_metrics}_mean', ascending=False)
    y = selected_models[f'{evaluation_metrics}_mean']
    x = y.index
    y_error = selected_models[f'{evaluation_metrics}_std']
    
#     x = list(df['model_id'])
#     y = df[f'{evaluation_metrics}_mean']
    y_1 = [round(a,2) for a in y]
#     y_error = df[f'{evaluation_metrics}_std']
    y_error_perc = [a*100 for a in y_error]
    y_perc = [round(a*100,1) for a in y]
    colors = ["#F9665E" if i == 0 else "#799FCB" for i in range(len(x))]

    fig, (ax1,ax2) = plt.subplots(1,2)
    fig.set_size_inches(20,5)

    if evaluation_metrics == 'mse':
        ax1.set_ylim(0,max(y+y_error)+0.5)
        bars = ax1.bar(x,y_1,color=colors)
    elif evaluation_metrics == 'accuracy':
        ax1.set_ylim(50,100)
        bars = ax1.bar(x,y_perc,color=colors)
    else:
        bars = ax1.bar(x,y,color=colors)
        
    addlabels(ax1,x,y_perc)
#     ax1.bar_label(bars,label_type='center', color='white', fontsize=12)
    ax1.set_xlabel('Model', fontsize=20)
    ax1.set_ylabel('Accuracy(%)', fontsize=20)
    
    ax1.errorbar(x, y_perc, yerr=y_error_perc,
              fmt='o', color='orange', ecolor='orange',
              elinewidth = 3, capsize=10)
    
    
    ax2.scatter(y,y_error)
    for i in range(len(x)):
        alignment = 'left' if y[i] < 0.800 else 'right'
        ax2.text(x=y[i], y=y_error[i], s=x[i], 
                 horizontalalignment=alignment, verticalalignment='bottom',
                 fontdict=dict(color='black', alpha=0.8, size=16))
                    
    ax2.plot([min(y), max(y)], [max(y_error), min(y_error)], ls="--", c=".3")
#     ax2.arrow(min(y), max(y_error), -0.01,0.02)
    ax2.set_xlabel(f'{evaluation_metrics}_mean', fontsize=20)
    ax2.set_ylabel(f'{evaluation_metrics}_std', fontsize=20)
    
    plt.setp(ax1.get_xticklabels(), rotation=30, horizontalalignment='right')
    
    plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=0.9, 
                    top=0.9, 
                    wspace=0.3, 
                    hspace=0.4)
    

In [None]:
# top_k, top_k_error, test = filter_data(rank_metrix='MSE',challenge='regression',topk=5)
error_bar_plot(df_dev, 'accuracy')

In [None]:
def error_bar_plot(df, evaluation_metrics):
    selected_models = df.sort_values(by=f'{evaluation_metrics}_mean',ascending=False).groupby('model_family').first()
    selected_models = selected_models.sort_values(by=f'{evaluation_metrics}_mean', ascending=False)
    y = selected_models[f'{evaluation_metrics}_mean']
    x = y.index
    y_error = selected_models[f'{evaluation_metrics}_std']
    
    y_1 = [round(a,2) for a in y]
    y_error_perc = [a*100 for a in y_error]
    y_perc = [round(a*100,1) for a in y]
    colors = ["#F9665E" if i == 1 else "#799FCB" for i in range(len(x))]

    fig, (ax1,ax2) = plt.subplots(1,2)
    fig.set_size_inches(20,5)

    if evaluation_metrics == 'mse':
        ax1.set_ylim(0,max(y+y_error)+0.5)
        bars = ax1.bar(x,y_1,color=colors)
    elif evaluation_metrics == 'auc':
        ax1.set_ylim(0,1)
        bars = ax1.bar(x,y_1,color=colors)
    else:
        bars = ax1.bar(x,y,color=colors)
        
#     addlabels(ax1,x,y_1)
    ax1.bar_label(bars,label_type='center', color='white', fontsize=12)
    ax1.set_xlabel('Model', fontsize=20)
    ax1.set_ylabel('Area under ROC', fontsize=20)
    
    ax1.errorbar(x, y, yerr=y_error,
              fmt='o', color='orange', ecolor='orange',
              elinewidth = 3, capsize=10)
    
    ax2.scatter(y,y_error)
    for i in range(len(x)):
        alignment = 'left' if y[i] < 0.800 else 'right'
        ax2.text(x=y[i], y=y_error[i], s=x[i], 
                 horizontalalignment=alignment, verticalalignment='bottom',
                 fontdict=dict(color='black', alpha=0.8, size=16))
                    
    ax2.plot([min(y), max(y)], [max(y_error), min(y_error)], ls="--", c=".3")
#     ax2.arrow(min(y), max(y_error), -0.01,0.02)
    ax2.set_xlabel(f'{evaluation_metrics}_mean', fontsize=20)
    ax2.set_ylabel(f'{evaluation_metrics}_std', fontsize=20)
    
    plt.setp(ax1.get_xticklabels(), rotation=30, horizontalalignment='right')
    
    plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=0.9, 
                    top=0.9, 
                    wspace=0.3, 
                    hspace=0.2)
    

In [None]:
# top_k, top_k_error, test = filter_data(rank_metrix='MSE',challenge='regression',topk=5)
error_bar_plot(df_dev, 'auc')

In [None]:
dataset = CoronnaCERTAINDataset(
    library_root = 'Dataset/',
    challenge = 'binary_classification', #option: regression, regression_delta, classification, binary_classification, regression_delta_binary
    dataset = 'CORRONA CERTAIN', 
    process_approach = 'SC', #option: KVB, SC
    imputation = None, #option: SimpleFill, KNN, SoftImpute, BiScaler, NuclearNormMinimization, IterativeImputer, IterativeSVD, None(raw)
    patient_group = ['bionaive TNF'], #option: "all", "bioexp nTNF", "bionaive TNF", "bionaive orencia", "KVB"
    drug_group = 'all', #option: "all", "actemra", "cimzia", "enbrel", "humira", "orencia", "remicade", "rituxan", "simponi"
    time_points = (0,3), 
    train_test_rate = 0.8,
    remove_low_DAS = True,
    save_csv = False, 
    random_state = 2022,
    verbose=False)

train, train_loc = dataset.get_train()
test, test_loc = dataset.get_test()

# Start the H2O cluster (locally)
h2o.init()

# Import a sample binary outcome train/test set into H2O
# train_h2o = h2o.upload_file(str(train_loc))
# test_h2o = h2o.upload_file(str(test_loc))
train_h2o = h2o.import_file(str(train_loc))
test_h2o = h2o.import_file(str(test_loc))

# Identify predictors and response
x = train_h2o.columns[:-1]
# y = "DAS28_CRP_3M"
y = dataset.target

for feature in dataset.categorical:
    train_h2o[feature] = train_h2o[feature].asfactor()
    test_h2o[feature] = test_h2o[feature].asfactor()
if "classification" in dataset.challenge:
    train_h2o[y] = train_h2o[y].asfactor()
    test_h2o[y] = test_h2o[y].asfactor()

In [None]:
# regression
evaluation_metrics = 'accuracy'
model_path_dir = '../leaderboard/model_saved/'
model_id_list = list(df_dev.loc[df_dev.groupby('model_family')[f'{evaluation_metrics}_mean'].idxmax()].reset_index(drop=True).sort_values(by=f'{evaluation_metrics}_mean',ascending=False)['model_id'].values)

for i, model_id in enumerate(model_id_list):
    model_path = os.path.join(model_path_dir,model_id)
    uploaded_model = h2o.upload_model(model_path)
    m = h2o.get_model(model_id)
#     print(m)
    regression_pred = m.predict(test_h2o).as_data_frame()['predict']
    regression_true = test_h2o.as_data_frame()[dataset.target]
    m.model_performance(test_h2o)

    X = test.drop(columns=dataset.target)
    true = test[dataset.target]
    pred = m.predict(test_h2o).as_data_frame()['predict']

    baseline = test['DAS28_CRP_0M']

    baseline, true, pred = np.array(baseline), np.array(true), np.squeeze(np.array(pred))

    print("classification accuracy", Classification_Accuracy(true,pred))
    print("F1 score", F1_Score(true,pred))

    contingency_matrix = pd.crosstab(true, pred, rownames=['true'], colnames=[
                                                 'prediction'], normalize='columns')
    plt.figure(i)
    sns.heatmap(contingency_matrix.T, annot=True,
                fmt='.2f', cmap="YlGnBu", cbar=False)
    plt.show()


# 3 Class 

In [None]:
df = pd.read_csv('../leaderboard/SC_3_Class_classification_Aug4_final_output.csv')
df

In [None]:
df_dev = df
df_dev.loc[:,'model_family'] = df_dev.apply(lambda row:add_model_family(row),axis=1)
df_dev

In [None]:
def addlabels(ax,x,y):
    for i in range(len(x)):
        ax.text(i, (y[i])//2, y[i], ha = 'center', color='white',fontsize=12)

In [None]:
def error_bar_plot(df, evaluation_metrics):
    selected_models = df.sort_values(by=f'{evaluation_metrics}_mean',ascending=False).groupby('model_family').first()
    selected_models = selected_models.sort_values(by=f'{evaluation_metrics}_mean', ascending=False)
    y = selected_models[f'{evaluation_metrics}_mean']
    x = y.index
    y_error = selected_models[f'{evaluation_metrics}_std']
    
#     x = list(df['model_id'])
#     y = df[f'{evaluation_metrics}_mean']
    y_1 = [round(a,2) for a in y]
#     y_error = df[f'{evaluation_metrics}_std']
    y_error_perc = [a*100 for a in y_error]
    y_perc = [round(a*100,1) for a in y]
    colors = ["#F9665E" if i == 0 else "#799FCB" for i in range(len(x))]

    fig, (ax1,ax2) = plt.subplots(1,2)
    fig.set_size_inches(20,5)

    if evaluation_metrics == 'mse':
        ax1.set_ylim(0,max(y+y_error)+0.5)
        bars = ax1.bar(x,y_1,color=colors)
    elif evaluation_metrics == 'accuracy':
        ax1.set_ylim(0,100)
        bars = ax1.bar(x,y_perc,color=colors)
    else:
        bars = ax1.bar(x,y,color=colors)
        
    addlabels(ax1,x,y_perc)
#     ax1.bar_label(bars,label_type='center', color='white', fontsize=12)
    ax1.set_xlabel('Model', fontsize=20)
    ax1.set_ylabel('Accuracy(%)', fontsize=20)
    
    ax1.errorbar(x, y_perc, yerr=y_error_perc,
              fmt='o', color='orange', ecolor='orange',
              elinewidth = 3, capsize=10)
    
    
    ax2.scatter(y,y_error)
    for i in range(len(x)):
        alignment = 'left' if y[i] < 0.800 else 'right'
        ax2.text(x=y[i], y=y_error[i], s=x[i], 
                 horizontalalignment=alignment, verticalalignment='bottom',
                 fontdict=dict(color='black', alpha=0.8, size=16))
                    
    ax2.plot([min(y), max(y)], [max(y_error), min(y_error)], ls="--", c=".3")
#     ax2.arrow(min(y), max(y_error), -0.01,0.02)
    ax2.set_xlabel(f'{evaluation_metrics}_mean', fontsize=20)
    ax2.set_ylabel(f'{evaluation_metrics}_std', fontsize=20)
    
    plt.setp(ax1.get_xticklabels(), rotation=30, horizontalalignment='right')
    
    plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=0.9, 
                    top=0.9, 
                    wspace=0.3, 
                    hspace=0.4)
    

In [None]:
# top_k, top_k_error, test = filter_data(rank_metrix='MSE',challenge='regression',topk=5)
error_bar_plot(df_dev, 'accuracy')

In [None]:
def error_bar_plot(df, evaluation_metrics):
    selected_models = df.sort_values(by=f'{evaluation_metrics}_mean',ascending=False).groupby('model_family').first()
    selected_models = selected_models.sort_values(by=f'{evaluation_metrics}_mean', ascending=False)
    y = selected_models[f'{evaluation_metrics}_mean']
    x = y.index
    y_error = selected_models[f'{evaluation_metrics}_std']
    
    y_1 = [round(a,2) for a in y]
    y_error_perc = [a*100 for a in y_error]
    y_perc = [round(a*100,1) for a in y]
    colors = ["#F9665E" if i == 1 else "#799FCB" for i in range(len(x))]

    fig, (ax1,ax2) = plt.subplots(1,2)
    fig.set_size_inches(20,5)

    if evaluation_metrics == 'mse':
        ax1.set_ylim(0,max(y+y_error)+0.5)
        bars = ax1.bar(x,y_1,color=colors)
    elif evaluation_metrics == 'auc':
        ax1.set_ylim(0,1)
        bars = ax1.bar(x,y_1,color=colors)
    else:
        bars = ax1.bar(x,y,color=colors)
        
#     addlabels(ax1,x,y_1)
    ax1.bar_label(bars,label_type='center', color='white', fontsize=12)
    ax1.set_xlabel('Model', fontsize=20)
    ax1.set_ylabel('Area under ROC', fontsize=20)
    
    ax1.errorbar(x, y, yerr=y_error,
              fmt='o', color='orange', ecolor='orange',
              elinewidth = 3, capsize=10)
    
    ax2.scatter(y,y_error)
    for i in range(len(x)):
        alignment = 'left' if y[i] < 0.800 else 'right'
        ax2.text(x=y[i], y=y_error[i], s=x[i], 
                 horizontalalignment=alignment, verticalalignment='bottom',
                 fontdict=dict(color='black', alpha=0.8, size=16))
                    
    ax2.plot([min(y), max(y)], [max(y_error), min(y_error)], ls="--", c=".3")
#     ax2.arrow(min(y), max(y_error), -0.01,0.02)
    ax2.set_xlabel(f'{evaluation_metrics}_mean', fontsize=20)
    ax2.set_ylabel(f'{evaluation_metrics}_std', fontsize=20)
    
    plt.setp(ax1.get_xticklabels(), rotation=30, horizontalalignment='right')
    
    plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=0.9, 
                    top=0.9, 
                    wspace=0.3, 
                    hspace=0.2)
    