In [None]:
import pandas as pd
import numpy as np
import shap
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor
from tqdm.auto import tqdm
from sklearn.utils import resample
from utils import bayescv, cv_scores, reg_all_scores

In [None]:
pd.set_option('display.max_columns',1000)
pd.options.display.float_format = '{:.4f}'.format
sns.set_theme(style='whitegrid')
model_types = ['extratree', 'gradientboost']
cls = model_types[0]
thres = 1.03
random_state = 42
n_iter = 30

zscore = False
bayes = True

## Load BF2 data

In [None]:
load_bf2_df = pd.read_csv('csv/BF2_R.csv')
load_bf2_df['APOE'].replace({22.: '22', 23.: '23', 24.: '24', 33.: '33', 34.: '34', 44.: '44'}, inplace=True)
if zscore:
    load_bf1_df = pd.read_csv('csv/BF1_R_Z.csv')
    load_bf1_df['APOE'].replace({22.: '22', 23.: '23', 24.: '24', 33.: '33', 34.: '34', 44.: '44'}, inplace=True)

In [None]:
ptau217 = ['Plasma WashU %P-tau217',
           'Plasma Lilly P-tau217',
           'CSF Lilly P-tau217',
           'CSF WashU P-tau217']

common = ['CSF Aβ42/Aβ40',
          'Age',
          'APOE',
          'ADAS',
          'Education',
          'Sex',
          'Cognitive status',
          'MMSE',
          'CSF Abnormal Ratio',
          'Diagnosis status',
          'fnc_ber_com_composite']

cd_drop = [ 
    # 'CSF Aβ42/Aβ40',
                'ADAS',
                'Education',
                'Sex',
                'Cognitive status',
                'MMSE',
                'CSF Abnormal Ratio',
                'Diagnosis status']

name = ['BF2-P-MS','BF2-P-IA','BF2-C-IA','BF2-C-MS']

In [None]:
ptau217_index = 0
features = [ptau217[ptau217_index]] + common
select_df = load_bf2_df[features]
select_df = select_df.dropna(how='any').reset_index(drop=True)
select_df.info()

## Stratified Data Split

In [None]:
neg_idx = select_df['fnc_ber_com_composite']<=thres
pos_idx = (1-neg_idx).astype('bool')
neg_df = select_df[neg_idx]
pos_df = select_df[pos_idx]
neg_tv_df, neg_test_df = train_test_split(neg_df, test_size=0.2, random_state=random_state)
pos_tv_df, pos_test_df = train_test_split(pos_df, test_size=0.2, random_state=random_state)
tv_df = pd.concat([neg_tv_df, pos_tv_df])
test_df = pd.concat([neg_test_df, pos_test_df])

if zscore:
    zs_feature = ptau217[ptau217_index]
    control_df = tv_df[((tv_df['Cognitive status'] == 'Normal') | 
                        (tv_df['Cognitive status'] == 'SCD')) & (tv_df['CSF Abnormal Ratio'] == 0)]
    z_mean = control_df[zs_feature].mean()
    z_std = control_df[zs_feature].std()
    tv_df[zs_feature] = (tv_df[zs_feature]-z_mean)/z_std
    test_df[zs_feature] = (test_df[zs_feature]-z_mean)/z_std
    if ptau217_index in [0,1]:
        bf1_df = load_bf1_df.drop('CSF Lilly P-tau217',axis=1)
        bf1_df.rename(columns = {'Plasma Lilly P-tau217': select_df.columns[0]}, inplace = True)
    elif ptau217_index in [2,3]:
        bf1_df = load_bf1_df.drop('Plasma Lilly P-tau217',axis=1)
        bf1_df.rename(columns = {'CSF Lilly P-tau217': select_df.columns[0]}, inplace = True)        
    bf1_df = bf1_df.dropna(how='any')
    bf1_df = bf1_df.drop(cd_drop,axis=1)
    X_bf1 = bf1_df.drop(['fnc_ber_com_composite'], axis=1)
    y_bf1 = bf1_df['fnc_ber_com_composite']

tv_df = tv_df.drop(cd_drop,axis=1)
test_df = test_df.drop(cd_drop,axis=1)
X_train = tv_df.drop(['fnc_ber_com_composite'], axis=1)
y_train = tv_df['fnc_ber_com_composite']

X_test = test_df.drop(['fnc_ber_com_composite'], axis=1)
y_test = test_df['fnc_ber_com_composite']

In [None]:
if zscore:
    bf1_df.info()

In [None]:
list(X_train.columns)

In [None]:
if cls == 'extratree':
    model = ExtraTreesRegressor()
elif cls == 'gradientboost':
    model = GradientBoostingRegressor()

## BayesSearchCV or load best parameters

In [None]:
if bayes:
    opt = bayescv(X_train, y_train, n_iter, model, random_state=random_state, cls=cls)
    best_param = dict(opt.best_params_)
    print(best_param)
else:
    if cls == 'extratree':
        if ptau217_index == 0:
            best_param = {'max_depth': 5,'min_samples_leaf': 1,'min_samples_split': 3,'n_estimators': 150}
        if ptau217_index == 1:
            best_param = {'max_depth': 6,'min_samples_leaf': 4,'min_samples_split': 2,'n_estimators': 50}
        if ptau217_index == 2:
            best_param = {'max_depth': 6,'min_samples_leaf': 4,'min_samples_split': 2,'n_estimators': 100}
        if ptau217_index == 3:
            best_param = {'max_depth': 12,'min_samples_leaf': 4,'min_samples_split': 2,'n_estimators': 64}

In [None]:
cv_scores_df = cv_scores(tv_df, 5, cls, best_param, thres, random_state)

In [None]:
cv_scores_df.describe()

## Train and Test the Optimal Model

In [None]:
X_train = tv_df.drop(['fnc_ber_com_composite'], axis=1)
y_train = tv_df['fnc_ber_com_composite']

X_test = test_df.drop(['fnc_ber_com_composite'], axis=1)
y_test = test_df['fnc_ber_com_composite']
if cls == 'extratree':
    best_model = ExtraTreesRegressor(**best_param, random_state=random_state)
elif cls == 'gradientboost':
    best_model = GradientBoostingRegressor(**best_param, random_state=random_state)
best_model.fit(X_train, y_train)
## rs=42
result_df = pd.DataFrame(reg_all_scores(best_model, X_train, y_train, X_test, y_test, thres), index=
                         ['Train_R2', 'Test_R2', 'Test_R2_NEG', 'Test_R2_POS', 'Train_MAPE', 'Test_MAPE','Test_MAPE_NEG', 'Test_MAPE_POS']).T
result_df

In [None]:
## bootstrap of test
bstp_scores = []
bf1_scores = []
for i in tqdm(range(100)):
    X_testt, y_testt = resample(X_test, y_test, n_samples=80, replace=True, stratify=y_test, random_state=i)
    bstp_scores.append([i] + list(reg_all_scores(best_model, X_train, y_train, X_testt, y_testt, thres)))
    if zscore:
        X_bf11, y_bf11 = resample(X_bf1, y_bf1, n_samples=80, replace=True, stratify=y_bf1, random_state=i)
        bf1_scores.append([i] + list(reg_all_scores(best_model, X_train, y_train, X_bf11, y_bf11, thres))) 

In [None]:
## bootstrap boxchart of BF2
bstp_scores_df = pd.DataFrame(bstp_scores, columns=['iteration', 'Train_R2', 'Test_R2', 'Test_R2_NEG', 'Test_R2_POS', 'Train_MAPE', 'Test_MAPE','Test_MAPE_NEG', 'Test_MAPE_POS'])
bstp_scores_df['data'] = 'BF2'
sns.set_theme(style="whitegrid", palette=sns.color_palette("tab10"), font_scale=0.95)

fig, ax = plt.subplots(1,2,figsize=(8,5), gridspec_kw={'width_ratios': [1,3]})

a_df = bstp_scores_df[['Test_R2']]
a_df = a_df.set_axis(['Aβ'],axis='columns')
sns.boxplot(a_df,width=0.5,whis=10,linewidth = 1.5,ax=ax[0])
ax[0].set_yticks([0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])
ax[0].set_ylabel('R squared')

sns.set_theme(style="white", palette=sns.color_palette("tab10")[1:])

b_df = bstp_scores_df[['Test_MAPE','Test_MAPE_NEG','Test_MAPE_POS']]
b_df = b_df.set_axis(['Aβ','Aβ-','Aβ+'],axis='columns')
sns.boxplot(b_df,width=0.5,whis=30,linewidth = 1.5,ax=ax[1])
ax[1].set_yticks([0,0.02,0.04,0.06,0.08,0.10,0.12,0.14,0.16])
ax[1].set_ylabel('MAPE')
# ax[1].yaxis.tick_right()
ax[1].yaxis.set_label_position("left")
ax[0].tick_params(direction="in",  length=0)
ax[1].tick_params(direction="in", length=0)
plt.suptitle('Model Performance on Evaluation Metrics ({0})'.format(name[ptau217_index]), y=0.95, fontsize=13)
plt.ylim()
plt.show()

In [None]:
## bootstrap boxchart of BF2 and BF1
if zscore:
    bf1_scores_df = pd.DataFrame(bf1_scores, columns=['iteration', 'Train_R2', 'Test_R2', 'Test_R2_NEG', 'Test_R2_POS', 'Train_MAPE', 'Test_MAPE','Test_MAPE_NEG', 'Test_MAPE_POS'])
    bf1_scores_df['data'] = 'BF1'
    sns.set_theme(style="whitegrid", palette=sns.color_palette("tab10"), font_scale=0.95)
    fig, ax = plt.subplots(1,2,figsize=(10,6), gridspec_kw={'width_ratios': [1,3]})

    tbf2_df = bstp_scores_df.dropna(how='any')
    t1_df = pd.DataFrame(list(tbf2_df['Test_R2']),columns=['values'])
    t1_df['metrics'] = 'Aβ'
    t2_df = pd.DataFrame(list(tbf2_df['Test_MAPE']), columns=['values'])
    t2_df['metrics'] = 'Aβ'
    t3_df = pd.DataFrame(list(tbf2_df['Test_MAPE_NEG']), columns=['values'])
    t3_df['metrics'] = 'Aβ-'
    t4_df = pd.DataFrame(list(tbf2_df['Test_MAPE_POS']), columns=['values'])
    t4_df['metrics'] = 'Aβ+'
    r2bf2_df = t1_df.dropna(how='any')
    vbf2_df = pd.concat([t2_df,t3_df,t4_df],axis=0).reset_index(drop=True)

    tbf1_df = bf1_scores_df.dropna(how='any')
    t1_df = pd.DataFrame(list(tbf1_df['Test_R2']),columns=['values'])
    t1_df['metrics'] = 'Aβ'
    t2_df = pd.DataFrame(list(tbf1_df['Test_MAPE']), columns=['values'])
    t2_df['metrics'] = 'Aβ'
    t3_df = pd.DataFrame(list(tbf1_df['Test_MAPE_NEG']), columns=['values'])
    t3_df['metrics'] = 'Aβ-'
    t4_df = pd.DataFrame(list(tbf1_df['Test_MAPE_POS']), columns=['values'])
    t4_df['metrics'] = 'Aβ+'
    r2bf1_df = t1_df.dropna(how='any')
    vbf1_df = pd.concat([t2_df,t3_df,t4_df],axis=0).reset_index(drop=True)
    vbf2_df['data'] = 'BF2'
    vbf1_df['data'] = 'BF1'
    r2bf2_df['data'] = 'BF2'
    r2bf1_df['data'] = 'BF1'
    r2bf_df = pd.concat([r2bf2_df,r2bf1_df]).reset_index(drop=True)
    vbf_df = pd.concat([vbf2_df,vbf1_df]).reset_index(drop=True)
    sns.boxplot(r2bf_df, x='metrics', y='values', width=0.6, hue=r2bf_df['data'], whis=30,linewidth = 1.5, palette=[sns.color_palette("tab10", 8)[0],sns.color_palette("tab10", 8)[3]],ax=ax[0])
    sns.boxplot(vbf_df, x='metrics', y='values', width=0.6, hue=vbf_df['data'], whis=30,linewidth = 1.5, palette=[sns.color_palette("tab10", 8)[0],sns.color_palette("tab10", 8)[3]],ax=ax[1])
    ax[0].set_yticks([0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])
    ax[1].set_yticks([0,0.02,0.04,0.06,0.08,0.10,0.12,0.14,0.16,0.18])
    ax[0].set_ylabel('R squared')
    ax[1].set_ylabel('MAPE')
    ax[0].set_xlabel('')
    ax[1].set_xlabel('')
    # ax[0].legend(loc='upper right')
    plt.suptitle('Model Performance on Evaluation Metrics ({0} with Z-Score)'.format(name[ptau217_index]), y=0.95)
    plt.show()

In [None]:
vbf2_df.groupby('metrics').describe()

In [None]:
vbf1_df.groupby('metrics').describe()

In [None]:
## Observation vs. Prediction
sns.set_theme(style='whitegrid', palette=sns.color_palette('deep'))
y_pred = best_model.predict(X_test)
fig, ax = plt.subplots(1,2,figsize=(14,5))
max_value = max(y_test.max(), y_pred.max(), y_train.max(), best_model.predict(X_train).max())
min_value = min(y_test.min(), y_pred.min(), y_train.min(), best_model.predict(X_train).min())
sns.scatterplot(x=y_train, y=best_model.predict(X_train),  ax=ax[0], color=sns.color_palette('deep')[0])
sns.scatterplot(x=y_test, y=y_pred,  ax=ax[0], marker='s', color=sns.color_palette('deep')[3])
sns.regplot(x=y_train, y=best_model.predict(X_train),  ax=ax[0], line_kws={'color':'darkblue'}, scatter=False)
sns.regplot(x=y_test, y=y_pred, ax=ax[0], line_kws={'color':'darkred'}, scatter=False)
l = min(y_train.min(), best_model.predict(X_train).min()),max(y_train.max(), best_model.predict(X_train).max())
ax[0].plot(l,l,'--',c='black', linewidth=0.8)
ax[0].axvline(1.03, color = 'black', linewidth=1.5, linestyle='--')
ax[0].text(1.05,2.2,'cutpoint=1.03',rotation=0)
ax[0].set_title("Observation vs. Prediction")
ax[0].set_xlabel('Observation [SUVR]')
ax[0].set_ylabel('Prediction [SUVR]')
ax[0].annotate("Train R2={:.3f}".format(result_df['Train_R2'][0]), (1.8, 0.9))
ax[0].annotate("Test R2={:.3f}".format(result_df['Test_R2'][0]), (1.8, 0.8))
ax[0].legend(['Train', 'Test'], loc='upper right')
## Observation vs. Residual
sns.scatterplot(x=y_train, y=np.abs(best_model.predict(X_train)-y_train),  ax=ax[1], color=sns.color_palette('deep')[0])
sns.scatterplot(x=y_test, y=np.abs(y_pred-y_test),  ax=ax[1], marker='s', color=sns.color_palette('deep')[3])
sns.regplot(x=y_train, y=np.abs(best_model.predict(X_train)-y_train), order=3, ax=ax[1], line_kws={'color':'darkblue'}, scatter=False)
sns.regplot(x=y_test, y=np.abs(y_pred-y_test), order=3, ax=ax[1], line_kws={'color':'darkred'}, scatter=False)
ax[1].axvline(1.03, color = 'black', linewidth=1.5, linestyle='--')
ax[1].text(1.05,0.85,'cutpoint=1.03',rotation=0)
ax[1].set_xlabel('Observation [SUVR]')
ax[1].set_ylabel('Residual [SUVR]')
ax[1].set_title('Observation vs. Residual')
ax[1].legend(['Train', 'Test'], loc='upper right')
plt.show()

In [None]:
fig, ax = plt.subplots(1,2,figsize=(14,5))
sns.set_theme(style='whitegrid', palette=[sns.color_palette('deep')[3],sns.color_palette('deep')[0]])
sns.histplot([best_model.predict(X_train), y_train], bins=100, alpha=0.8, ax=ax[0])
sns.histplot([y_pred, y_test], bins=100, alpha=0.8, ax=ax[1])
ax[0].legend(['Obs.','Pred.'])
ax[1].legend(['Obs.','Pred.'])
ax[0].set_title('Histogram of Amyloid SUVR on Training Set')
ax[1].set_title('Histogram of Amyloid SUVR on Test Set')
ax[0].axvline(1.03, color = 'black', linewidth=1.5, linestyle='--')
ax[0].text(1.05,140,'cutpoint=1.03',rotation=0)
ax[1].axvline(1.03, color = 'black', linewidth=1.5, linestyle='--')
ax[1].text(1.05,36,'cutpoint=1.03',rotation=0)
ax[0].set_xlabel('SUVR')
ax[1].set_xlabel('SUVR')
plt.show()

In [None]:
## Feature importance from estimator
sns.set_theme(style='whitegrid')
fea_imp = pd.DataFrame(columns=['AVG_Importance'], index=[i for i in X_train.columns])
fea_imp['AVG_Importance'] = best_model.feature_importances_
fea_imp = fea_imp.sort_values(by="AVG_Importance" , inplace=False, ascending=True) 

row_names = {'PL_pT217T217percentmean_WashU_2023':'Plasma %p-tau217',
             'CSF_Ab42_Ab40_ratio_imputed_Elecsys_2020_2022':'CSF AB42/AB40',
             'age':'Age',
             'apoe_genotype_baseline_variable':'APOE'}
fea_imp = fea_imp.rename(index = row_names)

fig = plt.figure(figsize=(8,6))
ax = fea_imp.iloc[:,:].plot(kind='barh', color=['r'],figsize=(10,6))
# bar_datalabel(ax)
ax.set_xlabel('Weight')
ax.set_xlim(0, np.max(fea_imp['AVG_Importance'].values)*1.1) # expand xlim to make labels easier to read
plt.title('Feature Importance Derived from the Gradient Boosting Regressor', fontsize=13)
plt.show()

## SHAP (Not for Z-score)

In [None]:
shap_df = select_df.drop(cd_drop,axis=1).sort_values(by = 'fnc_ber_com_composite').reset_index(drop=True)
# shap_df = tv_df.sort_values(by = 'fnc_ber_com_composite').reset_index(drop=True)
X_shap = shap_df.drop(['fnc_ber_com_composite'], axis=1)
y_shap = shap_df['fnc_ber_com_composite']
y_pred_shap = best_model.predict(X_shap)

In [None]:
shap.initjs()
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_shap)
plt.title('SHAP values')
shap.summary_plot(shap_values, X_shap, max_display=10, show=True, cmap='plasma')
# savefig_name = crop[8:-7] + 'RF_SHAP_impact.png'
# plt.savefig(savefig_name,format='png')

In [None]:
# shap_exp = explainer(X_shap)
# output_df = pd.DataFrame()
# for idx in range(len(X_shap)):
#     details = pd.DataFrame({
#         'row_id':idx,
#         'feature': X_shap.columns,
#         'feature_value': X_shap.iloc[idx,:].values,
#         'base_value': shap_exp[idx].base_values,
#         'shap_values': shap_exp[idx].values,
#         'prediction': y_pred_shap[idx],
#         'observation': y_shap[idx],
#     })
#     output_df = pd.concat([output_df, details])

# impact = []
# for i in range(len(shap_df)):
#     v = np.abs(output_df[output_df['row_id'] == i]['shap_values'])
#     imp = list(v/np.sum(v))
#     impact = impact + imp
    
# output_df['shap_impacts'] = impact

# shap_impacts = []
# shap_values_plot = []
# for chosen_feature in range(len(X_shap.columns)):
#     shap_impacts.append(output_df[output_df['feature']==X_shap.columns[chosen_feature]].reset_index(drop=True)['shap_impacts'])
#     shap_values_plot.append(output_df[output_df['feature']==X_shap.columns[chosen_feature]].reset_index(drop=True)['shap_values'])

In [None]:
# ## Feature contribution
# sns.set_theme(style='white', palette=sns.color_palette('deep'))
# fig, ax = plt.subplots(1,1,figsize=(10,7))
# i = 0
# for impacts in shap_impacts:
#     sns.scatterplot(x=y_shap, y=impacts)
#     sns.regplot(x=y_shap, y=impacts, order=2, scatter=False, ci=95, ax=ax, label=X_shap.columns[i])
#     i += 1
# plt.title('Feature Contribution at Different Amyloid PET SUVR')
# ax.legend(loc='upper right')
# ax.set_xlabel('Amyloid PET SUVR')
# ax.set_ylabel('Contribution')

# plt.show()

## Probability Plot

In [None]:
obs_pred_100_df = pd.DataFrame([y_shap.values,y_pred_shap]).T.rename(columns={0:'obs',1:'pre'})
obs_pred_100_df

In [None]:
sns.scatterplot(x=obs_pred_100_df['obs'],y=obs_pred_100_df['pre'])
plt.show()

In [None]:
x = []
py = []
for i in range(6):
    if i == 0:
        x.append(obs_pred_100_df[(obs_pred_100_df['obs'] <=thres)]['pre'])
        py.append(obs_pred_100_df[(obs_pred_100_df['obs'] <=thres)].shape[0])
    elif i == 1:
        x.append(obs_pred_100_df[(obs_pred_100_df['obs'] >thres) & (obs_pred_100_df['obs'] <=1.1)]['pre'])
        py.append(obs_pred_100_df[(obs_pred_100_df['obs'] >thres) & (obs_pred_100_df['obs'] <=1.1)].shape[0])
    elif i == 2:
        x.append(obs_pred_100_df[(obs_pred_100_df['obs'] >1.1) & (obs_pred_100_df['obs'] <=1.25)]['pre'])
        py.append(obs_pred_100_df[(obs_pred_100_df['obs'] >1.1) & (obs_pred_100_df['obs'] <=1.25)].shape[0]) 
    elif i == 3:
        x.append(obs_pred_100_df[(obs_pred_100_df['obs'] >1.25) & (obs_pred_100_df['obs'] <=1.45)]['pre'])
        py.append(obs_pred_100_df[(obs_pred_100_df['obs'] >1.25) & (obs_pred_100_df['obs'] <=1.45)].shape[0]) 
    elif i == 4:
        x.append(obs_pred_100_df[(obs_pred_100_df['obs'] >1.45) & (obs_pred_100_df['obs'] <=1.7)]['pre'])
        py.append(obs_pred_100_df[(obs_pred_100_df['obs'] >1.45) & (obs_pred_100_df['obs'] <=1.7)].shape[0]) 
    elif i == 5:
        x.append(obs_pred_100_df[(obs_pred_100_df['obs'] >1.7) & (obs_pred_100_df['obs'] <=3.0)]['pre'])
        py.append(obs_pred_100_df[(obs_pred_100_df['obs'] >1.7) & (obs_pred_100_df['obs'] <=3.0)].shape[0]) 
        break
nr_groups = i+1
print(np.sum(py)/920, nr_groups)
py = py/np.sum(py)
print(py)

In [None]:
fig, ax = plt.subplots(1,1,figsize=(10,6))
for i in range(nr_groups):
    sns.kdeplot(x[i],ax=ax)
plt.legend(['obs<=1.03','1.03<obs<=1.1','1.1<obs<=1.25','1.25<obs<=1.45','1.45<obs<=1.7',
            '1.7<obs','2.0<obs'], loc='upper right')
plt.title('P(prediction|observation)')
plt.xlabel('prediction')
plt.show()

In [None]:
fig, ax = plt.subplots(1,1,figsize=(10,6))
for i in range(nr_groups):
    sns.histplot(x[i],ax=ax, stat='probability', kde=True, kde_kws={'cut': 3})
    ax.containers[0].remove()
plt.legend(['obs<=1.03','1.03<obs<=1.1','1.1<obs<=1.3','1.3<obs<=1.5','1.5<obs<=1.7',
            '1.7<obs<=2.0','2.0<obs'], loc='upper right')
plt.title('P(prediction|observation)')
plt.xlabel('prediction')
plt.show()

In [None]:
fig, bx = plt.subplots(1,1,figsize=(10,6))
sns.ecdfplot(obs_pred_100_df['obs'],ax=bx)
plt.title('P(observation belongs to group x)')
plt.show()

In [None]:
fig, cx= plt.subplots(1,1,figsize=(10,6))
for i in range(nr_groups):
    fit = ax.get_lines()[i].get_data() # Getting the data from the plotted line
    xfit, yfit = fit[0], fit[1]*py[i]
    cx.plot(xfit, yfit) 
plt.legend(['obs<=1.03','1.03<obs<=1.1','1.1<obs<=1.3','1.3<obs<=1.5','1.5<obs<=1.7',
            '1.7<obs<=2.0','2.0<obs'], loc='upper right')
plt.title('P(observation | prediction)')
plt.ylim([0,0.062])
plt.xlabel('prediction')
plt.show()

## Plot a histogram

In [None]:
cut_points = [0.9, 1.03, 1.15, 1.35, 1.55, 1.7]
nr_cutpoints = 91
cut_points = list(np.around(np.linspace(0.9,1.8,nr_cutpoints),2))
l = len(cut_points)
pyx = np.zeros([nr_groups,l])

In [None]:
for i in range(nr_groups):
    fit = cx.get_lines()[i].get_data() # Getting the data from the plotted line
    xfit, yfit = fit[0], fit[1]
    for j in range(l):
        index = np.abs(xfit-cut_points[j]).argmin()
        pyx[i][j] = yfit[index]

In [None]:
px = np.sum(pyx,axis=0)
prob = pyx/px
prob_df = pd.DataFrame(prob, columns=cut_points)
prob_df = prob_df.T

In [None]:
maxindex = []
for i in range(prob_df.shape[0]):
    maxindex.append(prob_df.iloc[i,:].argmax())
maxindex = np.asarray(maxindex)

In [None]:
fig, ax = plt.subplots(1,1,figsize=(40,15))
ax = prob_df.plot(kind='bar', stacked=True, ax=ax, rot=0, width=1, alpha=0.85)

i = 0
j = 0
for bar in ax.patches:
  if i < nr_cutpoints*1:
    c = maxindex == 0
    if c[j]:
      bar.set_color((0.043, 0.471, 0.941))
  elif i < nr_cutpoints*2:
    c = maxindex == 1
    if c[j-nr_cutpoints*1]:
      bar.set_color((0.969, 0.565, 0.027))
  elif i < nr_cutpoints*3:
    c = maxindex == 2
    if c[j-nr_cutpoints*2]:
      bar.set_color('green')
  elif i < nr_cutpoints*4:
    c = maxindex == 3
    if c[j-nr_cutpoints*3]:
      bar.set_color((0.929, 0.078, 0.078))
  elif i < nr_cutpoints*5:
    c = maxindex == 4
    if c[j-nr_cutpoints*4]:
      bar.set_color((0.424, 0.059, 0.89))
  elif i < nr_cutpoints*6:
    c = maxindex == 5
    if c[j-nr_cutpoints*5]:
      bar.set_color((0.169, 0.125, 0.125))
  j += 1
  i += 1

plt.legend(['obs<=1.03','1.03<obs<=1.1','1.1<obs<=1.25','1.25<obs<=1.45','1.45<obs<=1.7',
            '1.7<obs',], loc='upper right')
plt.xlabel('Predicted Amyloid SUVR')
plt.ylabel('Probability')
plt.title('The probabilities of the predicted value located in different observation groups')
plt.show()

In [None]:
sns.set_theme(style="whitegrid", palette=sns.color_palette("deep"))
fig = plt.figure(figsize=(10,6))
X_axis = np.arange(l)
for i in range(l):
    plt.bar(X_axis - 0.4 + 0.1*i, prob_df[i], 0.1)
    for index, value in enumerate(prob_df[i]):
        if value>0.01:
            plt.text(index- 0.45 + 0.1*i, value+0.01,
                    str(float("{:.2f}".format(value))))
plt.legend(['obs<=1.03','1.03<obs<=1.1','1.1<obs<=1.25','1.25<obs<=1.45','1.45<obs<=1.7',
            '1.7<obs',], loc='upper right')  
plt.xticks(X_axis, prob_df.index)
plt.xlabel('Predicted Amyloid SUVR')
plt.ylabel('Probability')
plt.title('Probabilities of Predicted Values Located in Different Observation Groups')
plt.show()