In [None]:
# !pip install numpy==1.16.1
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import f1_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import jaccard_score


In [None]:
n = 5
save_name = f'clinical_history_{n}_observations'
identifier = '20231026_084723_925028'



In [None]:
path_predictions = f'/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/temporal_model/results/{save_name}_{identifier}/predictions.npz'
path_auc = f'/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/temporal_model/results/{save_name}_{identifier}/auroc_score.npz'
path_truths = f'/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/temporal_model/results/{save_name}_{identifier}/truths.npz'
path_losses = f'/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/temporal_model/results/{save_name}_{identifier}/losses.npz'

### plot losses

In [None]:
## bestaat uit 5x 2 lijsten. elke lijst heeft 50 waardes, 1 voor training en 1 voor validation
losses = np.load(path_losses, allow_pickle=True)#load(path_predictions)
keys = losses.keys()
for key in keys:
    print(f"Key: {key}")
    
    print(losses[key].shape)
    # print(losses[key])


In [None]:
df = pd.DataFrame(losses['losses'],columns=['Loss','valloss'])
# pd.concat([df['Loss'].apply(pd.Series), df['valloss']], axis = 1)
loss_df = df['Loss'].apply(pd.Series).T
loss_df = loss_df.reset_index()
loss_df.columns = ['Epoch','fold1','fold2','fold3','fold4','fold5']
loss_melted = pd.melt(loss_df, id_vars=['Epoch'],
        value_vars=['fold1','fold2','fold3','fold4','fold5'],
       var_name='Fold', value_name='Training Loss')

val_loss_df = df['valloss'].apply(pd.Series).T
val_loss_df = val_loss_df.reset_index()
val_loss_df.columns = ['Epoch','fold1','fold2','fold3','fold4','fold5']
# display(loss_df)
val_loss_melted = pd.melt(val_loss_df, id_vars=['Epoch'],
        value_vars=['fold1','fold2','fold3','fold4','fold5'],
       var_name='Fold', value_name='Validation Loss')
# display(loss_melted)  

%matplotlib inline
%config InlineBackend.figure_format = 'svg'
sns.set(style="ticks", font_scale=2.5)

fig, ax = plt.subplots(figsize=(6, 8))
sns.lineplot(data=loss_melted, x="Epoch", y="Training Loss", palette = 'blue', label="Training Loss")
sns.lineplot(data=val_loss_melted, x="Epoch", y="Validation Loss", palette = 'steelblue', label="Validation Loss")
# sns.barplot(x=full_report_melted.index, y="value", hue="metric", data=full_report_melted, ax=ax,palette = 'Blues')
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)
ax.set_xlabel("Epoch",labelpad=10)
ax.set_ylabel("Loss",labelpad=10)

plt.suptitle(None)
# ax.set_ylim(0,1)
ax.spines["right"].set_color("none")
ax.spines["top"].set_color("none")
sns.despine(offset=10, trim=False)
plt.legend(loc='upper right')
output_path_png = f'/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/temporal_model/results/{save_name}_{identifier}/loss_{identifier}.png'
output_path_pdf = f'/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/temporal_model/results/{save_name}_{identifier}/loss_{identifier}.pdf'
fig.savefig(output_path_png,bbox_inches="tight",dpi=600) 
fig.savefig(output_path_pdf,bbox_inches="tight",dpi=600) 
plt.show()
plt.close()

#### AUROC
TO DO; AUROC score per diagnosis? possible?

In [None]:
auroc = np.load(path_auc, allow_pickle=True)#load(path_predictions)
keys = auroc.keys()
for key in keys:
    print(f"Key: {key}")
    print(auroc[key].shape)
    print(auroc[key])
averages = []
for i in range(5):
    print(i)
    averages.append(auroc['auc_score_list_all'][i][2])
print(averages)
average_auroc = np.average(averages)
print(average_auroc)

#### PREDICTIONS
one array for each fold. Then per fold we get the results for train, validate, and test. 


In [None]:
## voor elke fold
## pred_y_list met predicties voor train, val, en test
## de predicties zijn een array. voor elke donor is er een rij. 
## de rij bestaat uit 12 kolommen, 1 voor elke diagnose
## we kunnen de predicties voor de test data combineren, en zo voor elke donor een predictie krijgen
donorindex = pd.read_excel("/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/temporal_model/data/clinical_history_5_observations/donorindexes.xlsx" )
wanted = ['CON', 'AD', 'PD', 'VD', 'FTD','DLB','AD,DLB','ATAXIA', 'MND', 'PSP', 'MS','MSA']
# display(donorindex)

######## ACCESSING THE PREDICTIONS AND ADDING DONORID
pred_dict = []
preds = np.load(path_predictions, allow_pickle=True)
keys = preds.keys()
for key in keys:
    print(preds[key].shape)
    for i in range(len(preds[key])):
        # display(preds[key][i][2].shape)
        preds_df = pd.DataFrame(preds[key][i][2],columns = wanted)
        preds_df['fold'] = i
        
        folddonors = list(donorindex[donorindex['foldinfo']==i]['DonorID'])
        preds_df['DonorID'] = folddonors
        # display(truth_df)
        pred_dict.append(preds_df)
        
preds_df_all = pd.concat(pred_dict, ignore_index=True)
preds_df_all['pred'] = preds_df_all.iloc[:, :12].apply(lambda row: row.idxmax(), axis=1)  
preds_df_all = preds_df_all[['DonorID','pred']]
# display(preds_df_all)

######## ACCESSING THE TRUTHS AND ADDING DONORID
truth_dict = []
truths = np.load(path_truths, allow_pickle=True)
keys = truths.keys()
for key in keys:
    for j in range(len(truths[key])):
        # display(truths[key][j][2].shape)
        truth_df = pd.DataFrame(truths[key][j][2],columns = wanted)
        truth_df['fold'] = j
        
        folddonors = list(donorindex[donorindex['foldinfo']==j]['DonorID'])
        testdiag = list(donorindex[donorindex['foldinfo']==j]['simplified_diagnosis'])
        truth_df['DonorID'] = folddonors
        truth_df['testdiag'] = testdiag
        # display(truth_df)
        truth_dict.append(truth_df)
        
truths_df_all = pd.concat(truth_dict, ignore_index=True)
truths_df_all['neuropathological_diagnosis'] = truths_df_all.iloc[:, :12].apply(lambda row: row.idxmax(), axis=1)
truths_df_all = truths_df_all[['DonorID','neuropathological_diagnosis']]

prediction_evaluation = pd.merge(preds_df_all, truths_df_all, on='DonorID', how='inner')
prediction_evaluation['neuropathological_diagnosis'] = prediction_evaluation['neuropathological_diagnosis'].replace('AD,DLB', 'AD-DLB')
prediction_evaluation['pred'] = prediction_evaluation['pred'].replace('AD,DLB', 'AD-DLB')
display(prediction_evaluation)


### confusion matrix

In [None]:
cf = prediction_evaluation.copy()
# cf['clinical_diagnosis'] = cf['clinical_diagnosis'].str.split(',')
cf = cf.explode('pred')
# display(cf)
# display(cf['pred'].value_counts())
# display(cf['neuropathological_diagnosis'].value_counts())
total_true_diagnoses = pd.DataFrame(cf['neuropathological_diagnosis'].value_counts())
total_true_diagnoses.reset_index(inplace=True)
total_true_diagnoses.columns = ['neuropathological_diagnosis', 'counts']
# print(total_true_diagnoses)
# display(cf.head(10))
confusion_matrix = pd.crosstab(cf['neuropathological_diagnosis'], cf['pred'])
# Define the custom order of diagnosis categories
# wantedy = ['AD', 'PD', 'VD', 'FTD', 'DLB', 'AD-DLB', 'ATAXIA', 'MND', 'PSP', 'MS', 'MSA', 'other']
wantedx = ['CON','AD', 'PD', 'VD', 'FTD', 'DLB', 'AD-DLB', 'ATAXIA', 'MND', 'PSP', 'MS', 'MSA']


total_true_diagnoses = total_true_diagnoses.set_index('neuropathological_diagnosis')
total_true_diagnoses = total_true_diagnoses.reindex(wantedx)
print(total_true_diagnoses)
### observations, colored by % of clindiag
# total_true_diagnoses = confusion_matrix.sum(axis=0)
confusion_matrix_percentage = confusion_matrix.div(total_true_diagnoses['counts'], axis=0) * 100
display(confusion_matrix)
for diag in wantedx:
    if diag not in confusion_matrix.columns:
        # Add a dummy column for 'DLB' with all zeros
        confusion_matrix_percentage[diag] = 0
        confusion_matrix_percentage[diag] = confusion_matrix_percentage[diag].astype(int)
        confusion_matrix[diag] = 0
        confusion_matrix[diag] = confusion_matrix[diag].astype(int)
# Convert the index and columns of the confusion_matrix DataFrame to a Categorical type with the custom order
confusion_matrix_percentage.index = pd.Categorical(confusion_matrix_percentage.index, categories=wantedx)
confusion_matrix_percentage.columns = pd.Categorical(confusion_matrix_percentage.columns, categories=wantedx)
confusion_matrix_percentage = confusion_matrix_percentage.sort_index(axis=0).sort_index(axis=1)
confusion_matrix.index = pd.Categorical(confusion_matrix.index, categories=wantedx)
confusion_matrix.columns = pd.Categorical(confusion_matrix.columns, categories=wantedx)
confusion_matrix = confusion_matrix.sort_index(axis=0).sort_index(axis=1)

# display(confusion_matrix)
display(confusion_matrix_percentage)
plt.figure(figsize=(12, 10))
plt.imshow(confusion_matrix_percentage, cmap=plt.cm.Blues, aspect='auto',interpolation='nearest',vmin=0, vmax=100)
plt.colorbar()
plt.xlabel('GRU-D prediction')
plt.ylabel('Neuropathological Diagnosis')
plt.title('Observations, colored by %clindiag ')
tick_labels_x = confusion_matrix_percentage.columns
tick_labels_y = confusion_matrix_percentage.index
plt.xticks(range(len(tick_labels_x)), tick_labels_x, rotation=90)
plt.yticks(range(len(tick_labels_y)), tick_labels_y)
for i in range(len(tick_labels_y)):
    for j in range(len(tick_labels_x)):
        value = round(confusion_matrix.iloc[i, j])
        value_p = round(confusion_matrix_percentage.iloc[i, j])
        if value > 0:
            text_color = 'white' if value_p >= 50 else 'black'
            plt.text(j, i, f'{value}', ha='center', va='center', color=text_color, fontsize=14)
plt.savefig('/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_analysis/figures/grud_confusion_matric.pdf', format='pdf', dpi=600)            
plt.show()

### compare with clinical diagnosis

In [None]:
clinical = pd.read_excel("/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_analysis/data/grud_clin_subset_overview.xlsx" )
clinical_full = pd.read_excel('/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_analysis/data/grud_clin_subset_with_clinical_diagnoses.xlsx' )
# display(clinical['neuropathological_diagnosis'].value_counts())
display(prediction_evaluation['neuropathological_diagnosis'].value_counts())
clinical_full.head()

## clinical confusion matrix

In [None]:
clinical

In [None]:
cf = clinical.copy()
cf['parsed_clinical_diagnosis'] = cf['parsed_clinical_diagnosis'].str.split(',')
cf = cf.explode('parsed_clinical_diagnosis')
total_true_diagnoses = pd.DataFrame(cf['neuropathological_diagnosis'].value_counts())
total_true_diagnoses.reset_index(inplace=True)
total_true_diagnoses.columns = ['neuropathological_diagnosis', 'counts']
# print(total_true_diagnoses)
confusion_matrix = pd.crosstab(cf['neuropathological_diagnosis'], cf['parsed_clinical_diagnosis'])
# Define the custom order of diagnosis categories
wantedy = ['dementia','other','AD', 'PD', 'VD', 'FTD', 'DLB', 'AD-DLB', 'ATAXIA', 'MND', 'PSP', 'MS', 'MSA']
wantedx = ['CON','AD', 'PD', 'VD', 'FTD', 'DLB', 'AD-DLB', 'ATAXIA', 'MND', 'PSP', 'MS', 'MSA']

# Convert the index and columns of the confusion_matrix DataFrame to a Categorical type with the custom order
confusion_matrix.index = pd.Categorical(confusion_matrix.index, categories=wantedx)
confusion_matrix.columns = pd.Categorical(confusion_matrix.columns, categories=wantedy)
confusion_matrix = confusion_matrix.sort_index(axis=0).sort_index(axis=1)
# display(confusion_matrix)

### percentages of true NP
total_true_diagnoses = total_true_diagnoses.set_index('neuropathological_diagnosis')
total_true_diagnoses = total_true_diagnoses.reindex(wantedx)
print(total_true_diagnoses)
confusion_matrix_percentage = confusion_matrix.div(total_true_diagnoses['counts'], axis=0) * 100
display(confusion_matrix_percentage)
display(confusion_matrix)
plt.figure(figsize=(12, 10))
plt.imshow(confusion_matrix_percentage, cmap=plt.cm.Oranges, aspect='auto',interpolation='nearest',vmin=0, vmax=100)
plt.colorbar()
plt.xlabel('Clinical Diagnosis')
plt.ylabel('Neuropathological Diagnosis')
plt.title('observations')
tick_labels_x = confusion_matrix.columns
tick_labels_y = confusion_matrix.index
plt.xticks(range(len(tick_labels_x)), tick_labels_x, rotation=90)
plt.yticks(range(len(tick_labels_y)), tick_labels_y)
for i in range(len(tick_labels_y)):
    for j in range(len(tick_labels_x)):
        value = round(confusion_matrix.iloc[i, j])
        valuep = round(confusion_matrix_percentage.iloc[i, j])
        if value > 0:
            text_color = 'white' if valuep >= 50 else 'black'
            plt.text(j, i, f'{value}', ha='center', va='center', color=text_color, fontsize=14)

plt.savefig('/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_analysis/figures/clinical_confusion_matric.pdf', format='pdf', dpi=600)
plt.show()



In [None]:
both = pd.merge(clinical, prediction_evaluation, on=['DonorID','neuropathological_diagnosis'], how='inner')
both_full = pd.merge(clinical_full, prediction_evaluation, on=['DonorID','neuropathological_diagnosis'], how='inner')
# display(both['neuropathological_diagnosis'].value_counts())
display(both)

In [None]:

# 

### venn cd vs nd

In [None]:
display(both)#['parsed_clinical_diagnosis'].value_counts())

In [None]:
new_both = both.copy()
new_both['clin_coherence'] = 'unknown/coherent'

In [None]:
from matplotlib_venn import venn3
from matplotlib_venn import venn2
venndiagnoses = ['AD','FTD','VD','DLB']
# venndiagnoses = ['AD']
for i in venndiagnoses:
    display(i)
    ## STRICT
    venndf = both[(both['neuropathological_diagnosis'] == i) | (both['parsed_clinical_diagnosis'] == i)]
    # display(venndf)
    nponly_s = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        (venndf['parsed_clinical_diagnosis'] != i)]
    # display(nponly_s)
    # display(len(nponly_s))
    clinonly_s = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          ((venndf['parsed_clinical_diagnosis'] == i))]
    # display(clinonly_s)
    clinndset_s = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         ((venndf['parsed_clinical_diagnosis'] == i))]
    # display(clinndset_s)
    print(f" jaccard strict: {len(clinndset_s)/(len(nponly_s)+len(clinonly_s)+len(clinndset_s))}")

    ### LENIENT
    venndf = both[(both['neuropathological_diagnosis'] == i) | (both['parsed_clinical_diagnosis'].str.contains(i)) | \
                   (both['parsed_clinical_diagnosis'] =='dementia') ]
    print(f"All donors lenient: {len(venndf)}")
    nponly = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        ((~venndf['parsed_clinical_diagnosis'].str.contains(i)) & (venndf['parsed_clinical_diagnosis']!='dementia'))]
    # display(nponly)
    clinonly = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          ((venndf['parsed_clinical_diagnosis'].str.contains(i)) | (venndf['parsed_clinical_diagnosis']=='dementia'))]
    # display(clinonly)
    clinndset = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         ((venndf['parsed_clinical_diagnosis'].str.contains(i)) | (venndf['parsed_clinical_diagnosis']=='dementia'))]
    # display(clinndset)
    new_both.loc[new_both['DonorID'].isin(nponly['DonorID']), 'clin_coherence'] = 'non_coherent'
    # new_both.loc[new_both['DonorID'].isin(clinndset_s['DonorID']), 'clin_coherence'] = 'perfect'
    # display(clinonly['parsed_clinical_diagnosis'].value_counts())
    # x = clinonly.merge(clinonly_s[['DonorID']], on='DonorID', how='left', indicator=True)
    # display(x[x['_merge'] == 'left_only'].drop(columns=['_merge']))
    print(f"ND only strict {len(nponly_s)}")
    print(f"ND only lenient {len(nponly)}")
    print(f"clin only strict {len(clinonly_s)}")
    print(f"clin only lenient {len(clinonly)-len(clinonly_s)}")
    print(f"clin only total {len(clinonly)}")
    print(f"clin + ND strict {len(clinndset_s)}")
    print(f"clin + ND lenient {len(clinndset)-len(clinndset_s)}")
    print(f"clin + ND total {len(clinndset)}")
    print(len(nponly)+len(clinonly)+len(clinndset))
    print(len(nponly_s)+len(clinonly_s)+len(clinndset_s))
    print(f" jaccard lenient: {len(clinndset)/(len(nponly)+len(clinonly)+len(clinndset))}")
    
    print(f" % ND {round(100*len(clinndset)/(len(clinndset)+len(nponly)))} ")
    print(f" % CD {round(100*len(clinndset)/(len(clinndset)+len(clinonly)))} ")

    venn2(subsets = (len(nponly), len(clinonly), len(clinndset)), set_labels = ('ND', 'CD'))
    plt.show()
    print("\n")

display(new_both[40:80])   
# display(new_both['clin_coherence'].value_counts())

In [None]:
from matplotlib_venn import venn3
# venndiagnoses = ['MS']
venndiagnoses = ['PD','ATAXIA', 'MND', 'PSP', 'MSA','MS']
# venndiagnoses = ['MS']
for i in venndiagnoses:
    display(i)
    ## first we have to select all donors that have AD or dementia in one of the three columns
    venndf = both[(both['neuropathological_diagnosis'] == i) | (both['parsed_clinical_diagnosis'] == i)]
    display(f"length strict: {len(venndf)}")
    nponly_s = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        (venndf['parsed_clinical_diagnosis'] != i)]
    # display(nponly_s)
    clinonly_s = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          ((venndf['parsed_clinical_diagnosis'] == i))]
    # display(clinonly_s)
    clinndset_s = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         ((venndf['parsed_clinical_diagnosis'] == i))]
    # display(clinndset_s)
    print(f" jaccard strict: {len(clinndset_s)/(len(nponly_s)+len(clinonly_s)+len(clinndset_s))}")

    venndf = both[(both['neuropathological_diagnosis'] == i) | ((both['parsed_clinical_diagnosis'].str.contains(i)))]
    venndf = both[(both['neuropathological_diagnosis'] == i) | both['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))]
    # display(venndf['parsed_clinical_diagnosis'].value_counts())
    display(f"length lenient (in figure): {len(venndf)}")
    nponly = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        (~venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(',')))]
    # display(nponly)
    # clinonly = venndf[(venndf['neuropathological_diagnosis'] != i) & \
    #                       ((venndf['parsed_clinical_diagnosis'].str.contains(i)))]
    clinonly = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          (venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(',')))]
    # display(clinonly)
    clinndset = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         (venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(',')))]
    # display(clinndset)
    new_both.loc[new_both['DonorID'].isin(nponly['DonorID']), 'clin_coherence'] = 'non_coherent'
    # new_both.loc[new_both['DonorID'].isin(clinndset_s['DonorID']), 'clin_coherence'] = 'perfect'
    # display(clinonly['parsed_clinical_diagnosis'].value_counts())
    # x = clinonly.merge(clinonly_s[['DonorID']], on='DonorID', how='left', indicator=True)
    # display(x[x['_merge'] == 'left_only'].drop(columns=['_merge']))
    print(f"ND only strict {len(nponly_s)}")
    print(f"ND only lenient (in figure) {len(nponly)}")
    print(f"clin only strict (in figure parentheses) {len(clinonly_s)}")
    # print(f"clin only lenient {len(clinonly)-len(clinonly_s)}")
    print(f"clin only total (in figure) {len(clinonly)}")
    print(f"clin + ND strict (in figure parentheses) {len(clinndset_s)}")
    # print(f"clin + ND lenient {len(clinndset)-len(clinndset_s)}")
    print(f"clin + ND total (in figure) {len(clinndset)}")
    # print(len(nponly)+len(clinonly)+len(clinndset))
    # print(len(nponly_s)+len(clinonly_s)+len(clinndset_s))
    print(f" jaccard lenient (in figure): {len(clinndset)/(len(nponly)+len(clinonly)+len(clinndset))}")
    
    print(f" % ND {round(100*len(clinndset)/(len(clinndset)+len(nponly)))} ")
    print(f" % CD {round(100*len(clinndset)/(len(clinndset)+len(clinonly)))} ")
    venn2(subsets = (len(nponly), len(clinonly), len(clinndset)), set_labels = ('ND', 'CD'))
    plt.show()
    print("\n")



In [None]:
from matplotlib_venn import venn3
venndiagnoses = ['CON']
for i in venndiagnoses:
    display(i)
    ## first we have to select all donors that have AD or dementia in one of the three columns
    venndf = both[(both['neuropathological_diagnosis'] == i) | (both['parsed_clinical_diagnosis'] == 'other')]
    display(f"length strict: {len(venndf)}")
    nponly_s = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        (venndf['parsed_clinical_diagnosis'] != 'other')]
    display(nponly_s)
    clinonly_s = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          ((venndf['parsed_clinical_diagnosis'] == 'other'))]
    # display(clinonly_s)
    clinndset_s = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         ((venndf['parsed_clinical_diagnosis'] == 'other'))]
    new_both.loc[new_both['DonorID'].isin(nponly_s['DonorID']), 'clin_coherence'] = 'non_coherent'
    # new_both.loc[new_both['DonorID'].isin(clinndset_s['DonorID']), 'clin_coherence'] = 'perfect'
    # display(clinndset_s)
    print(f" jaccard strict: {len(clinndset_s)/(len(nponly_s)+len(clinonly_s)+len(clinndset_s))}")

    print(f"ND only strict {len(nponly_s)}")
    print(f"clin only strict {len(clinonly_s)}")
    print(f"clin + ND strict {len(clinndset_s)}")
    print(len(nponly_s)+len(clinonly_s)+len(clinndset_s))
    print(f" % ND {round(100*len(clinndset_s)/(len(clinndset_s)+len(nponly_s)))} ")
    print(f" % CD {round(100*len(clinndset_s)/(len(clinndset_s)+len(clinonly_s)))} ")
    venn2(subsets = (len(nponly_s), len(clinonly_s), len(clinndset_s)), set_labels = ('ND', 'CD'))
    plt.show()    
    print("\n")



In [None]:
from matplotlib_venn import venn3
venndiagnoses = ['AD-DLB']
for i in venndiagnoses:
    display(i)
    ## first we have to select all donors that have AD or dementia in one of the three columns
    venndf = both[(both['neuropathological_diagnosis'] == i) | (both['parsed_clinical_diagnosis'] == i)]
    display(f"length strict: {len(venndf)}")
    nponly_s = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        (venndf['parsed_clinical_diagnosis'] != i)]
    # display(nponly_s)
    clinonly_s = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          ((venndf['parsed_clinical_diagnosis'] == i))]
    display(clinonly_s)
    clinndset_s = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         ((venndf['parsed_clinical_diagnosis'] == i))]
    display(clinndset_s)
    print(f" jaccard strict: {len(clinndset_s)/(len(nponly_s)+len(clinonly_s)+len(clinndset_s))}")

    venndf = both[(both['neuropathological_diagnosis'] == i) | (both['parsed_clinical_diagnosis'].str.contains(i))]
    venndf = both[(both['neuropathological_diagnosis'] == i) | (both['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))) |\
                  (both['parsed_clinical_diagnosis'] =='dementia') |\
                  (both['parsed_clinical_diagnosis'].str.contains('AD')) |\
                  (both['parsed_clinical_diagnosis'].str.contains('DLB'))]
    # display(venndf[venndf['neuropathological_diagnosis'] != 'AD-DLB'])
    # display(venndf['parsed_clinical_diagnosis'].value_counts())
    display(f"length lenient: {len(venndf)}")
    nponly = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        ((~venndf['parsed_clinical_diagnosis'].str.contains(i)) & \
                         (~venndf['parsed_clinical_diagnosis'].str.contains('AD')) & \
                         (~venndf['parsed_clinical_diagnosis'].str.contains('dementia')) & \
                         (~venndf['parsed_clinical_diagnosis'].str.contains('DLB')))]
    # display(nponly)
    clinonly = venndf[(venndf['neuropathological_diagnosis'] != i) &\
                         ((venndf['parsed_clinical_diagnosis'].str.contains(i)) |\
                          (venndf['parsed_clinical_diagnosis'].str.contains('AD')) |\
                          (venndf['parsed_clinical_diagnosis'].str.contains('dementia')) |\
                          (venndf['parsed_clinical_diagnosis'].str.contains('DLB'))
                         )]
    # display(clinonly)
    clinndset = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         ((venndf['parsed_clinical_diagnosis'].str.contains(i)) |\
                          (venndf['parsed_clinical_diagnosis'].str.contains('AD')) |\
                          (venndf['parsed_clinical_diagnosis'].str.contains('dementia')) |\
                          (venndf['parsed_clinical_diagnosis'].str.contains('DLB'))
                         )]
    display(clinndset)
    new_both.loc[new_both['DonorID'].isin(nponly['DonorID']), 'clin_coherence'] = 'non_coherent'
    # new_both.loc[new_both['DonorID'].isin(clinndset_s['DonorID']), 'clin_coherence'] = 'perfect'
    # display(clinonly['parsed_clinical_diagnosis'].value_counts())
    # x = clinonly.merge(clinonly_s[['DonorID']], on='DonorID', how='left', indicator=True)
    # display(x[x['_merge'] == 'left_only'].drop(columns=['_merge']))
    print(f"ND only strict {len(nponly_s)}")
    print(f"ND only lenient {len(nponly)}")
    print(f"clin only strict {len(clinonly_s)}")
    # print(f"clin only lenient {len(clinonly)-len(clinonly_s)}")
    print(f"clin only total {len(clinonly)}")
    print(f"clin + ND strict {len(clinndset_s)}")
    # print(f"clin + ND lenient {len(clinndset)-len(clinndset_s)}")
    print(f"clin + ND total {len(clinndset)}")
    print(len(nponly)+len(clinonly)+len(clinndset))
    print(len(nponly_s)+len(clinonly_s)+len(clinndset_s))
    print(f" jaccard lenient: {len(clinndset)/(len(nponly)+len(clinonly)+len(clinndset))}")

    print(f" % ND {round(100*len(clinndset)/(len(clinndset)+len(nponly)))} ")
    print(f" % CD {round(100*len(clinndset)/(len(clinndset)+len(clinonly)))} ")
    venn2(subsets = (len(nponly), len(clinonly), len(clinndset)), set_labels = ('ND', 'CD'))
    plt.show()
    print("\n")



In [None]:
display(new_both[100:120])
display(new_both['clin_coherence'].value_counts())
display(new_both['DonorID'].nunique())
# new_both.to_excel('/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_analysis/data/grud_clin_subset_overview_newboth.xlsx', index=False)

In [None]:
new_both[new_both['clin_coherence']=='non_coherent']

### overview plot

In [None]:
def set_diagnosis_info(row):
    if row['pred'] == row['neuropathological_diagnosis']:
        return 'coherent'
    elif 'AD-DLB' in row['neuropathological_diagnosis']:
        if any(diagnosis in ["AD", "DLB"] for diagnosis in row['pred'].split(',')):
            return 'ambiguous'
        else:
            return 'non-coherent'
    elif all(neuropath_diagnosis in row['pred'] for neuropath_diagnosis in row['neuropathological_diagnosis']):
        return 'ambiguous'
    else:
        return 'non-coherent'
    
both['pred_info'] = both.apply(set_diagnosis_info, axis=1)
both_full['pred_info'] = both_full.apply(set_diagnosis_info, axis=1)
both['DonorID'].nunique()
display(both['pred_info'].value_counts())

In [None]:
overview_df = both.copy()
# display(overview_df.head(9))
# grouped_df = overview_df[overview_df['neuropathological_diagnosis'] != 'CON']
grouped_df = overview_df.groupby(['neuropathological_diagnosis', 'pred_info']).size().unstack(fill_value=0)

grouped_df['total_count'] = grouped_df.sum(axis=1)
display(grouped_df)
display(grouped_df['coherent'].sum())
display(grouped_df['non-coherent'].sum())
display(grouped_df['ambiguous'].sum())
display(both['DonorID'].nunique())
# # Calculate the percentages for each category
grouped_df['coherent_percentage'] = grouped_df['coherent'] / grouped_df['total_count'] * 100
grouped_df['non-coherent_percentage'] = grouped_df['non-coherent'] / grouped_df['total_count'] * 100
grouped_df['ambiguous_percentage'] = grouped_df['ambiguous'] / grouped_df['total_count'] * 100

# Define custom colors for the plot
colors = ['#004c6d', '#aa7f0e', '#ff7f0e']
display(grouped_df)
# Define the desired order for the rows
# wanted_order = ['AD', 'PD', 'VD', 'FTD', 'DLB', 'AD-DLB', 'ATAXIA', 'MND', 'PSP', 'MS', 'MSA']
wanted_order = ['MSA', 'MS', 'PSP', 'MND','ATAXIA','AD-DLB', 'DLB', 'FTD', 'VD', 'PD', 'AD','CON']

# Reindex the DataFrame based on the desired order
grouped_df = grouped_df.reindex(wanted_order)

# Plot the stacked horizontal bar plot
ax = grouped_df[['coherent_percentage', 'ambiguous_percentage', 'non-coherent_percentage']].plot(
    kind='barh', stacked=True, figsize=(10, 6), color=colors
)

# Set labels and title
plt.xlabel('Percentage')
plt.ylabel('NP')
plt.title('gru-d vs neuropath')

# Set custom legend labels
legend_labels = ['Coherent', 'Ambiguous', 'Non-Coherent']
ax.legend(legend_labels, title='', loc='center left', bbox_to_anchor=(1.1, 0.5), fontsize=10)

for i, (idx, row) in enumerate(grouped_df.iterrows()):
    total_count = row['total_count']
    coherent_percentage = row['coherent_percentage']
    ambiguous_percentage = row['ambiguous_percentage']
    non_coherent_percentage = row['non-coherent_percentage']
    
    ax.annotate(f"{round(total_count)}", (107, i), xytext=(-10, 0), textcoords='offset points', va='center', ha='center', fontsize=10)
    ax.annotate(f"{round(coherent_percentage)}%", (coherent_percentage / 2, i), 
                va='center', ha='center', fontsize=10, color='white')
    
    ax.annotate(f"{round(ambiguous_percentage)}%", 
                (coherent_percentage + ambiguous_percentage / 2, i), 
                va='center', ha='center', fontsize=10, color='white')
    
    ax.annotate(f"{round(non_coherent_percentage)}%", 
                (coherent_percentage + ambiguous_percentage + non_coherent_percentage / 2, i), 
                va='center', ha='center', fontsize=10, color='black')
plt.xlim(0,110)

# Show the plot
plt.tight_layout()
plt.savefig('/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_analysis/figures/gru-d_overview.pdf', format='pdf', dpi=600)
plt.show()


In [None]:
overview_df = both.copy()
# display(overview_df.head(9))
display(overview_df[(overview_df['pred'] == 'AD') & (overview_df['pred_info'] != 'coherent')])
grouped_df = overview_df.groupby(['pred', 'pred_info']).size().unstack(fill_value=0)

grouped_df['total_count'] = grouped_df.sum(axis=1)
# display(grouped_df)
# # Calculate the percentages for each category
grouped_df['coherent_percentage'] = grouped_df['coherent'] / grouped_df['total_count'] * 100
grouped_df['non-coherent_percentage'] = grouped_df['non-coherent'] / grouped_df['total_count'] * 100
grouped_df['ambiguous_percentage'] = grouped_df['ambiguous'] / grouped_df['total_count'] * 100

# Define custom colors for the plot
colors = ['#004c6d', '#aa7f0e', '#ff7f0e']
# display(grouped_df)
# Define the desired order for the rows
# wanted_order = ['AD', 'PD', 'VD', 'FTD', 'DLB', 'AD-DLB', 'ATAXIA', 'MND', 'PSP', 'MS', 'MSA']
wanted_order = ['MSA', 'MS', 'PSP', 'MND','ATAXIA','AD-DLB', 'DLB', 'FTD', 'VD', 'PD', 'AD','CON']

# Reindex the DataFrame based on the desired order
grouped_df = grouped_df.reindex(wanted_order)
grouped_df = grouped_df.fillna(0)
display(grouped_df)
# Plot the stacked horizontal bar plot
ax = grouped_df[['coherent_percentage', 'ambiguous_percentage', 'non-coherent_percentage']].plot(
    kind='barh', stacked=True, figsize=(10, 6), color=colors
)

# Set labels and title
plt.xlabel('Percentage')
plt.ylabel('gru-d')
plt.title('gru-d vs neuropath')

# Set custom legend labels
legend_labels = ['Coherent', 'Ambiguous', 'Non-Coherent']
ax.legend(legend_labels, title='', loc='center left', bbox_to_anchor=(1.1, 0.5), fontsize=10)

for i, (idx, row) in enumerate(grouped_df.iterrows()):
    total_count = row['total_count']
    coherent_percentage = row['coherent_percentage']
    ambiguous_percentage = row['ambiguous_percentage']
    non_coherent_percentage = row['non-coherent_percentage']
    
    ax.annotate(f"{round(total_count)}", (107, i), xytext=(-10, 0), textcoords='offset points', va='center', ha='center', fontsize=10)
    ax.annotate(f"{round(coherent_percentage)}%", (coherent_percentage / 2, i), 
                va='center', ha='center', fontsize=10, color='white')
    
    ax.annotate(f"{round(ambiguous_percentage)}%", 
                (coherent_percentage + ambiguous_percentage / 2, i), 
                va='center', ha='center', fontsize=10, color='white')
    
    ax.annotate(f"{round(non_coherent_percentage)}%", 
                (coherent_percentage + ambiguous_percentage + non_coherent_percentage / 2, i), 
                va='center', ha='center', fontsize=10, color='black')
plt.xlim(0,110)

# Show the plot
plt.tight_layout()
plt.savefig('/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_analysis/figures/gru-d_overview_fromclin.pdf', format='pdf', dpi=600)
plt.show()


### venn diagram

In [None]:
new_3venn = new_both.copy()
display(new_3venn)
new_3venn['parsed_clinical_diagnosis'].value_counts()
# display(new_3venn[(new_3venn['parsed_clinical_diagnosis'].apply(lambda x: 'MS' in x.split(','))) & (new_3venn['neuropathological_diagnosis'] != 'MS')] )
# new_3venn[(new_3venn['pred'] == 'MS') & (new_3venn['neuropathological_diagnosis'] != 'MS')] 

In [None]:
from matplotlib_venn import venn3
# venndiagnoses = ['MS']
venndiagnoses = ['PD','ATAXIA', 'MND', 'PSP', 'MSA','MS']
# venndiagnoses = ['MS']
for i in venndiagnoses:
    display(i)
    ## STRICT CD JACCARD
    venndf = new_3venn[(new_3venn['neuropathological_diagnosis'] == i) | (new_3venn['parsed_clinical_diagnosis'] == i)]
    # display(venndf[(venndf['pred'] == 'MS') & (venndf['neuropathological_diagnosis'] != 'MS')] )
    display(f"length strict cd: {len(venndf)}")
    cd_strict_only_ND = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        (venndf['parsed_clinical_diagnosis'] != i)]
    cd_strict_only_clinic = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          ((venndf['parsed_clinical_diagnosis'] == i))]
    cd_strict_clinic_ND_set = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         ((venndf['parsed_clinical_diagnosis'] == i))]
    print(f" jaccard strict cd: {len(cd_strict_clinic_ND_set)/(len(cd_strict_only_ND)+len(cd_strict_only_clinic)+len(cd_strict_clinic_ND_set))}")
    
    ## STRICT GRU-D JACCARD
    venndf = new_3venn[(new_3venn['neuropathological_diagnosis'] == i) | (new_3venn['pred'] == i)]
    display(f"length strict grud: {len(venndf)}")
    grud_strict_only_ND = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        (venndf['pred'] != i)]
    grud_strict_only_clinic = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          ((venndf['pred'] == i))]
    grud_strict_clinic_ND_set = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         ((venndf['pred'] == i))]
    print(f" jaccard strict gru-d: {len(grud_strict_clinic_ND_set)/(len(grud_strict_only_ND)+len(grud_strict_only_clinic)+len(grud_strict_clinic_ND_set))}")
    print("\n")
    
    ##  LENIENT cd jaccard
    venndf = new_3venn[(new_3venn['neuropathological_diagnosis'] == i) | new_3venn['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))]
    display(f"length lenient cd (in figure 3): {len(venndf)}")
    cd_lenient_only_ND = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        (~venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(',')))]
    cd_lenient_only_clinic = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          (venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(',')))]
    cd_lenient_clinic_ND_set = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         (venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(',')))]
    # display(clinndset)
    print(f" jaccard lenient cd (in figure 3,s7): {len(cd_lenient_clinic_ND_set)/(len(cd_lenient_only_ND)+len(cd_lenient_only_clinic)+len(cd_lenient_clinic_ND_set))}")
    print(f"ND only lenient (in figure) {len(cd_lenient_only_ND)}")
    print(f"clin + ND total (in figure) {len(cd_lenient_clinic_ND_set)}") 
    print(f"clin + ND strict (in figure parentheses) {len(cd_strict_clinic_ND_set)}")
    print(f"clin only total (in figure) {len(cd_lenient_only_clinic)}")
    print(f"clin only strict (in figure parentheses) {len(cd_strict_only_clinic)}")

    print(f" % ND {round(100*len(cd_lenient_clinic_ND_set)/(len(cd_lenient_clinic_ND_set)+len(cd_lenient_only_ND)))} ")
    print(f" % CD {round(100*len(cd_lenient_clinic_ND_set)/(len(cd_lenient_clinic_ND_set)+len(cd_lenient_only_clinic)))} ")
    print('\n')
    ##  LENIENT grud jaccard
    venndf = new_3venn[(new_3venn['neuropathological_diagnosis'] == i) | new_3venn['pred'].apply(lambda x: i in x.split(','))]
    display(f"length lenient grud: {len(venndf)}")
    grud_lenient_only_ND = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        (~venndf['pred'].apply(lambda x: i in x.split(',')))]
    grud_lenient_only_clinic = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          (venndf['pred'].apply(lambda x: i in x.split(',')))]
    grud_lenient_clinic_ND_set = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         (venndf['pred'].apply(lambda x: i in x.split(',')))]
    
    # display(clinndset)
    print(f" jaccard lenient grud (in figure): {len(grud_lenient_clinic_ND_set)/(len(grud_lenient_only_ND)+len(grud_lenient_only_clinic)+len(grud_lenient_clinic_ND_set))}")
    print("\n")
    
    ### SCORES WITH ALL 3
    venndf = new_3venn[(new_3venn['neuropathological_diagnosis'] == i) | new_3venn['pred'].apply(lambda x: i in x.split(',')) | new_3venn['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))]
    display(f"length lenient all 3: {len(venndf)}")
    
    onlyND = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                    (~venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))) & \
                    (~venndf['pred'].apply(lambda x: i in x.split(',')))]
    onlyGRUD = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                    (~venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))) & \
                    (venndf['pred'].apply(lambda x: i in x.split(',')))]
    onlyCD = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                    (venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))) & \
                    (~venndf['pred'].apply(lambda x: i in x.split(',')))]
    
    ND_CD = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                    (venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))) & \
                    (~venndf['pred'].apply(lambda x: i in x.split(',')))]
    ND_GRUD = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                    (~venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))) & \
                    (venndf['pred'].apply(lambda x: i in x.split(',')))]
    GRUD_CD = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                    (venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))) & \
                    (venndf['pred'].apply(lambda x: i in x.split(',')))]
    
    allthree = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                    (venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))) & \
                    (venndf['pred'].apply(lambda x: i in x.split(',')))]    
    print(f"only ND (strict) {len(onlyND)}")
    print(f"only GRU-D (lenient) {len(onlyGRUD)}")
    print(f"only CD (lenient) {len(onlyCD)}")

    print(f"ND_CD {len(ND_CD)}")
    print(f"ND_GRUD {len(ND_GRUD)}")
    print(f"GRUD_CD {len(GRUD_CD)}")
    
    print(f"allthree {len(allthree)}")

    venn3(subsets=(len(onlyND), len(onlyCD),len(ND_CD), len(onlyGRUD),len(ND_GRUD), len(GRUD_CD) , len(allthree)),
        # subsets=(1, 2, 3, 4,5,6, 7),
        set_labels=('Neuropathological Diagnosis', 'Clinical diagnosis', 'GRU-D prediction'),
        normalize_to=1
    )
    plt.show()
    print("\n")



In [None]:
from matplotlib_venn import venn3
# venndiagnoses = ['MS']
venndiagnoses = ['AD','FTD','VD','DLB']
# venndiagnoses = ['DLB']
for i in venndiagnoses:
    display(i)
    ## STRICT CD JACCARD
    venndf = new_3venn[(new_3venn['neuropathological_diagnosis'] == i) | (new_3venn['parsed_clinical_diagnosis'] == i)]
    # display(venndf[(venndf['pred'] == 'MS') & (venndf['neuropathological_diagnosis'] != 'MS')] )
    display(f"length strict cd: {len(venndf)}")
    cd_strict_only_ND = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        (venndf['parsed_clinical_diagnosis'] != i)]
    cd_strict_only_clinic = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          ((venndf['parsed_clinical_diagnosis'] == i))]
    cd_strict_clinic_ND_set = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         ((venndf['parsed_clinical_diagnosis'] == i))]
    print(f" jaccard strict cd: {len(cd_strict_clinic_ND_set)/(len(cd_strict_only_ND)+len(cd_strict_only_clinic)+len(cd_strict_clinic_ND_set))}")
    
    ## STRICT GRU-D JACCARD
    venndf = new_3venn[(new_3venn['neuropathological_diagnosis'] == i) | (new_3venn['pred'] == i)]
    display(f"length strict grud: {len(venndf)}")
    grud_strict_only_ND = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        (venndf['pred'] != i)]
    grud_strict_only_clinic = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          ((venndf['pred'] == i))]
    grud_strict_clinic_ND_set = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         ((venndf['pred'] == i))]
    print(f" jaccard strict gru-d: {len(grud_strict_clinic_ND_set)/(len(grud_strict_only_ND)+len(grud_strict_only_clinic)+len(grud_strict_clinic_ND_set))}")
    print("\n")
    
    ##  LENIENT cd jaccard
    venndf = new_3venn[(new_3venn['neuropathological_diagnosis'] == i) | new_3venn['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(',')) | new_3venn['parsed_clinical_diagnosis'].apply(lambda x: 'dementia' in x.split(','))]
    display(f"length lenient cd (in figure 3): {len(venndf)}")
    cd_lenient_only_ND = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                                ((venndf['parsed_clinical_diagnosis'] != 'dementia') & \
                                (~venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))))]
    cd_lenient_only_clinic = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                                    ((venndf['parsed_clinical_diagnosis'] == 'dementia') | \
                                    (venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))))]
    cd_lenient_clinic_ND_set = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                                      ((venndf['parsed_clinical_diagnosis'] == 'dementia') | \
                                       (venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))))]
    # display(cd_lenient_clinic_ND_set)
    print(f" jaccard lenient cd (in figure 3,s7): {len(cd_lenient_clinic_ND_set)/(len(cd_lenient_only_ND)+len(cd_lenient_only_clinic)+len(cd_lenient_clinic_ND_set))}")
    print(f"ND only lenient (in figure) {len(cd_lenient_only_ND)}")
    print(f"clin + ND total (in figure) {len(cd_lenient_clinic_ND_set)}") 
    print(f"clin + ND strict (in figure parentheses) {len(cd_strict_clinic_ND_set)}")
    print(f"clin only total (in figure) {len(cd_lenient_only_clinic)}")
    print(f"clin only strict (in figure parentheses) {len(cd_strict_only_clinic)}")

    print(f" % ND {round(100*len(cd_lenient_clinic_ND_set)/(len(cd_lenient_clinic_ND_set)+len(cd_lenient_only_ND)))} ")
    print(f" % CD {round(100*len(cd_lenient_clinic_ND_set)/(len(cd_lenient_clinic_ND_set)+len(cd_lenient_only_clinic)))} ")
    
    ##  LENIENT grud jaccard
    venndf = new_3venn[(new_3venn['neuropathological_diagnosis'] == i) | new_3venn['pred'].apply(lambda x: i in x.split(','))]
    display(f"length lenient grud: {len(venndf)}")
    grud_lenient_only_ND = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        (~venndf['pred'].apply(lambda x: i in x.split(',')))]
    grud_lenient_only_clinic = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          (venndf['pred'].apply(lambda x: i in x.split(',')))]
    grud_lenient_clinic_ND_set = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         (venndf['pred'].apply(lambda x: i in x.split(',')))]
    # display(clinndset)
    print(f" jaccard lenient grud (in figure): {len(grud_lenient_clinic_ND_set)/(len(grud_lenient_only_ND)+len(grud_lenient_only_clinic)+len(grud_lenient_clinic_ND_set))}")
    print("\n")
    
    ### SCORES WITH ALL 3
    venndf = new_3venn[(new_3venn['neuropathological_diagnosis'] == i) | new_3venn['pred'].apply(lambda x: i in x.split(',')) | new_3venn['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(',')) | new_3venn['parsed_clinical_diagnosis'].apply(lambda x: 'dementia' in x.split(',') )]
    display(f"length lenient all 3: {len(venndf)}")
    
    onlyND = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                    ((~venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))) &  (venndf['parsed_clinical_diagnosis'] != 'dementia') ) & \
                    (~venndf['pred'].apply(lambda x: i in x.split(',')))]
    onlyGRUD = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                    ((~venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))) &  (venndf['parsed_clinical_diagnosis'] != 'dementia') ) & \
                    (venndf['pred'].apply(lambda x: i in x.split(',')))]
    onlyCD = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                    ( (venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))) | (venndf['parsed_clinical_diagnosis'] == 'dementia')) & \
                    (~venndf['pred'].apply(lambda x: i in x.split(',')))]
    
    ND_CD = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                    ( (venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))) | (venndf['parsed_clinical_diagnosis'] == 'dementia')) & \
                    (~venndf['pred'].apply(lambda x: i in x.split(',')))]
    ND_GRUD = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                    ((~venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))) &  (venndf['parsed_clinical_diagnosis'] != 'dementia') ) & \
                    (venndf['pred'].apply(lambda x: i in x.split(',')))]
    GRUD_CD = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                    ( (venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))) | (venndf['parsed_clinical_diagnosis'] == 'dementia')) & \
                    (venndf['pred'].apply(lambda x: i in x.split(',')))]
    
    allthree = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                    ( (venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))) | (venndf['parsed_clinical_diagnosis'] == 'dementia')) & \
                    (venndf['pred'].apply(lambda x: i in x.split(',')))]    
    print(f"only ND (strict) {len(onlyND)}")
    print(f"only GRU-D (lenient) {len(onlyGRUD)}")
    print(f"only CD (lenient) {len(onlyCD)}")
    # display(onlyCD)
    print(f"ND_CD {len(ND_CD)}")
    print(f"ND_GRUD {len(ND_GRUD)}")
    print(f"GRUD_CD {len(GRUD_CD)}")
    
    print(f"allthree {len(allthree)}")

    venn3(subsets=(len(onlyND), len(onlyCD),len(ND_CD), len(onlyGRUD),len(ND_GRUD), len(GRUD_CD) , len(allthree)),
        # subsets=(1, 2, 3, 4,5,6, 7),
        set_labels=('Neuropathological Diagnosis', 'Clinical diagnosis', 'GRU-D prediction'),
        normalize_to=1
    )
    plt.show()
    print("\n")



In [None]:
from matplotlib_venn import venn3
# venndiagnoses = ['MS']
venndiagnoses = ['CON']
# venndiagnoses = ['MS']
for i in venndiagnoses:
    display(i)
    ## STRICT CD JACCARD
    venndf = new_3venn[(new_3venn['neuropathological_diagnosis'] == i) | (new_3venn['parsed_clinical_diagnosis'] == 'other')]
    # display(venndf[(venndf['pred'] == 'MS') & (venndf['neuropathological_diagnosis'] != 'MS')] )
    display(f"length strict cd: {len(venndf)}")
    cd_strict_only_ND = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        (venndf['parsed_clinical_diagnosis'] != 'other')]
    cd_strict_only_clinic = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          ((venndf['parsed_clinical_diagnosis'] == 'other'))]
    cd_strict_clinic_ND_set = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         ((venndf['parsed_clinical_diagnosis'] == 'other'))]
    print(f" jaccard strict cd: {len(cd_strict_clinic_ND_set)/(len(cd_strict_only_ND)+len(cd_strict_only_clinic)+len(cd_strict_clinic_ND_set))}")
    
    ## STRICT GRU-D JACCARD
    venndf = new_3venn[(new_3venn['neuropathological_diagnosis'] == i) | (new_3venn['pred'] == i)]
    display(f"length strict grud: {len(venndf)}")
    grud_strict_only_ND = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        (venndf['pred'] != i)]
    grud_strict_only_clinic = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          ((venndf['pred'] == i))]
    grud_strict_clinic_ND_set = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         ((venndf['pred'] == i))]
    print(f" jaccard strict gru-d: {len(grud_strict_clinic_ND_set)/(len(grud_strict_only_ND)+len(grud_strict_only_clinic)+len(grud_strict_clinic_ND_set))}")
    print("\n")
    
    ##  LENIENT cd jaccard
    venndf = new_3venn[(new_3venn['neuropathological_diagnosis'] == i) | new_3venn['parsed_clinical_diagnosis'].apply(lambda x: 'other' in x.split(','))]
    display(f"length lenient cd (in figure 3): {len(venndf)}")
    cd_lenient_only_ND = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        (~venndf['parsed_clinical_diagnosis'].apply(lambda x: 'other' in x.split(',')))]
    cd_lenient_only_clinic = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          (venndf['parsed_clinical_diagnosis'].apply(lambda x: 'other' in x.split(',')))]
    cd_lenient_clinic_ND_set = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         (venndf['parsed_clinical_diagnosis'].apply(lambda x: 'other' in x.split(',')))]
    # display(clinndset)
    print(f" jaccard lenient cd (in figure 3,s7): {len(cd_lenient_clinic_ND_set)/(len(cd_lenient_only_ND)+len(cd_lenient_only_clinic)+len(cd_lenient_clinic_ND_set))}")
    print(f"ND only lenient (in figure) {len(cd_lenient_only_ND)}")
    print(f"clin + ND total (in figure) {len(cd_lenient_clinic_ND_set)}") 
    print(f"clin + ND strict (in figure parentheses) {len(cd_strict_clinic_ND_set)}")
    print(f"clin only total (in figure) {len(cd_lenient_only_clinic)}")
    print(f"clin only strict (in figure parentheses) {len(cd_strict_only_clinic)}")

    print(f" % ND {round(100*len(cd_lenient_clinic_ND_set)/(len(cd_lenient_clinic_ND_set)+len(cd_lenient_only_ND)))} ")
    print(f" % CD {round(100*len(cd_lenient_clinic_ND_set)/(len(cd_lenient_clinic_ND_set)+len(cd_lenient_only_clinic)))} ")
    
    ##  LENIENT grud jaccard
    venndf = new_3venn[(new_3venn['neuropathological_diagnosis'] == i) | new_3venn['pred'].apply(lambda x: i in x.split(','))]
    display(f"length lenient grud: {len(venndf)}")
    grud_lenient_only_ND = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        (~venndf['pred'].apply(lambda x: i in x.split(',')))]
    grud_lenient_only_clinic = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          (venndf['pred'].apply(lambda x: i in x.split(',')))]
    grud_lenient_clinic_ND_set = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         (venndf['pred'].apply(lambda x: i in x.split(',')))]
    # display(clinndset)
    print(f" jaccard lenient grud (in figure): {len(grud_lenient_clinic_ND_set)/(len(grud_lenient_only_ND)+len(grud_lenient_only_clinic)+len(grud_lenient_clinic_ND_set))}")
    print("\n")
    
    ### SCORES WITH ALL 3
    venndf = new_3venn[(new_3venn['neuropathological_diagnosis'] == i) | new_3venn['pred'].apply(lambda x: i in x.split(',')) | new_3venn['parsed_clinical_diagnosis'].apply(lambda x: 'other' in x.split(','))]
    display(f"length lenient all 3: {len(venndf)}")
    
    onlyND = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                    (~venndf['parsed_clinical_diagnosis'].apply(lambda x: 'other' in x.split(','))) & \
                    (~venndf['pred'].apply(lambda x: i in x.split(',')))]
    onlyGRUD = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                    (~venndf['parsed_clinical_diagnosis'].apply(lambda x: 'other' in x.split(','))) & \
                    (venndf['pred'].apply(lambda x: i in x.split(',')))]
    onlyCD = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                    (venndf['parsed_clinical_diagnosis'].apply(lambda x: 'other' in x.split(','))) & \
                    (~venndf['pred'].apply(lambda x: i in x.split(',')))]
    
    ND_CD = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                    (venndf['parsed_clinical_diagnosis'].apply(lambda x: 'other' in x.split(','))) & \
                    (~venndf['pred'].apply(lambda x: i in x.split(',')))]
    ND_GRUD = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                    (~venndf['parsed_clinical_diagnosis'].apply(lambda x: 'other' in x.split(','))) & \
                    (venndf['pred'].apply(lambda x: i in x.split(',')))]
    GRUD_CD = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                    (venndf['parsed_clinical_diagnosis'].apply(lambda x: 'other' in x.split(','))) & \
                    (venndf['pred'].apply(lambda x: i in x.split(',')))]
    
    allthree = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                    (venndf['parsed_clinical_diagnosis'].apply(lambda x: 'other' in x.split(','))) & \
                    (venndf['pred'].apply(lambda x: i in x.split(',')))]    
    
    print(f"only ND (strict) {len(onlyND)}")
    print(f"only GRU-D (lenient) {len(onlyGRUD)}")
    print(f"only CD (lenient) {len(onlyCD)}")

    print(f"ND_CD {len(ND_CD)}")
    print(f"ND_GRUD {len(ND_GRUD)}")
    print(f"GRUD_CD {len(GRUD_CD)}")
    
    print(f"allthree {len(allthree)}")

    venn3(subsets=(len(onlyND), len(onlyCD),len(ND_CD), len(onlyGRUD),len(ND_GRUD), len(GRUD_CD) , len(allthree)),
        # subsets=(1, 2, 3, 4,5,6, 7),
        set_labels=('Neuropathological Diagnosis', 'Clinical diagnosis', 'GRU-D prediction'),
        normalize_to=1
    )
    plt.show()
    print("\n")



In [None]:
from matplotlib_venn import venn3
# venndiagnoses = ['MS']
venndiagnoses = ['AD','FTD','VD','DLB']
venndiagnoses = ['AD-DLB']
for i in venndiagnoses:
    display(i)
    ## STRICT CD JACCARD
    venndf = new_3venn[(new_3venn['neuropathological_diagnosis'] == i) | (new_3venn['parsed_clinical_diagnosis'] == i)]
    # display(venndf[(venndf['pred'] == 'MS') & (venndf['neuropathological_diagnosis'] != 'MS')] )
    display(f"length strict cd: {len(venndf)}")
    cd_strict_only_ND = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        (venndf['parsed_clinical_diagnosis'] != i)]
    cd_strict_only_clinic = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          ((venndf['parsed_clinical_diagnosis'] == i))]
    cd_strict_clinic_ND_set = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         ((venndf['parsed_clinical_diagnosis'] == i))]
    print(f" jaccard strict cd: {len(cd_strict_clinic_ND_set)/(len(cd_strict_only_ND)+len(cd_strict_only_clinic)+len(cd_strict_clinic_ND_set))}")
    
    ## STRICT GRU-D JACCARD
    venndf = new_3venn[(new_3venn['neuropathological_diagnosis'] == i) | (new_3venn['pred'] == i)]
    display(f"length strict grud: {len(venndf)}")
    grud_strict_only_ND = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        (venndf['pred'] != i)]
    grud_strict_only_clinic = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          ((venndf['pred'] == i))]
    grud_strict_clinic_ND_set = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         ((venndf['pred'] == i))]
    print(f" jaccard strict gru-d: {len(grud_strict_clinic_ND_set)/(len(grud_strict_only_ND)+len(grud_strict_only_clinic)+len(grud_strict_clinic_ND_set))}")
    print("\n")
    
    ##  LENIENT cd jaccard
    venndf = new_3venn[(new_3venn['neuropathological_diagnosis'] == i) |\
                       new_3venn['parsed_clinical_diagnosis'].apply(lambda x: 'AD' in x.split(',')) |\
                       new_3venn['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(',')) |\
                       new_3venn['parsed_clinical_diagnosis'].apply(lambda x: 'DLB' in x.split(',')) |\
                       new_3venn['parsed_clinical_diagnosis'].apply(lambda x: 'dementia' in x.split(','))]
    display(f"length lenient cd (in figure 3): {len(venndf)}")
    cd_lenient_only_ND = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                                ((venndf['parsed_clinical_diagnosis'] != 'dementia') & \
                                 (~venndf['parsed_clinical_diagnosis'].apply(lambda x: 'DLB' in x.split(','))) &\
                                 (~venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))) &\
                                (~venndf['parsed_clinical_diagnosis'].apply(lambda x: 'AD' in x.split(','))))]
    cd_lenient_only_clinic = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                                    ((venndf['parsed_clinical_diagnosis'] == 'dementia') | \
                                     (venndf['parsed_clinical_diagnosis'].apply(lambda x: 'DLB' in x.split(','))) |\
                                     (venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))) |\
                                    (venndf['parsed_clinical_diagnosis'].apply(lambda x: 'AD' in x.split(','))))]
    cd_lenient_clinic_ND_set = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                                      ((venndf['parsed_clinical_diagnosis'] == 'dementia') | \
                                     (venndf['parsed_clinical_diagnosis'].apply(lambda x: 'DLB' in x.split(','))) |\
                                     (venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))) |\
                                    (venndf['parsed_clinical_diagnosis'].apply(lambda x: 'AD' in x.split(','))))]
    # display(cd_lenient_clinic_ND_set)
    print(f" jaccard lenient cd (in figure 3,s7): {len(cd_lenient_clinic_ND_set)/(len(cd_lenient_only_ND)+len(cd_lenient_only_clinic)+len(cd_lenient_clinic_ND_set))}")
    print(f"ND only lenient (in figure) {len(cd_lenient_only_ND)}")
    print(f"clin + ND total (in figure) {len(cd_lenient_clinic_ND_set)}") 
    print(f"clin + ND strict (in figure parentheses) {len(cd_strict_clinic_ND_set)}")
    print(f"clin only total (in figure) {len(cd_lenient_only_clinic)}")
    print(f"clin only strict (in figure parentheses) {len(cd_strict_only_clinic)}")

    print(f" % ND {round(100*len(cd_lenient_clinic_ND_set)/(len(cd_lenient_clinic_ND_set)+len(cd_lenient_only_ND)))} ")
    print(f" % CD {round(100*len(cd_lenient_clinic_ND_set)/(len(cd_lenient_clinic_ND_set)+len(cd_lenient_only_clinic)))} ")
    
    ##  LENIENT grud jaccard
    venndf = new_3venn[(new_3venn['neuropathological_diagnosis'] == i) |\
                       new_3venn['pred'].apply(lambda x: 'AD' in x.split(',')) |\
                       new_3venn['pred'].apply(lambda x: 'DLB' in x.split(',')) |\
                       new_3venn['pred'].apply(lambda x: i in x.split(','))]
    display(f"length lenient grud: {len(venndf)}")
    grud_lenient_only_ND = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        ((~venndf['pred'].apply(lambda x: i in x.split(','))) & (~venndf['pred'].apply(lambda x: 'AD' in x.split(','))) & (~venndf['pred'].apply(lambda x: 'DLB' in x.split(','))) )]
    grud_lenient_only_clinic = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          ((venndf['pred'].apply(lambda x: i in x.split(','))) | (venndf['pred'].apply(lambda x: 'AD' in x.split(','))) | (venndf['pred'].apply(lambda x: 'DLB' in x.split(','))) )]
    grud_lenient_clinic_ND_set = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         ((venndf['pred'].apply(lambda x: i in x.split(','))) | (venndf['pred'].apply(lambda x: 'AD' in x.split(','))) | (venndf['pred'].apply(lambda x: 'DLB' in x.split(','))) )]
    # display(clinndset)
    print(f" jaccard lenient grud (in figure): {len(grud_lenient_clinic_ND_set)/(len(grud_lenient_only_ND)+len(grud_lenient_only_clinic)+len(grud_lenient_clinic_ND_set))}")
    print("\n")
    
    ### SCORES WITH ALL 3
    venndf = new_3venn[(new_3venn['neuropathological_diagnosis'] == i) |\
                       new_3venn['pred'].apply(lambda x: i in x.split(',')) |\
                       new_3venn['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(',')) |\
                       new_3venn['parsed_clinical_diagnosis'].apply(lambda x: 'dementia' in x.split(',') )]
    display(f"length lenient all 3: {len(venndf)}")
    
    onlyND = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                    ((~venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))) &  (venndf['parsed_clinical_diagnosis'] != 'dementia') ) & \
                    ((~venndf['pred'].apply(lambda x: i in x.split(','))) & (~venndf['pred'].apply(lambda x: 'AD' in x.split(','))) & (~venndf['pred'].apply(lambda x: 'DLB' in x.split(','))) )]
    onlyGRUD = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                    ((~venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))) &  (venndf['parsed_clinical_diagnosis'] != 'dementia') ) & \
                    (venndf['pred'].apply(lambda x: i in x.split(',')))]
    onlyCD = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                    ( (venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))) | (venndf['parsed_clinical_diagnosis'] == 'dementia')) & \
                    ((~venndf['pred'].apply(lambda x: i in x.split(','))) & (~venndf['pred'].apply(lambda x: 'AD' in x.split(','))) & (~venndf['pred'].apply(lambda x: 'DLB' in x.split(','))) )]
    
    ND_CD = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                    ( (venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))) | (venndf['parsed_clinical_diagnosis'] == 'dementia')) & \
                    ((~venndf['pred'].apply(lambda x: i in x.split(','))) & (~venndf['pred'].apply(lambda x: 'AD' in x.split(','))) & (~venndf['pred'].apply(lambda x: 'DLB' in x.split(','))) )]
    ND_GRUD = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                    ((~venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))) &  (venndf['parsed_clinical_diagnosis'] != 'dementia') ) & \
                    (venndf['pred'].apply(lambda x: i in x.split(',')))]
    GRUD_CD = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                    ( (venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))) | (venndf['parsed_clinical_diagnosis'] == 'dementia')) & \
                    (venndf['pred'].apply(lambda x: i in x.split(',')))]
    
    allthree = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                    ( (venndf['parsed_clinical_diagnosis'].apply(lambda x: i in x.split(','))) | (venndf['parsed_clinical_diagnosis'] == 'dementia')) & \
                    (venndf['pred'].apply(lambda x: i in x.split(',')))]    
    print(f"only ND (strict) {len(onlyND)}")
    print(f"only GRU-D (lenient) {len(onlyGRUD)}")
    print(f"only CD (lenient) {len(onlyCD)}")
    display(onlyCD)
    print(f"ND_CD {len(ND_CD)}")
    print(f"ND_GRUD {len(ND_GRUD)}")
    print(f"GRUD_CD {len(GRUD_CD)}")
    
    print(f"allthree {len(allthree)}")

    venn3(subsets=(len(onlyND), len(onlyCD),len(ND_CD), len(onlyGRUD),len(ND_GRUD), len(GRUD_CD) , len(allthree)),
        # subsets=(1, 2, 3, 4,5,6, 7),
        set_labels=('Neuropathological Diagnosis', 'Clinical diagnosis', 'GRU-D prediction'),
        normalize_to=1
    )
    plt.show()
    print("\n")



#### OLD

In [None]:
from matplotlib_venn import venn3
venndiagnoses = ['AD','FTD','VD','DLB']
# venndiagnoses = ['AD']
for i in venndiagnoses:
    ## first we have to select all donors that have AD or dementia in one of the three columns
    venndf = both[(both['neuropathological_diagnosis'] ==i) | (both['parsed_clinical_diagnosis'].str.contains(i)) | \
                  (both['pred'].str.contains(i)) | (both['parsed_clinical_diagnosis'] =='dementia') ]
    # venndf = venndf[['DonorID','neuropathological_diagnosis','parsed_clinical_diagnosis','pred']].copy()
    # display(venndf)
    display(len(venndf))
    nponly = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        ((~venndf['parsed_clinical_diagnosis'].str.contains(i)) & (venndf['parsed_clinical_diagnosis']!='dementia')) & \
                        (venndf['pred'] != i)]
    predonly = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          ((~venndf['parsed_clinical_diagnosis'].str.contains(i)) & (venndf['parsed_clinical_diagnosis']!='dementia')) & \
                          (venndf['pred'] == i)]
    clinonly = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          ((venndf['parsed_clinical_diagnosis'].str.contains(i)) | (venndf['parsed_clinical_diagnosis']=='dementia')) & \
                          (venndf['pred'] != i)]
    notnp = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                       ((venndf['parsed_clinical_diagnosis'].str.contains(i)) | (venndf['parsed_clinical_diagnosis']=='dementia')) & \
                       (venndf['pred'] == i)]
    notpred = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         ((venndf['parsed_clinical_diagnosis'].str.contains(i)) | (venndf['parsed_clinical_diagnosis']=='dementia')) &\
                         (~venndf['pred'].str.contains(i))]
    notclin =venndf[(venndf['neuropathological_diagnosis'] == i) & \
                         ((~venndf['parsed_clinical_diagnosis'].str.contains(i)) & (venndf['parsed_clinical_diagnosis']!='dementia')) & \
                         (venndf['pred'] == i)]
    allthree = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                          ((venndf['parsed_clinical_diagnosis'].str.contains(i)) | (venndf['parsed_clinical_diagnosis']=='dementia')) &\
                           (venndf['pred'].str.contains(i) )]
    # common_elements = list(set(list(allthree['DonorID'])).intersection(list(notpred['DonorID'])))
    # print(common_elements)
    print(f"ND only {len(nponly)}")
    print(f"pred only {len(predonly)}")
    print(f"clin only {len(clinonly)}")
    print(f"clin+pred {len(notnp)}")
    print(f"clin + ND {len(notpred)}")
    print(f"pred+ND {len(notclin)}")
    print(f"all three {len(allthree)}")
    print(len(nponly)+len(predonly)+len(clinonly)+len(notnp)+len(notpred)+len(notclin)+len(allthree))
    venn = venn3(
        subsets=(len(nponly), len(clinonly),len(notpred), len(predonly),len(notclin), len(notnp) , len(allthree)),
        # subsets=(1, 2, 3, 4,5,6, 7),
        set_labels=('Neuropathological Diagnosis', 'Clinical diagnosis', 'GRU-D prediction'),
        normalize_to=1
    )
    for text in venn.set_labels:
        text.set_fontsize(12)
    for text in venn.subset_labels:
        if text is not None:
            text.set_fontsize(14)
    plt.title(i)
    plt.show()


In [None]:
from matplotlib_venn import venn3
venndiagnoses = ['AD-DLB']
# venndiagnoses = ['AD']
for i in venndiagnoses:
    ## first we have to select all donors that have AD or dementia in one of the three columns
    venndf = both[(both['neuropathological_diagnosis'] ==i) | (both['parsed_clinical_diagnosis'].str.contains(i)) | \
                  (both['pred'].str.contains(i)) | (both['parsed_clinical_diagnosis'] =='dementia') ]
    # venndf = venndf[['DonorID','neuropathological_diagnosis','parsed_clinical_diagnosis','pred']].copy()
    display(venndf)
    display(len(venndf))
    nponly = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        ((~venndf['parsed_clinical_diagnosis'].str.contains(i)) &\
                         (~venndf['parsed_clinical_diagnosis'].str.contains('AD')) & \
                         (~venndf['parsed_clinical_diagnosis'].str.contains('DLB'))& \
                         (venndf['parsed_clinical_diagnosis']!='dementia')) & \
                        ((venndf['pred'] != i)&(venndf['pred'] != 'AD')&(venndf['pred'] != 'DLB'))]
    predonly = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          ((~venndf['parsed_clinical_diagnosis'].str.contains(i)) & (venndf['parsed_clinical_diagnosis']!='dementia')) & \
                          (venndf['pred'] == i)]
    clinonly = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          ((venndf['parsed_clinical_diagnosis'].str.contains(i)) | (venndf['parsed_clinical_diagnosis']=='dementia')) & \
                          ((venndf['pred'] != i)&(venndf['pred'] != 'AD')&(venndf['pred'] != 'DLB'))]
    notnp = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                       ((venndf['parsed_clinical_diagnosis'].str.contains(i)) |\
                        (venndf['parsed_clinical_diagnosis'].str.contains('AD')) |\
                        (venndf['parsed_clinical_diagnosis'].str.contains('DLB')) |\
                        (venndf['parsed_clinical_diagnosis']=='dementia')) & \
                       ((venndf['pred'] == i) | (venndf['pred'] == 'AD') | (venndf['pred'] == 'DLB'))]
    notpred = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         ((venndf['parsed_clinical_diagnosis'].str.contains(i)) |\
                          (venndf['parsed_clinical_diagnosis'].str.contains('AD')) |\
                          (venndf['parsed_clinical_diagnosis'].str.contains('DLB')) |\
                          (venndf['parsed_clinical_diagnosis']=='dementia')) &\
                         ((venndf['pred'] != i)&(venndf['pred'] != 'AD')&(venndf['pred'] != 'DLB'))]
    notclin =venndf[(venndf['neuropathological_diagnosis'] == i) & \
                         ((~venndf['parsed_clinical_diagnosis'].str.contains(i)) & \
                          (~venndf['parsed_clinical_diagnosis'].str.contains('AD')) & \
                          (~venndf['parsed_clinical_diagnosis'].str.contains('DLB')) & \
                          (venndf['parsed_clinical_diagnosis']!='dementia')) & \
                         ((venndf['pred'] == i) | (venndf['pred'] == 'AD') | (venndf['pred'] == 'DLB'))]
    allthree = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                          ((venndf['parsed_clinical_diagnosis'].str.contains(i)) | \
                           (venndf['parsed_clinical_diagnosis'].str.contains('AD')) | \
                           (venndf['parsed_clinical_diagnosis'].str.contains('DLB')) | \
                           (venndf['parsed_clinical_diagnosis']=='dementia')) &\
                           ((venndf['pred'] == i) | (venndf['pred'] == 'AD') | (venndf['pred'] == 'DLB'))]
    # common_elements = list(set(list(allthree['DonorID'])).intersection(list(notpred['DonorID'])))
    # print(common_elements)
    display(notnp)
    print(f"ND only {len(nponly)}")
    print(f"pred only {len(predonly)}")
    print(f"clin only {len(clinonly)}")
    print(f"clin+pred {len(notnp)}")
    print(f"clin + ND {len(notpred)}")
    print(f"pred+ND {len(notclin)}")
    print(f"all three {len(allthree)}")
    print(len(nponly)+len(predonly)+len(clinonly)+len(notnp)+len(notpred)+len(notclin)+len(allthree))
    venn = venn3(
        subsets=(len(nponly), len(clinonly),len(notpred), len(predonly),len(notclin), len(notnp) , len(allthree)),
        # subsets=(1, 2, 3, 4,5,6, 7),
        set_labels=('Neuropathological Diagnosis', 'Clinical diagnosis', 'GRU-D prediction'),
        normalize_to=1
    )
    for text in venn.set_labels:
        text.set_fontsize(12)
    for text in venn.subset_labels:
        if text is not None:
            text.set_fontsize(14)
    plt.title(i)
    plt.show()


In [None]:
venndiagnoses = ['CON'] ##'CON',
# venndiagnoses = ['MND'] ##'CON',
for i in venndiagnoses:
    ## first we have to select all donors that have AD or dementia in one of the three columns
    venndf = both[(both['neuropathological_diagnosis'] ==i) | (both['parsed_clinical_diagnosis']=='other') | \
                  (both['pred'].str.contains(i)) ]
    # venndf = venndf[['DonorID','neuropathological_diagnosis','parsed_clinical_diagnosis','pred']].copy()
    display(len(venndf))
    display(venndf)
    nponly = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        ((venndf['parsed_clinical_diagnosis'] != 'other')) & \
                        (venndf['pred'] != i)]
    predonly = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          ((venndf['parsed_clinical_diagnosis'] != 'other')) & \
                          (venndf['pred'] == i)]
    clinonly = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          ((venndf['parsed_clinical_diagnosis'] == 'other')) & \
                          (venndf['pred'] != i)]
    notnp = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                       ((venndf['parsed_clinical_diagnosis'] == 'other')) & \
                       (venndf['pred'] == i)]
    notpred = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         ((venndf['parsed_clinical_diagnosis'] == 'other')) &\
                         (~venndf['pred'].str.contains(i))]
    notclin = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                         ((venndf['parsed_clinical_diagnosis'] != 'other')) & \
                         (venndf['pred'] == i)]
    allthree = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                          ((venndf['parsed_clinical_diagnosis'] == 'other')) &\
                           (venndf['pred'].str.contains(i) )]
    display(predonly)
    # common_elements = list(set(list(allthree['DonorID'])).intersection(list(notpred['DonorID'])))
    # print(common_elements)
    print(f"ND only {len(nponly)}")
    print(f"pred only {len(predonly)}")
    print(f"clin only {len(clinonly)}")
    print(f"clin+pred {len(notnp)}")
    print(f"clin + ND {len(notpred)}")
    print(f"pred+ND {len(notclin)}")
    print(f"all three {len(allthree)}")
    print(len(nponly)+len(predonly)+len(clinonly)+len(notnp)+len(notpred)+len(notclin)+len(allthree))
    venn = venn3(
        subsets=(len(nponly), len(clinonly),len(notpred), len(predonly),len(notclin), len(notnp) , len(allthree)),
        set_labels=('Neuropathological Diagnosis', 'Clinical diagnosis', 'GRU-D prediction'),
        normalize_to=1
    )
    for text in venn.set_labels:
        text.set_fontsize(12)
    for text in venn.subset_labels:
        if text is not None:
            text.set_fontsize(14)
    plt.title(i)
    plt.show()


In [None]:
venndiagnoses = ['PD','ATAXIA', 'MND', 'PSP', 'MS', 'MSA'] ##'CON',
# venndiagnoses = ['MND'] ##'CON',
for i in venndiagnoses:
    ## first we have to select all donors that have AD or dementia in one of the three columns
    venndf = both[(both['neuropathological_diagnosis'] ==i) | (both['parsed_clinical_diagnosis'].str.contains(i)) | \
                  (both['pred'].str.contains(i)) ]
    # venndf = venndf[['DonorID','neuropathological_diagnosis','parsed_clinical_diagnosis','pred']].copy()
    display(len(venndf))
    display(venndf)
    nponly = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        ((~venndf['parsed_clinical_diagnosis'].str.contains(i))) & \
                        (venndf['pred'] != i)]
    predonly = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          ((~venndf['parsed_clinical_diagnosis'].str.contains(i))) & \
                          (venndf['pred'] == i)]
    display(predonly)
    clinonly = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          ((venndf['parsed_clinical_diagnosis'].str.contains(i))) & \
                          (venndf['pred'] != i)]
    notnp = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                       ((venndf['parsed_clinical_diagnosis'].str.contains(i))) & \
                       (venndf['pred'] == i)]
    notpred = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         ((venndf['parsed_clinical_diagnosis'].str.contains(i))) &\
                         (~venndf['pred'].str.contains(i))]
    notclin =venndf[(venndf['neuropathological_diagnosis'] == i) & \
                         ((~venndf['parsed_clinical_diagnosis'].str.contains(i))) & \
                         (venndf['pred'] == i)]
    allthree = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                          ((venndf['parsed_clinical_diagnosis'].str.contains(i))) &\
                           (venndf['pred'].str.contains(i) )]
    # common_elements = list(set(list(allthree['DonorID'])).intersection(list(notpred['DonorID'])))
    # print(common_elements)
    print(f"ND only {len(nponly)}")
    print(f"pred only {len(predonly)}")
    print(f"clin only {len(clinonly)}")
    print(f"clin+pred {len(notnp)}")
    print(f"clin + ND {len(notpred)}")
    print(f"pred+ND {len(notclin)}")
    print(f"all three {len(allthree)}")
    print(len(nponly)+len(predonly)+len(clinonly)+len(notnp)+len(notpred)+len(notclin)+len(allthree))
    venn = venn3(
        subsets=(len(nponly), len(clinonly),len(notpred), len(predonly),len(notclin), len(notnp) , len(allthree)),
        set_labels=('Neuropathological Diagnosis', 'Clinical diagnosis', 'GRU-D prediction'),
        normalize_to=1
    )
    for text in venn.set_labels:
        text.set_fontsize(12)
    for text in venn.subset_labels:
        if text is not None:
            text.set_fontsize(14)
    plt.title(i)
    plt.show()


### jaccard

In [None]:
jacdf = both[['neuropathological_diagnosis','DonorID','parsed_clinical_diagnosis','pred']]
jacdf

venndiagnoses = ['PD','ATAXIA', 'MND', 'PSP', 'MS', 'MSA'] ##'CON',
# venndiagnoses = ['MSA'] ##'CON',
for i in venndiagnoses:
    print(i)
    ## first we have to select all donors that have AD or dementia in one of the three columns
    venndf = jacdf[(jacdf['neuropathological_diagnosis'] ==i) | (jacdf['parsed_clinical_diagnosis'].str.contains(i)) | \
                  (jacdf['pred'].str.contains(i)) ]
    # venndf = venndf[['DonorID','neuropathological_diagnosis','parsed_clinical_diagnosis','pred']].copy()
    # display(len(venndf))
    # display(venndf)

    clin_ND_set = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         ((venndf['parsed_clinical_diagnosis'].str.contains(i)))]
    clinonly = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          ((venndf['parsed_clinical_diagnosis'].str.contains(i)))]
    nponly_clin = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        ((~venndf['parsed_clinical_diagnosis'].str.contains(i)))]
    
    grud_ND_set = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         ((venndf['pred'] == i))]
    grudonly = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          ((venndf['pred'] == i))]
    nponly_grud = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        ((venndf['pred'] != i))]

    # display(clin_ND_set)
#     display(clinonly)
#     display(nponly)
    
#     display(len(grud_ND_set))
#     display(len(grudonly))
#     display(len(nponly_grud))
    
    clin_jaccard = len(clin_ND_set)/(len(clinonly)+len(nponly_clin)+len(clin_ND_set))
    grud_jaccard = len(grud_ND_set)/(len(grudonly)+len(nponly_grud)+len(grud_ND_set))
    print(round(clin_jaccard,3))
    print(round(grud_jaccard,3))

In [None]:
jacdf = both[['neuropathological_diagnosis','DonorID','parsed_clinical_diagnosis','pred']]
jacdf

venndiagnoses = ['AD','FTD','VD','DLB']
# venndiagnoses = ['AD'] ##'CON',
for i in venndiagnoses:
    print(i)
    ## first we have to select all donors that have AD or dementia in one of the three columns
    venndf = jacdf[(jacdf['neuropathological_diagnosis'] ==i) | (jacdf['parsed_clinical_diagnosis'].str.contains(i)) | \
                  (jacdf['pred'].str.contains(i)) ]
    # venndf = venndf[['DonorID','neuropathological_diagnosis','parsed_clinical_diagnosis','pred']].copy()
    # display(len(venndf))
    # display(venndf)

    clin_ND_set = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         ((venndf['parsed_clinical_diagnosis'].str.contains(i)) | (venndf['parsed_clinical_diagnosis']=='dementia'))]
    clinonly = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          ((venndf['parsed_clinical_diagnosis'].str.contains(i)) | (venndf['parsed_clinical_diagnosis']=='dementia'))]
    nponly_clin = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        ((~venndf['parsed_clinical_diagnosis'].str.contains(i)) & (venndf['parsed_clinical_diagnosis']!='dementia'))]
    
    grud_ND_set = venndf[(venndf['neuropathological_diagnosis'] == i) &\
                         ((venndf['pred'] == i))]
    grudonly = venndf[(venndf['neuropathological_diagnosis'] != i) & \
                          ((venndf['pred'] == i))]
    nponly_grud = venndf[(venndf['neuropathological_diagnosis'] == i) & \
                        ((venndf['pred'] != i))]


    # display(clin_ND_set)
    # display(clinonly)
    # display(nponly_clin)
    
    display(len(grud_ND_set))
    display(len(grudonly))
    display(len(nponly_grud))
    
    clin_jaccard = len(clin_ND_set)/(len(clinonly)+len(nponly_clin)+len(clin_ND_set))
    grud_jaccard = len(grud_ND_set)/(len(grudonly)+len(nponly_grud)+len(grud_ND_set))
    print(round(clin_jaccard,3))
    print(round(grud_jaccard,3))

### scatterplot

In [None]:
scatter = both[['DonorID','neuropathological_diagnosis','diagnosis_info','pred_info']].copy()
# display(scatter)

import matplotlib.lines as mlines
# wantedx
overlap_percentages = {}
overlap_percentages_lenient = {}
# Calculate overlap percentage for each diagnosis
for diagnosis in wantedx:
    diagnosis_df = scatter[scatter['neuropathological_diagnosis'] == diagnosis]
    overlap_percentage = (
        ((diagnosis_df['diagnosis_info'] == 'coherent') & (diagnosis_df['pred_info'] == 'coherent')).sum() / len(diagnosis_df) * 100
    )
    overlap_percentages[diagnosis] = overlap_percentage

    

for diagnosis in wantedx:
    diagnosis_df = scatter[scatter['neuropathological_diagnosis'] == diagnosis]
    overlap_percentage = (
        ((diagnosis_df['diagnosis_info'] != 'non-coherent') & (diagnosis_df['pred_info'] != 'non-coherent')).sum() / len(diagnosis_df) * 100
    )
    overlap_percentages_lenient[diagnosis] = overlap_percentage

## Convert the dictionary to a DataFrame
overlap_df = pd.DataFrame.from_dict(overlap_percentages, orient='index', columns=['overlap_strict'])
overlap_df_lenient = pd.DataFrame.from_dict(overlap_percentages_lenient, orient='index', columns=['overlap_lenient'])
# display(overlap_df)


# Display the updated scatterdf
# print(scatterdf)

scatterpc = scatter.pivot_table(
    values=['diagnosis_info', 'pred_info'],
    index='neuropathological_diagnosis',
    aggfunc=lambda x: (x == 'coherent').mean() * 100
)

scatterpc['diagnosis_info_lenient'] = scatter.pivot_table(
    values='diagnosis_info',
    index='neuropathological_diagnosis',
    aggfunc=lambda x: ((x == 'coherent') | (x == 'ambiguous')).mean() * 100
)

scatterpc['pred_info_lenient'] = scatter.pivot_table(
    values='pred_info',
    index='neuropathological_diagnosis',
    aggfunc=lambda x: ((x == 'coherent') | (x == 'ambiguous')).mean() * 100
)
scatterpc = scatterpc.merge(overlap_df, left_on='neuropathological_diagnosis', right_index=True)
scatterpc = scatterpc.merge(overlap_df_lenient, left_on='neuropathological_diagnosis', right_index=True)

min_dot_size = 100
max_dot_size = 1000
scatterpc['dot_strict'] = min_dot_size + (max_dot_size - min_dot_size) * (scatterpc['overlap_strict'] / 100)
scatterpc['dot_lenient'] = min_dot_size + (max_dot_size - min_dot_size) * (scatterpc['overlap_lenient'] / 100)

print( min_dot_size + (max_dot_size - min_dot_size))
display(scatterpc)
# Create a scatter plot
plt.figure(figsize=(10,10))

plt.scatter(
    scatterpc['diagnosis_info'],
    scatterpc['pred_info'],
    c='#004c6d',  # Use the desired color
    label='strict',
    marker='o',
    s=scatterpc['dot_strict'] 
)

plt.scatter(
    scatterpc['diagnosis_info_lenient'],
    scatterpc['pred_info_lenient'],
    c='#aa7f0e',  # Use the desired color
    label='lenient',
    marker='o',
    s=scatterpc['dot_lenient'] 
)

plt.scatter(
    10,
    10,
    c='black',
    s = min_dot_size
    
)
plt.scatter(
    20,
    20,
    c='black',
    s = min_dot_size + (max_dot_size - min_dot_size)
    
)

# Label the scatter points with neuropathological_diagnosis
for i, diagnosis in enumerate(scatterpc.index):
    plt.text(scatterpc['diagnosis_info'][i]+2, scatterpc['pred_info'][i] + 2, diagnosis, fontsize=8, color='#004c6d')
    plt.text(scatterpc['diagnosis_info_lenient'][i]+1, scatterpc['pred_info_lenient'][i] + 2, diagnosis, fontsize=8,color='#aa7f0e')
plt.ylim(0,100)
# Add labels and title
plt.xlabel('diagnosis_info (%)')
plt.ylabel('pred_info (%)')
plt.title('Scatter Plot of diagnosis_info vs. pred_info')

# Add legend
# plt.legend(loc='best')

# Show the plot
plt.grid()
plt.savefig('/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_analysis/figures/scatter.pdf', format='pdf', dpi=600)     
plt.show()

# '#004c6d', '#aa7f0e',

In [None]:
def add_model_vs_clinic_column(df):
    # Define the conditions and the corresponding values for the new column
    conditions = [
        (df['pred_info'] == 'coherent') & (df['diagnosis_info'] == 'coherent'), ##'both_coherent'
        (df['pred_info'] == 'non-coherent') & (df['diagnosis_info'] == 'coherent'), ##clin_coh_model_non_coh
        (df['pred_info'] == 'ambiguous') & (df['diagnosis_info'] == 'coherent'), ##'clin_coh_model_ambiguous'
        (df['pred_info'] == 'non-coherent') & (df['diagnosis_info'] == 'ambiguous'), ##clin_amb_model_non_coh
        (df['pred_info'] == 'coherent') & (df['diagnosis_info'] == 'non-coherent'), ##model_coh_clin_non_coh
        (df['pred_info'] == 'coherent') & (df['diagnosis_info'] == 'ambiguous'), ##model_coh_clin_ambiguous
        (df['pred_info'] == 'ambiguous') & (df['diagnosis_info'] == 'non-coherent'), ## model_amb_clin_non_coh
        (df['pred_info'] == 'non-coherent') & (df['diagnosis_info'] == 'non-coherent'),  ##both_non_coh
        (df['pred_info'] == 'ambiguous') & (df['diagnosis_info'] == 'ambiguous'), ## both ambh
    ]

    values = ['both_coherent',\
               'clin_coh_model_non_coh', 'clin_coh_model_ambiguous','clin_amb_model_non_coh',\
               'model_coh_clin_non_coh','model_coh_clin_ambiguous','model_amb_clin_non_coh',\
               'both_non_coh','both_amb']

    # Use np.select to assign values based on the conditions
    df['model_vs_clinic'] = np.select(conditions, values, default=None)

# Example usage:
# Apply the function to the DataFrame "ms_diagnosed_donors"
add_model_vs_clinic_column(both)
add_model_vs_clinic_column(both_full)
display(both.tail(5))
display(both_full.tail(5))
both.to_excel('/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_analysis/data/grud_clin_subset_overview_both.xlsx', index=False)
both_full.to_excel('/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_analysis/data/grud_clin_subset_full_overview_both.xlsx', index=False)

In [None]:
# both[both['neuropathological_diagnosis'] == 'FTD']['diagnosis_info'].value_counts()
# display(both[both['model_vs_clinic'] == 'both_non_coh']['neuropathological_diagnosis'].value_counts())
display(both[both['diagnosis_info'] == 'non-coherent']['neuropathological_diagnosis'].value_counts())
display(both[both['neuropathological_diagnosis'] == 'AD-DLB']['model_vs_clinic'].value_counts())
display(both['model_vs_clinic'].value_counts())
# both['model_vs_clinic'].value_counts()

In [None]:
# Select all donors with neuropathological diagnosis "MS"
overview = []
for diagnosis in both['neuropathological_diagnosis'].unique():
    # print(diagnosis)
    diagnosed_donors = both[both['neuropathological_diagnosis'] == diagnosis]
    # display(diagnosed_donors)
    # Calculate the counts for each category
    both_coherent = diagnosed_donors[diagnosed_donors['model_vs_clinic'] == 'both_coherent']
    doctor_coherent_model_non_coherent = diagnosed_donors[(diagnosed_donors['model_vs_clinic'] == 'clin_coh_model_non_coh') |\
                                                          (diagnosed_donors['model_vs_clinic'] == 'clin_coh_model_ambiguous')]
    # doctor_coherent_model_ambiguous = diagnosed_donors[diagnosed_donors['model_vs_clinic'] == 'clin_coh_model_ambiguous']
    doctor_ambiguous_model_non_coherent = diagnosed_donors[(diagnosed_donors['model_vs_clinic'] == 'clin_amb_model_non_coh')|\
                                                           (diagnosed_donors['model_vs_clinic'] == 'model_amb_clin_non_coh') |\
                                                           (diagnosed_donors['model_vs_clinic'] == 'both_non_coh')|\
                                                           (diagnosed_donors['model_vs_clinic'] == 'both_amb')
                                                          ]
    model_coherent_doctor_non_coherent =diagnosed_donors[(diagnosed_donors['model_vs_clinic'] == 'model_coh_clin_non_coh')|\
                                                         (diagnosed_donors['model_vs_clinic'] == 'model_coh_clin_ambiguous')]
    # model_coherent_doctor_ambiguous =diagnosed_donors[diagnosed_donors['model_vs_clinic'] == 'model_coh_clin_ambiguous']
    # model_amb_clin_non_coh = diagnosed_donors[diagnosed_donors['model_vs_clinic'] == 'model_amb_clin_non_coh']
    # both_non_coherent = diagnosed_donors[diagnosed_donors['model_vs_clinic'] == 'both_non_coh']
    # ambiguous_by_both = diagnosed_donors[diagnosed_donors['model_vs_clinic'] == 'both_amb']
    

    overview.append([diagnosis, len(both_coherent), \
                     len(doctor_coherent_model_non_coherent),#len(doctor_coherent_model_ambiguous),
                     len(doctor_ambiguous_model_non_coherent), \
                     len(model_coherent_doctor_non_coherent),#len(model_coherent_doctor_ambiguous),
                     # len(model_amb_clin_non_coh), \
                     # len(both_non_coherent),len(ambiguous_by_both)
                    ])
# display(overview)
# print(len(overview))    
columns = ['diagnosis', 'both_coherent',\
           'clin_coh', #'clin_coh_model_ambiguous',
           'ambiguous_or_non_coherent',\
           'model_coh',#'model_coh_clin_ambiguous',
           # 'model_amb_clin_non_coh',\
           # 'both_non_coh','both_ambiguous'
          ]
diagnosis_df = pd.DataFrame(overview, columns=columns)

# Set the 'diagnosis' column as the index for easier plotting
diagnosis_df.set_index('diagnosis', inplace=True)
# wantedx = ['CON','AD', 'PD', 'VD', 'FTD', 'DLB', 'AD-DLB', 'ATAXIA', 'MND', 'PSP', 'MS', 'MSA']
# diagnosis_df.index = pd.Categorical(diagnosis_df.index, categories=wantedx)
# diagnosis_df = diagnosis_df.sort_index(axis=0).sort_index(axis=1)
diagnosis_df = diagnosis_df.reindex(wantedx)
diagnosis_df = diagnosis_df.iloc[::-1]
display(diagnosis_df)


In [None]:

total_counts = diagnosis_df.sum(axis=1)
# display(diagnosis_df)
diagnosis_df_percentage = diagnosis_df.divide(total_counts, axis=0) * 100
# display(diagnosis_df_percentage)
# Plot the horizontal stacked bar chart with percentages
# colors = ['#004c6d', '#aa7f0e', '#ff7f0e']
colors = ['darkgreen','#aa7f0e', '#004c6d', 'grey']
colors = {
    'both_coherent': 'darkgreen',
    'clin_coh': '#ff7f0e',
    'clin_coh_model_non_coh': '#ff7f0e',
    'clin_coh_model_ambiguous': '#f2cc8f',
    'model_coh': '#004c6d',
    'model_coh_clin_non_coh': '#004c6d',
    'model_coh_clin_ambiguous': '#6699cc',
    'both_non_coh': '#404040',
    'both_ambiguous': 'pink',
    'clin_amb_model_non_coh':'#7f7f7f',
    'model_amb_clin_non_coh':'#bfbfbf',
    'ambiguous_or_non_coherent':'#bfbfbf'
}
ax = diagnosis_df_percentage.plot(kind='barh', stacked=True, figsize=(6, 8), color=colors)


# Set the title and labels
plt.title('Diagnosis Distribution')
plt.xlabel('Percentage')
plt.ylabel('Diagnosis')

# Show the legend
plt.legend(title='Category', bbox_to_anchor=(1.1, 1))
for i, count in enumerate(total_counts):
    plt.text(107, i, f"{count}", ha='left', va='center', color='black', fontsize=14)

# Show the plot
plt.tight_layout()
plt.show()

## old

In [None]:
def make_confusion_matrix(cf,
                          group_names=None,
                          categories='auto',
                          count=True,
                          percent=True,
                          cbar=True,
                          xyticks=True,
                          xyplotlabels=True,
                          sum_stats=True,
                          figsize=None,
                          cmap='Blues',
                          title=None):
    '''
    This function will make a pretty plot of an sklearn Confusion Matrix cm using a Seaborn heatmap visualization.
    Arguments
    ---------
    cf:            confusion matrix to be passed in
    group_names:   None, List of strings that represent the labels row by row to be shown in each square.
    categories:    List of strings containing the categories to be displayed on the x,y axis. Default is 'auto'
    count:         If True, show the raw number in the confusion matrix. Default is True.
    normalize:     If True, show the proportions for each category. Default is True.
    cbar:          If True, show the color bar. The cbar values are based off the values in the confusion matrix.
                   Default is True.
    xyticks:       If True, show x and y ticks. Default is True.
    xyplotlabels:  If True, show 'True Label' and 'Predicted Label' on the figure. Default is True.
    sum_stats:     If True, display summary statistics below the figure. Default is True.
    figsize:       Tuple representing the figure size. Default will be the matplotlib rcParams value.
    cmap:          Colormap of the values displayed from matplotlib.pyplot.cm. Default is 'Blues'
                   See http://matplotlib.org/examples/color/colormaps_reference.html
                   
    title:         Title for the heatmap. Default is None.
    '''


    # CODE TO GENERATE TEXT INSIDE EACH SQUARE
    blanks = ['' for i in range(cf.size)]

    if group_names and len(group_names)==cf.size:
        group_labels = ["{}\n".format(value) for value in group_names]
    else:
        group_labels = blanks

    if count:
#         group_counts = ["{0:.2f}  \n".format(value) for value in cf.flatten()]
        group_counts = []
        for value in cf.flatten():
            if value == 0:
                group_counts.append('0')
            else:
                group_counts.append(value)
#         group_counts = ["{} \n".format(value) for value in cf.flatten()]
    else:
        group_counts = blanks

    if percent:
        group_percentages = ["{0:.2%}".format(value) for value in cf.flatten()/np.sum(cf)]
    else:
        group_percentages = blanks

    box_labels = [f"{v1}{v2}{v3}".strip() for v1, v2, v3 in zip(group_labels,group_counts,group_percentages)]
    box_labels = np.asarray(box_labels).reshape(cf.shape[0],cf.shape[1])


    # CODE TO GENERATE SUMMARY STATISTICS & TEXT FOR SUMMARY STATS
    if sum_stats:
        #Accuracy is sum of diagonal divided by total observations
        accuracy  = np.trace(cf) / float(np.sum(cf))

        #if it is a binary confusion matrix, show some more stats
        if len(cf)==2:
            #Metrics for Binary Confusion Matrices
            precision = cf[1,1] / sum(cf[:,1])
            recall    = cf[1,1] / sum(cf[1,:])
            f1_score  = 2*precision*recall / (precision + recall)
            stats_text = "\n\nAccuracy={:0.3f}\nPrecision={:0.3f}\nRecall={:0.3f}\nF1 Score={:0.3f}".format(
                accuracy,precision,recall,f1_score)
        else:
            stats_text = "\n\nAccuracy={:0.3f}".format(accuracy)
    else:
        stats_text = ""


    # SET FIGURE PARAMETERS ACCORDING TO OTHER ARGUMENTS
    if figsize==None:
        #Get default figure size if not set
        figsize = plt.rcParams.get('figure.figsize')

    if xyticks==False:
        #Do not show categories if xyticks is False
        categories=False


    # MAKE THE HEATMAP VISUALIZATION
    plt.figure(figsize=figsize)
    sns.heatmap(cf,annot=box_labels,fmt="",cmap=cmap,cbar=cbar,
                xticklabels=categories,
                yticklabels=categories,
                annot_kws={"size": 16})
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    if xyplotlabels:
        plt.ylabel('True label',labelpad=0)#,fontsize=16)
        plt.xlabel('Predicted label' + stats_text,labelpad=10)#,fontsize=16)
    else:
        plt.xlabel(stats_text)#,fontsize=16)
    if title:
        plt.title(title)#,fontsize=16)

In [None]:
pd.set_option('display.max_rows', 500)

In [None]:
sns.set(font_scale=2.1)
list_of_dfs = []
full_preds = []
averages_f1 = []
confusions = []
jaccards = []

# alphabetic = ['AD', 'AD_CA','AD_VE','ATAXIA','BP','CON','DEM_SICC','DLB',  'FTD',
#               'MD','MND','MS', 'MSA','PD',  'PDD', 'PSP', 'SCHIZ', 'VD']
wanted = ['CON', 'AD', 'AD_CA', 'AD_VE', 'DEM_SICC', 'PD', 'PDD','DLB', 'VD',
          'FTD', 'MND', 'PSP','ATAXIA', 'MS','MSA' ,'MD','BP', 'SCHIZ']
wanted = ['CON', 'AD', 'DEM_SICC', 'PD', 'PDD','DLB', 'VD',
          'FTD', 'MND', 'PSP','ATAXIA', 'MS','MSA' ,'MD','BP', 'SCHIZ']
wanted = ['CON', 'AD', 'PD', 'PDD','DLB', 'VD',
          'FTD', 'MND', 'PSP','ATAXIA', 'MS','MSA' ,'MD','BP', 'SCHIZ']
wanted = ['CON', 'AD', 'PD', 'PDD', 'VD',
          'FTD', 'MND', 'PSP', 'MS','MSA' ,'MD','BP', 'SCHIZ']
wanted = ['CON', 'AD', 'PD', 'VD',
          'FTD', 'MND', 'PSP', 'MS','MSA' ,'MD','BP', 'SCHIZ']
wanted = ['CON', 'AD', 'PD', 'VD', 'FTD','AD,DLB','DLB','ATAXIA', 'MND', 'PSP', 'MS','MSA']
wanted = ['CON', 'AD', 'PD', 'VD', 'FTD','DLB','AD,DLB','DLB,SICC','ATAXIA', 'MND', 'PSP', 'MS','MSA'] #'AD,DLB'
wanted = ['CON', 'AD', 'PD', 'VD', 'FTD','DLB','AD,DLB','ATAXIA', 'MND', 'PSP', 'MS','MSA'] #'AD,DLB'
# wanted = ['CON', 'AD', 'PD', 'VD', 'FTD','DLB','ATAXIA', 'MND', 'PSP', 'MS','MSA'] #'AD,DLB'
for i in range(preds['pred_y_list_all'].shape[0]):
    print('fold: ',i)
    #print the prediction for the test for the nth fold
    firstfold_preds = preds['pred_y_list_all'][i][2]
    pred_df = pd.DataFrame(firstfold_preds, columns = wanted)
    b = np.zeros_like(pred_df.values)
    b[np.arange(len(pred_df)), pred_df.values.argmax(1)] = 1
    df1 = pd.DataFrame(b, columns = pred_df.columns).astype(int)
    compare = pd.DataFrame(df1.idxmax(1),columns=['predictions'])

    ## TRUTHS
    truths = np.load(path_truths, allow_pickle=True)#load(path_predictions)
    firstfold_truths = truths['true_y_list_all'][i][2]
    truth_df = pd.DataFrame(firstfold_truths, columns = wanted)
    compare['truths'] = truth_df.idxmax(1)
    full_preds.append(compare)
    
    ## F1-SCORE
    print('f1score')
    print(f1_score(compare['truths'], compare['predictions'], average='micro'))
    averages_f1.append(f1_score(compare['truths'], compare['predictions'], average='micro'))
    
    # print('nienke this is the format!')
    # display(compare)
    ## CONFUSION MATRIX
    cf_matrix= metrics.confusion_matrix(compare['truths'], compare['predictions'],
                                        labels = wanted)
    confusions.append(cf_matrix)

    ## JACCARD
    j_index = jaccard_score(y_true=compare['truths'],y_pred=compare['predictions'],labels=wanted,average=None)
    j_index_df = pd.DataFrame(j_index, columns= ['Jaccard'], index = wanted)
#     display(j_index_df)
    jaccards.append(j_index)
    
    ## CLASSIFICATION REPORT
    report = metrics.classification_report(compare['truths'], compare['predictions'], digits=3,output_dict=True, zero_division=1)
    report_df = pd.DataFrame(report).transpose()
    report_df = round(report_df,3)
    report_df['fold'] = i
    report_df = pd.concat([report_df,j_index_df],axis=1)
    wanted_plus = wanted.copy()
    new_items = ['accuracy', 'macro avg', 'weighted avg']
    wanted_plus.extend(new_items)
    report_df= report_df.reindex(wanted_plus)
    # display(report_df)
    list_of_dfs.append(report_df)

## F1-SCORE
average_f1 = np.average(averages_f1)

## JACCARD
# average_jaccard = np.array(jaccards).mean(axis=0)
# jaccard_df = pd.DataFrame(average_jaccard,columns=['Jaccard Index'],index=cols)
# jaccard_df = jaccard_df.round(decimals = 2)
# jaccard_df = jaccard_df.reindex(wanted)
# display(jaccard_df)

## CLASSIFICATION REPORT
full_report = pd.concat([list_of_dfs[0], list_of_dfs[1], list_of_dfs[2],list_of_dfs[3],list_of_dfs[4]])

full_report2 = full_report.groupby(level=0).mean()
full_report2 = full_report2.drop('fold',axis=1)
full_report2 = full_report2.reindex(wanted)
display(full_report2)
output_path = f'/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/temporal_model/results/{save_name}_{identifier}/classification_report_{identifier}.xlsx'
full_report2.to_excel(output_path)  
# display(confusions)

## CONFUSION MATRIX
average_confusion = np.array(confusions).mean(axis=0)
vector = full_report2['support'].head(n=13).to_numpy()
perc_con = average_confusion / vector[:,None]
perc_con = np.round(perc_con, 2)
display(perc_con)

# perc_con[perc_con == 0] = str(float('0'))
# print(perc_con)
make_confusion_matrix(perc_con,# average_confusion,#
                      categories=wanted, 
                      cmap='Blues',
                      group_names=None,
                      count=True,
                      percent=False,
                      cbar=False,
                      xyticks=True,
                      xyplotlabels=True,
                      sum_stats=False,
                      figsize=(12,9),
                      title=None)

output_path_png = f'/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/temporal_model/results/{save_name}_{identifier}/confusion_matrix_{identifier}.png'
output_path_pdf = f'/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/temporal_model/results/{save_name}_{identifier}/confusion_matrix_{identifier}.pdf'
plt.savefig(output_path_png,bbox_inches="tight",dpi=600) 
plt.savefig(output_path_pdf,bbox_inches="tight",dpi=600) 
plt.show()
plt.close()


In [None]:
sns.set(font_scale=2.1)
list_of_dfs = []
full_preds = []
averages_f1 = []
confusions = []
jaccards = []

# alphabetic = ['AD', 'AD_CA','AD_VE','ATAXIA','BP','CON','DEM_SICC','DLB',  'FTD',
#               'MD','MND','MS', 'MSA','PD',  'PDD', 'PSP', 'SCHIZ', 'VD']
wanted = ['CON', 'AD', 'AD_CA', 'AD_VE', 'DEM_SICC', 'PD', 'PDD','DLB', 'VD',
          'FTD', 'MND', 'PSP','ATAXIA', 'MS','MSA' ,'MD','BP', 'SCHIZ']
wanted = ['CON', 'AD', 'DEM_SICC', 'PD', 'PDD','DLB', 'VD',
          'FTD', 'MND', 'PSP','ATAXIA', 'MS','MSA' ,'MD','BP', 'SCHIZ']
wanted = ['CON', 'AD', 'PD', 'PDD','DLB', 'VD',
          'FTD', 'MND', 'PSP','ATAXIA', 'MS','MSA' ,'MD','BP', 'SCHIZ']
wanted = ['CON', 'AD', 'PD', 'PDD', 'VD',
          'FTD', 'MND', 'PSP', 'MS','MSA' ,'MD','BP', 'SCHIZ']
wanted = ['CON', 'AD', 'PD', 'VD',
          'FTD', 'MND', 'PSP', 'MS','MSA' ,'MD','BP', 'SCHIZ']
wanted = ['CON', 'AD', 'PD', 'VD', 'FTD','AD,DLB','DLB','ATAXIA', 'MND', 'PSP', 'MS','MSA']
wanted = ['CON', 'AD', 'PD', 'VD', 'FTD','DLB','AD,DLB','DLB,SICC','ATAXIA', 'MND', 'PSP', 'MS','MSA'] #'AD,DLB'
wanted = ['CON', 'AD', 'PD', 'VD', 'FTD','DLB','AD,DLB','ATAXIA', 'MND', 'PSP', 'MS','MSA'] #'AD,DLB'
# wanted = ['CON', 'AD', 'PD', 'VD', 'FTD','DLB','ATAXIA', 'MND', 'PSP', 'MS','MSA'] #'AD,DLB'
for i in range(preds['pred_y_list_all'].shape[0]):
    print('fold: ',i)
    #print the prediction for the test for the nth fold
    firstfold_preds = preds['pred_y_list_all'][i][2]
    pred_df = pd.DataFrame(firstfold_preds, columns = wanted)
    b = np.zeros_like(pred_df.values)
    b[np.arange(len(pred_df)), pred_df.values.argmax(1)] = 1
    df1 = pd.DataFrame(b, columns = pred_df.columns).astype(int)
    compare = pd.DataFrame(df1.idxmax(1),columns=['predictions'])

    ## TRUTHS
    truths = np.load(path_truths, allow_pickle=True)#load(path_predictions)
    firstfold_truths = truths['true_y_list_all'][i][2]
    truth_df = pd.DataFrame(firstfold_truths, columns = wanted)
    compare['truths'] = truth_df.idxmax(1)
    full_preds.append(compare)
    
    ## F1-SCORE
    print('f1score')
    print(f1_score(compare['truths'], compare['predictions'], average='micro'))
    averages_f1.append(f1_score(compare['truths'], compare['predictions'], average='micro'))
    
    # print('nienke this is the format!')
    # display(compare)
    ## CONFUSION MATRIX
    cf_matrix= metrics.confusion_matrix(compare['truths'], compare['predictions'],
                                        labels = wanted)
    confusions.append(cf_matrix)

    ## JACCARD
    j_index = jaccard_score(y_true=compare['truths'],y_pred=compare['predictions'],labels=wanted,average=None)
    j_index_df = pd.DataFrame(j_index, columns= ['Jaccard'], index = wanted)
#     display(j_index_df)
    jaccards.append(j_index)
    
    ## CLASSIFICATION REPORT
    report = metrics.classification_report(compare['truths'], compare['predictions'], digits=3,output_dict=True, zero_division=1)
    report_df = pd.DataFrame(report).transpose()
    report_df = round(report_df,3)
    report_df['fold'] = i
    report_df = pd.concat([report_df,j_index_df],axis=1)
    wanted_plus = wanted.copy()
    new_items = ['accuracy', 'macro avg', 'weighted avg']
    wanted_plus.extend(new_items)
    report_df= report_df.reindex(wanted_plus)
    # display(report_df)
    list_of_dfs.append(report_df)

## F1-SCORE
average_f1 = np.average(averages_f1)

## JACCARD
# average_jaccard = np.array(jaccards).mean(axis=0)
# jaccard_df = pd.DataFrame(average_jaccard,columns=['Jaccard Index'],index=cols)
# jaccard_df = jaccard_df.round(decimals = 2)
# jaccard_df = jaccard_df.reindex(wanted)
# display(jaccard_df)

## CLASSIFICATION REPORT
full_report = pd.concat([list_of_dfs[0], list_of_dfs[1], list_of_dfs[2],list_of_dfs[3],list_of_dfs[4]])

full_report2 = full_report.groupby(level=0).mean()
full_report2 = full_report2.drop('fold',axis=1)
full_report2 = full_report2.reindex(wanted)
display(full_report2)
output_path = f'/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/temporal_model/results/{save_name}_{identifier}/classification_report_{identifier}.xlsx'
full_report2.to_excel(output_path)  
# display(confusions)

## CONFUSION MATRIX
average_confusion = np.array(confusions).mean(axis=0)
vector = full_report2['support'].head(n=13).to_numpy()
perc_con = average_confusion / vector[:,None]
perc_con = np.round(perc_con, 2)
display(perc_con)

# perc_con[perc_con == 0] = str(float('0'))
# print(perc_con)
make_confusion_matrix(perc_con,# average_confusion,#
                      categories=wanted, 
                      cmap='Blues',
                      group_names=None,
                      count=True,
                      percent=False,
                      cbar=False,
                      xyticks=True,
                      xyplotlabels=True,
                      sum_stats=False,
                      figsize=(12,9),
                      title=None)

output_path_png = f'/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/temporal_model/results/{save_name}_{identifier}/confusion_matrix_{identifier}.png'
output_path_pdf = f'/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/temporal_model/results/{save_name}_{identifier}/confusion_matrix_{identifier}.pdf'
plt.savefig(output_path_png,bbox_inches="tight",dpi=600) 
plt.savefig(output_path_pdf,bbox_inches="tight",dpi=600) 
plt.show()
plt.close()


In [None]:
# full_preds
df = pd.concat(full_preds,axis=1)
# display(df)
output_path = '/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/temporal_model/results/60_clinical_history_5_years_1_observations_subset_{}/folds_compare.xlsx'.format(identifier)
df.to_excel(output_path)  

#### PLOTS

In [None]:
full_report3 = full_report.reset_index()
full_report3.columns = ['index','Precision','Recall','F1-score','support','fold','Jaccard']
full_report3

In [None]:
full_report_melted = pd.melt(full_report3, id_vars=['index','support','fold'],
        value_vars=['Precision','Recall','F1-score','Jaccard'],
       var_name='metric', value_name='value')
a = ['accuracy','macro avg','weighted avg']
full_report_melted = full_report_melted[~full_report_melted['index'].isin(a)]
full_report_melted = full_report_melted.set_index('index')
display(full_report_melted)

In [None]:
# Say, "the default sans-serif font is COMIC SANS"
plt.rcParams['font.sans-serif'] = "Arial"
# Then, "ALWAYS use sans-serif fonts"
plt.rcParams['font.family'] = "sans-serif"

In [None]:
display(full_report2)

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
sns.set(style="ticks", font_scale=2.5)

fig, ax = plt.subplots(figsize=(25, 7))
sns.boxplot(x=full_report_melted.index, y="value", hue="metric", data=full_report_melted, ax=ax,palette = 'Blues')
# ax.set_xticklabels(ax.get_xticklabels(), rotation=40)#, horizontalalignment='right')
# for tick in ax.get_xticklabels():
#     tick.set_rotation(45)
# xticklabels = ax.get_xticklabels()
# ax.set_xticklabels(xticklabels, rotation = 45, ha="right")
# plt.xticks(rotation = 45)
# ax.set_xticklabels(ax.get_xticks(), rotation = 45)
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)
plt.legend(bbox_to_anchor=(0.4, 1.2), loc="upper center", ncol=4, borderaxespad=0)
# plt.legend(loc="upper center", ncol=4)
ax.set_xlabel("Diagnosis",labelpad=15)
ax.set_ylabel("Prediction performance",labelpad=20)
if observation == False:
    plt.suptitle('{} Years with {} observations. \n Micro F1-score: {}, average AUROC: {}'.format(n,m,round(average_f1,3),round(average_auroc,3)), fontsize=12)
elif observation == True and unique == False:
    plt.suptitle('{} observations. \n Micro F1-score: {}, average AUROC: {}'.format(n,round(average_f1,3),round(average_auroc,3)), fontsize=12)
elif observation == True and unique == True:
    plt.suptitle('{} unique observations. \n Micro F1-score: {}, average AUROC: {}'.format(n,round(average_f1,3),round(average_auroc,3)), fontsize=12)

plt.suptitle(None)
ax.set_ylim(0,1)
ax.spines["right"].set_color("none")
ax.spines["top"].set_color("none")
sns.despine(offset=10, trim=False)

a=ax.get_xticks().tolist()
for i in range(len(a)):
#     print(i)
#     print(full_report2.index.tolist())
    print(round(full_report2['support'][i]))
    a[i]=full_report2.index.tolist()[i] + '\nn={}'.format(round(full_report2['support'][i]))
ax.set_xticklabels(a)
# labels = [item.get_text() for item in ax.get_xticklabels()]
# labels[1] = 'Testing'
# ax.set_xticklabels(labels)
print([item.get_text() for item in ax.get_xticklabels()])
# ax.spines.left.set_bounds((0, 1))

output_path_png = '/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/temporal_model/results/{}/classification_report_{}.png'.format(identifier,identifier)
output_path_pdf = '/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/temporal_model/results/{}/classification_report_{}.pdf'.format(identifier,identifier)
fig.savefig(output_path_png,bbox_inches="tight",dpi=600) 
fig.savefig(output_path_pdf,bbox_inches="tight",dpi=600) 
plt.show()
plt.close()

### old

In [None]:
# m = 1
# n = 5
# remove3 = True
# observation = False
# unique = False
# if remove3 == False and observation == False and unique == False:
#     save_name = 'clinical_history_{}_years_{}_observations'.format(str(n),str(m))
#     cols = ['AD', 'ATAXIA', 'BP', 'CON', 'FTD', 'MD', 'MND', 'MS',  'MSA', 'PD', 'PDD', 'PSP', 'SCHIZ', 'VD']
#     cols = ['AD', 'ATAXIA', 'BP', 'CON','DLB', 'FTD', 'MD', 'MND', 'MS',  'MSA', 'PD', 'PDD', 'PSP', 'SCHIZ', 'VD']
#     if m == 1 and n == 1:
#         identifier = '20220811_125749_549959' # Micro F1-score: 0.739, Macro F1-score: 0.504, average AUROC: 0.933
#         # without shuffle'20220811_091932_484522' 
#     elif m == 1 and n == 2:
#         identifier = '20220811_130315_572958' # Micro F1-score: 0.755, Macro F1-score: 0.531, average AUROC: 0.936
#         #without shuffle 20220811_092342_934865  
#     elif m == 1 and n == 3: ## WINNER
#         identifier = '20220811_130317_686429' # Micro F1-score: 0.765, Macro F1-score: 0.519, average AUROC: 0.942
#         #without shuffle 20220811_092456_948567 
#     elif m == 0 and n == 1:
#         identifier = '20220811_125912_299211' # Micro F1-score: 0.74, Macro F1-score: 0.51, average AUROC: 0.932
#         #without shuffle 20220811_092151_231509 
#     elif m == 0 and n == 2:
#         identifier = '20220811_125946_705098' # Micro F1-score: 0.745, Macro F1-score: 0.513, average AUROC: 0.935
#         #without shuffle 20220811_092753_431603
#     elif m == 0 and n == 3:
#         identifier = '20220811_130023_046260' # Micro F1-score: 0.761, Macro F1-score: 0.534, average AUROC: 0.932
#         #without shuffle 20220811_092848_136571
#     elif m == 0 and n == 4:
#         identifier = '20220811_144324_393381' # Micro F1-score: 0.75, Macro F1-score: 0.528, average AUROC: 0.933
#     elif m == 2 and n == 1:
#         identifier = '20220811_144525_941453' # Micro F1-score: 0.747, Macro F1-score: 0.509, average AUROC: 0.939
#     elif m == 2 and n == 2: 
#         identifier = '20220811_144530_960516' # Micro F1-score: 0.764, Macro F1-score: 0.54, average AUROC: 0.938

# if remove3==True and observation == False and unique == False:
#     save_name = 'clinical_history_{}_years_{}_observations_subset'.format(str(n),str(m))
#     cols = ['AD',  'BP', 'CON', 'FTD', 'MD', 'MS',  'MSA', 'PD', 'PDD', 'PSP', 'VD']
#     cols = ['AD', 'BP', 'CON','DLB', 'FTD', 'MD', 'MS',  'MSA', 'PD', 'PDD', 'PSP', 'VD']
#     if m == 1 and n == 1:
#         identifier = '20220811_135844_836666' # Micro F1-score: 0.758, Macro F1-score: 0.633, average AUROC: 0.94
#     elif m == 1 and n == 2:
#         identifier = '20220811_135926_067172' # Micro F1-score: 0.769, Macro F1-score: 0.647, average AUROC: 0.945
#     elif m == 1 and n == 3:
#         identifier = '20220811_135935_868183' # Micro F1-score: 0.769, Macro F1-score: 0.631, average AUROC: 0.947
#     elif m == 1 and n == 4: 
#         identifier = '20220811_144109_053395' # Micro F1-score: 0.787, Macro F1-score: 0.665, average AUROC: 0.95
#         # 80-10-10
#         identifier = '20220811_175226_704355' # Micro F1-score: 0.782, Macro F1-score: 0.656, average AUROC: 0.957
#     elif m == 1 and n == 5: 
#         identifier = '20220811_170556_266847' # Micro F1-score: 0.789, Macro F1-score: 0.654, average AUROC: 0.947
#         # 80-10-10 ## WINNER
#         identifier = '20220811_182451_264378' # Micro F1-score: 0.8, Macro F1-score: 0.674, average AUROC: 0.959
#     elif m == 0 and n == 1:
#         identifier = '20220811_135913_079098' # Micro F1-score: 0.753, Macro F1-score: 0.634, average AUROC: 0.941
#     elif m == 0 and n == 2:
#         identifier = '20220811_135936_892905' # Micro F1-score: 0.765, Macro F1-score: 0.645, average AUROC: 0.942
#     elif m == 0 and n == 3:
#         identifier = '20220811_135936_677540' # Micro F1-score: 0.77, Macro F1-score: 0.642, average AUROC: 0.943
#     elif m == 0 and n == 4:  
#         identifier = '20220811_144110_457439' # Micro F1-score: 0.765, Macro F1-score: 0.638, average AUROC: 0.944
#     elif m == 2 and n == 1:
#         identifier = '20220811_144915_412665' # Micro F1-score: 0.753, Macro F1-score: 0.62, average AUROC: 0.945
#     elif m == 2 and n == 2:
#         identifier = '20220811_144914_648664' # Micro F1-score: 0.771, Macro F1-score: 0.634, average AUROC: 0.945
#     elif m == 2 and n == 3:
#         identifier = '20220811_170446_921683' # Micro F1-score: 0.786, Macro F1-score: 0.654, average AUROC: 0.951

# if remove3==True and observation == True and unique == False:
#     save_name = 'clinical_history_{}_observations_subset'.format(str(n),str(m))
#     cols = ['AD',  'BP', 'CON', 'FTD', 'MD', 'MS',  'MSA', 'PD', 'PDD', 'PSP', 'VD']
#     if n == 2:
#         identifier = '20220811_160202_909428' # Micro F1-score: 0.755, Macro F1-score: 0.608, average AUROC: 0.942
#     elif n == 5:
#         identifier = '20220811_155640_151675' # Micro F1-score: 0.76, Macro F1-score: 0.623, average AUROC: 0.943
#     elif n == 7:
#         identifier = '20220811_160551_194857' # Micro F1-score: 0.772, Macro F1-score: 0.65, average AUROC: 0.946
#     elif n == 10:
#         identifier = '20220811_160250_666878' # Micro F1-score: 0.772, Macro F1-score: 0.644, average AUROC: 0.949
#     elif n == 15: 
# #         identifier = '20220811_161746_449357' # Micro F1-score: 0.787, Macro F1-score: 0.672, average AUROC: 0.953
#         #80-10-10 ## WINNER
#         identifier = '20220811_174255_533224' #Micro F1-score: 0.789, Macro F1-score: 0.677, average AUROC: 0.96
#     elif n == 20:
#         identifier = '20220811_162714_753150' # Micro F1-score: 0.78, Macro F1-score: 0.653, average AUROC: 0.95      

# if remove3==True and observation == True and unique == True:
#     save_name = 'clinical_history_{}_unique_observations_subset'.format(str(n),str(m))
#     cols = ['AD',  'BP', 'CON', 'FTD', 'MD', 'MS',  'MSA', 'PD', 'PDD', 'PSP', 'VD']
#     if n == 3:
#         # 60-20-20
#         identifier = '20220811_175922_954446' # Micro F1-score: 0.757, Macro F1-score: 0.632, average AUROC: 0.944
#         # 80-10-10
#         identifier = '20220811_182025_514685' # Micro F1-score: 0.774, Macro F1-score: 0.646, average AUROC: 0.954
#     elif n == 2:
#         # 80-10-10
#         identifier = '20220812_094756_293237' # 

#     elif n == 4:
#         # 80-10-10
#         identifier = '20220812_094152_479638' # 

#     elif n == 5:
#         identifier = '20220811_165554_100842' # Micro F1-score: 0.759, Macro F1-score: 0.626, average AUROC: 0.944
#     elif n == 7:
#         identifier = '20220811_171554_192040' # Micro F1-score: 0.772, Macro F1-score: 0.642, average AUROC: 0.949
#     elif n == 10:
#         identifier = '20220811_165412_414701' # Micro F1-score: 0.782, Macro F1-score: 0.669, average AUROC: 0.949
  
        
# print(save_name)
# print(identifier)

# identifier =  save_name + '_' + identifier
# print(identifier)
