<a href="https://colab.research.google.com/github/ccosmin97/idc-prostate-mri-analysis/blob/main/analysis_statistical_testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports


In [1]:
!pip install scikit_posthocs

Collecting scikit_posthocs
  Downloading scikit_posthocs-0.9.0-py3-none-any.whl (32 kB)
Installing collected packages: scikit_posthocs
Successfully installed scikit_posthocs-0.9.0


In [40]:
import os
import numpy as np
import glob
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import pylab
from scipy.stats import wilcoxon

In [3]:
import plotly.graph_objects as go
import plotly.express as px
import os
import plotly.io as pio
import scikit_posthocs as sp

In [4]:
from numpy.random import seed
from numpy.random import randn
from scipy.stats import shapiro
from scipy.stats import friedmanchisquare

In [5]:
# from IPython.display import display, HTML
# display(HTML("<style>.container { width:100% !important; }</style>"))

In [6]:
#colab
from google.colab import auth
auth.authenticate_user()
project_id = "idc-sandbox-003"
os.environ["GCP_PROJECT_ID"] = project_id
from google.cloud import bigquery
bq_client = bigquery.Client(os.environ["GCP_PROJECT_ID"])

# Import analysis data

In [None]:
# !wget https://github.com/nytimes/covid-19-data/archive/refs/heads/master.zip

In [24]:
selection_query = f"""
SELECT *
FROM `idc-sandbox-003.prostate_benchmarking_neurips_2024.prostate_benchmark_simple_table_all_table`
 """
selection_result = bq_client.query(selection_query)
data_df = selection_result.result().to_dataframe()
data_df["asd"] = data_df.asd.replace(-999,np.NaN)
data_df = data_df.astype({'expert_Value': 'float32', 'ai_Value': 'float32'})
data_df["expert_Value"] = data_df.expert_Value.apply(lambda x : x/1000)#change Volume of segment from mm3 to mL
data_df["ai_Value"] = data_df.ai_Value.apply(lambda x : x/1000 if x is not None else x)#change Volume of segment from mm3 to mL
#round AI and expert quantitative results to 0.1 and .2 decimals
for field_to_round in ["ai_Value", "expert_Value"]:
  na_mask = data_df[field_to_round].notnull()
  data_df.loc[na_mask, "ai_Value"] = data_df.loc[na_mask, field_to_round].astype(float).round(1)
data_df = data_df.round({'dsc': 2, 'hsdff': 2, 'hsdff_95': 2,
                    'asd': 2, 'prediction_time': 1})

# Analysis


## Seaborn parameters


In [8]:
sns.set(rc={'figure.figsize':(10,10)})
# Setting the font size by 2.5
sns.set(font_scale=1.5)

## Mean and STDs of quantitative region and distance based metrics


In [9]:
data_df.algorithmNameCollection.unique()

array(['prostate158-prostate_mri_us_biopsy-prostate',
       'nnunet_task024-prostate_mri_us_biopsy-prostate',
       'prostate158-qin_prostate_repeatability-prostate',
       'nnunet_task05-qin_prostate_repeatability-prostate',
       'nnunet_task024-qin_prostate_repeatability-prostate',
       'prostate158-prostatex-zonal', 'prostate158-prostatex-prostate',
       'nnunet_task05-prostatex-prostate',
       'nnunet_task05-prostatex-zonal',
       'nnunet_task024-prostatex-prostate'], dtype=object)

### ProstateX analysis


#### Whole prostate


In [25]:
prostate_prostatex_df = data_df[(data_df.collection_id == 'prostatex') & (data_df.segment == 'Prostate')]
print(len(prostate_prostatex_df))

198


In [26]:
nnnunet_task024_prostate_df = prostate_prostatex_df[prostate_prostatex_df.algorithmNameCollection == 'nnunet_task024-prostatex-prostate']
nnnunet_task05_prostate_df = prostate_prostatex_df[prostate_prostatex_df.algorithmNameCollection == 'nnunet_task05-prostatex-prostate']
monai_prostate158_prostate_df = prostate_prostatex_df[prostate_prostatex_df.algorithmNameCollection == 'prostate158-prostatex-prostate']

In [27]:
# pd.set_option('display.width', -1)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 150)

##### Quantitative metrics summary for ProstateX whole prostate segment


In [29]:
nnnunet_task024_prostate_df[['dsc', 'hsdff', 'hsdff_95', 'asd', 'ai_Value', 'expert_Value']].round(2).describe(include = 'all')

Unnamed: 0,dsc,hsdff,hsdff_95,asd,ai_Value,expert_Value
count,66.0,66.0,66.0,66.0,66.0,66.0
mean,0.922879,6.69803,0.915758,0.779242,59.024242,59.027273
std,0.025526,3.495229,0.62984,0.304304,29.637575,29.643197
min,0.85,3.32,0.0,0.35,17.6,17.6
25%,0.91,4.66,0.5,0.5225,38.825,38.815
50%,0.93,6.0,0.75,0.735,53.9,53.91
75%,0.94,6.93,1.35,0.9475,73.525,73.5625
max,0.96,24.23,3.97,1.65,161.9,161.94


In [30]:
nnnunet_task05_prostate_df[['dsc', 'hsdff', 'hsdff_95', 'asd', 'ai_Value', 'expert_Value']].round(2).describe(include = 'all')

Unnamed: 0,dsc,hsdff,hsdff_95,asd,ai_Value,expert_Value
count,66.0,66.0,66.0,66.0,66.0,66.0
mean,0.895606,11.689091,1.703182,1.228636,59.024242,59.027273
std,0.037953,13.187189,1.082287,0.650015,29.637575,29.643197
min,0.69,4.03,0.25,0.6,17.6,17.6
25%,0.87,6.1,1.01,0.865,38.825,38.815
50%,0.9,7.43,1.51,1.05,53.9,53.91
75%,0.92,9.01,2.0,1.3,73.525,73.5625
max,0.95,66.07,6.67,3.64,161.9,161.94


In [34]:
monai_prostate158_prostate_df[['dsc', 'hsdff', 'hsdff_95', 'asd', 'ai_Value', 'expert_Value']].round(2).describe(include = 'all')

Unnamed: 0,dsc,hsdff,hsdff_95,asd,ai_Value,expert_Value
count,66.0,66.0,66.0,66.0,66.0,66.0
mean,0.902424,8.286818,1.931515,1.490606,59.024242,59.027273
std,0.076421,13.58715,6.891053,5.033204,29.637575,29.643197
min,0.31,3.91,0.35,0.49,17.6,17.6
25%,0.9,5.91,0.75,0.73,38.825,38.815
50%,0.91,6.02,1.045,0.85,53.9,53.91
75%,0.9275,7.305,1.49,0.9975,73.525,73.5625
max,0.94,116.23,56.97,41.72,161.9,161.94


Normality tests


In [35]:
for df_val in [nnnunet_task024_prostate_df,nnnunet_task05_prostate_df,monai_prostate158_prostate_df]:
    # normality test
    stat, p = shapiro(df_val.dsc.values)
    print('Statistics=%.3f, p=%.3f' % (stat, p))
    # interpret
    alpha = 0.05
    if p > alpha:
        print('Sample looks Gaussian (fail to reject H0)')
    else:
        print('Sample does not look Gaussian (reject H0)')
    print("\n")

Statistics=0.906, p=0.000
Sample does not look Gaussian (reject H0)


Statistics=0.797, p=0.000
Sample does not look Gaussian (reject H0)


Statistics=0.285, p=0.000
Sample does not look Gaussian (reject H0)




Mean and STD deviations for prostatex whole prostate results


In [37]:
print(f"nnunet task024 model DSC mean and std for whole prostate: {round(np.mean(nnnunet_task024_prostate_df.dsc.values),2)}/{round(np.std(nnnunet_task024_prostate_df.dsc.values),2)}")
print(f"nnunet task05 model DSC mean and std for whole prostate: {round(np.mean(nnnunet_task05_prostate_df.dsc.values),2)}/{round(np.std(nnnunet_task05_prostate_df.dsc.values),2)}")
print(f"nnunet monai prostate158 DSC model mean and std for whole prostate: {round(np.mean(monai_prostate158_prostate_df.dsc.values),2)}/{round(np.std(monai_prostate158_prostate_df.dsc.values),2)}")

nnunet task024 model DSC mean and std for whole prostate: 0.92/0.03
nnunet task05 model DSC mean and std for whole prostate: 0.9/0.04
nnunet monai prostate158 DSC model mean and std for whole prostate: 0.9/0.08


##### Paired tests on mean DSC, same collection == Wilcoxon signed rank test with Bonferroni correction


The Wilcoxon signed-rank test tests the null hypothesis that two related paired samples come from the same distribution.

In particular, it tests whether the distribution of the differences x - y is symmetric about zero.

It is a non-parametric version of the paired T-test.


The p-value measures the probability of getting a more extreme value than the one you got from the experiment.

If the p-value is greater than alpha, you accept the null hypothesis.

If it is less than alpha, you reject the null hypothesis.


**Bonferroni correction : 3 paired Wilcoxon tests so alpha = 0.05/3 = 0.017**


Pair 1 : task024 dsc differences vs task05


In [45]:
diff = nnnunet_task024_prostate_df.sort_values(by=['refSerieUID']).dsc.values - nnnunet_task05_prostate_df.sort_values(by=['refSerieUID']).dsc.values

In [46]:
res = wilcoxon(diff)
res.statistic, res.pvalue

(20.0, 6.07257483705153e-11)

In [47]:
res = wilcoxon(diff, alternative='less')
res.statistic, res.pvalue

(1750.0, 0.9999999999696372)

This shows that the null hypothesis that the median of dsc differences is negative can be rejected at a confidence level of 5% in favor of the alternative that the median is greater than zero.


Pair 2 : task024 dsc differences vs monai


In [48]:
diff = nnnunet_task024_prostate_df.sort_values(by=['refSerieUID']).dsc.values - monai_prostate158_prostate_df.sort_values(by=['refSerieUID']).dsc.values
res = wilcoxon(diff)
res.statistic, res.pvalue

(325.0, 6.298514251178753e-05)

In [49]:
res = wilcoxon(diff, alternative='less')
res.statistic, res.pvalue

(1328.0, 0.9999685074287441)

Pair 3 : task05 dsc differences vs monai


In [50]:
diff = monai_prostate158_prostate_df.sort_values(by=['refSerieUID']).dsc.values - nnnunet_task05_prostate_df.sort_values(by=['refSerieUID']).dsc.values
res = wilcoxon(diff)
res.statistic, res.pvalue

(320.0, 0.0001498725787900786)

In [51]:
res = wilcoxon(diff, alternative='greater')
res.statistic, res.pvalue

(1220.0, 7.49362893950393e-05)

#### Peripheral Zone


In [69]:
pz_prostatex_df = data_df[(data_df.collection_id == 'prostatex') & (data_df.segment.str.contains('Peripheral'))]

In [70]:
pz_prostatex_df.algorithmNameCollection.unique()

array(['prostate158-prostatex-zonal', 'nnunet_task05-prostatex-zonal'],
      dtype=object)

In [71]:
nnunet_pz_prostatex_df =  pz_prostatex_df[pz_prostatex_df.algorithmNameCollection == 'nnunet_task05-prostatex-zonal']
monai_pz_prostatex_df = pz_prostatex_df[pz_prostatex_df.algorithmNameCollection == 'prostate158-prostatex-zonal']

In [72]:
nnunet_pz_prostatex_df = nnunet_pz_prostatex_df.drop_duplicates(subset='refSerieUID', keep="last")

##### Quantitative metrics summary for ProstateX peripheral zone segment

In [73]:
nnunet_pz_prostatex_df[['dsc', 'hsdff', 'hsdff_95', 'asd', 'ai_Value', 'expert_Value']].round(2).describe(include = 'all')

Unnamed: 0,dsc,hsdff,hsdff_95,asd,ai_Value,expert_Value
count,98.0,98.0,98.0,98.0,98.0,98.0
mean,0.667857,17.397755,5.005918,2.334184,19.77449,19.778776
std,0.155837,7.703612,3.670226,3.454579,6.978824,6.976135
min,0.0,7.43,1.5,0.61,6.5,6.55
25%,0.6325,11.5625,2.6825,1.0125,14.95,14.9125
50%,0.71,15.635,3.84,1.395,18.7,18.74
75%,0.76,20.9675,5.9625,2.3275,23.05,23.0625
max,0.84,46.13,21.4,27.67,42.1,42.05


In [74]:
monai_pz_prostatex_df[['dsc', 'hsdff', 'hsdff_95', 'asd', 'ai_Value', 'expert_Value']].round(2).describe(include = 'all')

Unnamed: 0,dsc,hsdff,hsdff_95,asd,ai_Value,expert_Value
count,98.0,98.0,98.0,98.0,98.0,98.0
mean,0.73051,13.146327,3.383571,1.145816,19.77449,19.778776
std,0.083213,5.526158,2.058936,0.561258,6.978824,6.976135
min,0.41,4.64,1.0,0.47,6.5,6.55
25%,0.7,9.3925,2.055,0.7625,14.95,14.9125
50%,0.75,11.97,2.71,1.04,18.7,18.74
75%,0.7875,16.16,4.19,1.3475,23.05,23.0625
max,0.9,33.98,15.04,4.14,42.1,42.05


In [75]:
print(f"nnunet task05 model DSC mean and std for pz: {round(np.mean(nnunet_pz_prostatex_df.dsc.values),2)}/{round(np.std(nnunet_pz_prostatex_df.dsc.values),2)}")
print(f"monai model DSC mean and std for pz: {round(np.mean(monai_pz_prostatex_df.dsc.values),2)}/{round(np.std(monai_pz_prostatex_df.dsc.values),2)}")

nnunet task05 model DSC mean and std for pz: 0.67/0.16
monai model DSC mean and std for pz: 0.73/0.08


##### Paired tests on mean DSC, same collection == Wilcoxon signed rank test


In [61]:
diff = monai_pz_prostatex_df.sort_values(by=['refSerieUID']).dsc.values - nnunet_pz_prostatex_df.sort_values(by=['refSerieUID']).dsc.values
res = wilcoxon(diff)
res.statistic, res.pvalue

(834.0, 2.1865286957299772e-07)

In [62]:
res = wilcoxon(diff, alternative='greater')
res.statistic, res.pvalue

(3537.0, 1.0932643478649886e-07)

#### Transition zone


In [63]:
tz_prostatex_df = data_df[(data_df.collection_id == 'prostatex') & (data_df.segment.str.contains('Transition'))]

In [64]:
nnunet_tz_prostatex_df =  tz_prostatex_df[tz_prostatex_df.algorithmNameCollection == 'nnunet_task05-prostatex-zonal']
monai_tz_prostatex_df = tz_prostatex_df[tz_prostatex_df.algorithmNameCollection == 'prostate158-prostatex-zonal']

In [65]:
nnunet_tz_prostatex_df = nnunet_tz_prostatex_df.drop_duplicates(subset='refSerieUID', keep="last")

##### Quantitative metrics summary for ProstateX Transition zone segment

In [76]:
nnunet_tz_prostatex_df[['dsc', 'hsdff', 'hsdff_95', 'asd', 'ai_Value', 'expert_Value']].round(2).describe(include = 'all')

Unnamed: 0,dsc,hsdff,hsdff_95,asd,ai_Value,expert_Value
count,98.0,98.0,98.0,98.0,98.0,98.0
mean,0.817857,15.785306,3.099286,2.05051,45.190816,45.192041
std,0.112555,14.497108,2.857834,1.443969,30.083136,30.085658
min,0.33,5.23,0.75,0.8,6.1,6.13
25%,0.78,9.2725,1.75,1.2825,22.4,22.385
50%,0.85,10.63,2.29,1.64,36.6,36.57
75%,0.89,13.46,3.5,2.1675,60.025,60.0325
max,0.93,67.77,21.8,10.95,144.1,144.14


In [77]:
monai_tz_prostatex_df[['dsc', 'hsdff', 'hsdff_95', 'asd', 'ai_Value', 'expert_Value']].round(2).describe(include = 'all')

Unnamed: 0,dsc,hsdff,hsdff_95,asd,ai_Value,expert_Value
count,98.0,98.0,98.0,98.0,98.0,98.0
mean,0.84551,11.436837,3.487245,2.571531,45.190816,45.192041
std,0.135503,12.741837,11.692521,10.154334,30.083136,30.085658
min,0.0,4.64,0.5,0.63,6.1,6.13
25%,0.83,7.6825,1.445,0.97,22.4,22.385
50%,0.88,9.39,1.75,1.15,36.6,36.57
75%,0.9,11.8775,2.3625,1.3625,60.025,60.0325
max,0.94,119.75,102.93,90.04,144.1,144.14


In [78]:
print(f"nnunet task05 model DSC mean and std for tz: {round(np.mean(nnunet_tz_prostatex_df.dsc.values),2)}/{round(np.std(nnunet_tz_prostatex_df.dsc.values),2)}")
print(f"monai model DSC mean and std for tz: {round(np.mean(monai_tz_prostatex_df.dsc.values),2)}/{round(np.std(monai_tz_prostatex_df.dsc.values),2)}")

nnunet task05 model DSC mean and std for tz: 0.82/0.11
monai model DSC mean and std for tz: 0.85/0.13


##### Paired tests on mean DSC, same collection == Wilcoxon signed rank test


In [79]:
diff = monai_tz_prostatex_df.sort_values(by=['refSerieUID']).dsc.values - nnunet_tz_prostatex_df.sort_values(by=['refSerieUID']).dsc.values
res = wilcoxon(diff)
res.statistic, res.pvalue

(474.5, 5.525883664815393e-11)

In [80]:
res = wilcoxon(diff, alternative='greater')
res.statistic, res.pvalue

(474.5, 0.9999999999723705)

### QIN collection


#### Whole prostate


In [81]:
prostate_qin_df = data_df[(data_df.collection_id == 'qin_prostate_repeatability') & (data_df.segment == 'Prostate')]
prostate_qin_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 90 entries, 1623 to 1771
Data columns (total 59 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   segmentationInstanceUID               90 non-null     object 
 1   StudyInstanceUID                      90 non-null     object 
 2   sourceSegmentedSeriesUID              90 non-null     object 
 3   expert_QuantityCodeMeaning            90 non-null     object 
 4   expert_QuantityCodeValue              90 non-null     object 
 5   expert_Value                          90 non-null     float64
 6   expert_UnitsCodeValue                 90 non-null     object 
 7   expert_UnitsCodeMeaning               90 non-null     object 
 8   expert_findingSiteCodeValue           90 non-null     object 
 9   expert_findingSiteCodeMeaning         90 non-null     object 
 10  ai_QuantityCodeMeaning                90 non-null     object 
 11  ai_QuantityCodeValue 

In [83]:
prostate_qin_df.algorithmNameCollection.unique()

array(['prostate158-qin_prostate_repeatability-prostate',
       'nnunet_task05-qin_prostate_repeatability-prostate',
       'nnunet_task024-qin_prostate_repeatability-prostate'], dtype=object)

In [84]:
nnnunet_task024_prostate_qin_df = prostate_qin_df[prostate_qin_df.algorithmNameCollection == 'nnunet_task024-qin_prostate_repeatability-prostate']
nnnunet_task05_prostate_qin_df = prostate_qin_df[prostate_qin_df.algorithmNameCollection == 'nnunet_task05-qin_prostate_repeatability-prostate']
monai_prostate158_prostate_qin_df = prostate_qin_df[prostate_qin_df.algorithmNameCollection == 'prostate158-qin_prostate_repeatability-prostate']

##### Quantitative metrics summary for QIN Whole prostate segment

In [85]:
print(f"nnunet task024 model DSC mean and std for whole prostate: {round(np.mean(nnnunet_task024_prostate_qin_df.dsc.values),2)}/{round(np.std(nnnunet_task024_prostate_qin_df.dsc.values),2)}")
print(f"nnunet task05 model DSC mean and std for whole prostate: {round(np.mean(nnnunet_task05_prostate_qin_df.dsc.values),2)}/{round(np.std(nnnunet_task05_prostate_qin_df.dsc.values),2)}")
print(f"nnunet monai prostate158 DSC model mean and std for whole prostate: {round(np.mean(monai_prostate158_prostate_qin_df.dsc.values),2)}/{round(np.std(monai_prostate158_prostate_qin_df.dsc.values),2)}")

nnunet task024 model DSC mean and std for whole prostate: 0.85/0.05
nnunet task05 model DSC mean and std for whole prostate: 0.67/0.16
nnunet monai prostate158 DSC model mean and std for whole prostate: 0.79/0.12


In [86]:
print(f"nnunet task024 model HSDFF95 mean and std for whole prostate: {round(np.mean(nnnunet_task024_prostate_qin_df.hsdff_95.values),2)}/{round(np.std(nnnunet_task024_prostate_qin_df.hsdff_95.values),2)}")
print(f"nnunet task05 model HSDFF95 mean and std for whole prostate: {round(np.mean(nnnunet_task05_prostate_qin_df.hsdff_95.values),2)}/{round(np.std(nnnunet_task05_prostate_qin_df.hsdff_95.values),2)}")
print(f"nnunet monai prostate158 DSC model mean and std for whole prostate: {round(np.mean(monai_prostate158_prostate_qin_df.hsdff_95.values),2)}/{round(np.std(monai_prostate158_prostate_qin_df.hsdff_95.values),2)}")

nnunet task024 model HSDFF95 mean and std for whole prostate: 2.72/2.02
nnunet task05 model HSDFF95 mean and std for whole prostate: 11.64/8.25
nnunet monai prostate158 DSC model mean and std for whole prostate: 5.61/7.34


In [87]:
nnnunet_task024_prostate_qin_df[['dsc', 'hsdff', 'hsdff_95', 'asd', 'ai_Value', 'expert_Value']].round(2).describe(include = 'all')

Unnamed: 0,dsc,hsdff,hsdff_95,asd,ai_Value,expert_Value
count,30.0,30.0,30.0,30.0,30.0,30.0
mean,0.852667,10.483,2.724333,2.038,46.45,46.449667
std,0.04849,4.300959,2.053337,0.921458,28.118539,28.112551
min,0.72,6.07,1.5,0.96,19.0,19.04
25%,0.8225,7.2875,1.66,1.585,25.025,24.995
50%,0.86,9.595,1.82,1.785,31.15,31.15
75%,0.89,10.6425,3.0375,2.0425,62.2,62.1525
max,0.92,24.0,9.76,4.66,115.9,115.87


In [88]:
nnnunet_task05_prostate_qin_df[['dsc', 'hsdff', 'hsdff_95', 'asd', 'ai_Value', 'expert_Value']].round(2).describe(include = 'all')

Unnamed: 0,dsc,hsdff,hsdff_95,asd,ai_Value,expert_Value
count,30.0,30.0,30.0,30.0,30.0,30.0
mean,0.673,30.489333,11.638667,5.681,46.45,46.449667
std,0.160262,14.721662,8.391738,3.664901,28.118539,28.112551
min,0.35,6.31,1.36,1.09,19.0,19.04
25%,0.5475,19.3275,3.25,2.47,25.025,24.995
50%,0.675,29.395,9.215,5.245,31.15,31.15
75%,0.815,43.1,20.36,8.2075,62.2,62.1525
max,0.9,59.41,26.23,15.36,115.9,115.87


In [89]:
monai_prostate158_prostate_qin_df[['dsc', 'hsdff', 'hsdff_95', 'asd', 'ai_Value', 'expert_Value']].round(2).describe(include = 'all')

Unnamed: 0,dsc,hsdff,hsdff_95,asd,ai_Value,expert_Value
count,30.0,30.0,30.0,30.0,30.0,30.0
mean,0.792333,18.473,5.612333,3.434,46.45,46.449667
std,0.125277,15.342885,7.465135,4.828262,28.118539,28.112551
min,0.23,6.64,1.57,1.17,19.0,19.04
25%,0.79,9.0225,2.04,1.7,25.025,24.995
50%,0.83,11.805,3.365,2.095,31.15,31.15
75%,0.85,20.2775,5.1975,2.56,62.2,62.1525
max,0.89,72.65,39.03,26.77,115.9,115.87


##### Paired tests on mean DSC, same collection == Wilcoxon signed rank test

Pair 1 : task024 dsc differences vs task05


In [93]:
diff = nnnunet_task024_prostate_qin_df.sort_values(by=['refSerieUID']).dsc.values - nnnunet_task05_prostate_qin_df.sort_values(by=['refSerieUID']).dsc.values
res = wilcoxon(diff)
res.statistic, res.pvalue



(0.0, 5.5887145476589365e-06)

In [94]:
res = wilcoxon(diff, alternative='greater')
res.statistic, res.pvalue

(378.0, 2.7943572738294682e-06)

Pair 2 : task024 dsc differences vs monai


In [97]:
diff = nnnunet_task024_prostate_qin_df.sort_values(by=['refSerieUID']).dsc.values - monai_prostate158_prostate_qin_df.sort_values(by=['refSerieUID']).dsc.values
res = wilcoxon(diff)
res.statistic, res.pvalue



(30.0, 4.876307514715551e-05)

In [98]:
res = wilcoxon(diff, alternative='greater')
res.statistic, res.pvalue

(405.0, 2.4381537573577754e-05)

Pair 3 : task05 dsc differences vs monai


In [99]:
diff = monai_prostate158_prostate_qin_df.sort_values(by=['refSerieUID']).dsc.values - nnnunet_task05_prostate_qin_df.sort_values(by=['refSerieUID']).dsc.values
res = wilcoxon(diff)
res.statistic, res.pvalue



(69.5, 0.001370044847525478)

In [100]:
res = wilcoxon(diff, alternative='less')
res.statistic, res.pvalue

(365.5, 0.9993149775762372)

#### Peripheral zone


##### Quantitative metrics summary for QIN Peripheral Zone segment

In [107]:
pz_qin_df = data_df[(data_df.collection_id == 'qin_prostate_repeatability') & (data_df.segment.str.contains('Peripheral'))]

In [110]:
pz_qin_df.algorithmNameCollection.unique()

array(['prostate158-qin_prostate_repeatability-prostate',
       'nnunet_task05-qin_prostate_repeatability-prostate'], dtype=object)

In [112]:
monai_prostate158_pz_qin_df = pz_qin_df[pz_qin_df.algorithmNameCollection == 'prostate158-qin_prostate_repeatability-prostate']
nnnunet_task05_pz_qin_df = pz_qin_df[pz_qin_df.algorithmNameCollection == 'nnunet_task05-qin_prostate_repeatability-prostate']

In [113]:
nnnunet_task05_pz_qin_df = nnnunet_task05_pz_qin_df.drop_duplicates(subset='refSerieUID', keep="last")
monai_prostate158_pz_qin_df = monai_prostate158_pz_qin_df.drop_duplicates(subset='refSerieUID', keep="last")

In [114]:
nnnunet_task05_pz_qin_df[['dsc', 'hsdff', 'hsdff_95', 'asd', 'ai_Value', 'expert_Value']].round(2).describe(include = 'all')

Unnamed: 0,dsc,hsdff,hsdff_95,asd,ai_Value,expert_Value
count,30.0,30.0,30.0,30.0,30.0,30.0
mean,0.489,18.677333,10.041333,5.182667,9.903333,9.896333
std,0.268166,13.402669,9.492406,5.766393,3.97531,3.982878
min,0.0,6.0,2.28,0.83,3.2,3.2
25%,0.225,9.1,3.0275,1.3625,6.7,6.6625
50%,0.595,11.315,4.55,1.845,9.65,9.655
75%,0.725,30.6475,17.895,8.51,13.0,12.9975
max,0.8,48.61,31.33,21.9,18.6,18.64


In [115]:
monai_prostate158_pz_qin_df[['dsc', 'hsdff', 'hsdff_95', 'asd', 'ai_Value', 'expert_Value']].round(2).describe(include = 'all')

Unnamed: 0,dsc,hsdff,hsdff_95,asd,ai_Value,expert_Value
count,30.0,30.0,30.0,30.0,30.0,30.0
mean,0.632667,13.561667,5.083333,2.076,9.903333,9.896333
std,0.159026,7.55138,4.430007,2.47489,3.97531,3.982878
min,0.16,5.57,1.85,0.66,3.2,3.2
25%,0.63,8.9625,2.8025,1.0425,6.7,6.6625
50%,0.68,11.885,3.445,1.27,9.65,9.655
75%,0.72,14.985,5.3025,1.915,13.0,12.9975
max,0.78,38.09,21.21,11.31,18.6,18.64


In [116]:
print(f"nnunet task05 model DSC mean and std for pz: {round(np.mean(nnnunet_task05_pz_qin_df.dsc.values),2)}/{round(np.std(nnnunet_task05_pz_qin_df.dsc.values),2)}")
print(f"nnunet monai prostate158 DSC model mean and std for pz: {round(np.mean(monai_prostate158_pz_qin_df.dsc.values),2)}/{round(np.std(monai_prostate158_pz_qin_df.dsc.values),2)}")

nnunet task05 model DSC mean and std for pz: 0.49/0.26
nnunet monai prostate158 DSC model mean and std for pz: 0.63/0.16


##### Paired tests on mean DSC, same collection == Wilcoxon signed rank test

In [117]:
diff = monai_prostate158_pz_qin_df.sort_values(by=['refSerieUID']).hsdff_95.values - nnnunet_task05_pz_qin_df.sort_values(by=['refSerieUID']).hsdff_95.values
res = wilcoxon(diff)
res.statistic, res.pvalue

(56.5, 0.00011058151721954346)

In [118]:
res = wilcoxon(diff, alternative='greater')
res.statistic, res.pvalue

(56.5, 0.9999504871666431)

### Prostate MRI US Biopsy


#### Whole prostate


In [119]:
prostate_mri_us_biopsy_df = data_df[(data_df.collection_id == 'prostate_mri_us_biopsy') & (data_df.segment == 'Prostate')]

In [120]:
prostate_mri_us_biopsy_df.algorithmNameCollection.unique()

array(['prostate158-prostate_mri_us_biopsy-prostate',
       'nnunet_task024-prostate_mri_us_biopsy-prostate'], dtype=object)

In [123]:
nnunet_task024_prostate_prostate_mri_us_biopsy_df = prostate_mri_us_biopsy_df[prostate_mri_us_biopsy_df.algorithmNameCollection \
    == 'nnunet_task024-prostate_mri_us_biopsy-prostate']
monai_prostate158_prostate_prostate_mri_us_biopsy_df = prostate_mri_us_biopsy_df[prostate_mri_us_biopsy_df.algorithmNameCollection \
    == 'prostate158-prostate_mri_us_biopsy-prostate']

##### Quantitative metrics summary for PROSTATE-MRI-US-Biopsy whole prostate segment

In [124]:
print(f"nnunet task024 model DSC mean and std for prostate: \
      {round(np.mean(nnunet_task024_prostate_prostate_mri_us_biopsy_df.dsc.values),2)}/{round(np.std(nnunet_task024_prostate_prostate_mri_us_biopsy_df.dsc.values),2)}")
print(f"nnunet monai prostate158 DSC model mean and std for prostate: \
      {round(np.mean(monai_prostate158_prostate_prostate_mri_us_biopsy_df.dsc.values),2)}/{round(np.std(monai_prostate158_prostate_prostate_mri_us_biopsy_df.dsc.values),2)}")

nnunet task024 model DSC mean and std for prostate:       0.86/0.09
nnunet monai prostate158 DSC model mean and std for prostate:       0.84/0.16


In [125]:
nnunet_task024_prostate_prostate_mri_us_biopsy_df[['dsc', 'hsdff', 'hsdff_95', 'asd', 'ai_Value', 'expert_Value']].round(2).describe(include = 'all')

Unnamed: 0,dsc,hsdff,hsdff_95,asd,ai_Value,expert_Value
count,1017.0,1017.0,1017.0,1006.0,1017.0,1017.0
mean,0.856893,16.565703,3.454444,6.564503,48.970403,48.969587
std,0.092948,23.927155,7.697978,6.581671,26.344915,26.345122
min,0.08,3.6,0.33,1.13,10.6,10.56
25%,0.84,6.36,1.08,3.3625,31.7,31.74
50%,0.88,8.09,1.53,4.99,41.5,41.49
75%,0.91,11.46,2.3,7.3425,59.8,59.84
max,0.95,135.46,58.03,57.96,222.4,222.41


In [126]:
monai_prostate158_prostate_prostate_mri_us_biopsy_df[['dsc', 'hsdff', 'hsdff_95', 'asd', 'ai_Value', 'expert_Value']].round(2).describe(include = 'all')

Unnamed: 0,dsc,hsdff,hsdff_95,asd,ai_Value,expert_Value
count,1017.0,1017.0,1017.0,1006.0,1017.0,1017.0
mean,0.840197,13.026912,4.598496,6.85507,48.970403,48.969587
std,0.157959,18.561501,10.96334,7.069255,26.344915,26.345122
min,0.0,3.58,0.33,1.07,10.6,10.56
25%,0.86,6.18,1.08,3.27,31.7,31.74
50%,0.89,7.68,1.5,4.8,41.5,41.49
75%,0.91,9.68,2.16,7.295,59.8,59.84
max,0.95,132.99,78.96,71.78,222.4,222.41


##### Paired tests on mean DSC, same collection == Wilcoxon signed rank test

In [127]:
diff = nnunet_task024_prostate_prostate_mri_us_biopsy_df.sort_values(by=['refSerieUID']).hsdff_95.values - \
    monai_prostate158_prostate_prostate_mri_us_biopsy_df.sort_values(by=['refSerieUID']).hsdff_95.values
res = wilcoxon(diff)
res.statistic, res.pvalue

(226905.0, 0.07506454031591767)

In [128]:
res = wilcoxon(diff, alternative='greater')
res.statistic, res.pvalue

(258700.0, 0.03753227015795883)