In [1]:
import numpy as np
import pandas as pd
import os
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from scipy.stats import f_oneway, chi2_contingency

In [2]:
excel = '/home/SMC_data_2101xx_EGFR_deeplearning/EGFR_deep_learning/3ch_img_minmax/result/final_patient_index.xlsx'

In [3]:
train = pd.read_excel(excel, sheet_name  = 'train')
val = pd.read_excel(excel, sheet_name  = 'val')
test = pd.read_excel(excel, sheet_name  = 'test')

In [4]:
ex_test_excel = '/home/SMC_data_2101xx_EGFR_deeplearning/EGFR_deep_learning/external_validationset/external_test/GGN_TKI_demo.xlsx'

In [5]:
ex_val_excel = '/home/SMC_data_2101xx_EGFR_deeplearning/EGFR_deep_learning/external_validationset/external_validation/external_validation_clinical.xlsx'

In [6]:
ex_test = pd.read_excel(ex_test_excel)
ex_val = pd.read_excel(ex_val_excel)

## AGE f_oneway test

In [7]:
train_age = train['Age'].to_numpy()
val_age = val['Age'].to_numpy()
test_age = test['Age'].to_numpy()

In [8]:
ex_test_age = ex_test['Age'].to_numpy()
ex_val_age = ex_val['Age'].to_numpy()

In [9]:
test_age = np.append(test_age, ex_val_age)

In [10]:
test_age.shape

(46,)

In [11]:
age_labels = ['train']*len(train_age) + ['val']*len(val_age) + ['test']*len(test_age) + ['ex_test']*len(ex_test_age)

In [12]:
train_mean = np.mean(train_age)
train_std = np.std(train_age)
val_mean = np.mean(val_age)
val_std = np.std(val_age)
test_mean = np.mean(test_age)
test_std = np.std(test_age)
ex_test_mean = np.mean(ex_test_age)
ex_test_std = np.std(ex_test_age)

In [13]:
print('train mean', train_mean)
print('train std', train_std)
print('val mean', val_mean)
print('val std', val_std)
print('test mean', test_mean)
print('test std', test_std)
print('ex test mean', ex_test_mean)
print('ex test std', ex_test_std)

train mean 58.775
train std 8.676080624337235
val mean 58.608695652173914
val std 8.590890678300058
test mean 57.82608695652174
test std 7.794380215832449
ex test mean 58.03125
ex test std 9.45874058411055


In [14]:
age = np.concatenate([train_age, val_age, test_age, ex_test_age])

In [15]:
tukey_results_age = pairwise_tukeyhsd(age, age_labels, 0.05)

In [16]:
print(tukey_results_age)

Multiple Comparison of Means - Tukey HSD, FWER=0.05
 group1 group2 meandiff p-adj  lower  upper  reject
---------------------------------------------------
ex_test   test  -0.2052   0.9 -4.6014 4.1911  False
ex_test  train   0.7437   0.9 -2.7766 4.2641  False
ex_test    val   0.5774   0.9 -4.9517 6.1066  False
   test  train   0.9489   0.9 -2.9951 4.8929  False
   test    val   0.7826   0.9 -5.0255 6.5907  False
  train    val  -0.1663   0.9 -5.3432 5.0105  False
---------------------------------------------------


In [17]:
age_test = f_oneway(train_age, val_age, test_age, ex_test_age, ex_val_age)

In [18]:
print('AGE f_oneway test : ', age_test)

AGE f_oneway test :  F_onewayResult(statistic=0.18977830970312257, pvalue=0.9436078289125895)


## SEX chi2_contigency test

In [19]:
train_sex = train['Sex'].to_numpy()
val_sex = val['Sex'].to_numpy()
test_sex = test['Sex'].to_numpy()

In [20]:
ex_test_sex = ex_test['Sex'].to_numpy()
ex_val_sex = ex_val['Sex'].to_numpy()

In [21]:
test_sex = np.append(test_sex, ex_val_sex)

In [22]:
train_f = len(np.where(train_sex == 0)[0])
train_m = len(np.where(train_sex == 1)[0])
train_sex_table = np.array([train_f, train_m]).reshape(1, 2)

In [23]:
val_f = len(np.where(val_sex == 0)[0])
val_m = len(np.where(val_sex == 1)[0])
val_sex_table = np.array([val_f, val_m]).reshape(1, 2)

In [24]:
test_f = len(np.where(test_sex == 0)[0])
test_m = len(np.where(test_sex == 1)[0])
test_sex_table = np.array([test_f, test_m]).reshape(1, 2)

In [25]:
ex_test_f = len(np.where(ex_test_sex == 0)[0])
ex_test_m = len(np.where(ex_test_sex == 1)[0])
ex_test_sex_table = np.array([ex_test_f, ex_test_m]).reshape(1, 2)

In [28]:
test_f

25

In [29]:
test_m

21

In [26]:
ex_test_f

41

In [27]:
ex_test_m

23

In [30]:
sex_table = np.concatenate([train_sex_table, val_sex_table, test_sex_table, ex_test_sex_table])

In [31]:
sex_table

array([[69, 51],
       [16,  7],
       [25, 21],
       [41, 23]])

In [32]:
g, p, dof, expctd = chi2_contingency(sex_table, lambda_="log-likelihood")

In [33]:
print('SEX chi2_contigency test : ', p)

SEX chi2_contigency test :  0.5207714036324778


## SMOKE chi2_contigency test

In [34]:
train_smoke = train['Smoking'].to_numpy()
val_smoke = val['Smoking'].to_numpy()
test_smoke = test['Smoking'].to_numpy()

In [35]:
ex_test_smoke = ex_test['Smoking'].to_numpy()
ex_val_smoke = ex_val['Smoking'].to_numpy()

In [36]:
train_ns = len(np.where(train_smoke == 0)[0])
train_s = len(np.where(train_smoke == 1)[0])
train_smoke_table = np.array([train_ns, train_s]).reshape(1, 2)

In [37]:
val_ns = len(np.where(val_smoke == 0)[0])
val_s = len(np.where(val_smoke == 1)[0])
val_smoke_table = np.array([val_ns, val_s]).reshape(1, 2)

In [38]:
test_ns = len(np.where(test_smoke == 0)[0])
test_s = len(np.where(test_smoke == 1)[0])
test_smoke_table = np.array([test_ns, test_s]).reshape(1, 2)

In [39]:
ex_test_ns = len(np.where(ex_test_smoke == 0)[0])
ex_test_s = len(np.where(ex_test_smoke == 1)[0])
ex_test_smoke_table = np.array([ex_test_ns, ex_test_s]).reshape(1, 2)

In [40]:
ex_val_ns = len(np.where(ex_val_smoke == 0)[0])
ex_val_s = len(np.where(ex_val_smoke == 1)[0])
ex_val_smoke_table = np.array([ex_val_ns, ex_val_s]).reshape(1, 2)

In [43]:
test_smoke_table = test_smoke_table + ex_val_smoke_table

In [44]:
test_smoke_table

array([[28, 18]])

In [45]:
smoke_table = np.concatenate([train_smoke_table, val_smoke_table, test_smoke_table, ex_test_smoke_table])

In [46]:
smoke_table

array([[76, 44],
       [17,  6],
       [28, 18],
       [47, 17]])

In [47]:
smoke_g, smoke_p, smoke_dof, smoke_expctd = chi2_contingency(smoke_table, lambda_="log-likelihood")

In [48]:
print('SMOKE chi2_contigency test : ', smoke_p)

SMOKE chi2_contigency test :  0.36178165558936465


## EGFR chi2_contigency test

In [49]:
train_label = train['EGFR'].to_numpy()
val_label = val['EGFR'].to_numpy()
test_label = test['EGFR'].to_numpy()

In [50]:
ex_val_label = ex_val['EGFR'].to_numpy()

In [51]:
train_n = len(np.where(train_label == 0)[0])
train_p = len(np.where(train_label == 1)[0])
train_label_table = np.array([train_n, train_p]).reshape(1, 2)

In [52]:
val_n = len(np.where(val_label == 0)[0])
val_p = len(np.where(val_label == 1)[0])
val_label_table = np.array([val_n, val_p]).reshape(1, 2)

In [53]:
test_n = len(np.where(test_label == 0)[0])
test_p = len(np.where(test_label == 1)[0])
test_label_table = np.array([test_n, test_p]).reshape(1, 2)

In [54]:
ex_val_n = len(np.where(ex_val_label == 0)[0])
ex_val_p = len(np.where(ex_val_label == 1)[0])
ex_val_label_table = np.array([ex_val_n, ex_val_p]).reshape(1, 2)

In [55]:
test_label_table = test_label_table + ex_val_label_table

In [57]:
label_table = np.concatenate([train_label_table, val_label_table, test_label_table])

In [58]:
label_g, label_p, label_dof, label_expctd = chi2_contingency(label_table, lambda_="log-likelihood")

In [59]:
label_table

array([[39, 81],
       [ 8, 15],
       [17, 29]])

In [60]:
print('LABEL chi2_contigency test : ', label_p)

LABEL chi2_contigency test :  0.85954851872517
