### Notebook to generate demographics tables for paper

In [None]:
import pandas as pd
import numpy as np
import os
import sys
#import meld_classifier.old_hdf5_io as hio
import matplotlib.pyplot as plt
import meld_classifier.paths as paths
from statsmodels.stats.proportion import proportion_confint
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from statsmodels.stats.multitest import multipletests
from meld_classifier.meld_cohort import MeldCohort, MeldSubject
import meld_classifier.paths as p

In [3]:
path = ''

In [4]:
demographics_file = os.path.join(path,'meld_data/Data/demographics_qc_allgroups_withH27H28H101.csv')
included_subs_file = os.path.join(path,'meld_data/Data/overview_cohort_6_withH101freesurfer.csv')


In [5]:
demographics = pd.read_csv(demographics_file,index_col=False)
included_subs = pd.read_csv(included_subs_file,index_col=False)

len(included_subs)

1185

In [7]:
# excluded for initial reasons (e.g by sites or failed freesurfer)
# print(np.sum(included_subs['excluded_because_other']))
print(f"excluded for initial reasons: {((included_subs['group']=='patient')&(included_subs['excluded_because_other'])).sum()} patients,{((included_subs['group']=='control')&(included_subs['excluded_because_other'])).sum()} controls")

#total patients prior to any exclusion
print(f"total subjects before QC and lesions exclusions: {((included_subs['group']=='patient')&(included_subs['excluded_because_other']==0)).sum()} patients,{((included_subs['group']=='control')&(included_subs['excluded_because_other']==0)).sum()} controls")

# no lesion mask
# print(np.sum(included_subs['excluded_because_nolesion']))
print(f"excluded because no lesion: {((included_subs['group']=='patient')&(included_subs['excluded_because_nolesion'])).sum()} patients,{((included_subs['group']=='control')&(included_subs['excluded_because_nolesion'])).sum()} controls")

#QC
# print(np.sum(included_subs['excluded_because_outlier']-included_subs['excluded_because_nolesion']>0))
print(f"excluded because of QC: {((included_subs['group']=='patient')&(included_subs['excluded_because_outlier'])&(included_subs['excluded_because_nolesion']==0)).sum()} patients,{((included_subs['group']=='control')&(included_subs['excluded_because_outlier'])&(included_subs['excluded_because_nolesion']==0)).sum()} controls")


#total remaining patients
print(f"total subjects remaining: {((included_subs['group']=='patient')&(included_subs['excluded_because_outlier']==0)&(included_subs['excluded_because_nolesion']==0) & (included_subs['excluded_because_other']==0)).sum()} patients,{((included_subs['group']=='control')&(included_subs['excluded_because_outlier']==0)&(included_subs['excluded_because_nolesion']==0)& (included_subs['excluded_because_other']==0)).sum()} controls")


excluded for initial reasons: 4 patients,3 controls
total subjects before QC and lesions exclusions: 699 patients,479 controls
excluded because no lesion: 37 patients,0 controls
excluded because of QC: 8 patients,5 controls
total subjects remaining: 654 patients,474 controls


In [8]:
#update exclusion and demographic

vec_exclude = included_subs[['excluded_because_nolesion','excluded_because_outlier','excluded_because_other']].values.any(axis=1).astype(bool)
excluded = included_subs[vec_exclude]
demographics=demographics[~vec_exclude]

In [9]:
#print demographics
print(f"total subjects remaining: {(demographics['group']=='patient').sum()} patients, {(demographics['group']=='control').sum()} controls")


print(np.nanmedian(demographics['Age at preoperative'][demographics.group=='patient']),
                  np.nanpercentile(demographics['Age at preoperative'][demographics.group=='patient'],25),
                   np.nanpercentile(demographics['Age at preoperative'][demographics.group=='patient'],75))
print(np.sum(demographics['Sex'][demographics.group=='patient']==0),
      np.sum(demographics['Sex'][demographics.group=='patient']))

print(np.nanmedian(demographics['Age at preoperative'][demographics.group=='control']),
                  np.nanpercentile(demographics['Age at preoperative'][demographics.group=='control'],25),
                   np.nanpercentile(demographics['Age at preoperative'][demographics.group=='control'],75))
print(np.sum(demographics['Sex'][demographics.group=='control']==0),
      np.sum(demographics['Sex'][demographics.group=='control']))

total subjects remaining: 654 patients, 474 controls
18.5 11.09 30.65
337 316.0
28.0 21.0 39.0
260 213.0


In [10]:
folder  = 'experiments/co-ripa1/iteration_21-09-15/ensemble_21-09-15/fold_all/results/'
res_file = os.path.join(folder,'test_results.csv')
test_df=pd.read_csv(res_file,index_col=False)
test_df.drop_duplicates(inplace=True,ignore_index=True)
test_df.groupby('group').mean()
fp = test_df['n_clusters']<1
test_df['any']=fp
test_df.border=test_df.border>100
test_df.detected+=test_df.border
test_df=test_df.dropna()
test_df.detected=test_df.detected.astype(int)

In [11]:
folder  = 'experiments/co-ripa1/iteration_21-09-17/ensemble_21-09-20/fold_all/results/'
res_file = os.path.join(folder,'test_results.csv')
n = pd.read_csv(res_file,index_col=False)
n.drop_duplicates(inplace=True,ignore_index=True)
n.dropna(inplace=True)

fp = n['n_clusters']<1
n['any']=fp
n.border=n.border>100
n.detected+=n.border
n=n.dropna()

In [12]:
subsets = np.zeros((len(demographics),4),dtype=bool)

group = np.ones(len(demographics),dtype=bool)
sites = np.zeros(len(demographics),dtype=object)
for i,d in enumerate(demographics.ID):
        if d in n.ID.values:
            subsets[i,0]=True
        if d in test_df.ID.values:
            subsets[i,1] = True
        if '_C_' in d:
            group[i] = False
        if 'H27' in d:
            subsets[i,2] = True
        if 'H28' in d:
            subsets[i,2] = True
        if 'H101' in d:
            subsets[i,2] = True
        sites[i]=d.split('_')[1]

In [None]:
demographics['group'] = group
demographics['Site'] = sites

In [14]:
rows=['Age of onset','Duration', 'Age at preoperative', 'Sex',
       'Ever reported MRI negative','Lesion area','Hemisphere','Surgery','Seizure free',
      'FLAIR','Scanner',
      ]

In [19]:
# get number of patients MRIneg and Histo confirm
len('number MRIneg patients & Histo confirm')
patients_mrineghisto = []
for i, dataset_type in enumerate(['train','test']):
    mrineg_histo = demographics[subsets[:,i]][(~demographics['Histology'].isna())&(demographics['Ever reported MRI negative'])]
    print(f'In {dataset_type}: {len(mrineg_histo)}')

#save list
mrineg_histo = demographics[(~demographics['Histology'].isna())&(demographics['Ever reported MRI negative'])]
# mrineg_histo['ID'].reset_index().to_csv(os.path.join(paths.BASE_PATH, 'list_patients_mrineg_histo.csv'))

# print number MRI positive , histo confirm
for i, dataset_type in enumerate(['train','test']):
    mrineg_histo = demographics[subsets[:,i]][(~demographics['Histology'].isna())&(demographics['Ever reported MRI negative']==0)]
    print(f'In {dataset_type}: {len(mrineg_histo)}')

In train: 77
In test: 58
In train: 116
In test: 113


  mrineg_histo = demographics[subsets[:,i]][(~demographics['Histology'].isna())&(demographics['Ever reported MRI negative'])]
  mrineg_histo = demographics[subsets[:,i]][(~demographics['Histology'].isna())&(demographics['Ever reported MRI negative']==0)]


In [15]:
#trainpat
train = demographics[subsets[:,0]]
train_pat = train[train.group]
h=np.zeros(len(train_pat))
for f, fi in enumerate(train_pat['Histology'].values):
    if isinstance(fi,str):
        h[f] = 1
n_s=np.sum(train_pat['Seizure free']==0)+np.sum(train_pat['Seizure free']==1)
#traincont
train_cont = train[~train.group]

#testpat
test = demographics[subsets[:,1]]
test_pat = test[test.group]
ht=np.zeros(len(test_pat))
for f, fi in enumerate(test_pat['Histology'].values):
    if isinstance(fi,str):
        ht[f] = 1
n_st=np.sum(test_pat['Seizure free']==0)+np.sum(test_pat['Seizure free']==1)
test_cont = test[~test.group]

new = demographics[subsets[:,2]]
new_pat = new[new.group]
htn=np.zeros(len(new_pat))
for f, fi in enumerate(new_pat['Histology'].values):
    if isinstance(fi,str):
        htn[f] = 1
n_stn=np.sum(new_pat['Seizure free']==0)+np.sum(new_pat['Seizure free']==1)
#test_cont = test[~test.group]
new_cont = new[~new.group]

new2 = demographics[subsets[:,3]]
new_pat2 = new2[new2.group]
htn2=np.zeros(len(new_pat2))
for f, fi in enumerate(new_pat2['Histology'].values):
    if isinstance(fi,str):
        htn2[f] = 1
n_stn2=np.sum(new_pat2['Seizure free']==0)+np.sum(new_pat2['Seizure free']==1)
#test_cont = test[~test.group]
new_cont = new[~new.group]

data = {'':  ['Age at preoperative scan (median,IQR)',
             'Sex (f:m)',
             'Age of epilepsy onset (median,IQR)',
             'Duration of epilepsy (median,IQR)',
             'Surgery',
             'Histology',
             'Seizure free','Follow up time', 
             'FLAIR available','Scanner (1.5T:3T)'],
        'Train cohort Patients (n= {})'.format(train_pat.group.sum()):['{} , ({} - {})'.format(np.round(train_pat.median()['Age at preoperative'],1),
                                                             np.round(np.nanpercentile(train_pat['Age at preoperative'],25),1),
                                                             np.round(np.nanpercentile(train_pat['Age at preoperative'],75),1)),
                                                 '{} : {}'.format((train_pat.Sex==0).sum(), (train_pat.Sex==1).sum()),
                                                '{} , ({} - {})'.format(np.round(train_pat.median()['Age of onset'],1),
                                                                        np.round(np.nanpercentile(train_pat['Age of onset'],25),1),
                                                             np.round(np.nanpercentile(train_pat['Age of onset'],75),1)),
                                                '{} , ({} - {})'.format(np.round(train_pat.median()['Duration'],1),
                                                                        np.round(np.nanpercentile(train_pat['Duration'],25),1),
                                                             np.round(np.nanpercentile(train_pat['Duration'],75),1)),
                                                '{} / {} ({}%)'.format(int(train_pat.Surgery.sum()), int(len(train_pat)),
                                                                       round(100*(train_pat.Surgery.sum()/len(train_pat)),0)),
                                                '{} / {} ({}%)'.format(int(h.sum()),int(train_pat.Surgery.sum()),
                                                                       round(100*(h.sum()/train_pat.Surgery.sum()),0)),
                                                '{} / {} ({}%)'.format((train_pat['Seizure free']==1).sum(),n_s,
                                                                       round(100*((train_pat['Seizure free']==1).sum()/n_s),0)),
                                                '{} , ({} - {})'.format(np.round(train_pat.median()['f/u'],1),
                                                                        np.round(np.nanpercentile(train_pat['f/u'],25),1),
                                                             np.round(np.nanpercentile(train_pat['f/u'],75),1)),
                                                '{} / {} ({}%)'.format((train_pat['FLAIR']==True).sum(),int(len(train_pat)),
                                                                       round(100*((train_pat['FLAIR']==1).sum()/int(len(train_pat))),0)),
                                                '{} : {}'.format((train_pat.Scanner=='15T').sum(), (train_pat.Scanner=='3T').sum()),],
        'Train cohort Controls (n= {})'.format(len(train_cont)):['{} , ({} - {})'.format(np.round(train_cont.median()['Age at preoperative'],1),
                                                             np.round(np.nanpercentile(train_cont['Age at preoperative'],25),1),
                                                             np.round(np.nanpercentile(train_cont['Age at preoperative'],75),1)),
                                                 '{} : {}'.format((train_cont.Sex==0).sum(), (train_cont.Sex==1).sum()),
                                                '','','','','','',
                                                '{} / {} ({}%)'.format((train_cont['FLAIR']==True).sum(),int(len(train_cont)),
                                                                       round(100*((train_cont['FLAIR']==1).sum()/int(len(train_cont))),0)),
                                                '{} : {}'.format((train_cont.Scanner=='15T').sum(), (train_cont.Scanner=='3T').sum()),],
        'Test cohort Patients (n= {})'.format(test_pat.group.sum()):['{} , ({} - {})'.format(np.round(test_pat.median()['Age at preoperative'],1),
                                                             np.round(np.nanpercentile(test_pat['Age at preoperative'],25),1),
                                                             np.round(np.nanpercentile(test_pat['Age at preoperative'],75),1)),
                                                 '{} : {}'.format((test_pat.Sex==0).sum(), (test_pat.Sex==1).sum()),
                                                '{} , ({} - {})'.format(np.round(test_pat.median()['Age of onset'],1),
                                                                        np.round(np.nanpercentile(test_pat['Age of onset'],25),1),
                                                             np.round(np.nanpercentile(test_pat['Age of onset'],75),1)),
                                                '{} , ({} - {})'.format(np.round(test_pat.median()['Duration'],2),
                                                                        np.round(np.nanpercentile(test_pat['Duration'],25),1),
                                                             np.round(np.nanpercentile(test_pat['Duration'],75),1)),
                                                '{} / {} ({}%)'.format(int(test_pat.Surgery.sum()), int(len(test_pat)),
                                                                       round(100*(test_pat.Surgery.sum()/len(test_pat)),0)),
                                                '{} / {} ({}%)'.format(int(ht.sum()),int(test_pat.Surgery.sum()),
                                                                       round(100*(ht.sum()/test_pat.Surgery.sum()),0)),
                                                '{} / {} ({}%)'.format((test_pat['Seizure free']==1.0).sum(),n_st,
                                                                       round(100*((test_pat['Seizure free']==1.0).sum()/n_st),0)),
                                                '{} , ({} - {})'.format(np.round(test_pat.median()['f/u'],1),
                                                                        np.round(np.nanpercentile(test_pat['f/u'],25),1),
                                                             np.round(np.nanpercentile(test_pat['f/u'],75),1)), 
                                                '{} / {} ({}%)'.format((test_pat['FLAIR']==True).sum(),int(len(test_pat)),
                                                                       round(100*((test_pat['FLAIR']==1).sum()/int(len(test_pat))),0)),
                                                 '{} : {}'.format((test_pat.Scanner=='15T').sum(), (test_pat.Scanner=='3T').sum()),],
        'Test cohort Controls (n= {})'.format(len(test_cont)):['{} , ({} - {})'.format(np.round(test_cont.median()['Age at preoperative'],1),
                                                             np.round(np.nanpercentile(test_cont['Age at preoperative'],25),1),
                                                             np.round(np.nanpercentile(test_cont['Age at preoperative'],75),1)),
                                                 '{} : {}'.format((test_cont.Sex==0).sum(), (test_cont.Sex==1).sum()),
                                                '','','','','','',
                                                '{} / {} ({}%)'.format((test_cont['FLAIR']==True).sum(),int(len(test_cont)),
                                                                       round(100*((test_cont['FLAIR']==1).sum()/int(len(test_cont))),0)),
                                                 '{} : {}'.format((test_cont.Scanner=='15T').sum(), (test_cont.Scanner=='3T').sum()),],
        
        'Independent test sites Patients (n= {})'.format(new_pat.group.sum()):['{} , ({} - {})'.format(np.round(new_pat.median()['Age at preoperative'],1),
                                                             np.round(np.nanpercentile(new_pat['Age at preoperative'],25),1),
                                                             np.round(np.nanpercentile(new_pat['Age at preoperative'],75),1)),
                                                 '{} : {}'.format((new_pat.Sex==0).sum(), (new_pat.Sex==1).sum()),
                                                '{} , ({} - {})'.format(np.round(new_pat.median()['Age of onset'],1),
                                                                        np.round(np.nanpercentile(new_pat['Age of onset'],25),1),
                                                             np.round(np.nanpercentile(new_pat['Age of onset'],75),1)),
                                                '{} , ({} - {})'.format(np.round(new_pat.median()['Duration'],2),
                                                                        np.round(np.nanpercentile(new_pat['Duration'],25),1),
                                                             np.round(np.nanpercentile(new_pat['Duration'],75),1)),
                                                '{} / {} ({}%)'.format(int(new_pat.Surgery.sum()), int(len(new_pat)),
                                                                       round(100*(new_pat.Surgery.sum()/len(new_pat)),0)),
                                                '{} / {} ({}%)'.format(int(htn.sum()),int(new_pat.Surgery.sum()),
                                                                       round(100*(htn.sum()/new_pat.Surgery.sum()),0)),
                                                '{} / {} ({}%)'.format((new_pat['Seizure free']==1).sum(),n_stn,
                                                                       round(100*((new_pat['Seizure free']==1).sum()/n_stn),0)),
                                                '{} , ({} - {})'.format(np.round(new_pat.median()['f/u'],1),
                                                                        np.round(np.nanpercentile(new_pat['f/u'],25),1),
                                                             np.round(np.nanpercentile(new_pat['f/u'],75),1)), 
                                                '{} / {} ({}%)'.format((new_pat['FLAIR']==True).sum(),int(len(new_pat)),
                                                                       round(100*((new_pat['FLAIR']==1).sum()/int(len(new_pat))),0)),
                                                '{} : {}'.format((new_pat.Scanner=='15T').sum(), (new_pat.Scanner=='3T').sum()),],
        'Independent test sites Controls (n= {})'.format(len(new_cont)):['{} , ({} - {})'.format(np.round(new_cont.median()['Age at preoperative'],1),
                                                             np.round(np.nanpercentile(new_cont['Age at preoperative'],25),1),
                                                             np.round(np.nanpercentile(new_cont['Age at preoperative'],75),1)),
                                                 '{} : {}'.format((new_cont.Sex==0).sum(), (new_cont.Sex==1).sum()),
                                                '','','','','','',
                                                '{} / {} ({}%)'.format((new_cont['FLAIR']==True).sum(),int(len(new_cont)),
                                                                       round(100*((new_cont['FLAIR']==1).sum()/int(len(new_cont))),0)),
                                                '{} : {}'.format((new_cont.Scanner=='15T').sum(), (new_cont.Scanner=='3T').sum()),],
        # 'Independent test site 2 Patients (n= {})'.format(new_pat2.group.sum()):['{} , ({} - {})'.format(np.round(new_pat2.median()['Age at preoperative'],1),
        #                                                      np.round(np.nanpercentile(new_pat2['Age at preoperative'],25),1),
        #                                                      np.round(np.nanpercentile(new_pat2['Age at preoperative'],75),1)),
        #                                          '{} : {}'.format((new_pat2.Sex==0).sum(), (new_pat2.Sex==1).sum()),
        #                                         '{} , ({} - {})'.format(np.round(new_pat2.median()['Age of onset'],1),
        #                                                                 np.round(np.nanpercentile(new_pat2['Age of onset'],25),1),
        #                                                      np.round(np.nanpercentile(new_pat2['Age of onset'],75),1)),
        #                                         '{} , ({} - {})'.format(np.round(new_pat2.median()['Duration'],2),
        #                                                                 np.round(np.nanpercentile(new_pat2['Duration'],25),1),
        #                                                      np.round(np.nanpercentile(new_pat2['Duration'],75),1)),
        #                                         '{} / {} ({}%)'.format(int(new_pat2.Surgery.sum()), int(len(new_pat2)),
        #                                                                round(100*(new_pat2.Surgery.sum()/len(new_pat2)),0)),
        #                                         '{} / {} ({}%)'.format(int(htn2.sum()),int(new_pat2.Surgery.sum()),
        #                                                                round(100*(htn2.sum()/new_pat2.Surgery.sum()),0)),
        #                                         '{} / {} ({}%)'.format((new_pat2['Seizure free']==1).sum(),n_stn2,
        #                                                                round(100*((new_pat2['Seizure free']==1).sum()/n_stn2),0)),
        #                                         '{} , ({} - {})'.format(np.round(new_pat2.median()['f/u'],1),
        #                                                                 np.round(np.nanpercentile(new_pat2['f/u'],25),1),
        #                                                      np.round(np.nanpercentile(new_pat2['f/u'],75),1)),
        #                                         '{} / {} ({}%)'.format((new_pat2['FLAIR']==True).sum(),int(len(new_pat2)),
        #                                                                round(100*((new_pat2['FLAIR']==1).sum()/int(len(new_pat2))),0)),
        #                                         '{} : {}'.format((new_pat2.Scanner=='15T').sum(), (new_pat2.Scanner=='3T').sum()),],
        }
df = pd.DataFrame (data)
df

  'Train cohort Patients (n= {})'.format(train_pat.group.sum()):['{} , ({} - {})'.format(np.round(train_pat.median()['Age at preoperative'],1),
  '{} , ({} - {})'.format(np.round(train_pat.median()['Age of onset'],1),
  '{} , ({} - {})'.format(np.round(train_pat.median()['Duration'],1),
  '{} , ({} - {})'.format(np.round(train_pat.median()['f/u'],1),
  'Train cohort Controls (n= {})'.format(len(train_cont)):['{} , ({} - {})'.format(np.round(train_cont.median()['Age at preoperative'],1),
  'Test cohort Patients (n= {})'.format(test_pat.group.sum()):['{} , ({} - {})'.format(np.round(test_pat.median()['Age at preoperative'],1),
  '{} , ({} - {})'.format(np.round(test_pat.median()['Age of onset'],1),
  '{} , ({} - {})'.format(np.round(test_pat.median()['Duration'],2),
  '{} , ({} - {})'.format(np.round(test_pat.median()['f/u'],1),
  'Test cohort Controls (n= {})'.format(len(test_cont)):['{} , ({} - {})'.format(np.round(test_cont.median()['Age at preoperative'],1),
  'Independent test sites

Unnamed: 0,Unnamed: 1,Train cohort Patients (n= 278),Train cohort Controls (n= 180),Test cohort Patients (n= 260),Test cohort Controls (n= 193),Independent test sites Patients (n= 116),Independent test sites Controls (n= 101)
0,"Age at preoperative scan (median,IQR)","19.5 , (11.0 - 32.4)","29.0 , (19.0 - 37.9)","18.0 , (10.8 - 29.0)","29.0 , (19.5 - 39.2)","22.5 , (13.1 - 27.5)","27.5 , (22.5 - 37.5)"
1,Sex (f:m),150 : 127,105 : 75,125 : 135,104 : 88,62 : 54,51 : 50
2,"Age of epilepsy onset (median,IQR)","6.0 , (2.5 - 12.0)",,"6.0 , (3.0 - 11.0)",,"2.8 , (0.8 - 5.5)",
3,"Duration of epilepsy (median,IQR)","10.0 , (4.3 - 18.3)",,"10.05 , (5.0 - 18.0)",,"2.65 , (1.2 - 7.2)",
4,Surgery,208 / 278 (75.0%),,190 / 260 (73.0%),,69 / 116 (59.0%),
5,Histology,193 / 208 (93.0%),,171 / 190 (90.0%),,68 / 69 (99.0%),
6,Seizure free,123 / 183 (67.0%),,106 / 157 (68.0%),,52 / 64 (81.0%),
7,Follow up time,"2.0 , (1.0 - 3.0)",,"2.0 , (1.0 - 3.4)",,"2.3 , (1.5 - 3.3)",
8,FLAIR available,132 / 278 (47.0%),28 / 180 (16.0%),110 / 260 (42.0%),28 / 193 (15.0%),33 / 116 (28.0%),18 / 101 (18.0%)
9,Scanner (1.5T:3T),41 : 237,18 : 162,56 : 204,15 : 178,0 : 116,0 : 101


In [46]:
# df.to_excel("demographics_final_table_graph.xlsx",)  

In [30]:
## test difference sex and age between controls and patients
from scipy import stats

for (dataset_name, dataset_pat, dataset_cont) in zip(['train','test'],
                                                   [train_pat, test_pat],
                                                   [train_cont, test_cont]):
    print(f'{dataset_name} dataset')
    
    # test age difference with T-test
    age_p = dataset_pat['Age at preoperative'].dropna()
    age_c = dataset_cont['Age at preoperative'].dropna().values
    stat, p = stats.ttest_ind(age_p,age_c)
    print(f'Mean age test patients {age_p.mean()} (n={len(age_p)})')
    print(f'Mean age test controls {age_c.mean()} (n={len(age_c)})')
    print(f'T-test: stat={round(stat, 2)}, p={"<0.01" if p<0.01 else round(p,2)}')
    # test sex difference with Chi-square

    from scipy import stats

    f_p, m_p = dataset_pat.groupby('Sex')['ID'].count().values
    f_c, m_c  = dataset_cont.groupby('Sex')['ID'].count().values 
    contigency_table = np.array([[f_p, f_c],
                                [m_p, m_c]])
    res = stats.chi2_contingency(contigency_table )
    stat = round(res.statistic, 2)
    pval ='<0.01' if res.pvalue < 0.01 else round(res.pvalue ,2)
    dof = res.dof
    N=f_p+f_c+m_p+m_c
    print(f'Chi-square test: X2({dof},{N})={stat},p={pval}')
    print('\n')

train dataset
Mean age test patients 22.467898550724637 (n=276)
Mean age test controls 28.634444444444444 (n=180)
T-test: stat=-4.65, p=<0.01
Chi-square test: X2(1,457)=0.61,p=0.43


test dataset
Mean age test patients 21.6815444015444 (n=259)
Mean age test controls 29.35647668393782 (n=193)
T-test: stat=-5.72, p=<0.01
Chi-square test: X2(1,452)=1.4,p=0.24


