In [1]:
import pandas

from datasets import (
    GitHubCOVIDDataset,
    BIMCVCOVIDDataset,
    ChestXray14Dataset,
    PadChestDataset,
    BIMCVNegativeDataset, 
    DomainConfoundedDataset
)
SEED = 42


def prepare_dataset_1(split_name):
    trainds = DomainConfoundedDataset(
            ChestXray14Dataset(fold='train', labels='chestx-ray14', random_state=SEED),
            GitHubCOVIDDataset(fold='train', labels='chestx-ray14', random_state=SEED)
            )

    valds = DomainConfoundedDataset(
            ChestXray14Dataset(fold='val', labels='chestx-ray14', random_state=SEED),
            GitHubCOVIDDataset(fold='val', labels='chestx-ray14', random_state=SEED)
            )
    
    split_dir = f"splits/{split_name}/dataset1"

    trainds.ds1.df = pandas.read_csv(f"{split_dir}/chestxray-train.csv")
    trainds.ds1.meta_df = pandas.read_csv(f"{split_dir}/chestxray-trainmeta.csv")

    valds.ds1.df = pandas.read_csv(f"{split_dir}/chestxray-val.csv")
    valds.ds1.meta_df = pandas.read_csv(f"{split_dir}/chestxray-valmeta.csv")

    trainds.ds2.df = pandas.read_csv(f"{split_dir}/padchest-train.csv")
    valds.ds2.df = pandas.read_csv(f"{split_dir}/padchest-val.csv")

    return trainds, valds

def prepare_dataset_2(split_name):
    trainds = DomainConfoundedDataset(
            PadChestDataset(fold='train', labels='chestx-ray14', random_state=SEED),
            BIMCVCOVIDDataset(fold='train', labels='chestx-ray14', random_state=SEED)
            )
    valds = DomainConfoundedDataset(
            PadChestDataset(fold='val', labels='chestx-ray14', random_state=SEED),
            BIMCVCOVIDDataset(fold='val', labels='chestx-ray14', random_state=SEED)
            )
    
    split_dir = f"splits/{split_name}/dataset2"

    trainds.ds1.df = pandas.read_csv(f"{split_dir}/padchest-train.csv")
    valds.ds1.df = pandas.read_csv(f"{split_dir}/padchest-val.csv")

    trainds.ds2.df = pandas.read_csv(f"{split_dir}/bimcv-train.csv")
    valds.ds2.df = pandas.read_csv(f"{split_dir}/bimcv-val.csv")

    return trainds, valds

def prepare_dataset_3(split_name):
    trainds = DomainConfoundedDataset(
            BIMCVNegativeDataset(fold='all', labels='chestx-ray14', random_state=SEED),
            BIMCVCOVIDDataset(fold='all', labels='chestx-ray14', random_state=SEED)
            )
    valds = DomainConfoundedDataset(
            BIMCVNegativeDataset(fold='all', labels='chestx-ray14', random_state=SEED),
            BIMCVCOVIDDataset(fold='all', labels='chestx-ray14', random_state=SEED)
            )
    
    split_dir = f"splits/{split_name}/dataset3"
    
    trainds.ds1.df = pandas.read_csv(f"{split_dir}/negative-train.csv")
    valds.ds1.df = pandas.read_csv(f"{split_dir}/negative-val.csv")

    trainds.ds2.df = pandas.read_csv(f"{split_dir}/positive-train.csv")
    valds.ds2.df = pandas.read_csv(f"{split_dir}/positive-val.csv")

    trainds.len1 = len(trainds.ds1)
    trainds.len2 = len(trainds.ds2)
    valds.len1 = len(valds.ds1)
    valds.len2 = len(valds.ds2)
    
    return trainds, valds

In [2]:
trainds, valds = prepare_dataset_3("42")

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

def spurious_baseline_model(dataset, column):
    df = pd.concat([dataset.ds1.df, dataset.ds2.df])
    
    discriminator = df[column]

    print("Column:", column)
    for v in discriminator.unique():
        predictions = (df[column] == v).to_numpy(dtype=int)
        labels = dataset.get_all_labels()[:, -1]

        print(f"Discriminator {v}, {predictions.sum() / len(df)}%: auroc=", roc_auc_score(labels, predictions))

In [4]:
for column in ['projection', 'modality', 'sex', 'photometric_interpretation', ]:
    print("Train")
    spurious_baseline_model(trainds, column)
    print("Val")
    spurious_baseline_model(valds, column)

Train
Column: projection
Discriminator PA, 0.46474067333939945%: auroc= 0.4727786748450301
Discriminator AP, 0.5352593266606005%: auroc= 0.5272213251549699
Val
Column: projection
Discriminator AP, 0.610236220472441%: auroc= 0.42391304347826086
Discriminator PA, 0.38976377952755903%: auroc= 0.5760869565217391
Train
Column: modality
Discriminator CR, 0.5409463148316651%: auroc= 0.5515897958299016
Discriminator DX, 0.45905368516833484%: auroc= 0.44841020417009836
Val
Column: modality
Discriminator DX, 0.4094488188976378%: auroc= 0.4934782608695652
Discriminator CR, 0.5905511811023622%: auroc= 0.5065217391304347
Train
Column: sex
Discriminator F, 0.4870336669699727%: auroc= 0.4544839214241334
Discriminator M, 0.5129663330300273%: auroc= 0.5455160785758666
Val
Column: sex
Discriminator F, 0.5433070866141733%: auroc= 0.460248447204969
Discriminator M, 0.4566929133858268%: auroc= 0.5397515527950311
Train
Column: photometric_interpretation
Discriminator MONOCHROME2, 0.6894904458598726%: auroc=

In [5]:
trainds.ds1.df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'path', 'participant', 'projection',
       'modality', 'manufacturer', 'sex', 'photometric_interpretation',
       'window_center', 'window_width', 'study_date', 'study_time', 'age',
       'lut', 'lut_min', 'rescale_slope', 'rescale_intercept', 'bits_stored'],
      dtype='object')

In [6]:
trainds.ds1.df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,path,participant,projection,modality,manufacturer,sex,photometric_interpretation,window_center,window_width,study_date,study_time,age,lut,lut_min,rescale_slope,rescale_intercept,bits_stored
0,3,3,bimcv-/sub-S07337/ses-E12991/mod-rx/sub-S07337...,sub-S07337,PA,CR,SIEMENS,F,MONOCHROME2,1533.0,2888.0,20200329.0,121317.812,26.0,,,,,
1,6,6,bimcv-/sub-S07665/ses-E13559/mod-rx/sub-S07665...,sub-S07665,PA,CR,Philips Medical Systems,M,MONOCHROME2,2047.0,4095.0,20200404.0,144109.265,56.0,,,,,
2,9,9,bimcv-/sub-S06628/ses-E11817/mod-rx/sub-S06628...,sub-S06628,AP,DX,Carestream Health,M,MONOCHROME2,2048.0,4096.0,20200324.0,220622.895,2.0,,,,,
3,10,10,bimcv-/sub-S06081/ses-E11014/mod-rx/sub-S06081...,sub-S06081,PA,CR,Agfa,F,MONOCHROME2,2048.0,4096.0,20200316.0,184345.0,,,,,,
4,11,11,bimcv-/sub-S06051/ses-E10963/mod-rx/sub-S06051...,sub-S06051,AP,DX,Agfa,F,MONOCHROME1,,,20200316.0,130608.0,,"[1, 2, 3, 4, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, ...",8192.0,1.0,0.0,15.0


In [7]:
def dataset_to_dfs(dataset):
    important_columns = [
        'projection',
        'modality',
        'sex', 
        'photometric_interpretation',
        'manufacturer',
    ]
    df = pd.concat([dataset.ds1.df, dataset.ds2.df])
    cat_df = df[important_columns].astype("category")
    cat_df['age'] = df['age']
    labels = dataset.get_all_labels()[:, -1]

    return cat_df, pd.Series(labels)

In [8]:
from xgboost import XGBClassifier

X_train, y_train = dataset_to_dfs(trainds)
X_test, y_test = dataset_to_dfs(valds)
    
# create model instance
bst = XGBClassifier(objective='binary:logistic', enable_categorical=True)
# fit model
bst.fit(X_train, y_train)
# make predictions
preds = bst.predict(X_test)

In [9]:
roc_auc_score(y_test, preds)

np.float64(0.5813664596273291)

In [12]:
roc_auc_score(y_train, bst.predict(X_train))

np.float64(0.6749565614749953)

In [10]:
bst.get_booster().get_score(importance_type='gain')

{'projection': 1.0412590503692627,
 'modality': 1.442354440689087,
 'sex': 0.794089138507843,
 'photometric_interpretation': 1.2782044410705566,
 'manufacturer': 2.9244885444641113,
 'age': 0.9429316520690918}

In [11]:
import pandas as pd
pd.read_csv("/home/janfidor/Programming/Research/cxr_covid/splits/42/dataset3/negative-val.csv").path.iloc[5]


'bimcv-/sub-S05136/ses-E09718/mod-rx/sub-S05136_ses-E09718_run-1_bp-chest_vp-ap_cr.png'