In [None]:
import pandas

from datasets import (
    GitHubCOVIDDataset,
    BIMCVCOVIDDataset,
    ChestXray14Dataset,
    PadChestDataset,
    BIMCVNegativeDataset, 
    DomainConfoundedDataset
)
SEED = 42


def prepare_dataset_1(split_name):
    trainds = DomainConfoundedDataset(
            ChestXray14Dataset(fold='train', labels='chestx-ray14', random_state=SEED),
            GitHubCOVIDDataset(fold='train', labels='chestx-ray14', random_state=SEED)
            )

    valds = DomainConfoundedDataset(
            ChestXray14Dataset(fold='val', labels='chestx-ray14', random_state=SEED),
            GitHubCOVIDDataset(fold='val', labels='chestx-ray14', random_state=SEED)
            )
    
    split_dir = f"splits/{split_name}/dataset1"

    trainds.ds1.df = pandas.read_csv(f"{split_dir}/chestxray-train.csv")
    trainds.ds1.meta_df = pandas.read_csv(f"{split_dir}/chestxray-trainmeta.csv")

    valds.ds1.df = pandas.read_csv(f"{split_dir}/chestxray-val.csv")
    valds.ds1.meta_df = pandas.read_csv(f"{split_dir}/chestxray-valmeta.csv")

    trainds.ds2.df = pandas.read_csv(f"{split_dir}/padchest-train.csv")
    valds.ds2.df = pandas.read_csv(f"{split_dir}/padchest-val.csv")

    return trainds, valds

def prepare_dataset_2(split_name):
    trainds = DomainConfoundedDataset(
            PadChestDataset(fold='train', labels='chestx-ray14', random_state=SEED),
            BIMCVCOVIDDataset(fold='train', labels='chestx-ray14', random_state=SEED)
            )
    valds = DomainConfoundedDataset(
            PadChestDataset(fold='val', labels='chestx-ray14', random_state=SEED),
            BIMCVCOVIDDataset(fold='val', labels='chestx-ray14', random_state=SEED)
            )
    
    split_dir = f"splits/{split_name}/dataset2"

    trainds.ds1.df = pandas.read_csv(f"{split_dir}/padchest-train.csv")
    valds.ds1.df = pandas.read_csv(f"{split_dir}/padchest-val.csv")

    trainds.ds2.df = pandas.read_csv(f"{split_dir}/bimcv-train.csv")
    valds.ds2.df = pandas.read_csv(f"{split_dir}/bimcv-val.csv")

    return trainds, valds

def prepare_dataset_3(split_name):
    trainds = DomainConfoundedDataset(
            BIMCVNegativeDataset(fold='all', labels='chestx-ray14', random_state=SEED),
            BIMCVCOVIDDataset(fold='all', labels='chestx-ray14', random_state=SEED)
            )
    valds = DomainConfoundedDataset(
            BIMCVNegativeDataset(fold='all', labels='chestx-ray14', random_state=SEED),
            BIMCVCOVIDDataset(fold='all', labels='chestx-ray14', random_state=SEED)
            )
    
    split_dir = f"splits/{split_name}/dataset3"
    
    trainds.ds1.df = pandas.read_csv(f"{split_dir}/traindf1.csv")
    valds.ds1.df = pandas.read_csv(f"{split_dir}/valdf1.csv")

    trainds.ds2.df = pandas.read_csv(f"{split_dir}/traindf2.csv")
    valds.ds2.df = pandas.read_csv(f"{split_dir}/valdf2.csv")

    trainds.len1 = len(trainds.ds1)
    trainds.len2 = len(trainds.ds2)
    valds.len1 = len(valds.ds1)
    valds.len2 = len(valds.ds2)
    
    return trainds, valds

In [None]:
trainds, valds = prepare_dataset_3("42")

In [None]:
trainds.get_all_labels()[:, -1].shape

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

def spurious_baseline_model(dataset, column):
    df = pd.concat([dataset.ds1.df, dataset.ds2.df])
    
    discriminator = df[column]

    print("Column:", column)
    for v in discriminator.unique():
        predictions = (df[column] == v).to_numpy(dtype=int)
        labels = dataset.get_all_labels()[:, -1]

        print(f"Discriminator {v}, {predictions.sum() / len(df)}%: auroc=", roc_auc_score(labels, predictions))

In [None]:
trainds.ds1.df.columns

In [None]:
trainds.ds1.df["manufacturer"].unique()

In [None]:
trainds.ds1.df['photometric_interpretation'].unique()

In [43]:
for column in ['projection', 'modality', 'sex', 'photometric_interpretation', ]:
    print("Train")
    spurious_baseline_model(trainds, column)
    print("Val")
    spurious_baseline_model(valds, column)

Train
Column: projection
Discriminator PA, 0.46474067333939945%: auroc= 0.4727786748450301
Discriminator AP, 0.5352593266606005%: auroc= 0.5272213251549699
Val
Column: projection
Discriminator AP, 0.610236220472441%: auroc= 0.42391304347826086
Discriminator PA, 0.38976377952755903%: auroc= 0.5760869565217391
Train
Column: modality
Discriminator CR, 0.5409463148316651%: auroc= 0.5515897958299016
Discriminator DX, 0.45905368516833484%: auroc= 0.44841020417009836
Val
Column: modality
Discriminator DX, 0.4094488188976378%: auroc= 0.4934782608695652
Discriminator CR, 0.5905511811023622%: auroc= 0.5065217391304347
Train
Column: sex
Discriminator F, 0.4870336669699727%: auroc= 0.4544839214241334
Discriminator M, 0.5129663330300273%: auroc= 0.5455160785758666
Val
Column: sex
Discriminator F, 0.5433070866141733%: auroc= 0.460248447204969
Discriminator M, 0.4566929133858268%: auroc= 0.5397515527950311
Train
Column: photometric_interpretation
Discriminator MONOCHROME2, 0.6894904458598726%: auroc=