## df_sizes

In [None]:
# get all picture sizes in a list

import os

rootdir = "../data/CheXpert/"
filenames = []
i=0

# get all file names
for root, dirs, files in os.walk(rootdir):
    if len(files) != 0:
        for file in files:
            filenames.append(os.path.normpath(os.path.join(root,file)))
            print(i, end='\r')
            i+=1

filenames = filenames[3:]            
filenames

In [73]:
# get all that end in frontal.jpg (exclude lateral.jpg)

filenames_frontal = []
for name in filenames:
    if 'frontal' in name:
        filenames_frontal.append(name)
len(filenames_frontal)

191229

In [75]:
# get sizes

from PIL import Image

size_list = []

i=0
for path in filenames_frontal:
    img = Image.open(path)
    size_list.append(img.size)
    print(i, end='\r')
    i+=1

191228

In [78]:
# create dataframe from filenames_frontal and size_list

import pandas as pd

df_sizes = pd.DataFrame(list(zip(filenames_frontal,size_list)),columns=['path','size'])
df_sizes[['width','height']] = pd.DataFrame(df_sizes['size'].tolist(), index=df_sizes.index)
df_sizes

Unnamed: 0,path,size,width,height
0,../data/CheXpert/train/patient01435/study1/vie...,"(2828, 2320)",2828,2320
1,../data/CheXpert/train/patient01435/study2/vie...,"(2828, 2320)",2828,2320
2,../data/CheXpert/train/patient47480/study1/vie...,"(2828, 2320)",2828,2320
3,../data/CheXpert/train/patient04740/study1/vie...,"(2828, 2320)",2828,2320
4,../data/CheXpert/train/patient45051/study1/vie...,"(3408, 2800)",3408,2800
...,...,...,...,...
191224,../data/CheXpert/valid/patient64617/study1/vie...,"(2828, 2320)",2828,2320
191225,../data/CheXpert/valid/patient64715/study1/vie...,"(2828, 2320)",2828,2320
191226,../data/CheXpert/valid/patient64570/study1/vie...,"(2022, 2022)",2022,2022
191227,../data/CheXpert/valid/patient64735/study1/vie...,"(2828, 2320)",2828,2320


In [86]:
# remove root from path 17

# df_sizes['path'][0][17:]
df_sizes['path'] = df_sizes['path'].apply(lambda x: x[17:])
df_sizes

In [88]:
# save df_sizes to csv
df_sizes.to_csv('../data/df_sizes.csv', index=False)

##### Labels and their meanings:
- **Cardiomegaly**: 
    - vergrößertes Herz
- **Enlarged_Cardiomediastinum**: 
    - s. Cardiomegaly 
    - Mediastinum ist die Region, in der das Herz liegt, ergo folgt aus vergrößertem Kardiomediastinum eine Kardiomegalie (dieses Label ist offenbar eher unüblich als Begriff)
- **Enlarg_Cardiomediastinum**: 
    - s. Enlarged_Cardiomediastinum 
- **Edema**: 
    - Ödem, Schwellung 
    - Wassereinlagerung in der Lunge bzw. in Lungenbläschen
    - entsteht wenn das Herz nicht mehr genug Blut wegpumpen kann --> Rückstau 
    - Blutflüssigkeit tritt in Lungenbläschen über 
- **edema**: 
    - s. Edema
- **Consolidation**: 
    - Flüssigkeit (Transsudat, Exsudat) in den Lungenbläschen
    - Lungenbläschen sind entzündet
    - Gasaustausch kann nicht mehr stattfinden 
    - häufig im Zusammenhang mit Lungenentzündung
- **Pleural_Effusion**: 
    - Flüssigkeit im Brustkorb
    - Pleura ist Hülle der Lunge (Rippenfell) 
    - Wasser ist zwischen Lunge und Hülle
- **Atelectasis**: 
    - Lungenbläschen platzen, es wird nicht mehr belüftet
    - ganzer Lungenflügel kann kollabieren
- **Pneumonia**: 
    - Entzündung des Lungengewebes und/oder Lungenbläschen 
    - Streifen/Schnee im Röntgenbild (aufgehellt)
- **Pneumothorax**: 
    - Luft im Brustkorb außerhalb der Lunge (drückt auf die Lunge, kann sich nicht mehr voll ausdehnen)
    - Lungenflügel kann kollabieren
- **Fracture**: 
    - Bruch


##### Aggregated Labels:
- Cardiomegaly, Enlarged_Cardiomediastinum, Enlarg_Cardiomediastinum 
- Pleural_Effusion 
- Edema, edema 
- Atelectasis 
- Consolidation 
- Pneumonia
- Pneumothorax
- Fracture

##### Competition Tasks
- Cardiomegaly
- Pleural Effusion
- Edema
- Atelectasis
- Consolidation

## df_boxes

boxes: [xmin, ymin, xmax, ymax]

In [29]:
import pandas as pd
import json

In [30]:
# open bboxes file

with open('../data/CheXpert/BBoxes.json', 'r') as f:
    bboxes = json.load(f)

len(bboxes)

2348

In [31]:
# create a list with path strings to all 2076 frontal images that we have annotation for

path_list = []

for element in bboxes:
    if 'frontal' in element:
        path_list.append(element)

len(path_list)

2076

In [32]:
# aggregate labels, changes dict (bboxes)

labels_cardiomediastinum = ['Enlarged_Cardiomediastinum', 'Enlarg_Cardiomediastinum']
label_edema = 'edema'

for path in path_list:
    for label in labels_cardiomediastinum:
        if label in bboxes[path].keys():
            if 'Cardiomegaly' in bboxes[path].keys():
                temp_coordinates = bboxes[path]['Cardiomegaly']
                bboxes[path]['Cardiomegaly'] = bboxes[path].pop(label)
                bboxes[path]['Cardiomegaly'].append(temp_coordinates[0])
            else:
                new_kvp = {'Cardiomegaly':bboxes[path][label]}
                bboxes[path].update(new_kvp)
                bboxes[path].pop(label)
    if label_edema in bboxes[path].keys():
        if 'Edema' in bboxes[path].keys():
            temp_coordinates = bboxes[path]['Edema']
            bboxes[path]['Edema'] = bboxes[path].pop(label_edema)
            bboxes[path]['Edema'].append(temp_coordinates)
        else:
            new_kvp = {'Edema':bboxes[path][label_edema]}
            bboxes[path].update(new_kvp)
            bboxes[path].pop(label_edema)

In [33]:
# create key list: list of all labels in data to be used

key_list = []

for element in path_list:
    for key in bboxes[element].keys():
        if key not in key_list:
            key_list.append(key)

key_list

['Cardiomegaly',
 'Pleural_Effusion',
 'Edema',
 'Atelectasis',
 'Pneumonia',
 'Consolidation',
 'Pneumothorax',
 'Fracture']

In [34]:
# create dataframe linking path to content in bboxes file
df_boxes = pd.DataFrame(columns=key_list)
df_boxes.insert(0, 'path', path_list)

# fill dataframe with box coordinates
for path in path_list:
    keys = bboxes[path].keys()
    index = df_boxes[df_boxes['path'].str.match(path)].index[0]
    for key in keys:
        df_boxes.at[index,key] = bboxes[path][key]

df_boxes

Unnamed: 0,path,Cardiomegaly,Pleural_Effusion,Edema,Atelectasis,Pneumonia,Consolidation,Pneumothorax,Fracture
0,train/patient40093/study4/view1_frontal.jpg,"[[1417, 537, 2525, 1565]]","[[2540, 1452, 2748, 1785]]","[[879, 472, 1433, 1511], [1879, 544, 2669, 1539]]",,,,,
1,train/patient01844/study23/view1_frontal.jpg,"[[916, 373, 2221, 1538]]","[[324, 1154, 621, 1667], [2029, 1146, 2332, 14...",,"[[1940, 917, 2279, 1183]]",,,,
2,train/patient36973/study6/view1_frontal.jpg,,"[[862, 374, 1814, 1554], [418, 798, 738, 1590]]","[[550, 574, 1138, 1386], [1638, 654, 2042, 1478]]","[[1726, 1138, 2038, 1318]]",,,,
3,train/patient38533/study9/view1_frontal.jpg,,,,,"[[442, 412, 1245, 1530]]",,,
4,train/patient49106/study5/view1_frontal.jpg,,"[[2320, 1135, 2647, 1785]]","[[943, 385, 1288, 1585]]",,,,,
...,...,...,...,...,...,...,...,...,...
2071,train/patient04616/study1/view1_frontal.jpg,"[[1209, 470, 2257, 1894]]",,,,,,,
2072,train/patient10996/study1/view1_frontal.jpg,,"[[693, 1304, 981, 1654]]",,,,,,"[[2015, 983, 2140, 1137], [2017, 1403, 2193, 1..."
2073,train/patient25403/study1/view1_frontal.jpg,"[[1173, 284, 2335, 2008]]",,,,,,,"[[616, 1161, 778, 1342], [657, 1356, 781, 1499]]"
2074,train/patient45042/study3/view1_frontal.jpg,,,"[[450, 573, 1190, 1744], [1708, 915, 2340, 1825]]",,"[[355, 1167, 1108, 1702]]",,,


In [35]:
# take those out that have entries only in Pneumonia, Pneumothorax or Fracture

# get indices of rows with NaN in all competition tasks
nan_indices_ct = df_boxes.loc[df_boxes['Cardiomegaly'].isna()].loc[df_boxes['Pleural_Effusion'].isna()].loc[df_boxes['Edema'].isna()].loc[df_boxes['Atelectasis'].isna()].loc[df_boxes['Consolidation'].isna()].index.to_list()

# get indices of rows with NaN in all tasks
nan_indices_all = df_boxes.loc[df_boxes['Cardiomegaly'].isna()].loc[df_boxes['Pleural_Effusion'].isna()].loc[df_boxes['Edema'].isna()].loc[df_boxes['Atelectasis'].isna()].loc[df_boxes['Consolidation'].isna()].loc[df_boxes['Pneumothorax'].isna()].loc[df_boxes['Pneumonia'].isna()].loc[df_boxes['Fracture'].isna()].index.to_list()

# remove nan_indices_all from nan_indices_ct
[nan_indices_ct.remove(value) for value in nan_indices_all]

# remove df_boxes rows by index
df_boxes = df_boxes.drop(nan_indices_ct).reset_index(drop=True)
df_boxes

Unnamed: 0,path,Cardiomegaly,Pleural_Effusion,Edema,Atelectasis,Pneumonia,Consolidation,Pneumothorax,Fracture
0,train/patient40093/study4/view1_frontal.jpg,"[[1417, 537, 2525, 1565]]","[[2540, 1452, 2748, 1785]]","[[879, 472, 1433, 1511], [1879, 544, 2669, 1539]]",,,,,
1,train/patient01844/study23/view1_frontal.jpg,"[[916, 373, 2221, 1538]]","[[324, 1154, 621, 1667], [2029, 1146, 2332, 14...",,"[[1940, 917, 2279, 1183]]",,,,
2,train/patient36973/study6/view1_frontal.jpg,,"[[862, 374, 1814, 1554], [418, 798, 738, 1590]]","[[550, 574, 1138, 1386], [1638, 654, 2042, 1478]]","[[1726, 1138, 2038, 1318]]",,,,
3,train/patient49106/study5/view1_frontal.jpg,,"[[2320, 1135, 2647, 1785]]","[[943, 385, 1288, 1585]]",,,,,
4,train/patient29171/study4/view1_frontal.jpg,"[[991, 620, 2045, 1562]]",,,,,,,
...,...,...,...,...,...,...,...,...,...
1817,train/patient04616/study1/view1_frontal.jpg,"[[1209, 470, 2257, 1894]]",,,,,,,
1818,train/patient10996/study1/view1_frontal.jpg,,"[[693, 1304, 981, 1654]]",,,,,,"[[2015, 983, 2140, 1137], [2017, 1403, 2193, 1..."
1819,train/patient25403/study1/view1_frontal.jpg,"[[1173, 284, 2335, 2008]]",,,,,,,"[[616, 1161, 778, 1342], [657, 1356, 781, 1499]]"
1820,train/patient45042/study3/view1_frontal.jpg,,,"[[450, 573, 1190, 1744], [1708, 915, 2340, 1825]]",,"[[355, 1167, 1108, 1702]]",,,


In [36]:
# remove columns: Pneumonia, Pneumothorax, Fracture
df_boxes.drop(columns=['Pneumonia', 'Pneumothorax', 'Fracture'], inplace=True)

In [39]:
# save df_boxes to pkl
df_boxes.to_pickle('../data/test/df_boxes.pkl')

## df_test

In [42]:
df_boxes = pd.read_pickle('../data/test/df_boxes.pkl')

In [22]:
# convert df_boxes to labels (1/0)

df_test = df_boxes.loc[:, 'Cardiomegaly':'Consolidation'].notnull().astype('int')
df_test.insert(0, 'path', df_boxes['path'])
df_test

Unnamed: 0,path,Cardiomegaly,Pleural_Effusion,Edema,Atelectasis,Consolidation
0,train/patient40093/study4/view1_frontal.jpg,1,1,1,0,0
1,train/patient01844/study23/view1_frontal.jpg,1,1,0,1,0
2,train/patient36973/study6/view1_frontal.jpg,0,1,1,1,0
3,train/patient49106/study5/view1_frontal.jpg,0,1,1,0,0
4,train/patient29171/study4/view1_frontal.jpg,1,0,0,0,0
...,...,...,...,...,...,...
1817,train/patient04616/study1/view1_frontal.jpg,1,0,0,0,0
1818,train/patient10996/study1/view1_frontal.jpg,0,1,0,0,0
1819,train/patient25403/study1/view1_frontal.jpg,1,0,0,0,0
1820,train/patient45042/study3/view1_frontal.jpg,0,0,1,0,0


In [23]:
# add sizes to df_test using df_sizes.csv

df_sizes = pd.read_csv("../data/df_sizes.csv")

width_list = []
height_list = []

i=1

for path in df_test['path'].tolist():
    width_list.append(df_sizes.loc[df_sizes['path']==path]['width'].item())
    height_list.append(df_sizes.loc[df_sizes['path']==path]['height'].item())
    print(i, end='\r')
    i+=1

df_test['width'] = width_list
df_test['height'] = height_list

df_test

1822

Unnamed: 0,path,Cardiomegaly,Pleural_Effusion,Edema,Atelectasis,Consolidation,width,height
0,train/patient40093/study4/view1_frontal.jpg,1,1,1,0,0,2828,2320
1,train/patient01844/study23/view1_frontal.jpg,1,1,0,1,0,2828,2320
2,train/patient36973/study6/view1_frontal.jpg,0,1,1,1,0,2828,2320
3,train/patient49106/study5/view1_frontal.jpg,0,1,1,0,0,2828,2320
4,train/patient29171/study4/view1_frontal.jpg,1,0,0,0,0,2828,2320
...,...,...,...,...,...,...,...,...
1817,train/patient04616/study1/view1_frontal.jpg,1,0,0,0,0,2828,2320
1818,train/patient10996/study1/view1_frontal.jpg,0,1,0,0,0,2828,2320
1819,train/patient25403/study1/view1_frontal.jpg,1,0,0,0,0,2828,2320
1820,train/patient45042/study3/view1_frontal.jpg,0,0,1,0,0,2828,2320


In [25]:
# save df_test to csv
df_test.to_csv('../data/test/df_test.csv', index=False)

## df_train & df_valid

In [6]:
# get CheXpert metadata dataframes (all available data)

import pandas as pd

train = pd.read_csv('../data/CheXpert/train.csv')
valid = pd.read_csv('../data/CheXpert/valid.csv')

In [7]:
### dataframe transformation

# only frontal
valid = valid.loc[valid['Frontal/Lateral']=='Frontal']
train = train.loc[train['Frontal/Lateral']=='Frontal']

# cut off CheXpert-v1.0 from path
valid['Path'] = valid['Path'].map(lambda path: path[14:])
train['Path'] = train['Path'].map(lambda path: path[14:])

# drop metadata columns
valid.drop(['Sex', 'Age', 'Frontal/Lateral', 'AP/PA', 'No Finding', 'Support Devices'], axis=1, inplace=True)
train.drop(['Sex', 'Age', 'Frontal/Lateral', 'AP/PA', 'No Finding', 'Support Devices'], axis=1, inplace=True)

# select only competition tasks
valid = valid.loc[:, ['Path', 'Cardiomegaly', 'Pleural Effusion', 'Edema', 'Atelectasis', 'Consolidation']]
train = train.loc[:, ['Path', 'Cardiomegaly', 'Pleural Effusion', 'Edema', 'Atelectasis', 'Consolidation']]

# rename column
valid = valid.rename(columns={'Pleural Effusion': 'Pleural_Effusion', 'Path': 'path'})
train = train.rename(columns={'Pleural Effusion': 'Pleural_Effusion', 'Path': 'path'})

# uncertainty handling: uncertain labels are converted to 0
train = train.replace(-1,0).fillna(0)

# typecast all to integer
valid = valid.astype({'Cardiomegaly': 'int', 'Pleural_Effusion': 'int', 'Edema': 'int', 'Atelectasis': 'int', 'Consolidation': 'int'})
train = train.astype({'Cardiomegaly': 'int', 'Pleural_Effusion': 'int', 'Edema': 'int', 'Atelectasis': 'int', 'Consolidation': 'int'})

In [8]:
train

Unnamed: 0,path,Cardiomegaly,Pleural_Effusion,Edema,Atelectasis,Consolidation
0,train/patient00001/study1/view1_frontal.jpg,0,0,0,0,0
1,train/patient00002/study2/view1_frontal.jpg,0,0,0,0,0
2,train/patient00002/study1/view1_frontal.jpg,0,0,0,0,0
4,train/patient00003/study1/view1_frontal.jpg,0,0,1,0,0
5,train/patient00004/study1/view1_frontal.jpg,0,0,0,0,0
...,...,...,...,...,...,...
223409,train/patient64537/study2/view1_frontal.jpg,0,1,0,0,0
223410,train/patient64537/study1/view1_frontal.jpg,0,0,0,0,0
223411,train/patient64538/study1/view1_frontal.jpg,0,0,0,0,0
223412,train/patient64539/study1/view1_frontal.jpg,1,0,0,1,0


In [9]:
# add sizes for valid

df_sizes = pd.read_csv("../data/df_sizes.csv")

width_list = []
height_list = []

for path in valid['path'].tolist():
    width_list.append(df_sizes.loc[df_sizes['path']==path]['width'].item())
    height_list.append(df_sizes.loc[df_sizes['path']==path]['height'].item())

valid['width'] = width_list
valid['height'] = height_list

valid

Unnamed: 0,path,Cardiomegaly,Pleural_Effusion,Edema,Atelectasis,Consolidation,width,height
0,valid/patient64541/study1/view1_frontal.jpg,1,0,0,0,0,2828,2320
1,valid/patient64542/study1/view1_frontal.jpg,0,0,0,0,0,2021,2022
3,valid/patient64543/study1/view1_frontal.jpg,0,0,1,0,0,2828,2320
4,valid/patient64544/study1/view1_frontal.jpg,0,0,0,0,0,2828,2320
5,valid/patient64545/study1/view1_frontal.jpg,0,1,0,1,0,2828,2320
...,...,...,...,...,...,...,...,...
229,valid/patient64736/study1/view1_frontal.jpg,0,0,0,0,0,2828,2320
230,valid/patient64737/study1/view1_frontal.jpg,0,0,0,0,0,4240,3480
231,valid/patient64738/study1/view1_frontal.jpg,1,0,1,0,0,2828,2320
232,valid/patient64739/study1/view1_frontal.jpg,0,0,0,0,0,2828,2320


In [10]:
# add sizes for train

width_list = []
height_list = []

i=1

for path in train['path'].tolist():
    width_list.append(df_sizes.loc[df_sizes['path']==path]['width'].item())
    height_list.append(df_sizes.loc[df_sizes['path']==path]['height'].item())
    print(i, end='\r')
    i+=1

train['width'] = width_list
train['height'] = height_list

train

191027

Unnamed: 0,path,Cardiomegaly,Pleural_Effusion,Edema,Atelectasis,Consolidation,width,height
0,train/patient00001/study1/view1_frontal.jpg,0,0,0,0,0,3408,2800
1,train/patient00002/study2/view1_frontal.jpg,0,0,0,0,0,2828,2320
2,train/patient00002/study1/view1_frontal.jpg,0,0,0,0,0,2828,2320
4,train/patient00003/study1/view1_frontal.jpg,0,0,1,0,0,2828,2320
5,train/patient00004/study1/view1_frontal.jpg,0,0,0,0,0,2022,1741
...,...,...,...,...,...,...,...,...
223409,train/patient64537/study2/view1_frontal.jpg,0,1,0,0,0,2494,2048
223410,train/patient64537/study1/view1_frontal.jpg,0,0,0,0,0,4240,3480
223411,train/patient64538/study1/view1_frontal.jpg,0,0,0,0,0,2920,2320
223412,train/patient64539/study1/view1_frontal.jpg,1,0,0,1,0,2920,2320


In [57]:
# concatenate train and valid to one dataframe --> df_train
# check for labels included in df_test (1822 images) and delete entries from df_train

# concatenate dataframes
df_train = pd.concat([train, valid]).reset_index(drop=True)

# check for paths included df_test --> all are included, thus: delete 1822 entries from df_train
df_test = pd.read_csv('../data/test/df_test.csv')
paths_test = df_test['path'].tolist()

print(len(df_train))
print('-1822 (SOLL)')

for path in paths_test:
    df_train = df_train.drop(df_train[df_train['path']==path].index)

df_train = df_train.reset_index(drop=True)

df_train

191229
-1822 (SOLL)


Unnamed: 0,path,Cardiomegaly,Pleural_Effusion,Edema,Atelectasis,Consolidation,width,height
0,train/patient00001/study1/view1_frontal.jpg,0,0,0,0,0,3408,2800
1,train/patient00002/study2/view1_frontal.jpg,0,0,0,0,0,2828,2320
2,train/patient00002/study1/view1_frontal.jpg,0,0,0,0,0,2828,2320
3,train/patient00003/study1/view1_frontal.jpg,0,0,1,0,0,2828,2320
4,train/patient00004/study1/view1_frontal.jpg,0,0,0,0,0,2022,1741
...,...,...,...,...,...,...,...,...
189402,valid/patient64728/study1/view1_frontal.jpg,0,0,0,0,0,2828,2320
189403,valid/patient64730/study1/view1_frontal.jpg,0,0,0,0,0,2828,2320
189404,valid/patient64731/study1/view1_frontal.jpg,0,0,0,0,0,2828,2320
189405,valid/patient64736/study1/view1_frontal.jpg,0,0,0,0,0,2828,2320


In [58]:
# validation set
# 200 images sampled from df_train (resolution 2828x2320)

df_valid = df_train.loc[df_train['width']==2828].loc[df_train['height']==2320].sample(n=200).reset_index(drop=True)
df_valid

Unnamed: 0,path,Cardiomegaly,Pleural_Effusion,Edema,Atelectasis,Consolidation,width,height
0,train/patient41844/study3/view1_frontal.jpg,0,1,0,1,0,2828,2320
1,train/patient18701/study12/view1_frontal.jpg,0,1,1,0,0,2828,2320
2,train/patient31999/study1/view1_frontal.jpg,0,0,0,0,0,2828,2320
3,train/patient42907/study6/view1_frontal.jpg,0,1,1,0,0,2828,2320
4,train/patient50461/study1/view1_frontal.jpg,0,0,0,0,0,2828,2320
...,...,...,...,...,...,...,...,...
195,train/patient42293/study3/view1_frontal.jpg,0,0,0,1,0,2828,2320
196,train/patient28748/study4/view1_frontal.jpg,0,1,0,1,0,2828,2320
197,train/patient02201/study1/view1_frontal.jpg,0,0,0,1,0,2828,2320
198,train/patient45414/study2/view1_frontal.jpg,0,0,0,0,0,2828,2320


In [59]:
# remove validation images from df_train

pathlist = df_valid['path'].tolist()

print(len(df_train))

for path in pathlist:
    df_train = df_train.drop(df_train[df_train['path']==path].index)

df_train = df_train.reset_index(drop=True)

df_train

189407


Unnamed: 0,path,Cardiomegaly,Pleural_Effusion,Edema,Atelectasis,Consolidation,width,height
0,train/patient00001/study1/view1_frontal.jpg,0,0,0,0,0,3408,2800
1,train/patient00002/study2/view1_frontal.jpg,0,0,0,0,0,2828,2320
2,train/patient00002/study1/view1_frontal.jpg,0,0,0,0,0,2828,2320
3,train/patient00003/study1/view1_frontal.jpg,0,0,1,0,0,2828,2320
4,train/patient00004/study1/view1_frontal.jpg,0,0,0,0,0,2022,1741
...,...,...,...,...,...,...,...,...
189202,valid/patient64728/study1/view1_frontal.jpg,0,0,0,0,0,2828,2320
189203,valid/patient64730/study1/view1_frontal.jpg,0,0,0,0,0,2828,2320
189204,valid/patient64731/study1/view1_frontal.jpg,0,0,0,0,0,2828,2320
189205,valid/patient64736/study1/view1_frontal.jpg,0,0,0,0,0,2828,2320


In [60]:
# save df_valid and df_train (BASE group)

df_valid.to_csv('../data/train/df_valid.csv', index=False)
df_train.to_csv('../data/train/df_train.csv', index=False)

## FL Client Train Sets

In [1]:
import pandas as pd

In [45]:
# read in
df_train = pd.read_csv('../data/train/df_train.csv')
df_valid = pd.read_csv('../data/train/df_valid.csv')
print(len(df_train))
print(len(df_valid))

# concat
df_train_full = pd.concat([df_train, df_valid]).reset_index(drop=True)
print(len(df_train_full))

# subset for size
df_train_full = df_train_full.loc[df_train_full.width == 2828].loc[df_train_full['height'] == 2320]
print(len(df_train_full))

189207
200
189407
133299


In [46]:
# client datasets (subsets of df_train)
df_train_0 = df_train_full.loc[df_train_full['Cardiomegaly']==1].reset_index(drop=True)
df_train_1 = df_train_full.loc[df_train_full['Pleural_Effusion']==1].reset_index(drop=True)
df_train_2 = df_train_full.loc[df_train_full['Edema']==1].reset_index(drop=True)
df_train_3 = df_train_full.loc[df_train_full['Atelectasis']==1].reset_index(drop=True)
df_train_4 = df_train_full.loc[df_train_full['Consolidation']==1].reset_index(drop=True)

# validation sets
df_valid_0 = df_train_0.sample(n=200).reset_index(drop=True)
df_valid_1 = df_train_1.sample(n=200).reset_index(drop=True)
df_valid_2 = df_train_2.sample(n=200).reset_index(drop=True)
df_valid_3 = df_train_3.sample(n=200).reset_index(drop=True)
df_valid_4 = df_train_4.sample(n=200).reset_index(drop=True)

In [47]:
# remove valid sets from train sets

# 0
pathlist = df_valid_0['path'].tolist()
print(len(df_train_0))
for path in pathlist:
    df_train_0 = df_train_0.drop(df_train_0[df_train_0['path']==path].index)
df_train_0 = df_train_0.reset_index(drop=True)
print(len(df_train_0))

# 1
pathlist = df_valid_1['path'].tolist()
print(len(df_train_1))
for path in pathlist:
    df_train_1 = df_train_1.drop(df_train_1[df_train_1['path']==path].index)
df_train_1 = df_train_1.reset_index(drop=True)
print(len(df_train_1))

# 2
pathlist = df_valid_2['path'].tolist()
print(len(df_train_2))
for path in pathlist:
    df_train_2 = df_train_2.drop(df_train_2[df_train_2['path']==path].index)
df_train_2 = df_train_2.reset_index(drop=True)
print(len(df_train_2))

# 3
pathlist = df_valid_3['path'].tolist()
print(len(df_train_3))
for path in pathlist:
    df_train_3 = df_train_3.drop(df_train_3[df_train_3['path']==path].index)
df_train_3 = df_train_3.reset_index(drop=True)
print(len(df_train_3))

# 4
pathlist = df_valid_4['path'].tolist()
print(len(df_train_4))
for path in pathlist:
    df_train_4 = df_train_4.drop(df_train_4[df_train_4['path']==path].index)
df_train_4 = df_train_4.reset_index(drop=True)
print(len(df_train_4))

18061
17861
57228
57028
38837
38637
21754
21554
9460
9260


In [48]:
# util: xrays with no positive labels

temp1 = df_train_full[df_train_full['Cardiomegaly']==0]
temp2 = temp1[temp1['Pleural_Effusion']==0]
temp3 = temp2[temp2['Edema']==0]
temp4 = temp3[temp3['Atelectasis']==0]
df_train_none = temp4[temp4['Consolidation']==0]
df_train_none

Unnamed: 0,path,Cardiomegaly,Pleural_Effusion,Edema,Atelectasis,Consolidation,width,height
1,train/patient00002/study2/view1_frontal.jpg,0,0,0,0,0,2828,2320
2,train/patient00002/study1/view1_frontal.jpg,0,0,0,0,0,2828,2320
6,train/patient00005/study2/view1_frontal.jpg,0,0,0,0,0,2828,2320
7,train/patient00005/study2/view2_frontal.jpg,0,0,0,0,0,2828,2320
16,train/patient00011/study1/view1_frontal.jpg,0,0,0,0,0,2828,2320
...,...,...,...,...,...,...,...,...
189383,train/patient59064/study1/view1_frontal.jpg,0,0,0,0,0,2828,2320
189390,train/patient30420/study4/view1_frontal.jpg,0,0,0,0,0,2828,2320
189398,train/patient58756/study1/view2_frontal.jpg,0,0,0,0,0,2828,2320
189401,train/patient31224/study1/view1_frontal.jpg,0,0,0,0,0,2828,2320


In [49]:
# sim0 - uneven groups, no further preprocessing

# save clients
df_train_0.to_csv('../data/train/FL/sim0/train/df_train_0.csv', index=False)
df_train_1.to_csv('../data/train/FL/sim0/train/df_train_1.csv', index=False)
df_train_2.to_csv('../data/train/FL/sim0/train/df_train_2.csv', index=False)
df_train_3.to_csv('../data/train/FL/sim0/train/df_train_3.csv', index=False)
df_train_4.to_csv('../data/train/FL/sim0/train/df_train_4.csv', index=False)

df_valid_0.to_csv('../data/train/FL/sim0/valid/df_valid_0.csv', index=False)
df_valid_1.to_csv('../data/train/FL/sim0/valid/df_valid_1.csv', index=False)
df_valid_2.to_csv('../data/train/FL/sim0/valid/df_valid_2.csv', index=False)
df_valid_3.to_csv('../data/train/FL/sim0/valid/df_valid_3.csv', index=False)
df_valid_4.to_csv('../data/train/FL/sim0/valid/df_valid_4.csv', index=False)

In [50]:
# sim0 - CL: remove all in df_train_none from df_train_full (CL should not see data that FL does not see)

print(len(df_train_none))
pathlist = df_train_none['path'].tolist()
print(len(df_train_full))
for path in pathlist:
    df_train_sim0_cl = df_train_full.drop(df_train_full[df_train_full['path']==path].index)
df_train_sim0_cl = df_train_sim0_cl.reset_index(drop=True)
print(len(df_train_sim0_cl))

41668
133299
133298


In [51]:
# sim0: sample 200 validation instances and delete from training data

df_valid_sim0_cl = df_train_sim0_cl.sample(n=200).reset_index(drop=True)

pathlist = df_valid_sim0_cl['path'].tolist()
print(len(df_train_sim0_cl))
for path in pathlist:
    df_train_sim0_cl = df_train_sim0_cl.drop(df_train_sim0_cl[df_train_sim0_cl['path']==path].index)
df_train_full = df_train_sim0_cl.reset_index(drop=True)
print(len(df_train_sim0_cl))

133298
133098


In [52]:
# sim0: save cl datasets

df_train_sim0_cl.to_csv('../data/train/FL/sim0/train/df_train_cl.csv', index=False)
df_valid_sim0_cl.to_csv('../data/train/FL/sim0/valid/df_valid_cl.csv', index=False)

In [53]:
print(len(df_train_0))
print(len(df_train_1))
print(len(df_train_2))
print(len(df_train_3))
print(len(df_train_4))

17861
57028
38637
21554
9260


In [54]:
# sim1: 5 clients with same 9260 instances (smallest amount in groups)

df_train_0_sim1 = df_train_0.head(9260)
df_train_1_sim1 = df_train_1.head(9260)
df_train_2_sim1 = df_train_2.head(9260)
df_train_3_sim1 = df_train_3.head(9260)
df_train_4_sim1 = df_train_4

print(len(df_train_0_sim1))
print(len(df_train_1_sim1))
print(len(df_train_2_sim1))
print(len(df_train_3_sim1))
print(len(df_train_4_sim1))

# save clients
df_train_0_sim1.to_csv('../data/train/FL/sim1/train/df_train_0.csv', index=False)
df_train_1_sim1.to_csv('../data/train/FL/sim1/train/df_train_1.csv', index=False)
df_train_2_sim1.to_csv('../data/train/FL/sim1/train/df_train_2.csv', index=False)
df_train_3_sim1.to_csv('../data/train/FL/sim1/train/df_train_3.csv', index=False)
df_train_4_sim1.to_csv('../data/train/FL/sim1/train/df_train_4.csv', index=False)

df_valid_0.to_csv('../data/train/FL/sim1/valid/df_valid_0.csv', index=False)
df_valid_1.to_csv('../data/train/FL/sim1/valid/df_valid_1.csv', index=False)
df_valid_2.to_csv('../data/train/FL/sim1/valid/df_valid_2.csv', index=False)
df_valid_3.to_csv('../data/train/FL/sim1/valid/df_valid_3.csv', index=False)
df_valid_4.to_csv('../data/train/FL/sim1/valid/df_valid_4.csv', index=False)

9260
9260
9260
9260
9260


In [55]:
# sim1: cl datasets

# concat all train and valid sets from sim1
df_train_sim1_cl = pd.concat([df_train_0_sim1, df_train_1_sim1, df_train_2_sim1, df_train_3_sim1, df_train_4_sim1, df_valid_0, df_valid_1, df_valid_2, df_valid_3, df_valid_4]).reset_index(drop=True)
print(len(df_train_sim1_cl))

# drop duplicates
df_train_sim1_cl = df_train_sim1_cl.drop_duplicates()
print(len(df_train_sim1_cl))

# sample 200 valid instances
df_valid_sim1_cl = df_train_sim1_cl.sample(n=200).reset_index(drop=True)
print(len(df_valid_sim1_cl))

# remove them from train data
pathlist = df_valid_sim1_cl['path'].tolist()
print(len(df_train_sim1_cl))
for path in pathlist:
    df_train_sim1_cl = df_train_sim1_cl.drop(df_train_sim1_cl[df_train_sim1_cl['path']==path].index)
df_train_sim1_cl = df_train_sim1_cl.reset_index(drop=True)
print(len(df_train_sim1_cl))

# save clients
df_train_sim1_cl.to_csv('../data/train/FL/sim1/train/df_train_cl.csv', index=False)
df_valid_sim1_cl.to_csv('../data/train/FL/sim1/valid/df_valid_cl.csv', index=False)

47300
35609
200
35609
35409


In [56]:
# sim2: fill clients up with no-label instances

# samples
sim2_samples = df_train_none.sample(23150)

# add 4630 no-label instances to each client
print(len(df_train_0_sim1))
df_train_0_sim2 = pd.concat([df_train_0_sim1, sim2_samples.head(4630)]).reset_index(drop=True)
print(len(df_train_0_sim2))
df_train_1_sim2 = pd.concat([df_train_1_sim1, sim2_samples.head(9260).tail(4630)]).reset_index(drop=True)
print(len(df_train_1_sim2))
df_train_2_sim2 = pd.concat([df_train_2_sim1, sim2_samples.head(13890).tail(4630)]).reset_index(drop=True)
print(len(df_train_2_sim2))
df_train_3_sim2 = pd.concat([df_train_3_sim1, sim2_samples.head(18520).tail(4630)]).reset_index(drop=True)
print(len(df_train_3_sim2))
df_train_4_sim2 = pd.concat([df_train_4_sim1, sim2_samples.tail(4630)]).reset_index(drop=True)
print(len(df_train_4_sim2))

# save clients
df_train_0_sim2.to_csv('../data/train/FL/sim2/train/df_train_0.csv', index=False)
df_train_1_sim2.to_csv('../data/train/FL/sim2/train/df_train_1.csv', index=False)
df_train_2_sim2.to_csv('../data/train/FL/sim2/train/df_train_2.csv', index=False)
df_train_3_sim2.to_csv('../data/train/FL/sim2/train/df_train_3.csv', index=False)
df_train_4_sim2.to_csv('../data/train/FL/sim2/train/df_train_4.csv', index=False)

df_valid_0.to_csv('../data/train/FL/sim2/valid/df_valid_0.csv', index=False)
df_valid_1.to_csv('../data/train/FL/sim2/valid/df_valid_1.csv', index=False)
df_valid_2.to_csv('../data/train/FL/sim2/valid/df_valid_2.csv', index=False)
df_valid_3.to_csv('../data/train/FL/sim2/valid/df_valid_3.csv', index=False)
df_valid_4.to_csv('../data/train/FL/sim2/valid/df_valid_4.csv', index=False)

9260
13890
13890
13890
13890
13890


In [57]:
# sim 2: cl datasets (analogous to sim1)

# concat all train and valid sets from sim2
df_train_sim2_cl = pd.concat([df_train_0_sim2, df_train_1_sim2, df_train_2_sim2, df_train_3_sim2, df_train_4_sim2, df_valid_0, df_valid_1, df_valid_2, df_valid_3, df_valid_4]).reset_index(drop=True)
print(len(df_train_sim2_cl))

# drop duplicates
df_train_sim2_cl = df_train_sim2_cl.drop_duplicates()
print(len(df_train_sim2_cl))

# sample 200 valid instances
df_valid_sim2_cl = df_train_sim2_cl.sample(n=200).reset_index(drop=True)
print(len(df_valid_sim2_cl))

# remove them from train data
pathlist = df_valid_sim2_cl['path'].tolist()
print(len(df_train_sim2_cl))
for path in pathlist:
    df_train_sim2_cl = df_train_sim2_cl.drop(df_train_sim2_cl[df_train_sim2_cl['path']==path].index)
df_train_sim2_cl = df_train_sim2_cl.reset_index(drop=True)
print(len(df_train_sim2_cl))

# save clients
df_train_sim2_cl.to_csv('../data/train/FL/sim2/train/df_train_cl.csv', index=False)
df_valid_sim2_cl.to_csv('../data/train/FL/sim2/valid/df_valid_cl.csv', index=False)

70450
58759
200
58759
58559


In [58]:
# sim3: 6 clients: add a client with 0s
# take sim1 clients (9260 each), sample 9260 from df_train_none as 6th client, save clients

sim3_samples = df_train_none.sample(9460)

# train sets
df_train_0_sim3 = df_train_0_sim1
df_train_1_sim3 = df_train_1_sim1
df_train_2_sim3 = df_train_2_sim1
df_train_3_sim3 = df_train_3_sim1
df_train_4_sim3 = df_train_4_sim1
df_train_5_sim3 = sim3_samples.head(9260)
print(len(df_train_0_sim3))
print(len(df_train_1_sim3))
print(len(df_train_2_sim3))
print(len(df_train_3_sim3))
print(len(df_train_4_sim3))
print(len(df_train_5_sim3))

# extra valid set
df_valid_5 = sim3_samples.tail(200)
print(len(df_valid_5))

# save clients
df_train_0_sim3.to_csv('../data/train/FL/sim3/train/df_train_0.csv', index=False)
df_train_1_sim3.to_csv('../data/train/FL/sim3/train/df_train_1.csv', index=False)
df_train_2_sim3.to_csv('../data/train/FL/sim3/train/df_train_2.csv', index=False)
df_train_3_sim3.to_csv('../data/train/FL/sim3/train/df_train_3.csv', index=False)
df_train_4_sim3.to_csv('../data/train/FL/sim3/train/df_train_4.csv', index=False)
df_train_5_sim3.to_csv('../data/train/FL/sim3/train/df_train_5.csv', index=False)

df_valid_0.to_csv('../data/train/FL/sim3/valid/df_valid_0.csv', index=False)
df_valid_1.to_csv('../data/train/FL/sim3/valid/df_valid_1.csv', index=False)
df_valid_2.to_csv('../data/train/FL/sim3/valid/df_valid_2.csv', index=False)
df_valid_3.to_csv('../data/train/FL/sim3/valid/df_valid_3.csv', index=False)
df_valid_4.to_csv('../data/train/FL/sim3/valid/df_valid_4.csv', index=False)
df_valid_5.to_csv('../data/train/FL/sim3/valid/df_valid_5.csv', index=False)

9260
9260
9260
9260
9260
9260
200


In [59]:
# sim3: cl datasets (analogous to sim1, sim2)

# concat all train and valid sets from sim3
df_train_sim3_cl = pd.concat([df_train_0_sim3, df_train_1_sim3, df_train_2_sim3, df_train_3_sim3, df_train_4_sim3, df_train_5_sim3, df_valid_0, df_valid_1, df_valid_2, df_valid_3, df_valid_4, df_valid_5]).reset_index(drop=True)
print(len(df_train_sim3_cl))

# drop duplicates
df_train_sim3_cl = df_train_sim3_cl.drop_duplicates()
print(len(df_train_sim3_cl))

# sample 200 valid instances
df_valid_sim3_cl = df_train_sim3_cl.sample(n=200).reset_index(drop=True)
print(len(df_valid_sim3_cl))

# remove them from train data
pathlist = df_valid_sim3_cl['path'].tolist()
print(len(df_train_sim3_cl))
for path in pathlist:
    df_train_sim3_cl = df_train_sim3_cl.drop(df_train_sim3_cl[df_train_sim3_cl['path']==path].index)
df_train_sim3_cl = df_train_sim3_cl.reset_index(drop=True)
print(len(df_train_sim3_cl))

# save clients
df_train_sim3_cl.to_csv('../data/train/FL/sim3/train/df_train_cl.csv', index=False)
df_valid_sim3_cl.to_csv('../data/train/FL/sim3/valid/df_valid_cl.csv', index=False)

56760
45069
200
45069
44869
