### Briefly filterting the data

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [38]:
df = pd.read_csv("CheXpert_train.csv")

In [39]:
#Dataset is cleaned up, values that are NaN are set to 0, values that are -1 are set to 0 as well
#Rows with more than 1 postive diagnoses are removed

cols = ['No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', 'Edema', 
        'Consolidation', 'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 
        'Fracture', 'Support Devices']

df[cols] = df[cols].apply(pd.to_numeric, errors='coerce').fillna(0)

df[cols] = df[cols].clip(lower=0)

df = df[df[cols].sum(axis=1) == 1.0]

In [40]:
#A column called Labels is created that contains the name of pathology that is present (who's columns contains the value 1)

def update_value(row):
    for col in df.columns:
        if row[col] == 1.0:
            return col
    return row

# Apply the update_value function to the DataFrame
df_updated = df.apply(update_value, axis=1)

# Create a new column with the updated value
df['Labels'] = df_updated

In [41]:
df

Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,Labels
4,CheXpert-v1.0/train/patient00003/study1/view1_...,Male,41,Frontal,AP,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Edema
5,CheXpert-v1.0/train/patient00004/study1/view1_...,Female,20,Frontal,PA,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,No Finding
6,CheXpert-v1.0/train/patient00004/study1/view2_...,Female,20,Lateral,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,No Finding
9,CheXpert-v1.0/train/patient00005/study2/view1_...,Male,33,Frontal,AP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,Pneumothorax
10,CheXpert-v1.0/train/patient00005/study2/view2_...,Male,33,Frontal,AP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,Pneumothorax
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223399,CheXpert-v1.0/train/patient64529/study1/view1_...,Male,81,Frontal,AP,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Lung Opacity
223405,CheXpert-v1.0/train/patient64534/study1/view1_...,Male,63,Frontal,AP,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Lung Opacity
223406,CheXpert-v1.0/train/patient64535/study1/view1_...,Male,60,Frontal,AP,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Lung Opacity
223409,CheXpert-v1.0/train/patient64537/study2/view1_...,Male,59,Frontal,AP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Pleural Effusion


In [42]:
#Scans that are not AP/PA are removed
df = df[df['AP/PA'].isin(['AP', 'PA'])]
#Pleural Other is dropped due to it's lack of data
df = df[df['Labels'] != 'Pleural Other']

In [43]:
#Total number of occurences for each label is calcualted 
place_holder_dataframe = {'Total Number': [], 'Label': []}
counting_labels = pd.DataFrame(place_holder_dataframe)

for i in df["Labels"].unique():
    new_row = {'Total Number': df[df["Labels"] == i]["Labels"].count(), 'Label': i}
    new_row_df = pd.DataFrame(new_row, index=[0])
    counting_labels = pd.concat([counting_labels, new_row_df], ignore_index=True)

In [44]:
counting_labels.sort_values(by='Total Number', ascending=False)

Unnamed: 0,Total Number,Label
1,9550.0,No Finding
8,8444.0,Lung Opacity
3,5278.0,Support Devices
6,3568.0,Pleural Effusion
0,3080.0,Edema
2,2274.0,Pneumothorax
7,1716.0,Atelectasis
10,1539.0,Cardiomegaly
5,1377.0,Fracture
9,1033.0,Consolidation


### Creating the filtered dataframe with shuffled rows

In [45]:
#The dataset is too large to be loaded into memory at once, so when the models are trained and tested a batch of data is loaded
#into memory. This batch of data has to contain all 13 classes at least once. A batch size of 500 was chosen based on 
#prior testing, this search was however not exhaustive.  
#The above calculated total number of labels will be used to caculte a value that will represent how many times each label
#should be sampled into a batch of size 500

In [46]:
#Dictionary containing these values is stored so that it can later be used in a loop to sample the original dataset to create
#a filtered dataset

value_dictionary = {}

#Some values are rounded up or down, this was done with trail and error depending on if certain labels ran out of values early
for index, row in counting_labels.iterrows():
    dictionary_value = row["Total Number"]/500
    if row["Label"] in "Lung Lesion, Cardiomegaly, Atelectasis, Pneumothorax, Enlarged Cardiomediastinum, Fracture, Consolidation":
        dictionary_value = dictionary_value//1
    else:
        dictionary_value = round(dictionary_value)
    value_dictionary[row["Label"]] = (dictionary_value)
    
value_dictionary

{'Edema': 6,
 'No Finding': 19,
 'Pneumothorax': 4.0,
 'Support Devices': 11,
 'Lung Lesion': 1.0,
 'Fracture': 2.0,
 'Pleural Effusion': 7,
 'Atelectasis': 3.0,
 'Lung Opacity': 17,
 'Consolidation': 2.0,
 'Cardiomegaly': 3.0,
 'Enlarged Cardiomediastinum': 2.0,
 'Pneumonia': 1}

In [47]:
columns_list = ['Edema', 'No Finding', 'Pneumothorax', 'Support Devices', 'Lung Lesion', 'Fracture', 'Pleural Effusion', 
                'Atelectasis', 'Lung Opacity', 'Consolidation', 
                'Cardiomegaly', 'Enlarged Cardiomediastinum', 'Pneumonia']

#column_list filters what columns values are checked
df['Integer Labels'] = df[columns_list].values.tolist()

#Data is now encoded in a list with a format of [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] depending which label has the value 1

In [48]:
#.append arguemnt was causing an error message to appear, this simply stated that .append would be removed in a future pandas
#upodate, this can be replaced with .concat but would need some slight modifications to the code below
import warnings
warnings.filterwarnings("ignore")


#The dataframe_dictionary will contain a filtered version of the df dataframe for each class, this will be used to check if 
#each class still has a row to append, as after a row is append to the updated dataframe it's removed from the dataframe in the
#dataframe_dictionary


dataframe_dictionary = {}


for value in columns_list:
    dataframe_dictionary[value] = df[df["Labels"] == value]
    
#updated_dataframe is the new fileterd dataframe where every 500 rows each class appears at least once

    
updated_dataframe = pd.DataFrame()

#The Pneumonia class is the first to run out of rows, once this runs out of rows the updated_dataframe stops appending new rows
while len(dataframe_dictionary["Pneumonia"]) != 0 :
    for value in value_dictionary:
        if len(dataframe_dictionary["Pneumonia"]) == 0:
            break
        i = 0
        while i < value_dictionary[value]:
            if len(dataframe_dictionary["Pneumonia"]) == 0:
                break
            try:
                #dataframe stored in the dataframe_dictionary is accessed for the label currently looping through the loop
                #The first occurence of that label is taken and appened to updated_dataframe and is dropped from the dataframe
                #it was taken from and the index is reset 
                index = dataframe_dictionary[value].index[0]
                row_to_append = dataframe_dictionary[value].iloc[0]
                updated_dataframe = updated_dataframe.append(row_to_append)
                dataframe_dictionary[value] = dataframe_dictionary[value].drop(index)
                dataframe_dictionary[value] = dataframe_dictionary[value].reset_index(drop=True)
                i += 1
            except IndexError:
                i += 1
                print(value)
    print(len(updated_dataframe))

    
warnings.filterwarnings("default")

78
156
234
312
390
468
546
624
702
780
858
936
1014
1092
1170
1248
1326
1404
1482
1560
1638
1716
1794
1872
1950
2028
2106
2184
2262
2340
2418
2496
2574
2652
2730
2808
2886
2964
3042
3120
3198
3276
3354
3432
3510
3588
3666
3744
3822
3900
3978
4056
4134
4212
4290
4368
4446
4524
4602
4680
4758
4836
4914
4992
5070
5148
5226
5304
5382
5460
5538
5616
5694
5772
5850
5928
6006
6084
6162
6240
6318
6396
6474
6552
6630
6708
6786
6864
6942
7020
7098
7176
7254
7332
7410
7488
7566
7644
7722
7800
7878
7956
8034
8112
8190
8268
8346
8424
8502
8580
8658
8736
8814
8892
8970
9048
9126
9204
9282
9360
9438
9516
9594
9672
9750
9828
9906
9984
10062
10140
10218
10296
10374
10452
10530
10608
10686
10764
10842
10920
10998
11076
11154
11232
11310
11388
11466
11544
11622
11700
11778
11856
11934
12012
12090
12168
12246
12324
12402
12480
12558
12636
12714
12792
12870
12948
13026
13104
13182
13260
13338
13416
13494
13572
13650
13728
13806
13884
13962
14040
14118
14196
14274
14352
14430
14508
14586
14664
14742
14820
1

In [49]:
updated_dataframe.reset_index(drop=True)
updated_dataframe['Path'] = updated_dataframe['Path'].str.replace('/', '\\')
updated_dataframe.to_csv("Filter_Data_13_Class_CheXpert")

In [50]:
updated_dataframe

Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,...,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,Labels,Integer Labels
4,CheXpert-v1.0\train\patient00003\study1\view1_...,Male,41,Frontal,AP,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Edema,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
0,CheXpert-v1.0\train\patient00015\study2\view1_...,Female,76,Frontal,AP,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Edema,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
0,CheXpert-v1.0\train\patient00098\study7\view1_...,Female,54,Frontal,AP,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Edema,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
0,CheXpert-v1.0\train\patient00100\study9\view1_...,Female,64,Frontal,AP,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Edema,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
0,CheXpert-v1.0\train\patient00114\study7\view1_...,Male,55,Frontal,AP,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Edema,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,CheXpert-v1.0\train\patient45154\study1\view1_...,Female,76,Frontal,AP,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiomegaly,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
0,CheXpert-v1.0\train\patient45220\study1\view1_...,Female,90,Frontal,AP,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiomegaly,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
0,CheXpert-v1.0\train\patient45378\study1\view1_...,Male,57,Frontal,AP,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Enlarged Cardiomediastinum,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
0,CheXpert-v1.0\train\patient45400\study3\view1_...,Male,74,Frontal,AP,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Enlarged Cardiomediastinum,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


### Calculating total number of labels now

In [51]:
place_holder_dataframe = {'Total Number': [], 'Label': []}
counting_labels = pd.DataFrame(place_holder_dataframe)

for i in updated_dataframe["Labels"].unique():
    new_row = {'Total Number': updated_dataframe[updated_dataframe["Labels"] == i]["Labels"].count(), 'Label': i}
    new_row_df = pd.DataFrame(new_row, index=[0])
    counting_labels = pd.concat([counting_labels, new_row_df], ignore_index=True)


df_sorted = counting_labels.sort_values(by='Total Number', ascending=False)

df_sorted

Unnamed: 0,Total Number,Label
1,8037.0,No Finding
8,7191.0,Lung Opacity
3,4653.0,Support Devices
6,2961.0,Pleural Effusion
0,2538.0,Edema
2,1692.0,Pneumothorax
7,1269.0,Atelectasis
10,1269.0,Cardiomegaly
5,846.0,Fracture
9,846.0,Consolidation
