In [1]:
import numpy as np
import pandas as pd
from itertools import chain
from sklearn.model_selection import train_test_split
import shutil
import os
import cv2

In [2]:
df = pd.read_csv('./Data_Entry_2017_v2020.csv')
df.head(2)

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
0,00000001_000.png,Cardiomegaly,0,1,57,M,PA,2682,2749,0.143,0.143
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143


In [3]:
# Diseases labels in the dataset
labels = sorted(set(chain(*df['Finding Labels'].apply(lambda x:x.split('|')))))

In [4]:
# manipulate the data frame to have one-hot encoding of diseases labels and their count per image
for label in labels:
    df[label] = df['Finding Labels'].apply(lambda x: 1 if label in x else 0)

df['count_labels'] = df['Finding Labels'].apply(lambda x: len(x.split('|')))
df.head(5)

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,...,Fibrosis,Hernia,Infiltration,Mass,No Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax,count_labels
0,00000001_000.png,Cardiomegaly,0,1,57,M,PA,2682,2749,0.143,...,0,0,0,0,0,0,0,0,0,1
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,...,0,0,0,0,0,0,0,0,0,2
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,...,0,0,0,0,0,0,0,0,0,2
3,00000002_000.png,No Finding,0,2,80,M,PA,2500,2048,0.171,...,0,0,0,0,1,0,0,0,0,1
4,00000003_001.png,Hernia,0,3,74,F,PA,2500,2048,0.168,...,0,1,0,0,0,0,0,0,0,1


In [5]:
# view the change 
df = df[['Patient ID', 'Follow-up #','Image Index', 'Finding Labels'] + labels + ['count_labels']]
df.head(2)


Unnamed: 0,Patient ID,Follow-up #,Image Index,Finding Labels,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,Mass,No Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax,count_labels
0,1,0,00000001_000.png,Cardiomegaly,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,1,00000001_001.png,Cardiomegaly|Emphysema,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,2


In [6]:
df[df['Patient ID']==1]

Unnamed: 0,Patient ID,Follow-up #,Image Index,Finding Labels,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,Mass,No Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax,count_labels
0,1,0,00000001_000.png,Cardiomegaly,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,1,00000001_001.png,Cardiomegaly|Emphysema,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,2
2,1,2,00000001_002.png,Cardiomegaly|Effusion,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,2


In [7]:
print(len(df['Patient ID'].unique()))
print(df['Patient ID'].max())
print(df['Patient ID'].min())


30805
30805
1


In [8]:
# Check the percentage of positive cases in each disease
labels = df.columns[4: -1]
for label in labels:
    print(label, len(df[df[label] == 1]), len(df[df[label] == 1]) / len(df))


Atelectasis 11559 0.10309489832322512
Cardiomegaly 2776 0.024759186585800928
Consolidation 4667 0.0416250445950767
Edema 2303 0.020540492329646807
Effusion 13317 0.11877452729218695
Emphysema 2516 0.022440242597217268
Fibrosis 1686 0.015037459864430967
Hernia 227 0.0020246164823403494
Infiltration 19894 0.17743489118801284
Mass 5782 0.05156974669996432
No Finding 60361 0.5383606849803781
Nodule 6331 0.05646628612201213
Pleural_Thickening 3385 0.030190866928291118
Pneumonia 1431 0.012763110952550838
Pneumothorax 5302 0.04728861933642526


In [9]:
# As we see from the above results, we have a big imbalance between negative and positive cases in each disease
# So to reduce the imbalance we will remove any image that has no diseases at all
# And that will speed up the training process
df.drop(df[(df["No Finding"] == 1)].index, inplace=True)


In [10]:
# Check the percentage of positive cases in each disease after removing the images that have no diseases
labels = df.columns[4: -1]
for label in labels:
    print(label, len(df[df[label] == 1]) / len(df))

Atelectasis 0.2233234799744972
Cardiomegaly 0.05363318456693522
Consolidation 0.09016789350644333
Edema 0.04449467725419734
Effusion 0.2572885874920304
Emphysema 0.048609903591645895
Fibrosis 0.03257404509360691
Hernia 0.004385710697656447
Infiltration 0.3843582758554068
Mass 0.11171004076585715
No Finding 0.0
Nodule 0.12231689174829498
Pleural_Thickening 0.06539925423597828
Pneumonia 0.02764736567553469
Pneumothorax 0.10243629127301532


In [11]:
# When the image has no disease then all the disease columns will be zeros so we can drop the 'No Finding' column
# We no longer need the 'Follow-up #', 'Finding Labels' and 'count_labels' column so we can drop them 
df = df.drop(['No Finding', 'Follow-up #', 'Finding Labels', 'count_labels'], axis=1)
df


Unnamed: 0,Patient ID,Image Index,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,Mass,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax
0,1,00000001_000.png,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,1,00000001_001.png,0,1,0,0,0,1,0,0,0,0,0,0,0,0
2,1,00000001_002.png,0,1,0,0,1,0,0,0,0,0,0,0,0,0
4,3,00000003_001.png,0,0,0,0,0,0,0,1,0,0,0,0,0,0
5,3,00000003_002.png,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112097,30786,00030786_006.png,0,0,1,0,0,0,0,0,0,0,0,0,0,0
112100,30789,00030789_000.png,0,0,0,0,0,0,0,0,1,0,0,0,0,0
112106,30793,00030793_000.png,0,0,0,0,0,0,0,0,0,1,1,0,0,0
112108,30795,00030795_000.png,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [12]:
# Split the dataframe into training and validation parts
train_df, valid_df = train_test_split(df, train_size=0.9, random_state=42)

In [13]:
# Save the training and validation dataframes into csv files
train_df.to_csv("../ML pipeline/train_csv.csv", index=False)
valid_df.to_csv("../ML pipeline/valid_csv.csv", index=False)
