# This code is to creat folders and move the images masked as Train , Validation and Test and giving labels as Nomal and Cardiomegaly, according to the list provided in the data source 
https://nihcc.app.box.com/v/ChestXray-NIHCC

In [114]:
# General libraries
import os
import shutil
import numpy as np
import pandas as pd 
import random
import cv2
import matplotlib.pyplot as plt
%matplotlib inline
import shutil
from sklearn.model_selection import train_test_split

# Deep learning libraries
import keras.backend as K
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Flatten, Dropout, BatchNormalization
from keras.layers import Conv2D, SeparableConv2D, MaxPool2D, LeakyReLU, Activation
from keras.optimizers import Adam
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
import tensorflow as tf

# Setting seeds for reproducibility
seed = 232
np.random.seed(seed)
tf.random.set_seed(seed)

In [115]:
# Create directory, if directory exists remove and create one 

def create_directory(directory_path):
    if os.path.exists(directory_path):
        shutil.rmtree(directory_path) 
        os.mkdir(directory_path)
    else:
        os.mkdir(directory_path)

In [116]:
#moving files from one folder to other,     
def move_allfiles (src,dst):
    if os.path.exists(src):
        files = os.listdir(src) 
        for f in files:
            shutil.copyfile(src+f,dst+f)

In [117]:
#moving list of files from one folder to other,
def move_listfiles (src,dst,img_list):
    if os.path.exists(src):
        files = img_list
        for f in files:
            shutil.copyfile(src+f,dst+f)

In [118]:
#reading the data
df=pd.read_csv("Data_Entry_2017_v2020.csv") 

In [119]:
#renaming the columns names
df.rename(columns={'Image Index':'img_index','Finding Labels':'labels','Follow-up #':'follow_up','Patient ID':'pa_id',
                   'Patient Age':'pa_age','Patient Gender':'pa_gender','View Position':'view_position'}, inplace=True)

In [120]:
df.columns

Index(['img_index', 'labels', 'follow_up', 'pa_id', 'pa_age', 'pa_gender',
       'view_position', 'OriginalImage[Width', 'Height]',
       'OriginalImagePixelSpacing[x', 'y]'],
      dtype='object')

In [121]:
#convert the files in train_val_list.txt to a python list 

train_val_list= open('train_val_list.txt').read().split()
train_val_list[1:3]

['00000001_001.png', '00000001_002.png']

In [122]:
len(train_val_list)

86524

In the link provided the CXR, the list of images are provided to be considered as train and test, 
but not the validation list. I used the train_test plit method to randomly select validation data
as it is good to validate the data before testing a model.

In [123]:
np.random.shuffle(train_val_list)

In [124]:
train_img, valid_img = train_test_split(train_val_list, 
                                   test_size = 0.30)

In [125]:
train_img[0:3]

['00028086_001.png', '00023743_000.png', '00014691_001.png']

In [126]:
print(len(train_img),len(valid_img), len(train_val_list))

60566 25958 86524


In [127]:
#Selecting the images under train_img from Data_Entry_2017.csv file

df_train_img = df[df.img_index.isin(train_img)]

In [128]:
df_train_img.head()
#following are the images to be trained

Unnamed: 0,img_index,labels,follow_up,pa_id,pa_age,pa_gender,view_position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168
3,00000002_000.png,No Finding,0,2,80,M,PA,2500,2048,0.171,0.171
12,00000004_000.png,Mass|Nodule,0,4,82,M,AP,2500,2048,0.168,0.168
13,00000005_000.png,No Finding,0,5,69,F,PA,2048,2500,0.168,0.168


In [129]:
len(df_train_img)

60566

In [130]:
#checking images with various labels
print(df_train_img['labels'].unique()[0:5])

['Cardiomegaly|Emphysema' 'Cardiomegaly|Effusion' 'No Finding'
 'Mass|Nodule' 'Infiltration']


Here we consider the label "No Finding" as normal. Although in the the documentation it has been identified specifically that the terms like ‘It is hard to exclude ...’ is treated as uncertainty cases and then the image is labeled as ‘No finding’. We have considerd that as "no cardiomegaly" or "Normal". 

In [131]:
#assign 1 for 'No Finding' and 2 for 'Cardiomegaly'
df_train_img.loc[df_train_img.labels.str.contains('No Finding'), 'marker'] = '1'

df_train_img.loc[df_train_img.labels.str.contains('Cardiomegaly'), 'marker'] = '2'



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [132]:
df_train_img.head()

Unnamed: 0,img_index,labels,follow_up,pa_id,pa_age,pa_gender,view_position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],marker
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,2.0
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,2.0
3,00000002_000.png,No Finding,0,2,80,M,PA,2500,2048,0.171,0.171,1.0
12,00000004_000.png,Mass|Nodule,0,4,82,M,AP,2500,2048,0.168,0.168,
13,00000005_000.png,No Finding,0,5,69,F,PA,2048,2500,0.168,0.168,1.0


In [133]:
df_train_img.describe()

Unnamed: 0,follow_up,pa_id,pa_age,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
count,60566.0,60566.0,60566.0,60566.0,60566.0,60566.0,60566.0
mean,5.127448,13785.226612,46.59687,2640.63075,2503.839563,0.155418,0.155418
std,8.692182,8277.874008,16.66908,337.36858,403.249851,0.015624,0.015624
min,0.0,1.0,0.0,1301.0,966.0,0.115,0.115
25%,0.0,6738.0,34.0,2500.0,2048.0,0.143,0.143
50%,2.0,13298.0,48.0,2514.0,2544.0,0.143,0.143
75%,6.0,20202.0,59.0,2992.0,2991.0,0.168,0.168
max,108.0,30801.0,95.0,3550.0,3166.0,0.1988,0.1988


In [134]:
df_train_normal=df_train_img[df_train_img['marker'] == '1']

In [135]:
df_train_cardiomegaly=df_train_img[df_train_img['marker'] == '2']

In [136]:
df_train_normal.head()

Unnamed: 0,img_index,labels,follow_up,pa_id,pa_age,pa_gender,view_position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],marker
3,00000002_000.png,No Finding,0,2,80,M,PA,2500,2048,0.171,0.171,1
13,00000005_000.png,No Finding,0,5,69,F,PA,2048,2500,0.168,0.168,1
14,00000005_001.png,No Finding,1,5,69,F,AP,2500,2048,0.168,0.168,1
15,00000005_002.png,No Finding,2,5,69,F,AP,2500,2048,0.168,0.168,1
16,00000005_003.png,No Finding,3,5,69,F,PA,2992,2991,0.143,0.143,1


In [137]:
df_train_cardiomegaly.head()

Unnamed: 0,img_index,labels,follow_up,pa_id,pa_age,pa_gender,view_position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],marker
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,2
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,2
23,00000008_000.png,Cardiomegaly,0,8,68,F,PA,2048,2500,0.171,0.171,2
175,00000033_000.png,Atelectasis|Cardiomegaly|Fibrosis,0,33,72,F,PA,2992,2745,0.143,0.143,2
185,00000038_003.png,Cardiomegaly,3,38,76,M,AP,3056,2544,0.139,0.139,2


In [138]:
#number of images which do not come under normal or cardiomegaly
print('Number of images which do not come under normal or cardiomegaly:',len(df_train_img)-(len(df_train_normal)+len(df_train_cardiomegaly)))

Number of images which do not come under normal or cardiomegaly: 24021


In [139]:
#number of raw images being analysed
print('Number of raw images being analysed:',len(df_train_normal)+len(df_train_cardiomegaly))


Number of raw images being analysed: 36545


In [140]:
#making list of images under normal
train_normal_img_list = df_train_normal.img_index.tolist()
train_normal_img_list[1:3]

['00000005_000.png', '00000005_001.png']

In [141]:
#making list of images under cardiomegaly
train_cardiomegaly_img_list = df_train_cardiomegaly.img_index.tolist()
train_cardiomegaly_img_list[1:3]

['00000001_002.png', '00000008_000.png']

In [142]:
#create a folder train_img to collect all the images belonging to train_img.txt
create_directory('/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/train_img/')

In [143]:
#move the images in train_img
move_listfiles('/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/img_all/','/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/train_img/',train_img)

In [144]:
#create folder train_normal_img
create_directory('/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/train_normal_img/')

In [145]:
#moving images marked as normal from train_img to train_normal_img

move_listfiles('/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/train_img/','/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/train_normal_img/',train_normal_img_list)

In [146]:
#create folder train_cardiomegaly_img
create_directory('/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/train_cardiomegaly_img/')

In [147]:
#moving images marked as cardiomegaly from train_img to train_cardiomegaly_img

move_listfiles('/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/train_img/','/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/train_cardiomegaly_img/',train_cardiomegaly_img_list)

In [148]:
#create a folder test_img to collect all the images belonging to test_list.txt
create_directory('/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/test_img/')


In [149]:
#convert the files in test_list.txt to a python list 

test_list= open('/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/test_list.txt').read().split()
test_list[1:3]

['00000003_001.png', '00000003_002.png']

In [150]:
len(test_list)

25596

In [151]:
#Selecting the images under test_list from Data_Entry_2017_v2020.csv file

df_test_img = df[df.img_index.isin(test_list)]

In [152]:
len(df_test_img)

25596

In [153]:
df_test_img.head()

Unnamed: 0,img_index,labels,follow_up,pa_id,pa_age,pa_gender,view_position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
4,00000003_001.png,Hernia,0,3,74,F,PA,2500,2048,0.168,0.168
5,00000003_002.png,Hernia,1,3,75,F,PA,2048,2500,0.168,0.168
6,00000003_003.png,Hernia|Infiltration,2,3,76,F,PA,2698,2991,0.143,0.143
7,00000003_004.png,Hernia,3,3,77,F,PA,2500,2048,0.168,0.168
8,00000003_005.png,Hernia,4,3,78,F,PA,2686,2991,0.143,0.143


In [154]:
#assign 1 for 'No Finding' and 2 for 'Cardiomegaly'
df_test_img.loc[df_test_img.labels.str.contains('No Finding'), 'marker'] = '1'

df_test_img.loc[df_test_img.labels.str.contains('Cardiomegaly'), 'marker'] = '2'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [155]:
df_test_normal=df_test_img[df_test_img['marker'] == '1']

In [156]:
df_test_cardiomegaly=df_test_img[df_test_img['marker'] == '2']

In [157]:
#number of raw images being tested
print('Number of raw images being tested:',len(df_test_normal)+len(df_test_cardiomegaly))


Number of raw images being tested: 10930


In [158]:
#making list of images under normal
test_normal_img_list = df_test_normal.img_index.tolist()
test_normal_img_list[1:3]

['00000013_000.png', '00000013_008.png']

In [159]:
#making list of images under cardiomegaly
test_cardiomegaly_img_list = df_test_cardiomegaly.img_index.tolist()
test_cardiomegaly_img_list[1:3]

['00000013_026.png', '00000013_027.png']

In [160]:
#create a folder test_img to collect all the images belonging to test_list.txt
create_directory('/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/test_img/')

In [161]:
#move the images in test_img
move_listfiles('/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/img_all/','/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/test_img/',test_list)


In [162]:
#create folder test_normal_img
create_directory('/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/test_normal_img/')

In [163]:
#moving images marked as normal from test_img to test_normal_img

move_listfiles('/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/test_img/','/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/test_normal_img/',test_normal_img_list)

In [164]:
#create folder test_cardiomegaly_img
create_directory('/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/test_cardiomegaly_img/')

In [165]:
#moving images marked as cardiomegaly from test_img to test_cardiomegaly_img

move_listfiles('/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/test_img/','/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/test_cardiomegaly_img/',test_cardiomegaly_img_list)

Creating and moving images marked valid

In [211]:
#create a folder val_img to collect all the images belonging to valid_list.txt
create_directory('/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/val_img/')

In [212]:
#create folder val_normal_img
create_directory('/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/val_normal_img/')

In [213]:
#create folder val_cardiomegaly_img
create_directory('/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/val_cardiomegaly_img/')

In [214]:
#move the images in valid_img
move_listfiles('/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/img_all/','/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/val_img/',valid_img)


In [215]:
#Selecting the images under val_img from Data_Entry_2017_v2020.csv file

df_val_img = df[df.img_index.isin(valid_img)]

In [216]:
len(df_val_img)

25958

In [217]:
df_val_img.head()

Unnamed: 0,img_index,labels,follow_up,pa_id,pa_age,pa_gender,view_position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
0,00000001_000.png,Cardiomegaly,0,1,57,M,PA,2682,2749,0.143,0.143
21,00000006_000.png,No Finding,0,6,81,M,PA,2500,2048,0.168,0.168
22,00000007_000.png,No Finding,0,7,82,M,PA,2500,2048,0.168,0.168
26,00000009_000.png,Emphysema,0,9,72,M,PA,2992,2991,0.143,0.143
28,00000011_000.png,Effusion,0,11,74,M,PA,2638,2449,0.143,0.143


In [218]:
#assign 1 for 'No Finding' and 2 for 'Cardiomegaly'
df_val_img.loc[df_val_img.labels.str.contains('No Finding'), 'marker'] = '1'

df_val_img.loc[df_val_img.labels.str.contains('Cardiomegaly'), 'marker'] = '2'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [219]:
df_val_normal=df_val_img[df_val_img['marker'] == '1']

In [220]:
df_val_cardiomegaly=df_val_img[df_val_img['marker'] == '2']

In [221]:
#number of raw images being validated
print('Number of raw images being validated:',len(df_val_normal)+len(df_val_cardiomegaly))


Number of raw images being validated: 15662


In [222]:
#making list of images under normal
val_normal_img_list = df_val_normal.img_index.tolist()
val_normal_img_list[1:3]

['00000007_000.png', '00000011_002.png']

In [223]:
#making list of images under cardiomegaly
val_cardiomegaly_img_list = df_val_cardiomegaly.img_index.tolist()
val_cardiomegaly_img_list[1:3]

['00000038_000.png', '00000045_000.png']

In [224]:
#moving images marked as normal from val_img to val_normal_img

move_listfiles('/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/val_img/','/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/val_normal_img/',val_normal_img_list)

In [225]:
#create folder val_cardiomegaly_img
create_directory('/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/val_cardiomegaly_img/')

In [226]:
#moving images marked as cardiomegaly from val_img to val_cardiomegaly_img

move_listfiles('/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/val_img/','/Users/neeharikasinha/Documents/datascience-course/Capstones/Chestxray-cardiomegaly3/val_cardiomegaly_img/',val_cardiomegaly_img_list)