In [1]:
import torch.optim as optim
import torch.nn.functional as F
import torch.nn as nn
import os
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision import transforms as tf
from os import walk
import torch
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
from matplotlib import image
# Set ipython's max row display
pd.set_option('display.max_row', 1000)

# Set iPython's max column width to 50
pd.set_option('display.max_columns', 50)

In [2]:
train_list_df = pd.read_csv('train_val_list.txt', header=None, names=['Image Index'])
test_list_df = pd.read_csv('test_list.txt', header=None, names=['Image Index'])
xray_train_df = pd.read_csv('BBox_List_2017.csv')
xray_test_df = pd.read_csv('Data_entry_2017_v2020.csv')

In [3]:
train_list_df.shape, test_list_df.shape

((86524, 1), (25596, 1))

In [4]:
train_list_df.head(10)

Unnamed: 0,Image Index
0,00000001_000.png
1,00000001_001.png
2,00000001_002.png
3,00000002_000.png
4,00000004_000.png
5,00000005_000.png
6,00000005_001.png
7,00000005_002.png
8,00000005_003.png
9,00000005_004.png


In [5]:
test_list_df.sample(10)

Unnamed: 0,Image Index
5443,00008911_033.png
2148,00003029_007.png
10152,00013670_037.png
18295,00021377_012.png
3663,00005801_009.png
21555,00027479_006.png
21922,00028027_002.png
8056,00011832_002.png
15411,00018253_033.png
10001,00013625_045.png


In [6]:
xray_test_df

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
0,00000001_000.png,Cardiomegaly,0,1,57,M,PA,2682,2749,0.143,0.143
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168
3,00000002_000.png,No Finding,0,2,80,M,PA,2500,2048,0.171,0.171
4,00000003_001.png,Hernia,0,3,74,F,PA,2500,2048,0.168,0.168
...,...,...,...,...,...,...,...,...,...,...,...
112115,00030801_001.png,Mass|Pneumonia,1,30801,38,M,PA,2048,2500,0.168,0.168
112116,00030802_000.png,No Finding,0,30802,28,M,PA,2048,2500,0.168,0.168
112117,00030803_000.png,No Finding,0,30803,42,F,PA,2048,2500,0.168,0.168
112118,00030804_000.png,No Finding,0,30804,29,F,PA,2048,2500,0.168,0.168


In [7]:
xray_test_df.columns

Index(['Image Index', 'Finding Labels', 'Follow-up #', 'Patient ID',
       'Patient Age', 'Patient Gender', 'View Position', 'OriginalImage[Width',
       'Height]', 'OriginalImagePixelSpacing[x', 'y]'],
      dtype='object')

In [8]:
my_list = []
for each in xray_test_df["Finding Labels"].str.split('|'):
    my_list += each
my_unique_list = set(my_list)

In [9]:
# new = xray_test_df["Finding Labels"].str.split("|", n = 9, expand = True)
# new.sample(20)

In [10]:
# xray_test_df['Finding Labels'] = xray_test_df["Finding Labels"].str.split("|")

In [11]:
def search_me(df_series, term):
    return 1 if term in df_series.split('|') else 0

In [12]:
for each_disease in my_unique_list:
    xray_test_df[each_disease] = xray_test_df["Finding Labels"].apply(search_me, args=(each_disease,))
xray_test_df = xray_test_df.drop('Finding Labels', axis=1)

In [13]:
def my_hot_encoding(dataframe, feature_list_to_encode):
    encoding_dict = {}
    for each_feature in feature_list_to_encode:
        feature_dict = {}
        for index_type, each_type in enumerate(dataframe[each_feature].unique()):
            feature_dict[each_type] = index_type
        encoding_dict[each_feature] = feature_dict
    for each_feature in feature_list_to_encode:
        dataframe[each_feature] = dataframe[each_feature].map(lambda x: encoding_dict[each_feature][x])
    dataframe[each_feature].astype(np.int64)
    return encoding_dict  

In [14]:
gender = my_hot_encoding(xray_test_df, ['Patient Gender'])

In [15]:
view_position = my_hot_encoding(xray_test_df, ['View Position'])

In [16]:
xray_test_df.sample(20)

Unnamed: 0,Image Index,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Effusion,Pleural_Thickening,No Finding,Hernia,Pneumothorax,Pneumonia,Mass,Atelectasis,Cardiomegaly,Fibrosis,Edema,Consolidation,Emphysema,Infiltration,Nodule
49132,00012467_000.png,0,12467,25,1,0,2992,2991,0.143,0.143,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
87621,00021648_004.png,10,21648,22,1,1,3056,2544,0.139,0.139,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
95859,00025228_008.png,8,25228,56,1,0,2544,3056,0.139,0.139,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
37825,00009951_000.png,0,9951,57,1,0,2048,2500,0.168,0.168,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
5179,00001381_001.png,1,1381,45,0,0,2992,2991,0.143,0.143,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
92822,00023162_020.png,20,23162,13,1,1,3056,2544,0.139,0.139,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
20738,00005538_001.png,2,5538,59,1,1,2500,2048,0.171,0.171,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2186,00000573_007.png,12,573,55,0,0,2946,2948,0.143,0.143,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
90307,00022456_002.png,0,22456,42,0,0,2992,2991,0.143,0.143,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
40270,00010491_006.png,6,10491,55,1,0,2986,2991,0.143,0.143,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [17]:
# xray_test_other_view_df = pd.pivot_table(xray_test_df, index=['Patient ID', 'Follow-up #', 'Image Index'])
# xray_test_other_view_df.head(20)

In [18]:
xray_train_df.isna().count()

Image Index      984
Finding Label    984
Bbox [x          984
y                984
w                984
h]               984
Unnamed: 6       984
Unnamed: 7       984
Unnamed: 8       984
dtype: int64

In [19]:
xray_train_df = xray_train_df.dropna(axis=1)
xray_train_df.sample(10)

Unnamed: 0,Image Index,Finding Label,Bbox [x,y,w,h]
904,00023162_025.png,Pneumothorax,192.853333,79.966823,263.964444,257.137778
763,00011514_015.png,Pneumonia,552.391111,458.846823,196.835556,176.355556
205,00013062_002.png,Cardiomegaly,352.0,340.338983,398.101695,395.932203
934,00009166_004.png,Pneumothorax,156.038095,243.809524,83.437037,302.32381
952,00021896_003.png,Pneumothorax,238.391534,137.616931,226.471958,214.552381
204,00026338_003.png,Cardiomegaly,352.0,345.762712,439.322034,314.576271
635,00030323_015.png,Mass,615.483598,423.686772,225.38836,290.404233
413,00027817_001.png,Effusion,760.685714,390.095238,197.214815,367.339683
122,00014795_002.png,Atelectasis,289.320635,541.798942,148.45291,133.28254
114,00000457_004.png,Atelectasis,802.946032,281.73545,93.189418,105.108995


In [20]:
def search_me2(df_series, term):
    return 1 if term in df_series else 0

In [21]:
for each_disease in my_unique_list:
    xray_train_df[each_disease] = xray_train_df["Finding Label"].apply(search_me2, args=(each_disease,))
# xray_train_df = xray_train_df.drop('Finding Label', axis=1)

In [22]:
xray_train_df.sample(20)

Unnamed: 0,Image Index,Finding Label,Bbox [x,y,w,h],Effusion,Pleural_Thickening,No Finding,Hernia,Pneumothorax,Pneumonia,Mass,Atelectasis,Cardiomegaly,Fibrosis,Edema,Consolidation,Emphysema,Infiltration,Nodule
129,00012829_004.png,Atelectasis,169.04127,539.631746,167.957672,93.189418,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
187,00002435_005.png,Cardiomegaly,362.847458,480.271186,401.355932,298.305085,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
350,00016291_002.png,Effusion,617.244444,676.162378,326.542222,78.506667,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
296,00028607_000.png,Cardiomegaly,404.182011,302.32381,497.371429,443.191534,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
767,00015069_001.png,Pneumonia,201.955556,227.877934,649.671111,508.586667,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
376,00021967_000.png,Effusion,182.044444,92.10582,365.172487,586.226455,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
278,00019187_000.png,Cardiomegaly,396.596825,383.593651,464.863492,384.677249,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
332,00018366_029.png,Effusion,560.355556,471.362378,251.448889,65.991111,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
832,00004344_002.png,Pneumonia,254.645503,299.073016,171.208466,392.262434,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
411,00009218_015.png,Effusion,193.964021,218.886772,221.053968,592.728042,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [23]:
xray_test_df.columns

Index(['Image Index', 'Follow-up #', 'Patient ID', 'Patient Age',
       'Patient Gender', 'View Position', 'OriginalImage[Width', 'Height]',
       'OriginalImagePixelSpacing[x', 'y]', 'Effusion', 'Pleural_Thickening',
       'No Finding', 'Hernia', 'Pneumothorax', 'Pneumonia', 'Mass',
       'Atelectasis', 'Cardiomegaly', 'Fibrosis', 'Edema', 'Consolidation',
       'Emphysema', 'Infiltration', 'Nodule'],
      dtype='object')

In [24]:
xray_train_df.columns

Index(['Image Index', 'Finding Label', 'Bbox [x', 'y', 'w', 'h]', 'Effusion',
       'Pleural_Thickening', 'No Finding', 'Hernia', 'Pneumothorax',
       'Pneumonia', 'Mass', 'Atelectasis', 'Cardiomegaly', 'Fibrosis', 'Edema',
       'Consolidation', 'Emphysema', 'Infiltration', 'Nodule'],
      dtype='object')

In [25]:
xray_test_df = xray_test_df[~(xray_test_df['No Finding'] == 1)]
xray_train_df = xray_train_df[~(xray_train_df['No Finding'] == 1)]
xray_test_df.drop(['No Finding'], axis=1)
xray_train_df.drop(['No Finding'], axis=1)

Unnamed: 0,Image Index,Finding Label,Bbox [x,y,w,h],Effusion,Pleural_Thickening,Hernia,Pneumothorax,Pneumonia,Mass,Atelectasis,Cardiomegaly,Fibrosis,Edema,Consolidation,Emphysema,Infiltration,Nodule
0,00013118_008.png,Atelectasis,225.084746,547.019217,86.779661,79.186441,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,00014716_007.png,Atelectasis,686.101695,131.543498,185.491525,313.491525,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,00029817_009.png,Atelectasis,221.830508,317.053115,155.118644,216.949153,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,00014687_001.png,Atelectasis,726.237288,494.95142,141.016949,55.322034,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,00017877_001.png,Atelectasis,660.067797,569.780787,200.677966,78.101695,0,0,0,0,0,0,1,0,0,0,0,0,0,0
5,00003148_004.png,Atelectasis,596.067797,505.780787,56.40678,180.067797,0,0,0,0,0,0,1,0,0,0,0,0,0,0
6,00012515_002.png,Atelectasis,289.084746,638.137861,83.525424,56.40678,0,0,0,0,0,0,1,0,0,0,0,0,0,0
7,00022098_006.png,Atelectasis,494.101695,577.392098,271.186441,154.033898,0,0,0,0,0,0,1,0,0,0,0,0,0,0
8,00014198_000.png,Atelectasis,676.338983,512.307352,98.711864,193.084746,0,0,0,0,0,0,1,0,0,0,0,0,0,0
9,00021007_000.png,Atelectasis,344.40678,468.917522,105.220339,101.966102,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [26]:
xray_test_2df = xray_test_df[['Image Index','OriginalImage[Width', 'Height]', 'OriginalImagePixelSpacing[x', 'y]', 'Mass', 'Fibrosis', 'Pneumonia',
       'Edema', 'Nodule', 'Pleural_Thickening', 'Consolidation', 'Effusion', 'Cardiomegaly', 'Emphysema', 'Hernia', 
       'Infiltration', 'Pneumothorax', 'Atelectasis']]

In [27]:
xray_train_2df = xray_train_df[['Image Index', 'Bbox [x', 'y', 'w', 'h]', 'Mass', 'Fibrosis',
       'Pneumonia', 'Edema', 'Nodule', 'Pleural_Thickening', 'Consolidation',
       'Effusion', 'Cardiomegaly', 'Emphysema', 'Hernia', 'Infiltration', 'Pneumothorax', 'Atelectasis']]

In [28]:
xray_train_2df = xray_train_df.rename(columns={'Bbox [x': 'x', 'h]': 'h'})

In [29]:
xray_train_2df.sample(10)

Unnamed: 0,Image Index,Finding Label,x,y,w,h,Effusion,Pleural_Thickening,No Finding,Hernia,Pneumothorax,Pneumonia,Mass,Atelectasis,Cardiomegaly,Fibrosis,Edema,Consolidation,Emphysema,Infiltration,Nodule
125,00019271_065.png,Atelectasis,211.301587,499.538624,141.951323,110.526984,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
845,00027758_004.png,Pneumonia,223.221164,500.622222,239.475132,123.530159,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
644,00019373_058.png,Mass,325.079365,330.497354,123.530159,157.121693,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
462,00012021_081.png,Infiltrate,655.928889,442.917934,192.284444,197.973333,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
941,00012622_016.png,Pneumothorax,763.936508,167.957672,81.269841,126.780952,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
861,00012834_113.png,Pneumonia,568.888889,260.063492,266.565079,423.686772,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
494,00013391_005.png,Infiltrate,90.453333,365.549045,303.786667,318.577778,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,00012515_002.png,Atelectasis,289.084746,638.137861,83.525424,56.40678,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
948,00028628_008.png,Pneumothorax,606.814815,98.607407,211.301587,322.912169,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
284,00004344_013.png,Cardiomegaly,350.002116,346.751323,525.544974,422.603175,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [30]:
xray_test_2df = xray_test_2df.rename(columns={'OriginalImage[Width': 'w', 'Height]': 'h', 'OriginalImagePixelSpacing[x': 'x', 'y]':'y'})

In [31]:
xray_test_2df.sample(10)

Unnamed: 0,Image Index,w,h,x,y,Mass,Fibrosis,Pneumonia,Edema,Nodule,Pleural_Thickening,Consolidation,Effusion,Cardiomegaly,Emphysema,Hernia,Infiltration,Pneumothorax,Atelectasis
105501,00028357_027.png,3056,2544,0.139,0.139,0,0,0,1,1,0,0,1,0,0,0,1,0,0
36193,00009554_000.png,2500,2048,0.168,0.168,0,0,0,0,1,0,0,0,0,0,0,1,0,0
71461,00017611_008.png,2992,2991,0.143,0.143,0,0,0,0,0,1,0,0,0,0,0,0,0,0
61861,00015286_003.png,2754,2833,0.143,0.143,0,0,0,0,1,0,0,0,1,0,0,0,0,0
7344,00001941_007.png,2048,2500,0.168,0.168,0,0,0,0,0,0,0,0,0,0,0,0,0,1
100766,00026759_000.png,2544,3056,0.139,0.139,0,0,0,0,0,0,0,0,0,0,0,1,0,1
40780,00010582_000.png,2992,2991,0.143,0.143,0,0,0,0,0,0,0,0,0,0,0,0,0,1
17267,00004630_000.png,2500,2048,0.171,0.171,0,0,0,0,0,0,0,0,1,0,0,0,0,0
56248,00014018_005.png,2500,2048,0.168,0.168,1,0,0,0,0,0,0,0,0,0,0,0,0,0
85354,00021035_010.png,3056,2544,0.139,0.139,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [32]:
xray_test_2df['Image Index'].isin(train_list_df['Image Index']).value_counts()

True     36024
False    15735
Name: Image Index, dtype: int64

In [33]:
xray_train_2df['Image Index'].isin(train_list_df['Image Index']).value_counts()

False    984
Name: Image Index, dtype: int64

In [34]:
xray_test_2df['Image Index'].isin(test_list_df['Image Index']).value_counts()

False    36024
True     15735
Name: Image Index, dtype: int64

In [35]:
xray_train_2df['Image Index'].isin(test_list_df['Image Index']).value_counts()

True    984
Name: Image Index, dtype: int64

In [36]:
train_df = xray_test_2df[xray_test_2df['Image Index'].isin(train_list_df['Image Index'])].copy()

In [37]:
test_df = xray_test_2df[xray_test_2df['Image Index'].isin(test_list_df['Image Index'])].copy()
test_df.sample(10)

Unnamed: 0,Image Index,w,h,x,y,Mass,Fibrosis,Pneumonia,Edema,Nodule,Pleural_Thickening,Consolidation,Effusion,Cardiomegaly,Emphysema,Hernia,Infiltration,Pneumothorax,Atelectasis
50669,00012834_074.png,2048,2500,0.168,0.168,0,0,0,0,0,1,1,1,0,0,0,1,0,0
37284,00009845_019.png,2500,2048,0.168,0.168,0,0,0,0,0,1,0,1,0,0,0,0,0,0
125,00000032_011.png,2500,2048,0.168,0.168,0,0,0,0,0,0,0,0,0,0,0,1,0,0
60672,00014976_002.png,2992,2991,0.143,0.143,0,0,0,0,0,0,0,0,1,0,0,0,0,0
98634,00026078_006.png,2992,2991,0.143,0.143,0,0,0,0,0,0,0,0,0,0,0,0,1,0
42085,00010828_028.png,2500,2048,0.168,0.168,0,0,0,0,1,0,1,1,0,0,0,0,0,1
42024,00010815_017.png,2500,2048,0.168,0.168,0,0,0,0,0,0,0,1,0,0,0,0,0,0
90182,00022416_053.png,3056,2544,0.139,0.139,0,0,0,0,0,0,0,1,0,0,0,0,0,0
56979,00014149_030.png,2500,2048,0.168,0.168,0,0,0,0,0,0,0,1,0,0,0,1,0,0
79420,00019508_005.png,3056,2456,0.139,0.139,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [38]:
test_df = test_df.append(xray_train_2df[xray_train_2df['Image Index'].isin(test_list_df['Image Index'])])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [39]:
test_df = test_df.drop(['No Finding', 'Finding Label'], axis=1)

In [40]:
test_df.to_csv('test_df.csv', index=False)

In [41]:
train_df.to_csv('train_df.csv', index=False)

In [42]:
test_df.shape

(16719, 19)

In [43]:
test_df.sample(10)

Unnamed: 0,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Image Index,Infiltration,Mass,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax,h,w,x,y
34922,1,0,0,0,1,0,0,0,00009218_032.png,0,0,0,0,0,0,2048.0,2500.0,0.168,0.168
81611,0,0,1,0,0,0,0,0,00020065_009.png,0,0,0,0,0,0,2544.0,3056.0,0.139,0.139
8779,0,0,0,0,0,0,0,0,00002316_008.png,0,0,0,0,0,1,2991.0,2992.0,0.143,0.143
97283,1,0,1,0,0,0,0,0,00025664_004.png,0,0,0,0,0,0,2544.0,3056.0,0.139,0.139
86087,1,0,1,0,1,1,0,0,00021222_013.png,1,1,0,0,0,0,2544.0,2920.0,0.139,0.139
75930,0,0,0,0,0,0,0,0,00018623_000.png,0,1,1,0,0,0,2991.0,2992.0,0.143,0.143
47717,0,0,0,0,0,0,0,0,00012094_034.png,1,0,0,0,0,0,2048.0,2500.0,0.168,0.168
947,0,0,0,0,0,0,0,0,00007471_003.png,0,0,0,0,0,1,483.284656,333.748148,540.715344,106.192593
109568,1,0,0,0,1,0,0,0,00029813_018.png,1,0,0,0,0,0,2021.0,2021.0,0.194311,0.194311
17938,0,1,0,0,0,0,0,0,00004822_051.png,0,0,0,0,0,0,2544.0,3056.0,0.139,0.139
