In [33]:

import os
import sys
import sklearn
import pip
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from pathlib import Path
plt.rc('font', size=14)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)

# Requirements taken from https://www.tensorflow.org/install/pip#linux
python_version = sys.version
pip_version = pip.__version__
nvidia_smi_version = os.popen('nvidia-smi --query-gpu=driver_version --format=csv,noheader').read().strip()
cuda_version = os.popen('nvcc --version').read().split('\n')[3].split(',')[1].strip()
tensorflow_version = tf.__version__
physical_devices = tf.config.list_physical_devices('GPU')
num_gpus = len(physical_devices)
gpu_model = os.popen('nvidia-smi --query-gpu=name --format=csv,noheader').read().strip()

req_df = pd.DataFrame({
    'Package': ['Python', 'pip', 'nvidia-smi', 'cuda', 'tensorflow', 'GPUs'],
    'Required': ['3.7', '20.2', '450.51.06', '11.0', '2.3.0', '1'],
    'Installed': [python_version, pip_version, nvidia_smi_version, cuda_version, tensorflow_version, num_gpus]
})
               
print(f'All requiremetns met. Using TF with GPU: {gpu_model} ({num_gpus}x)')

pd.options.plotting.backend = "plotly"

req_df


All requiremetns met. Using TF with GPU: NVIDIA GeForce GTX 1660 (1x)


Unnamed: 0,Package,Required,Installed
0,Python,3.7,"3.9.17 (main, Jul 5 2023, 20:41:20) \n[GCC 11..."
1,pip,20.2,23.2.1
2,nvidia-smi,450.51.06,535.54.03
3,cuda,11.0,release 12.2
4,tensorflow,2.3.0,2.15.0
5,GPUs,1,1


In [34]:
# https://github.com/ageron/handson-ml3/blob/main/10_neural_nets_with_keras.ipynb
from pathlib import Path

IMAGES_PATH = Path() / "images" / "chest_xray_14"
IMAGES_PATH.mkdir(parents=True, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = IMAGES_PATH / f"{fig_id}.{fig_extension}"
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [35]:

sample_labels =  pd.read_csv('sample/sample_labels.csv')
sample_labels.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImageWidth,OriginalImageHeight,OriginalImagePixelSpacing_x,OriginalImagePixelSpacing_y
0,00000013_005.png,Emphysema|Infiltration|Pleural_Thickening|Pneu...,5,13,060Y,M,AP,3056,2544,0.139,0.139
1,00000013_026.png,Cardiomegaly|Emphysema,26,13,057Y,M,AP,2500,2048,0.168,0.168
2,00000017_001.png,No Finding,1,17,077Y,M,AP,2500,2048,0.168,0.168
3,00000030_001.png,Atelectasis,1,30,079Y,M,PA,2992,2991,0.143,0.143
4,00000032_001.png,Cardiomegaly|Edema|Effusion,1,32,055Y,F,AP,2500,2048,0.168,0.168


In [36]:
sample_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5606 entries, 0 to 5605
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Image Index                  5606 non-null   object 
 1   Finding Labels               5606 non-null   object 
 2   Follow-up #                  5606 non-null   int64  
 3   Patient ID                   5606 non-null   int64  
 4   Patient Age                  5606 non-null   object 
 5   Patient Gender               5606 non-null   object 
 6   View Position                5606 non-null   object 
 7   OriginalImageWidth           5606 non-null   int64  
 8   OriginalImageHeight          5606 non-null   int64  
 9   OriginalImagePixelSpacing_x  5606 non-null   float64
 10  OriginalImagePixelSpacing_y  5606 non-null   float64
dtypes: float64(2), int64(4), object(5)
memory usage: 481.9+ KB


In [37]:
sample_labels['Patient Age'].apply(lambda x : int(x[1:3])).plot.box()


In [38]:
#drop unused columns

sample_labels = sample_labels.drop(columns=['Follow-up #', 'Patient ID', 'Patient Age', 'Patient Gender', 'View Position', 'OriginalImageWidth', 'OriginalImageHeight', 'OriginalImagePixelSpacing_x', 'OriginalImagePixelSpacing_y'])

sample_labels.head(5)

Unnamed: 0,Image Index,Finding Labels
0,00000013_005.png,Emphysema|Infiltration|Pleural_Thickening|Pneu...
1,00000013_026.png,Cardiomegaly|Emphysema
2,00000017_001.png,No Finding
3,00000030_001.png,Atelectasis
4,00000032_001.png,Cardiomegaly|Edema|Effusion


In [39]:


sample_labels['Finding Labels'].unique()[:15]

array(['Emphysema|Infiltration|Pleural_Thickening|Pneumothorax',
       'Cardiomegaly|Emphysema', 'No Finding', 'Atelectasis',
       'Cardiomegaly|Edema|Effusion', 'Consolidation|Mass', 'Effusion',
       'Consolidation|Effusion|Infiltration|Nodule', 'Mass',
       'Mass|Pneumothorax', 'Cardiomegaly|Consolidation', 'Consolidation',
       'Pneumothorax', 'Consolidation|Pleural_Thickening',
       'Infiltration|Nodule'], dtype=object)

In [40]:
sample_labels['Finding Labels'] = sample_labels['Finding Labels'].map(lambda x: x.replace('No Finding', 'No_Finding'))

# find truly unique labels by splitting the text
unique_labels = np.unique(np.concatenate(sample_labels['Finding Labels'].map(lambda x: x.split('|')).values)).tolist()

print(f'Unique labels: {len(unique_labels)}, {unique_labels}')


Unique labels: 15, ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'No_Finding', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']


In [41]:
# update the csv file with the new labels
for label in unique_labels:
    if len(label) > 1:
        sample_labels[label] = sample_labels['Finding Labels'].map(lambda finding: 1.0 if label in finding else 0)
        
sample_labels['No Findings'] = sample_labels['Finding Labels'].apply(lambda x: 1 if 'No Finding' in x else 0)
        
sample_labels.head()

Unnamed: 0,Image Index,Finding Labels,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,Mass,No_Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax,No Findings
0,00000013_005.png,Emphysema|Infiltration|Pleural_Thickening|Pneu...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0
1,00000013_026.png,Cardiomegaly|Emphysema,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,00000017_001.png,No_Finding,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
3,00000030_001.png,Atelectasis,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,00000032_001.png,Cardiomegaly|Edema|Effusion,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [42]:
sample_labels.iloc[0]

Image Index                                            00000013_005.png
Finding Labels        Emphysema|Infiltration|Pleural_Thickening|Pneu...
Atelectasis                                                         0.0
Cardiomegaly                                                        0.0
Consolidation                                                       0.0
Edema                                                               0.0
Effusion                                                            0.0
Emphysema                                                           1.0
Fibrosis                                                            0.0
Hernia                                                              0.0
Infiltration                                                        1.0
Mass                                                                0.0
No_Finding                                                          0.0
Nodule                                                          

In [43]:
# get the count of each label
label_counts = sample_labels[unique_labels].sum().sort_values(ascending=True)
label_counts.plot.barh()

In [44]:
# lets plot the same but without the 'No Finding' label
label_counts[:14].plot.barh()

In [45]:
# now the mean of each label
label_means = sample_labels[unique_labels].mean().sort_values(ascending=True)
label_means[:14].plot.barh()

In [46]:
sample_labels.head(8)


Unnamed: 0,Image Index,Finding Labels,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,Mass,No_Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax,No Findings
0,00000013_005.png,Emphysema|Infiltration|Pleural_Thickening|Pneu...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0
1,00000013_026.png,Cardiomegaly|Emphysema,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,00000017_001.png,No_Finding,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
3,00000030_001.png,Atelectasis,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,00000032_001.png,Cardiomegaly|Edema|Effusion,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5,00000040_003.png,Consolidation|Mass,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
6,00000042_002.png,No_Finding,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
7,00000057_001.png,No_Finding,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0


In [47]:
print(f'Num examples: {sample_labels.shape[0]}')
print(f'Num labels: {len(unique_labels)}')
print(f'Labels: {unique_labels}')


Num examples: 5606
Num labels: 15
Labels: ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'No_Finding', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']


In [48]:
sample_labels['classes_vector'] = sample_labels.apply(lambda x: [x[unique_labels].values], 1).map(lambda x: x[0])


In [49]:
sample_labels['classes_vector'].head(5)

0    [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ...
1    [0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...
2    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3    [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4    [0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
Name: classes_vector, dtype: object

In [50]:
sample_labels['Image Index'] = sample_labels['Image Index'].map(lambda x: '/home/kayaba_attribution/Documents/UoL/FINAL_PROJECT/Code/ChestXray-14/sample/images/' + x)

# change the name of the column to 'image_path'
sample_labels = sample_labels.rename(columns={
    'Image Index': 'image_path',
    'Finding Labels': 'labels'})

In [51]:
rare = ['Hernia']
sample_labels['labels'] = sample_labels['labels'].apply(lambda s: [l for l in str(s).split('|') if l not in rare])

In [52]:
sample_labels.head(10)

Unnamed: 0,image_path,labels,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,Mass,No_Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax,No Findings,classes_vector
0,/home/kayaba_attribution/Documents/UoL/FINAL_P...,"[Emphysema, Infiltration, Pleural_Thickening, ...",0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ..."
1,/home/kayaba_attribution/Documents/UoL/FINAL_P...,"[Cardiomegaly, Emphysema]",0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,"[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
2,/home/kayaba_attribution/Documents/UoL/FINAL_P...,[No_Finding],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,/home/kayaba_attribution/Documents/UoL/FINAL_P...,[Atelectasis],1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,/home/kayaba_attribution/Documents/UoL/FINAL_P...,"[Cardiomegaly, Edema, Effusion]",0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,"[0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
5,/home/kayaba_attribution/Documents/UoL/FINAL_P...,"[Consolidation, Mass]",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,/home/kayaba_attribution/Documents/UoL/FINAL_P...,[No_Finding],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,/home/kayaba_attribution/Documents/UoL/FINAL_P...,[No_Finding],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8,/home/kayaba_attribution/Documents/UoL/FINAL_P...,[Effusion],0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
9,/home/kayaba_attribution/Documents/UoL/FINAL_P...,[No_Finding],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [53]:
sample_labels.to_csv('sample/sample_labels_clean.csv', index=False)