# Importaciones y Loading de la Data 

In [2]:
import pandas as pd 
import numpy as np 
from itertools import chain
import matplotlib.pyplot as plt
import seaborn as sns


pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)


In [3]:
# loading y borrar columna sin info 
xray_df = pd.read_csv('../../raw_data/full-dataset/Data_Entry_2017.csv')
nRow, nCol = xray_df.shape

xray_df.drop(columns= 'Unnamed: 11', inplace=True )
print(f'There are {nRow} rows and {nCol} columns')

There are 112120 rows and 12 columns


In [4]:
xray_df

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143
...,...,...,...,...,...,...,...,...,...,...,...
112115,00030801_001.png,Mass|Pneumonia,1,30801,39,M,PA,2048,2500,0.168,0.168
112116,00030802_000.png,No Finding,0,30802,29,M,PA,2048,2500,0.168,0.168
112117,00030803_000.png,No Finding,0,30803,42,F,PA,2048,2500,0.168,0.168
112118,00030804_000.png,No Finding,0,30804,30,F,PA,2048,2500,0.168,0.168


# Encoding  y Unificar clases iguales 
1. Columna conteo de enfermedades por placa ('count_diseases') y columna de si esta enfermo ('Enfermo')
2. Se hace un encoding por cada enfermedad reportada en la columna 'Finding Labels'
3. Ejemplo: **(Infiltration|Effusion')**	y **(Effusion|Infiltration')** --> se volveran una misma clase y se genera nueva columna ('FIXED_LABELS') 


In [5]:
# Generacion columna conteo de enfermedades por imagen 
xray_df['count_diseases'] = xray_df['Finding Labels'].map(lambda x: len(x.split('|')))
xray_df['Count_diseases'] = np.where(xray_df['Finding Labels'] =='No Finding', 0,xray_df['count_diseases'] )

In [6]:
# Generacion columna enfermo_noenfermo --> 1 es enfermo - 0 es No enfermo 
xray_df['Enfermo'] = np.where(xray_df['Count_diseases'] ==0, 0,1)

# eliminar columna count_diseases
xray_df.drop(columns= 'count_diseases', inplace=True )


In [7]:
xray_df.head(10)

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Count_diseases,Enfermo
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,1,1
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,2,1
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,2,1
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,0,0
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,1,1
5,00000003_001.png,Hernia,1,3,74,F,PA,2500,2048,0.168,0.168,1,1
6,00000003_002.png,Hernia,2,3,75,F,PA,2048,2500,0.168,0.168,1,1
7,00000003_003.png,Hernia|Infiltration,3,3,76,F,PA,2698,2991,0.143,0.143,2,1
8,00000003_004.png,Hernia,4,3,77,F,PA,2500,2048,0.168,0.168,1,1
9,00000003_005.png,Hernia,5,3,78,F,PA,2686,2991,0.143,0.143,1,1


In [10]:
xray_df['Finding Labels'].map(lambda x: x.split('|'))

0                    [Cardiomegaly]
1         [Cardiomegaly, Emphysema]
2          [Cardiomegaly, Effusion]
3                      [No Finding]
4                          [Hernia]
                    ...            
112115            [Mass, Pneumonia]
112116                 [No Finding]
112117                 [No Finding]
112118                 [No Finding]
112119                 [No Finding]
Name: Finding Labels, Length: 112120, dtype: object

In [7]:
# encoding multiple 
all_labels = np.unique(list(chain(*xray_df['Finding Labels'].map(lambda x: x.split('|')).tolist())))
all_labels = [x for x in all_labels if len(x)>0]

print('All Labels ({}): {}'.format(len(all_labels), all_labels))

for c_label in all_labels:
    if len(c_label)>1: # leave out empty labels
        xray_df[c_label] = xray_df['Finding Labels'].map(lambda finding: 1.0 if c_label in finding else 0)

xray_df.sample(3)    

All Labels (15): ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'No Finding', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']


Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Count_diseases,Enfermo,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,Mass,No Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax
59262,00014663_016.png,No Finding,16,14663,63,M,AP,2500,2048,0.168,0.168,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
19989,00005348_000.png,Emphysema,0,5348,50,M,PA,2960,2975,0.143,0.143,1,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98106,00025895_005.png,Atelectasis,5,25895,66,F,AP,3056,2544,0.139,0.139,1,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# cambiar de float a integer 
cols = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 
        'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'No Finding', 
        'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']

xray_df[cols] = xray_df[cols].applymap(np.int64)

In [10]:
# numero de unique values de las clases (deberian quedar menos de 836)
num_clases = xray_df['Finding Labels'].nunique()
print('Existen',num_clases,'clases en el dataset' )

Existen 836 clases en el dataset


In [11]:
xray_df['Combined']= xray_df[cols].values.tolist()

xray_df['Fixed_Labels'] = xray_df['Combined'].apply(
    lambda x: '|'.join([cols[i] for i, val in enumerate(x) if val == 1]))

num_clases = xray_df['Fixed_Labels'].nunique()
print('Quedaron',num_clases,'clases en el dataset' )

Quedaron 801 clases en el dataset


In [12]:
# borrar la columna combined (no sirve)
# borrar Finding Labels
xray_df.drop(columns= ['Combined','Finding Labels'], inplace=True )

# Eliminar pacientes sin sentido (EDAD)

In [13]:
print('Rows en el dataset:',xray_df.shape[0])

Rows en el dataset: 112120


In [14]:
# se eliminan 16 pacientes 
index_mayor100 = list(xray_df[xray_df['Patient Age'] > 100].index)
xray_df = xray_df.drop(index_mayor100)
print('Rows en el dataset:',xray_df.shape[0])

Rows en el dataset: 112104


# Elegir clases con las que vamos a Trabajar 

In [15]:
enfermos =  xray_df[xray_df['Enfermo'] == 1]

In [16]:
# elegir las clases multiples con mas de 1000 
enfermos_mult = enfermos[enfermos['Count_diseases']>=2]
enfermos_mult = enfermos_mult['Fixed_Labels'].value_counts().to_frame()
enfermos_mult = enfermos_mult[enfermos_mult['Fixed_Labels'] > 1000]
mult_clases = list(enfermos_mult.index)
print(mult_clases)
enfermos_mult

['Effusion|Infiltration', 'Atelectasis|Infiltration', 'Atelectasis|Effusion']


Unnamed: 0,Fixed_Labels
Effusion|Infiltration,1604
Atelectasis|Infiltration,1350
Atelectasis|Effusion,1167


In [17]:
# Elegir las mono-clases  
enfermos_mono = enfermos[enfermos['Count_diseases']<2]
enfermos_mono = enfermos_mono['Fixed_Labels'].value_counts(normalize=True).to_frame()
enfermos_mono = enfermos_mono[enfermos_mono['Fixed_Labels'] > 0.01]
mono_clases  = list(enfermos_mono.index)
mono_clases

['Infiltration',
 'Atelectasis',
 'Effusion',
 'Nodule',
 'Pneumothorax',
 'Mass',
 'Consolidation',
 'Pleural_Thickening',
 'Cardiomegaly',
 'Emphysema',
 'Fibrosis',
 'Edema',
 'Pneumonia']

In [18]:
clases_enfermedades = mono_clases + mult_clases
clases_enfermedades

['Infiltration',
 'Atelectasis',
 'Effusion',
 'Nodule',
 'Pneumothorax',
 'Mass',
 'Consolidation',
 'Pleural_Thickening',
 'Cardiomegaly',
 'Emphysema',
 'Fibrosis',
 'Edema',
 'Pneumonia',
 'Effusion|Infiltration',
 'Atelectasis|Infiltration',
 'Atelectasis|Effusion']

# Cantidad de Datos Finales 

In [19]:
# cantidad datos eliminados de los enfermos 
eliminados = enfermos.shape[0] - xray_df[xray_df['Fixed_Labels'].isin(clases_enfermedades)].shape[0]
print('Se eliminarian',eliminados,'del dataframe' )

Se eliminarian 16781 del dataframe


In [54]:
no_enfermos =  xray_df[xray_df['Enfermo'] == 0]
enfermos_noenfermos = xray_df[xray_df['Fixed_Labels'].isin(clases_enfermedades)].shape[0] + no_enfermos.shape[0]
print('Total imagenes:',enfermos_noenfermos)

Total imagenes: 95323
