# Importaciones y Loading de la Data 

In [1]:
import pandas as pd 
import numpy as np 
from itertools import chain
import matplotlib.pyplot as plt
import seaborn as sns


pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)


In [2]:
# loading y borrar columna sin info 
xray_df = pd.read_csv('./Data_Entry_2017.csv')
nRow, nCol = xray_df.shape

xray_df.drop(columns= 'Unnamed: 11', inplace=True )
print(f'There are {nRow} rows and {nCol} columns')

There are 112120 rows and 12 columns


# Encoding  y Unificar clases iguales 
1. Columna conteo de enfermedades por placa ('count_diseases') y columna de si esta enfermo ('Enfermo')
2. Se hace un encoding por cada enfermedad reportada en la columna 'Finding Labels'
3. Ejemplo: **(Infiltration|Effusion')**	y **(Effusion|Infiltration')** --> se volveran una misma clase y se genera nueva columna ('FIXED_LABELS') 


In [3]:
# Generacion columna conteo de enfermedades por imagen 
xray_df['count_diseases'] = xray_df['Finding Labels'].map(lambda x: len(x.split('|')))
xray_df['Count_diseases'] = np.where(xray_df['Finding Labels'] =='No Finding', 0,xray_df['count_diseases'] )

In [4]:
# Generacion columna enfermo_noenfermo --> 1 es enfermo - 0 es No enfermo 
xray_df['Enfermo'] = np.where(xray_df['Count_diseases'] ==0, 0,1)

# eliminar columna count_diseases
xray_df.drop(columns= 'count_diseases', inplace=True )


In [5]:
# encoding multiple 
all_labels = np.unique(list(chain(*xray_df['Finding Labels'].map(lambda x: x.split('|')).tolist())))
all_labels = [x for x in all_labels if len(x)>0]

print('All Labels ({}): {}'.format(len(all_labels), all_labels))

for c_label in all_labels:
    if len(c_label)>1: # leave out empty labels
        xray_df[c_label] = xray_df['Finding Labels'].map(lambda finding: 1.0 if c_label in finding else 0)

xray_df.sample(3)    

All Labels (15): ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'No Finding', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']


Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Count_diseases,Enfermo,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,Mass,No Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax
105691,00028439_000.png,Infiltration,0,28439,35,F,PA,2544,3056,0.139,0.139,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
34684,00009138_028.png,Consolidation|Edema|Infiltration,28,9138,69,M,AP,2500,2048,0.168,0.168,3,1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
91361,00022807_000.png,Infiltration,0,22807,65,F,PA,2544,3056,0.139,0.139,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# cambiar de float a integer 
cols = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 
        'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'No Finding', 
        'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']

xray_df[cols] = xray_df[cols].applymap(np.int64)

In [8]:
# numero de unique values de las clases (deberian quedar menos de 836)
num_clases = xray_df['Finding Labels'].nunique()
print('Existen',num_clases,'clases en el dataset' )

Existen 836 clases en el dataset


In [9]:
xray_df['Combined']= xray_df[cols].values.tolist()

xray_df['Fixed_Labels'] = xray_df['Combined'].apply(
    lambda x: '|'.join([cols[i] for i, val in enumerate(x) if val == 1]))

num_clases = xray_df['Fixed_Labels'].nunique()
print('Quedaron',num_clases,'clases en el dataset' )

Quedaron 801 clases en el dataset


In [10]:
# borrar la columna combined (no sirve)
# borrar Finding Labels
xray_df.drop(columns= ['Combined','Finding Labels'], inplace=True )

# Eliminar pacientes sin sentido (EDAD)

In [11]:
print('Rows en el dataset:',xray_df.shape[0])

Rows en el dataset: 112120


In [12]:
# se eliminan 16 pacientes 
index_mayor100 = list(xray_df[xray_df['Patient Age'] > 100].index)
xray_df = xray_df.drop(index_mayor100)
print('Rows en el dataset:',xray_df.shape[0])

Rows en el dataset: 112104


# Elegir clases con las que vamos a Trabajar 

In [13]:
enfermos =  xray_df[xray_df['Enfermo'] == 1]

In [28]:
# elegir las clases multiples con mas de 1000 
enfermos_mult = enfermos[enfermos['Count_diseases']>=2]
enfermos_mult = enfermos_mult['Fixed_Labels'].value_counts().to_frame()
enfermos_mult = enfermos_mult[enfermos_mult['Fixed_Labels'] > 1000]
mult_clases = list(enfermos_mult.index)
print(mult_clases)
enfermos_mult

['Effusion|Infiltration', 'Atelectasis|Infiltration', 'Atelectasis|Effusion']


Unnamed: 0,Fixed_Labels
Effusion|Infiltration,1604
Atelectasis|Infiltration,1350
Atelectasis|Effusion,1167


In [29]:
# Elegir las mono-clases  
enfermos_mono = enfermos[enfermos['Count_diseases']<2]
enfermos_mono = enfermos_mono['Fixed_Labels'].value_counts(normalize=True).to_frame()
enfermos_mono = enfermos_mono[enfermos_mono['Fixed_Labels'] > 0.01]
mono_clases  = list(enfermos_mono.index)
mono_clases

['Infiltration',
 'Atelectasis',
 'Effusion',
 'Nodule',
 'Pneumothorax',
 'Mass',
 'Consolidation',
 'Pleural_Thickening',
 'Cardiomegaly',
 'Emphysema',
 'Fibrosis',
 'Edema',
 'Pneumonia']

In [32]:
clases_enfermedades = mono_clases + mult_clases
clases_enfermedades

['Infiltration',
 'Atelectasis',
 'Effusion',
 'Nodule',
 'Pneumothorax',
 'Mass',
 'Consolidation',
 'Pleural_Thickening',
 'Cardiomegaly',
 'Emphysema',
 'Fibrosis',
 'Edema',
 'Pneumonia',
 'Effusion|Infiltration',
 'Atelectasis|Infiltration',
 'Atelectasis|Effusion']