# ChestX-ray-8__Data Preprocessing

### 1. import modules

In [1]:
import os
import numpy as np
from PIL import Image
import pandas as pd
import re

### 2. locate the file

In [2]:
images_path = "F:/Data/Chest_Xray_Dataset/CXR-8-full-data/images"

### 3. define the function that converts image files into a list of np arrays

In [None]:
def get_images(path):
    # Append all the absolute image paths in a list image_paths
    image_paths = [os.path.join(path, f) for f in os.listdir(path)]
    # images will contains face images
    images = []
    labels = []
    
    count = 0
    
    for image_path in image_paths:
        # Read the image and convert to grayscale
        image_pil = Image.open(image_path)
        # Convert the image format into numpy array
        image = np.array(image_pil, dtype = 'float')
        images.append(image)
        
        index = image_path.split("\\")[-1]
            

        count = count + 1
        if count > 10:
            break
        
    # return the images list and labels list
    return images

#### converts the list of np arrays into a np array

In [None]:
images_acquired = get_images(images_path)
images_acquired = np.asarray(images_acquired)

#### the shape of the new np arrayt should be ( , 1024, 1024)

In [None]:
images_acquired.shape

### 4. Now we gonna convert the csv file into a dataframe/np array

In [3]:
labels_path = "F:/Data/Chest_Xray_Dataset/CXR-8-full-data/Data_Entry_2017.csv"

In [4]:
df = pd.read_csv(labels_path)
df

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImageWidth,OriginalImageHeight,OriginalImagePixelSpacing_x,OriginalImagePixelSpacing_y
0,00000001_000.png,Cardiomegaly,0,1,058Y,M,PA,2682,2749,0.143000,0.143000
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,058Y,M,PA,2894,2729,0.143000,0.143000
2,00000001_002.png,Cardiomegaly|Effusion,2,1,058Y,M,PA,2500,2048,0.168000,0.168000
3,00000002_000.png,No Finding,0,2,081Y,M,PA,2500,2048,0.171000,0.171000
4,00000003_000.png,Hernia,0,3,081Y,F,PA,2582,2991,0.143000,0.143000
5,00000003_001.png,Hernia,1,3,074Y,F,PA,2500,2048,0.168000,0.168000
6,00000003_002.png,Hernia,2,3,075Y,F,PA,2048,2500,0.168000,0.168000
7,00000003_003.png,Hernia|Infiltration,3,3,076Y,F,PA,2698,2991,0.143000,0.143000
8,00000003_004.png,Hernia,4,3,077Y,F,PA,2500,2048,0.168000,0.168000
9,00000003_005.png,Hernia,5,3,078Y,F,PA,2686,2991,0.143000,0.143000


#### below is the dataframe we need:

In [5]:
labels = df[['Image Index', 'Finding Labels']]
labels

Unnamed: 0,Image Index,Finding Labels
0,00000001_000.png,Cardiomegaly
1,00000001_001.png,Cardiomegaly|Emphysema
2,00000001_002.png,Cardiomegaly|Effusion
3,00000002_000.png,No Finding
4,00000003_000.png,Hernia
5,00000003_001.png,Hernia
6,00000003_002.png,Hernia
7,00000003_003.png,Hernia|Infiltration
8,00000003_004.png,Hernia
9,00000003_005.png,Hernia


#### the way we locate the row we need:

In [6]:
d2 = labels.loc[(labels['Image Index']=='00000001_000.png')]
d2

Unnamed: 0,Image Index,Finding Labels
0,00000001_000.png,Cardiomegaly


#### the way we locate the findings:

In [7]:
d2.iloc[0][1]

'Cardiomegaly'

#### test the regular expression:

In [8]:
bool(re.search('baz', 'foobarrrr'))

False

### 5. Define the function to process the finding labels

In [9]:
existing_annotations = ['Atelectasis','Cardiomegaly','Effusion','Infiltration','Mass','Nodule','Pneumonia','Pneumothorax']
existing_annotations = np.asarray(existing_annotations)
len(existing_annotations)

8

In [10]:
def process_findings(findings, annotations):
    """
    Parameters: findings as string, annotations as np array
    
    Output: onehot as np array
    
    """
    onehot = np.zeros(len(annotations))
    i = 0
    for disease in annotations:
        if bool(re.search(disease, findings)):
            onehot[i] = 1
        i = i + 1
    return onehot

### 6. Upgrade the origin function into: get_images_and_labels

In [None]:
def get_images(path, labels, batch_id, batch_size = 30):
    """
    Parameters:
        1. path as string
        2. (two-column) labels as dataframe
        3. batch-id as integer: 0-based
        4. batch_size as integer: 30 by default
    
    Outputs:
        1. image_list as list----containing the arrays of shape (1024, 1024)
        2. label_list as list----containing the onehot np arrays of shape (, len(annotations))
    
    """
    # Append all the absolute image paths in a list image_paths
    image_paths = [os.path.join(path, f) for f in os.listdir(path)]
    # images will contains face images
    image_list = []
    label_list = []
    
    print("The total number of images is " + str(len(image_paths)))
    batch_head = batch_id * batch_size + 1 
    count = batch_head
    
    for image_path in image_paths[batch_head : batch_head + batch_size]:
        # Read the image and convert to grayscale
        image_pil = Image.open(image_path)
        # Convert the image format into numpy array
        image = np.array(image_pil, dtype = 'float')
        image_list.append(image)
        
        index = image_path.split("\\")[-1]
        findings = labels.loc[(df['Image Index']==index)].iloc[0][1]    
        label_list.append(process_findings(findings, existing_annotations))
        
        count = count + 1
        if count % 10 == 0:
            print("Preprocessing images: " + str(count) + " /" + str(len(image_paths)))
        if count > 500:
            break
            
        
    # return the images list and labels list
    return image_list,label_list

In [17]:
def get_images_pro(path, labels, begin_id, error_count_in, batch_size = 30):
    """
    Parameters:
        1. path as string
        2. (two-column) labels as dataframe
        3. batch-id as integer: 0-based
        4. batch_size as integer: 30 by default
    
    Outputs:
        1. image_list as list----containing the arrays of shape (1024, 1024)
        2. label_list as list----containing the onehot np arrays of shape (, len(annotations))
    
    """
    # Append all the absolute image paths in a list image_paths
    image_paths = [os.path.join(path, f) for f in os.listdir(path)]
    # images will contains face images
    image_list = []
    label_list = []
    
    print("The total number of images is " + str(len(image_paths)))
    count = begin_id
    ceiling = begin_id + batch_size - 1
    error_count = error_count_in
    
    while True:
        # Read the image and convert to grayscale
        image_path = image_paths[count]
        image_pil = Image.open(image_path)
        # Convert the image format into numpy array
        image = np.array(image_pil, dtype = 'float')
        
        if image.shape == (1024, 1024):
            image_list.append(image)
            index = image_path.split("\\")[-1]
            findings = labels.loc[(df['Image Index']==index)].iloc[0][1]    
            label_list.append(process_findings(findings, existing_annotations))    
        else:
            ceiling = ceiling + 1
            error_count = error_count + 1
            
        count = count + 1
        
        if count % 10 == 0:
            print("Preprocessing images: " + str(count) + " /" + str(len(image_paths)))
        if count > ceiling:
            break
            
    end_index = ceiling
    print("Up to now " + str(count) + " images processed： " + str(error_count) + " pictures are not well shaped.")

             
    # return the images list and labels list
    return image_list, label_list, end_index, error_count

In [18]:
image_list, label_list, end_index, error_count = get_images_pro(images_path, labels, 0, 0, 40)

The total number of images is 112120
Preprocessing images: 10 /112120
Preprocessing images: 20 /112120
Preprocessing images: 30 /112120
Preprocessing images: 40 /112120
Up to now 41 images processed： 1 pictures are not well shaped.


In [13]:
end_index

40

In [14]:
image_list

[array([[ 202.,  199.,  195., ...,    5.,    2.,    0.],
        [ 199.,  196.,  195., ...,    5.,    2.,    0.],
        [ 196.,  194.,  193., ...,    5.,    2.,    0.],
        ..., 
        [ 255.,  255.,  255., ...,    0.,    0.,    0.],
        [ 255.,  255.,  254., ...,    0.,    0.,    0.],
        [ 255.,  255.,  255., ...,    0.,    0.,    0.]]),
 array([[ 208.,  205.,  206., ...,  204.,  215.,  139.],
        [ 209.,  203.,  205., ...,  202.,  210.,  134.],
        [ 206.,  204.,  202., ...,  202.,  212.,  136.],
        ..., 
        [ 101.,   86.,   73., ...,    0.,    0.,    0.],
        [ 101.,   88.,   72., ...,    0.,    0.,    0.],
        [  99.,   86.,   70., ...,    0.,    0.,    0.]]),
 array([[  7.,  10.,   9., ...,  15.,  19.,  16.],
        [ 13.,  21.,  18., ...,  29.,  33.,  23.],
        [ 12.,  19.,  17., ...,  26.,  28.,  17.],
        ..., 
        [ 18.,  30.,  26., ...,  28.,  31.,  19.],
        [ 21.,  34.,  30., ...,  29.,  31.,  19.],
        [ 11., 

In [15]:
label_array = np.asarray(label_list)

In [16]:
image_array = np.asarray(image_list)

In [None]:
np.save("F:/Data/Chest_Xray_Dataset/CXR-8-full-data/processed/label_array", label_array)

In [None]:
b = 30
for a in range(0,b):
    if a%10 == 0:
        b = b + 1
        print("Hahaha")
    print(a)

### things to do next:

1. solve the picture shape issue (1024, 1024, 4) --- done

2. make sure each batch is exactly 30 (by passing the end of each batch into next batch   --- done

3. create something that can supervised the conversion of the pictures (useful rate or sth)

4. figure out the structure of the array of shape(1024, 1024, 4)

   do not forget the fact that we may give up the last batch (because <30)

5. create a big loop to control the r/w of the pictures

6. then derive the pictures



