# ChestX-ray-8__Data Preprocessing__Phase-2

### 1. import modules

In [15]:
import os
import numpy as np
from PIL import Image
import pandas as pd
import re

### 2. locate the file

In [16]:
images_path = "F:/Data/Chest_Xray_Dataset/CXR-8-full-data/images"

### 4. Now we gonna convert the csv file into a dataframe/np array

In [17]:
labels_path = "F:/Data/Chest_Xray_Dataset/CXR-8-full-data/Data_Entry_2017.csv"

In [18]:
df = pd.read_csv(labels_path)

#### below is the dataframe we need:

In [19]:
labels = df[['Image Index', 'Finding Labels']]

### 5. Define the function to process the finding labels

In [20]:
existing_annotations = ['Atelectasis','Cardiomegaly','Effusion','Infiltration','Mass','Nodule','Pneumonia','Pneumothorax']
existing_annotations = np.asarray(existing_annotations)
len(existing_annotations)

8

In [21]:
def process_findings(findings, annotations):
    """
    Parameters: findings as string, annotations as np array
    
    Output: onehot as np array
    
    """
    onehot = np.zeros(len(annotations))
    i = 0
    for disease in annotations:
        if bool(re.search(disease, findings)):
            onehot[i] = 1
        i = i + 1
    return onehot

### 6. Upgrade the origin function into: get_images_and_labels

In [22]:
image_paths = [os.path.join(images_path, f) for f in os.listdir(images_path)]

In [23]:
def get_images_pro(paths, labels, begin_id, error_count_in, batch_size = 30):
    """
    Parameters:
        1. paths as list: a list which saves all the paths of pictures in "images" folder
        2. (two-column) labels as dataframe: the labels for the images
        3. begin-id as integer: the index of the first image in this batch (the batch to be processed)
        4. error_count_in: the number of images with wrong shape in all the batches before
        5. batch_size as integer: 30 by default
    
    Outputs:
        1. image_list as list: containing the arrays of shape (1024, 1024)
        2. label_list as list: containing the onehot np arrays of shape (, len(annotations))
        3. end_index as integer: the index for the last image in the batch just processed, to be passed into next iteration
        4. error_count: the number of images with wrong shape in all the batches processed, to be passed into next iteration
    
    """
    # Append all the absolute image paths in a list image_paths
    image_paths = paths
    # images will contains face images
    image_list = []
    label_list = []
    
    print("The total number of images is " + str(len(image_paths)))
    count = begin_id
    ceiling = begin_id + batch_size - 1
    error_count = error_count_in
    
    while True:
        # Read the image and convert to grayscale
        image_path = image_paths[count]
        image_pil = Image.open(image_path)
        # Convert the image format into numpy array
        image = np.array(image_pil, dtype = 'float')
        
        if image.shape == (1024, 1024):
            image_list.append(image)
            index = image_path.split("\\")[-1]
            findings = labels.loc[(df['Image Index']==index)].iloc[0][1]    
            label_list.append(process_findings(findings, existing_annotations))    
        else:
            ceiling = ceiling + 1
            error_count = error_count + 1
            
        count = count + 1
        
        if count % 50 == 0:
            print("Preprocessing images: " + str(count) + " /" + str(len(image_paths)))
        if count > ceiling:
            break
            
    end_index = ceiling
    print("Up to now " + str(count) + " images processed： " + str(error_count) + " pictures are not well shaped.")

             
    # return the images list and labels list
    return image_list, label_list, end_index, error_count

In [24]:
image_list, label_list, end_index, error_count = get_images_pro(image_paths, labels, 0, 0, 40)

The total number of images is 112120
Up to now 41 images processed： 1 pictures are not well shaped.


In [25]:
end_index

40

In [26]:
label_array = np.asarray(label_list)

In [27]:
image_array = np.asarray(image_list)

In [14]:
# np.save("F:/Data/Chest_Xray_Dataset/CXR-8-full-data/processed/label_array", label_array)

### 7. The loop

Demonstration:

In [None]:
batch_id = 0
begin_id = 0
error_count_in = 0

while batch_id < 100:
    
    image_list, label_list, end_index, error_count = get_images_pro(image_paths, labels, begin_id, error_count_in, 128)
    error_count_in = error_count
    begin_id = end_index + 1
    
    label_array = np.asarray(label_list)
    image_array = np.asarray(image_list)
    np.save("F:/Data/Chest_Xray_Dataset/CXR-8-full-data/processed/label_array_batch_" + str(batch_id), label_array)
    np.save("F:/Data/Chest_Xray_Dataset/CXR-8-full-data/processed/image_array_batch_" + str(batch_id), image_array)
    print("No. of batches saved: " + str(batch_id + 1))

    batch_id = batch_id + 1


The total number of images is 112120
Preprocessing images: 50 /112120
Preprocessing images: 100 /112120
Up to now 129 images processed： 1 pictures are not well shaped.
No. of batches saved: 1
The total number of images is 112120
Preprocessing images: 150 /112120
Preprocessing images: 200 /112120
Preprocessing images: 250 /112120
Up to now 257 images processed： 1 pictures are not well shaped.
No. of batches saved: 2
The total number of images is 112120
Preprocessing images: 300 /112120
Preprocessing images: 350 /112120
Up to now 392 images processed： 8 pictures are not well shaped.
No. of batches saved: 3
The total number of images is 112120
Preprocessing images: 400 /112120
Preprocessing images: 450 /112120
Preprocessing images: 500 /112120
Up to now 523 images processed： 11 pictures are not well shaped.
No. of batches saved: 4
The total number of images is 112120
Preprocessing images: 550 /112120
Preprocessing images: 600 /112120
Preprocessing images: 650 /112120
Up to now 656 images 

No. of batches saved: 38
The total number of images is 112120
Preprocessing images: 4950 /112120
Preprocessing images: 5000 /112120
Preprocessing images: 5050 /112120
Up to now 5063 images processed： 71 pictures are not well shaped.
No. of batches saved: 39
The total number of images is 112120
Preprocessing images: 5100 /112120
Preprocessing images: 5150 /112120
Up to now 5192 images processed： 72 pictures are not well shaped.
No. of batches saved: 40
The total number of images is 112120
Preprocessing images: 5200 /112120
Preprocessing images: 5250 /112120
Preprocessing images: 5300 /112120
Up to now 5322 images processed： 74 pictures are not well shaped.
No. of batches saved: 41
The total number of images is 112120
Preprocessing images: 5350 /112120
Preprocessing images: 5400 /112120
Preprocessing images: 5450 /112120
Up to now 5453 images processed： 77 pictures are not well shaped.
No. of batches saved: 42
The total number of images is 112120
Preprocessing images: 5500 /112120
Prepro

Preprocessing images: 9800 /112120
Up to now 9837 images processed： 109 pictures are not well shaped.
No. of batches saved: 76
The total number of images is 112120
Preprocessing images: 9850 /112120
Preprocessing images: 9900 /112120
Preprocessing images: 9950 /112120
Up to now 9966 images processed： 110 pictures are not well shaped.
No. of batches saved: 77
The total number of images is 112120
Preprocessing images: 10000 /112120
Preprocessing images: 10050 /112120
Up to now 10096 images processed： 112 pictures are not well shaped.
No. of batches saved: 78
The total number of images is 112120
Preprocessing images: 10100 /112120
Preprocessing images: 10150 /112120
Preprocessing images: 10200 /112120
Up to now 10226 images processed： 114 pictures are not well shaped.
No. of batches saved: 79
The total number of images is 112120
Preprocessing images: 10250 /112120
Preprocessing images: 10300 /112120
Preprocessing images: 10350 /112120
Up to now 10354 images processed： 114 pictures are not

### Things to do:

1. Try to save the pictures as integer arrays...

2. 