In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import PIL.Image
import os
import tensorflow as tf
from keras.preprocessing.image import ImageDataGenerator, img_to_array
from keras.preprocessing import image
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [11]:
train_df = pd.read_csv('/Users/kos/Desktop/test data/ChestRay/Chest_xray_Corona_Metadata.csv')
train_df.dropna(how='all')
train_df.fillna('unknown', inplace=True)
train_data = train_df[train_df['Dataset_type'] == 'TRAIN']
test_data = train_df[train_df['Dataset_type'] == 'TEST']
assert train_data.shape[0] + test_data.shape[0] == train_df.shape[0]
print(f"Shape of train data : {train_data.shape}")
print(f"Shape of test data : {test_data.shape}")

Shape of train data : (5286, 6)
Shape of test data : (624, 6)


In [12]:
test_img_dir = '/Users/kos/Desktop/test data/ChestRay/Coronahack-Chest-XRay-Dataset' \
               '/Coronahack-Chest-XRay-Dataset/test'
train_img_dir = '/Users/kos/Desktop/test data/ChestRay/Coronahack-Chest-XRay-Dataset' \
                '/Coronahack-Chest-XRay-Dataset/train'

sample_train_images = list(os.walk(train_img_dir))[0][2][:8]
sample_train_images = list(map(lambda x: os.path.join(train_img_dir, x), sample_train_images))

sample_test_images = list(os.walk(test_img_dir))[0][2][:8]
sample_test_images = list(map(lambda x: os.path.join(test_img_dir, x), sample_test_images))

In [13]:
# remove Pnuemonia with unknown value
final_train_data = train_data[(train_data['Label'] == 'Normal') |
                              ((train_data['Label'] == 'Pnemonia') &
                               (train_data['Label_2_Virus_category'] == 'COVID-19'))]

# add a target and class feature
final_train_data['class'] = final_train_data.Label.apply(lambda x: 'negative' if x == 'Normal' else 'positive')
test_data['class'] = test_data.Label.apply(lambda x: 'negative' if x == 'Normal' else 'positive')

final_train_data['target'] = final_train_data.Label.apply(lambda x: 0 if x=='Normal' else 1)
test_data['target'] = test_data.Label.apply(lambda x: 0 if x == 'Normal' else 1)
# get the important features
final_train_data = final_train_data[['X_ray_image_name', 'class', 'target', 'Label_2_Virus_category']]
final_test_data = test_data[['X_ray_image_name', 'class', 'target']]

In [14]:
datagen = ImageDataGenerator(
  shear_range=0.2,
  zoom_range=0.2,
)

def read_img(filename, size, path):
    img = image.load_img(os.path.join(path, filename), target_size=size)
    img = img_to_array(img) / 255
    return img


corona_df = final_train_data[final_train_data['Label_2_Virus_category'] == 'COVID-19']
with_corona_augmented = []

# create a function for augmentation
def augment(name):
    img = read_img(name, (255,255), train_img_dir)
    i = 0
    for batch in tqdm(datagen.flow(tf.expand_dims(img, 0), batch_size=32)):
        with_corona_augmented.append(tf.squeeze(batch).numpy())
        if i == 20:
            break
        i = i + 1


# apply the function
print(corona_df['X_ray_image_name'].apply(augment))

20it [00:00, 156.62it/s]             
20it [00:00, 193.11it/s]             
20it [00:00, 177.12it/s]             
20it [00:00, 166.74it/s]             
20it [00:00, 181.26it/s]             
20it [00:00, 188.79it/s]             
20it [00:00, 176.88it/s]             
20it [00:00, 175.53it/s]             
20it [00:00, 179.92it/s]             
20it [00:00, 188.74it/s]             
20it [00:00, 186.26it/s]             
20it [00:00, 181.37it/s]             
20it [00:00, 193.73it/s]             
20it [00:00, 181.93it/s]             
20it [00:00, 181.57it/s]             
20it [00:00, 191.33it/s]             
20it [00:00, 188.71it/s]             
20it [00:00, 195.55it/s]             
20it [00:00, 192.05it/s]             
20it [00:00, 188.00it/s]             
20it [00:00, 185.16it/s]             
20it [00:00, 187.35it/s]             
20it [00:00, 184.34it/s]             
20it [00:00, 179.33it/s]             
20it [00:00, 190.91it/s]             
20it [00:00, 178.45it/s]             
20it [00:00,

5221    None
5222    None
5223    None
5224    None
5225    None
5226    None
5227    None
5228    None
5229    None
5230    None
5237    None
5238    None
5239    None
5240    None
5242    None
5243    None
5244    None
5245    None
5246    None
5247    None
5248    None
5249    None
5250    None
5251    None
5252    None
5253    None
5254    None
5255    None
5256    None
5257    None
5258    None
5259    None
5260    None
5261    None
5262    None
5263    None
5264    None
5265    None
5266    None
5267    None
5268    None
5269    None
5270    None
5271    None
5272    None
5273    None
5274    None
5275    None
5276    None
5277    None
5278    None
5279    None
5280    None
5281    None
5282    None
5283    None
5284    None
5285    None
Name: X_ray_image_name, dtype: object





In [15]:
train_arrays = []
final_train_data['X_ray_image_name'].apply(lambda x: train_arrays.append(read_img(x, (255, 255), train_img_dir)))
test_arrays = []
final_test_data['X_ray_image_name'].apply(lambda x: test_arrays.append(read_img(x, (255, 255), test_img_dir)))

5286    None
5287    None
5288    None
5289    None
5290    None
        ... 
5905    None
5906    None
5907    None
5908    None
5909    None
Name: X_ray_image_name, Length: 624, dtype: object

In [16]:
# concatenate the training data labels and the labels for augmented images
y_train = np.concatenate((np.int64(final_train_data['target'].values),
                          np.ones(len(with_corona_augmented), dtype=np.int64)))