# COVID-19 X-Ray Classification --- Creating the Dataset

### `Importing libraries & creating the dataset`

In [None]:
import pandas as pd
import numpy as np

import os
import shutil
import random

pd.set_option('max.columns', None)
import warnings
warnings.filterwarnings('ignore')

### `Loading metadata file`

In [None]:
df = pd.read_csv('/content/drive/MyDrive/COVID-19 Project/covid-chestxray-dataset/covid-chestxray-dataset-master/metadata.csv')
df.sample(2)

Unnamed: 0,patientid,offset,sex,age,finding,RT_PCR_positive,survival,intubated,intubation_present,went_icu,in_icu,needed_supplemental_O2,extubated,temperature,pO2_saturation,leukocyte_count,neutrophil_count,lymphocyte_count,view,modality,date,location,folder,filename,doi,url,license,clinical_notes,other_notes,Unnamed: 29
543,289,3.0,,,Pneumonia/Viral/COVID-19,Unclear,,,,,,,,,,,,,AP,X-ray,,,images,68_2020_1364_Fig1_HTML-b.png,10.1007/s00068-020-01364-7,https://link.springer.com/article/10.1007/s000...,,Chest X-Ray of a symptomatic patient on hospit...,,
199,105,,M,,Pneumonia/Fungal/Pneumocystis,,,,,,,,,,,,,,PA,X-ray,2010.0,,images,pneumocystis-carinii-pneumonia-1-PA.jpg,,https://radiopaedia.org/cases/pneumocystis-car...,CC BY-NC-SA,There are diffuse bilaterally symmetric inters...,"Case courtesy of Radswiki, Radiopaedia.org, rI...",


In [None]:
len(df[(df['finding'].str.contains('COVID-19')) & (df['view'] == 'PA')]) ## 584 COVID positive files with posterioranterior view of the chest

196

In [None]:
df = df[(df['finding'].str.contains('COVID-19')) & (df['view'] == 'PA')]

### `COVID-19 X-ray Images`

In [None]:
covid_images_path = '/content/drive/MyDrive/COVID-19 Project/covid-chestxray-dataset/covid-chestxray-dataset-master/images'
target_path = "/content/drive/MyDrive/COVID-19 Project/Dataset/COVID"

In [None]:
if not os.path.exists(target_path):
  os.makedirs(target_path)
  print('COVID-19 folder created')

COVID-19 folder created


In [None]:
for (i, row) in df.iterrows():
  filename = row['filename']
  img_path = os.path.join(covid_images_path, filename)
  img_copy_path = os.path.join(target_path, filename)
  shutil.copy2(img_path, img_copy_path)
  print('Moving image', i)

Moving image 0
Moving image 1
Moving image 2
Moving image 3
Moving image 4
Moving image 5
Moving image 7
Moving image 8
Moving image 20
Moving image 21
Moving image 22
Moving image 24
Moving image 26
Moving image 27
Moving image 29
Moving image 30
Moving image 31
Moving image 32
Moving image 33
Moving image 39
Moving image 48
Moving image 51
Moving image 54
Moving image 72
Moving image 81
Moving image 82
Moving image 88
Moving image 90
Moving image 91
Moving image 92
Moving image 93
Moving image 95
Moving image 96
Moving image 98
Moving image 99
Moving image 101
Moving image 103
Moving image 107
Moving image 110
Moving image 111
Moving image 115
Moving image 116
Moving image 117
Moving image 118
Moving image 129
Moving image 130
Moving image 131
Moving image 132
Moving image 133
Moving image 134
Moving image 136
Moving image 142
Moving image 143
Moving image 146
Moving image 147
Moving image 149
Moving image 151
Moving image 152
Moving image 156
Moving image 158
Moving image 176
Moving

In [None]:
len(os.listdir(target_path))

196

### `Normal X-ray Images`

In [None]:
normal_images_path = '/content/drive/MyDrive/COVID-19 Project/normal/chest_xray/train/NORMAL'
target_path_normal = "/content/drive/MyDrive/COVID-19 Project/Dataset/NORMAL"

In [None]:
if not os.path.exists(target_path_normal):
  os.makedirs(target_path_normal)
  print('NORMAL folder created')

NORMAL folder created


In [None]:
img_names = os.listdir(normal_images_path)

In [None]:
random.shuffle(img_names)

In [None]:
for i in range(len(os.listdir(target_path))):
  img_name = img_names[i]
  img_path = os.path.join(normal_images_path, img_name)
  trgt_path = os.path.join(target_path_normal, img_name)
  shutil.copy2(img_path, trgt_path)
  print('Copying image', i)

Copying image 0
Copying image 1
Copying image 2
Copying image 3
Copying image 4
Copying image 5
Copying image 6
Copying image 7
Copying image 8
Copying image 9
Copying image 10
Copying image 11
Copying image 12
Copying image 13
Copying image 14
Copying image 15
Copying image 16
Copying image 17
Copying image 18
Copying image 19
Copying image 20
Copying image 21
Copying image 22
Copying image 23
Copying image 24
Copying image 25
Copying image 26
Copying image 27
Copying image 28
Copying image 29
Copying image 30
Copying image 31
Copying image 32
Copying image 33
Copying image 34
Copying image 35
Copying image 36
Copying image 37
Copying image 38
Copying image 39
Copying image 40
Copying image 41
Copying image 42
Copying image 43
Copying image 44
Copying image 45
Copying image 46
Copying image 47
Copying image 48
Copying image 49
Copying image 50
Copying image 51
Copying image 52
Copying image 53
Copying image 54
Copying image 55
Copying image 56
Copying image 57
Copying image 58
Copying

In [None]:
len(os.listdir(target_path_normal))

196

### `Creating Train folder`

In [None]:
train_covid_path = '/content/drive/MyDrive/COVID-19 Project/Dataset/train/COVID'
train_normal_path = '/content/drive/MyDrive/COVID-19 Project/Dataset/train/NORMAL'

source_COVID = '/content/drive/MyDrive/COVID-19 Project/Dataset/COVID'
source_NORMAL = '/content/drive/MyDrive/COVID-19 Project/Dataset/NORMAL'

if not os.path.exists(train_covid_path):
  os.makedirs(train_covid_path)
  print('Train COVID folder created')

if not os.path.exists(train_normal_path):
  os.makedirs(train_normal_path)
  print('Train NORMAL folder created')

Train COVID folder created
Train NORMAL folder created


In [None]:
covid_img_names = os.listdir(source_COVID)
normal_img_names = os.listdir(source_NORMAL)

random.shuffle(covid_img_names)
random.shuffle(normal_img_names)

In [None]:
train_size = int(0.75*196)

In [None]:
for i in range(train_size):
  covid_img = covid_img_names[i]
  normal_img = normal_img_names[i]
  
  covid_img_path = os.path.join(source_COVID, covid_img)
  covid_trgt_path = os.path.join(train_covid_path, covid_img)
  shutil.move(covid_img_path, covid_trgt_path)

  normal_img_path = os.path.join(source_NORMAL, normal_img)
  normal_trgt_path = os.path.join(train_normal_path, normal_img)
  shutil.move(normal_img_path, normal_trgt_path)

  print('Copying COVID & NORMAL image', i)

Copying COVID & NORMAL image 0
Copying COVID & NORMAL image 1
Copying COVID & NORMAL image 2
Copying COVID & NORMAL image 3
Copying COVID & NORMAL image 4
Copying COVID & NORMAL image 5
Copying COVID & NORMAL image 6
Copying COVID & NORMAL image 7
Copying COVID & NORMAL image 8
Copying COVID & NORMAL image 9
Copying COVID & NORMAL image 10
Copying COVID & NORMAL image 11
Copying COVID & NORMAL image 12
Copying COVID & NORMAL image 13
Copying COVID & NORMAL image 14
Copying COVID & NORMAL image 15
Copying COVID & NORMAL image 16
Copying COVID & NORMAL image 17
Copying COVID & NORMAL image 18
Copying COVID & NORMAL image 19
Copying COVID & NORMAL image 20
Copying COVID & NORMAL image 21
Copying COVID & NORMAL image 22
Copying COVID & NORMAL image 23
Copying COVID & NORMAL image 24
Copying COVID & NORMAL image 25
Copying COVID & NORMAL image 26
Copying COVID & NORMAL image 27
Copying COVID & NORMAL image 28
Copying COVID & NORMAL image 29
Copying COVID & NORMAL image 30
Copying COVID & NO

### `Creating Test folder`

In [None]:
test_covid_path = '/content/drive/MyDrive/COVID-19 Project/Dataset/test/COVID'
test_normal_path = '/content/drive/MyDrive/COVID-19 Project/Dataset/test/NORMAL'

source_COVID = '/content/drive/MyDrive/COVID-19 Project/Dataset/COVID'
source_NORMAL = '/content/drive/MyDrive/COVID-19 Project/Dataset/NORMAL'

if not os.path.exists(test_covid_path):
  os.makedirs(test_covid_path)
  print('Test COVID folder created')

if not os.path.exists(test_normal_path):
  os.makedirs(test_normal_path)
  print('Test NORMAL folder created')

Test COVID folder created
Test NORMAL folder created


In [None]:
covid_img_names = os.listdir(source_COVID)
normal_img_names = os.listdir(source_NORMAL)

In [None]:
for i in range(len(covid_img_names)):
  covid_img = covid_img_names[i]
  normal_img = normal_img_names[i]
  
  covid_img_path = os.path.join(source_COVID, covid_img)
  covid_trgt_path = os.path.join(test_covid_path, covid_img)
  shutil.move(covid_img_path, covid_trgt_path)

  normal_img_path = os.path.join(source_NORMAL, normal_img)
  normal_trgt_path = os.path.join(test_normal_path, normal_img)
  shutil.move(normal_img_path, normal_trgt_path)

  print('Moving COVID & NORMAL image', i)

Moving COVID & NORMAL image 0
Moving COVID & NORMAL image 1
Moving COVID & NORMAL image 2
Moving COVID & NORMAL image 3
Moving COVID & NORMAL image 4
Moving COVID & NORMAL image 5
Moving COVID & NORMAL image 6
Moving COVID & NORMAL image 7
Moving COVID & NORMAL image 8
Moving COVID & NORMAL image 9
Moving COVID & NORMAL image 10
Moving COVID & NORMAL image 11
Moving COVID & NORMAL image 12
Moving COVID & NORMAL image 13
Moving COVID & NORMAL image 14
Moving COVID & NORMAL image 15
Moving COVID & NORMAL image 16
Moving COVID & NORMAL image 17
Moving COVID & NORMAL image 18
Moving COVID & NORMAL image 19
Moving COVID & NORMAL image 20
Moving COVID & NORMAL image 21
Moving COVID & NORMAL image 22
Moving COVID & NORMAL image 23
Moving COVID & NORMAL image 24
Moving COVID & NORMAL image 25
Moving COVID & NORMAL image 26
Moving COVID & NORMAL image 27
Moving COVID & NORMAL image 28
Moving COVID & NORMAL image 29
Moving COVID & NORMAL image 30
Moving COVID & NORMAL image 31
Moving COVID & NOR

In [None]:
os.rmdir(source_COVID)
os.rmdir(source_NORMAL)