# Generate COVIDcxr Dataset

In [1]:
import numpy as np
import pandas as pd
import os
import random 
from shutil import copyfile
import pydicom as dicom
import cv2
from fastai2.vision.all import *

## 1. COVID-19 Images

##### Download covid-19 image data collection from: https://github.com/ieee8023/covid-chestxray-dataset

In [12]:
# view csv file
covid19_csvpath = '/home/jupyter/covid-chestxray-dataset/metadata.csv'
dfcovid = pd.read_csv(covid19_csvpath)
dfcovid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   patientid               930 non-null    object 
 1   offset                  684 non-null    float64
 2   sex                     850 non-null    object 
 3   age                     693 non-null    float64
 4   finding                 930 non-null    object 
 5   RT_PCR_positive         582 non-null    object 
 6   survival                358 non-null    object 
 7   intubated               243 non-null    object 
 8   intubation_present      246 non-null    object 
 9   went_icu                392 non-null    object 
 10  in_icu                  331 non-null    object 
 11  needed_supplemental_O2  88 non-null     object 
 12  extubated               37 non-null     object 
 13  temperature             74 non-null     float64
 14  pO2_saturation          119 non-null    fl

In [13]:
# clean the csv file 
## drop CT images
dfcovid = dfcovid[dfcovid.modality != 'CT']
## drop other findings such as ARDS & Lateral View
dfcovid = dfcovid[dfcovid.finding == 'COVID-19']
dfcovid = dfcovid[dfcovid.view != 'L'] #23
## drop unused columns
dfcovid = dfcovid.drop(["offset", "RT_PCR_positive", "survival", "intubated", "intubation_present", "went_icu", "in_icu", 
              "needed_supplemental_O2", "extubated","temperature", 
              "pO2_saturation", "leukocyte_count", "neutrophil_count", "lymphocyte_count", 
              "modality", "date", "location", "folder", "doi",
              "url", "license", "clinical_notes", "other_notes"], axis=1)
## drop NULL values
dfcovid = dfcovid.dropna(subset=['age']) 
dfcovid = dfcovid.dropna(subset=['sex']) 
dfcovid = dfcovid.dropna(subset=['view']) 
## re-name columns
dfcovid = dfcovid.rename(columns={"filename": "path"})
## re-order columns
dfcovid = dfcovid[['path', 'finding', 'age', 'sex', 'view', 'patientid']]
## save dfcovid as a new clean csv
dfcovid.to_csv( "/home/jupyter/CovidXrayNet/covid.csv", index=False, encoding='utf-8-sig') 

In [14]:
## view covid.csv
dfcovid.info(), dfcovid.isnull().sum() ,dfcovid['finding'].value_counts(), dfcovid['age'].value_counts(), dfcovid['sex'].value_counts(), dfcovid['view'].value_counts()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 320 entries, 0 to 919
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   path       320 non-null    object 
 1   finding    320 non-null    object 
 2   age        320 non-null    float64
 3   sex        320 non-null    object 
 4   view       320 non-null    object 
 5   patientid  320 non-null    object 
dtypes: float64(1), object(5)
memory usage: 17.5+ KB


(None,
 path         0
 finding      0
 age          0
 sex          0
 view         0
 patientid    0
 dtype: int64,
 COVID-19    320
 Name: finding, dtype: int64,
 65.0    20
 70.0    19
 55.0    18
 50.0    18
 61.0    12
         ..
 41.0     1
 31.0     1
 84.0     1
 33.0     1
 57.0     1
 Name: age, Length: 62, dtype: int64,
 M    213
 F    107
 Name: sex, dtype: int64,
 AP           119
 PA           117
 AP Supine     84
 Name: view, dtype: int64)

## 2. Pnumenia and Healthy Images

##### Download ChestX-ray14 dataset from: https://www.kaggle.com/nih-chest-xrays/data

In [23]:
# view chestxray14 csv file
chestxray14_csvpath = '/home/jupyter/Data_Entry_2017.csv'
dfpnumenia = pd.read_csv(chestxray14_csvpath)
dfpnumenia.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112120 entries, 0 to 112119
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Image Index                  112120 non-null  object 
 1   Finding Labels               112120 non-null  object 
 2   Follow-up #                  112120 non-null  int64  
 3   Patient ID                   112120 non-null  int64  
 4   Patient Age                  112120 non-null  int64  
 5   Patient Gender               112120 non-null  object 
 6   View Position                112120 non-null  object 
 7   OriginalImage[Width          112120 non-null  int64  
 8   Height]                      112120 non-null  int64  
 9   OriginalImagePixelSpacing[x  112120 non-null  float64
 10  y]                           112120 non-null  float64
 11  Unnamed: 11                  0 non-null       float64
dtypes: float64(3), int64(5), object(4)
memory usage: 10.3+ MB


In [24]:
# drop unused columns
dfpnumenia = dfpnumenia.drop(["Follow-up #", "OriginalImage[Width", "Height]", 
              "OriginalImagePixelSpacing[x","y]", "Unnamed: 11"], axis=1)
# arrange columns
dfpnumenia = dfpnumenia[['Image Index', 'Finding Labels', 'Patient Age', 'Patient Gender', 
         'View Position', 'Patient ID']]
# rename columns
dfpnumenia = dfpnumenia.rename(columns={"Image Index": "path", "Finding Labels": "finding", 
                        "Patient Age": "age", "Patient Gender": "sex",
                        "View Position": "view", "Patient ID": "patientid"})
# 1. pnumenia
dfpnumenia = dfpnumenia[dfpnumenia.finding == 'Pneumonia']
dfpnumenia = dfpnumenia.drop(dfpnumenia.index[320:])
dfpnumenia.to_csv( "/home/jupyter/CovidXrayNet/pneumonia.csv", index=False, encoding='utf-8-sig')

In [26]:
dfpnumenia.info(), dfpnumenia.isnull().sum(), dfpnumenia['finding'].value_counts(), dfpnumenia['age'].value_counts(), dfpnumenia['sex'].value_counts(), dfpnumenia['view'].value_counts()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 320 entries, 279 to 109877
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   path       320 non-null    object
 1   finding    320 non-null    object
 2   age        320 non-null    int64 
 3   sex        320 non-null    object
 4   view       320 non-null    object
 5   patientid  320 non-null    int64 
dtypes: int64(2), object(4)
memory usage: 17.5+ KB


(None,
 path         0
 finding      0
 age          0
 sex          0
 view         0
 patientid    0
 dtype: int64,
 Pneumonia    320
 Name: finding, dtype: int64,
 33    12
 46     9
 44     9
 50     9
 63     8
       ..
 75     1
 77     1
 79     1
 82     1
 3      1
 Name: age, Length: 78, dtype: int64,
 M    193
 F    127
 Name: sex, dtype: int64,
 PA    176
 AP    144
 Name: view, dtype: int64)

In [27]:
# read chestxray14 csv file
chestxray14_csvpath = '/home/jupyter/Data_Entry_2017.csv'
dfhealthy = pd.read_csv(chestxray14_csvpath)
# drop unused columns
dfhealthy = dfhealthy.drop(["Follow-up #", "OriginalImage[Width", "Height]", 
              "OriginalImagePixelSpacing[x","y]", "Unnamed: 11"], axis=1)
# arrange columns
dfhealthy = dfhealthy[['Image Index', 'Finding Labels', 'Patient Age', 'Patient Gender', 
         'View Position', 'Patient ID']]
# rename columns
dfhealthy = dfhealthy.rename(columns={"Image Index": "path", "Finding Labels": "finding", 
                        "Patient Age": "age", "Patient Gender": "sex",
                        "View Position": "view", "Patient ID": "patientid"})
# 2. healthy
dfhealthy = dfhealthy[dfhealthy.finding == 'No Finding']
dfhealthy = dfhealthy.drop(dfhealthy.index[320:])
dfhealthy.to_csv( "/home/jupyter/CovidXrayNet/healthy.csv", index=False, encoding='utf-8-sig')

In [28]:
dfhealthy.info(), dfhealthy.isnull().sum(), dfhealthy['finding'].value_counts(), dfhealthy['age'].value_counts(), dfhealthy['sex'].value_counts(), dfhealthy['view'].value_counts()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 320 entries, 3 to 787
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   path       320 non-null    object
 1   finding    320 non-null    object
 2   age        320 non-null    int64 
 3   sex        320 non-null    object
 4   view       320 non-null    object
 5   patientid  320 non-null    int64 
dtypes: int64(2), object(4)
memory usage: 17.5+ KB


(None,
 path         0
 finding      0
 age          0
 sex          0
 view         0
 patientid    0
 dtype: int64,
 No Finding    320
 Name: finding, dtype: int64,
 55    25
 50    23
 70    18
 71    15
 73    15
 67    13
 75    13
 64    12
 69    12
 61    10
 77     9
 68     9
 62     8
 52     8
 54     8
 56     7
 58     7
 60     7
 53     7
 63     7
 76     6
 65     6
 66     5
 81     5
 74     5
 46     4
 83     4
 59     4
 57     4
 78     4
 87     3
 92     3
 51     3
 48     2
 49     2
 42     2
 32     2
 31     2
 94     2
 72     2
 79     2
 82     2
 85     2
 80     1
 84     1
 47     1
 45     1
 89     1
 34     1
 33     1
 90     1
 91     1
 30     1
 25     1
 Name: age, dtype: int64,
 M    162
 F    158
 Name: sex, dtype: int64,
 PA    230
 AP     90
 Name: view, dtype: int64)

## 3. Combine Images  

In [2]:
# view all csv files
path = Path('/home/jupyter/CovidXrayNet')
os.chdir("/home/jupyter/CovidXrayNet") 
df_covid19 = pd.read_csv(path/'covid19.csv') 
df_pneumonia = pd.read_csv(path/'pneumonia.csv') 
df_healthy = pd.read_csv(path/'healthy.csv')

In [9]:
df_covid19.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 320 entries, 0 to 319
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   path       320 non-null    object 
 1   finding    320 non-null    object 
 2   age        320 non-null    float64
 3   sex        320 non-null    object 
 4   view       320 non-null    object 
 5   patientid  320 non-null    object 
dtypes: float64(1), object(5)
memory usage: 15.1+ KB


In [10]:
df_pneumonia.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 320 entries, 0 to 319
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   path       320 non-null    object
 1   finding    320 non-null    object
 2   age        320 non-null    int64 
 3   sex        320 non-null    object
 4   view       320 non-null    object
 5   patientid  320 non-null    int64 
dtypes: int64(2), object(4)
memory usage: 15.1+ KB


In [11]:
df_healthy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 320 entries, 0 to 319
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   path       320 non-null    object
 1   finding    320 non-null    object
 2   age        320 non-null    int64 
 3   sex        320 non-null    object
 4   view       320 non-null    object
 5   patientid  320 non-null    int64 
dtypes: int64(2), object(4)
memory usage: 15.1+ KB


In [14]:
# Merge csv files vertically (on top of each other)
frames = [df_covid19, df_pneumonia, df_healthy]
df_covidcxr = pd.concat(frames)
df_covidcxr.to_csv( "COVIDcxr.csv", index=False, encoding='utf-8-sig')
df_covidcxr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 960 entries, 0 to 319
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   path       960 non-null    object 
 1   finding    960 non-null    object 
 2   age        960 non-null    float64
 3   sex        960 non-null    object 
 4   view       960 non-null    object 
 5   patientid  960 non-null    object 
dtypes: float64(1), object(5)
memory usage: 52.5+ KB


In [15]:
df_covidcxr.isnull().sum(), df_covidcxr['finding'].value_counts(), df_covidcxr['age'].value_counts(), df_covidcxr['sex'].value_counts(), df_covidcxr['view'].value_counts()

(path         0
 finding      0
 age          0
 sex          0
 view         0
 patientid    0
 dtype: int64,
 COVID-19      320
 No Finding    320
 Pneumonia     320
 Name: finding, dtype: int64,
 55.0    51
 50.0    50
 70.0    37
 65.0    32
 73.0    29
         ..
 3.0      1
 17.0     1
 12.0     1
 90.0     1
 91.0     1
 Name: age, Length: 91, dtype: int64,
 M    568
 F    392
 Name: sex, dtype: int64,
 PA           523
 AP           353
 AP Supine     84
 Name: view, dtype: int64)

In [16]:
df_covidcxr['age'].describe()

count    960.000000
mean      54.669792
std       18.355041
min        3.000000
25%       44.000000
50%       56.000000
75%       69.000000
max       94.000000
Name: age, dtype: float64

In [18]:
## move covid-19 images to one folder (if the path exist in csv)
dir_src = '/home/jupyter/covid-chestxray-dataset/images'
dir_dst = '/home/jupyter/covidcxr'
for fileName in df_covid19['path']: 
    file_src = dir_src + "/" + fileName
    file_dst = dir_dst + "/" + fileName
    try:
        copyfile(file_src, file_dst)  
    except IOError as e:
        print('Unable to copy file {} to {}'
              .format(file_src, file_dst))
    except:
        print('When try copy file {} to {}, unexpected error: {}'
              .format(file_src, file_dst, sys.exc_info())) 

In [26]:
path = Path ('/home/jupyter/covidcxr')
num_files = len([f for f in os.listdir(path)if os.path.isfile(os.path.join(path, f))])
num_files

320

In [None]:
# Run the following lines into the terminal to combine all nih-chestxary images into one file
# mv /home/jupyter/images_001/images/* /home/jupyter/nih-chestxray
# mv /home/jupyter/images_002/images/* /home/jupyter/nih-chestxray
# mv /home/jupyter/images_003/images/* /home/jupyter/nih-chestxray
# mv /home/jupyter/images_004/images/* /home/jupyter/nih-chestxray
# mv /home/jupyter/images_005/images/* /home/jupyter/nih-chestxray
# mv /home/jupyter/images_006/images/* /home/jupyter/nih-chestxray
# mv /home/jupyter/images_007/images/* /home/jupyter/nih-chestxray
# mv /home/jupyter/images_008/images/* /home/jupyter/nih-chestxray
# mv /home/jupyter/images_009/images/* /home/jupyter/nih-chestxray
# mv /home/jupyter/images_010/images/* /home/jupyter/nih-chestxray
# mv /home/jupyter/images_011/images/* /home/jupyter/nih-chestxray
# mv /home/jupyter/images_012/images/* /home/jupyter/nih-chestxray

In [43]:
## move pneumonia cxr to one folder (if the path exist in csv)
dir_src = '/home/jupyter/nih-chestxray'
dir_dst = '/home/jupyter/covidcxr'
for fileName in df_pneumonia['path']: 
    file_src = dir_src + "/" + fileName
    file_dst = dir_dst + "/" + fileName
    try:
        copyfile(file_src, file_dst)   
    except IOError as e:
        print('Unable to copy file {} to {}'
              .format(file_src, file_dst))
    except:
        print('When try copy file {} to {}, unexpected error: {}'
              .format(file_src, file_dst, sys.exc_info()))     

In [44]:
path = Path ('/home/jupyter/covidcxr')
num_files = len([f for f in os.listdir(path)if os.path.isfile(os.path.join(path, f))])
num_files

640

In [45]:
## move healthy cxr to one folder (if the path exist in csv)
for fileName in df_healthy['path']: 
    file_src = dir_src + "/" + fileName
    file_dst = dir_dst + "/" + fileName
    try:
        copyfile(file_src, file_dst)    
    except IOError as e:
        print('Unable to copy file {} to {}'
              .format(file_src, file_dst))
    except:
        print('When try copy file {} to {}, unexpected error: {}'
              .format(file_src, file_dst, sys.exc_info())) 

In [46]:
path = Path ('/home/jupyter/covidcxr')
num_files = len([f for f in os.listdir(path)if os.path.isfile(os.path.join(path, f))])
num_files

960