# Pre-Process COVIDx Dataset

In [60]:
from fastai2.vision.all import *
import os.path
path = Path('/home/jupyter/covidx')

In [61]:
torch.cuda.empty_cache()

In [62]:
# fix result 
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
SEED = 42
seed_everything(SEED)

## 1. View COVIDx

In [69]:
path = Path('/home/jupyter/covidx/train')
folder_train = len([f for f in os.listdir(path)if os.path.isfile(os.path.join(path, f))])
folder_train

13917

In [3]:
path = Path('/home/jupyter/covidx/test')
folder_test = len([f for f in os.listdir(path)if os.path.isfile(os.path.join(path, f))])
folder_test

1580

In [67]:
path = Path('/home/jupyter/covidx')
df_train = pd.read_csv(path/'covidx_train.csv')
df_train

Unnamed: 0,3,SARS-10.1148rg.242035193-g04mr34g0-Fig8b-day5.jpeg,pneumonia,cohen
0,3,SARS-10.1148rg.242035193-g04mr34g0-Fig8c-day10.jpeg,pneumonia,cohen
1,7,SARS-10.1148rg.242035193-g04mr34g04a-Fig4a-day7.jpeg,pneumonia,cohen
2,7,SARS-10.1148rg.242035193-g04mr34g04b-Fig4b-day12.jpeg,pneumonia,cohen
3,9,SARS-10.1148rg.242035193-g04mr34g07a-Fig7a-day5.jpeg,pneumonia,cohen
4,9,SARS-10.1148rg.242035193-g04mr34g07b-Fig7b-day12.jpeg,pneumonia,cohen
...,...,...,...,...
13911,c1628c47-5ba3-42dd-8df3-7ad3abd57ad0,c1628c47-5ba3-42dd-8df3-7ad3abd57ad0.png,pneumonia,rsna
13912,c1cddf32-b957-4753-acaa-472ab1447e86,c1cddf32-b957-4753-acaa-472ab1447e86.png,pneumonia,rsna
13913,c1e73a4e-7afe-4ec5-8af6-ce8315d7a2f2,c1e73a4e-7afe-4ec5-8af6-ce8315d7a2f2.png,pneumonia,rsna
13914,c1ec14ff-f6d7-4b38-b0cb-fe07041cbdc8,c1ec14ff-f6d7-4b38-b0cb-fe07041cbdc8.png,pneumonia,rsna


In [66]:
path = Path('/home/jupyter/covidx')
df_test = pd.read_csv(path/'covidx_test.csv')
df_test

Unnamed: 0,8,SARS-10.1148rg.242035193-g04mr34g05x-Fig5-day9.jpeg,pneumonia,cohen
0,31,streptococcus-pneumoniae-pneumonia-temporal-evolution-1-day0.jpg,pneumonia,cohen
1,31,streptococcus-pneumoniae-pneumonia-temporal-evolution-1-day1.jpg,pneumonia,cohen
2,31,streptococcus-pneumoniae-pneumonia-temporal-evolution-1-day2.jpg,pneumonia,cohen
3,31,streptococcus-pneumoniae-pneumonia-temporal-evolution-1-day3.jpg,pneumonia,cohen
4,19,1-s2.0-S0929664620300449-gr2_lrg-a.jpg,COVID-19,cohen
...,...,...,...,...
1573,2c917d3a-95cb-4c11-802c-f83e28cb37bc,2c917d3a-95cb-4c11-802c-f83e28cb37bc.png,pneumonia,rsna
1574,3040d9d7-d895-453f-887c-616c10531960,3040d9d7-d895-453f-887c-616c10531960.png,pneumonia,rsna
1575,c07f52df-d481-434f-84c1-04263926ac40,c07f52df-d481-434f-84c1-04263926ac40.png,pneumonia,rsna
1576,c109061a-d815-4cae-8343-9230d8024adf,c109061a-d815-4cae-8343-9230d8024adf.png,pneumonia,rsna


## 2. Combine Images

In [71]:
# run this line in the terminal
# mv /home/jupyter/covidx/test/* /home/jupyter/covidx/images
path = Path('/home/jupyter/covidx/images') 
folder_images = len([f for f in os.listdir(path)if os.path.isfile(os.path.join(path, f))])
folder_images

15496

## 3. Join CSVs & Split COVIDx by is_valid Column

In [73]:
path = Path('/home/jupyter/covidx')
# add Headers
df_train = pd.read_csv(path/"covidx_train.csv", names=['patientid', 'path', 'finding', 'source'])
df_test = pd.read_csv(path/"covidx_test.csv", names=['patientid', 'path', 'finding', 'source'])
# add is_valid col
df_train['is_valid'] = True
df_test['is_valid'] = False

In [69]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13916 entries, 0 to 13915
Data columns (total 5 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   3                                                   13916 non-null  object
 1   SARS-10.1148rg.242035193-g04mr34g0-Fig8b-day5.jpeg  13916 non-null  object
 2   pneumonia                                           13916 non-null  object
 3   cohen                                               13916 non-null  object
 4   is_valid                                            13916 non-null  bool  
dtypes: bool(1), object(4)
memory usage: 448.6+ KB


In [74]:
# stack the data frames on top of each other
frames = [df_train, df_test] 
df_covidx = pd.concat(frames)
df_covidx.to_csv( path/"covidx.csv", index=False, encoding='utf-8-sig')
df_covidx.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15496 entries, 0 to 1578
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   patientid  15496 non-null  object
 1   path       15496 non-null  object
 2   finding    15496 non-null  object
 3   source     15496 non-null  object
 4   is_valid   15496 non-null  bool  
dtypes: bool(1), object(4)
memory usage: 620.4+ KB


In [75]:
df_covidx['is_valid'].value_counts()

True     13917
False     1579
Name: is_valid, dtype: int64