In [None]:
import pandas as pd
from shutil import move
import os

In [4]:
merged_labels = pd.read_csv('/ocean/projects/cis230079p/shared/CapStone-VeyTel-2024/datasets/merged_labels.csv')
shuffled_df = merged_labels.sample(frac=1, random_state=1).reset_index(drop=True)
shuffled_df.head(5)

Unnamed: 0,age,sex,label,dataset,dataset_ID,frontal/lateral,AP/PA,filename
0,66.0,F,2,bimcv_pos,sub-S318342_ses-E37955_run-1_bp-chest_vp-pa_dx...,Frontal,PA,20846_bimcv_pos.png
1,54.0,F,0,chestxray14,00019378_000.png,Frontal,PA,11624_chestxray14.png
2,49.0,M,1,chexpert,patient02427/study2/view1_frontal.jpg,Frontal,PA,8423_chexpert.png
3,55.0,F,0,chestxray14,00019363_030.png,Frontal,AP,12563_chestxray14.png
4,60.0,F,1,chestxray14,00019199_001.png,Frontal,PA,15258_chestxray14.png


In [7]:
shuffled_df[shuffled_df['label'] == 1]

Unnamed: 0,age,sex,label,dataset,dataset_ID,frontal/lateral,AP/PA,filename
2,49.0,M,1,chexpert,patient02427/study2/view1_frontal.jpg,Frontal,PA,8423_chexpert.png
4,60.0,F,1,chestxray14,00019199_001.png,Frontal,PA,15258_chestxray14.png
7,37.0,M,1,chexpert,patient37821/study3/view1_frontal.jpg,Frontal,AP,10544_chexpert.png
11,51.0,M,1,chexpert,patient03915/study1/view1_frontal.jpg,Frontal,PA,8504_chexpert.png
16,46.0,M,1,chexpert,patient27450/study1/view1_frontal.jpg,Frontal,PA,9874_chexpert.png
...,...,...,...,...,...,...,...,...
24466,67.0,F,1,padchest,216840111366964013076187734852011269135118336_...,Frontal,AP,4764_padchest.png
24468,22.0,F,1,chexpert,patient02811/study1/view2_lateral.jpg,Lateral,,8444_chexpert.png
24474,55.0,F,1,padchest,27849891022487966511612013793372705700_l842mp.png,Frontal,AP,3462_padchest.png
24475,56.0,F,1,chexpert,patient48405/study1/view1_frontal.jpg,Frontal,AP,10989_chexpert.png


In [5]:
shuffled_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24488 entries, 0 to 24487
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              24488 non-null  float64
 1   sex              24488 non-null  object 
 2   label            24488 non-null  int64  
 3   dataset          24488 non-null  object 
 4   dataset_ID       24488 non-null  object 
 5   frontal/lateral  24488 non-null  object 
 6   AP/PA            20105 non-null  object 
 7   filename         24488 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 1.5+ MB


In [6]:
def data_split(data, train_fraction, valid_fraction, test_fraction):
  assert train_fraction + valid_fraction + test_fraction == 1
  num_of_sample = len(data)
  train_df = data.iloc[0:int(train_fraction * num_of_sample)].reset_index(drop=True)
  valid_df = data.iloc[int(train_fraction * num_of_sample):int((train_fraction + valid_fraction) * num_of_sample)].reset_index(drop=True)
  test_df = data.iloc[int((train_fraction + valid_fraction) * num_of_sample):num_of_sample].reset_index(drop=True)
  return train_df, valid_df, test_df

def extract_index(train_df, valid_df, test_df):
  train_ids = {}
  valid_ids = {}
  test_ids  = {}

  train_ids[0] = list(train_df[train_df['label'] == 0]["filename"])
  train_ids[1] = list(train_df[train_df['label'] == 1]["filename"])
  train_ids[2] = list(train_df[train_df['label'] == 2]["filename"])


  valid_ids[0] = list(valid_df[valid_df['label'] == 0]["filename"])
  valid_ids[1] = list(valid_df[valid_df['label'] == 1]["filename"])
  valid_ids[2] = list(valid_df[valid_df['label'] == 2]["filename"])

  test_ids[0]  = list(test_df[test_df['label'] == 0]["filename"])
  test_ids[1]  = list(test_df[test_df['label'] == 1]["filename"])
  test_ids[2]  = list(test_df[test_df['label'] == 2]["filename"])

  return train_ids, valid_ids, test_ids

In [7]:
train_df, valid_df, test_df = data_split(shuffled_df, 0.8, 0.05, 0.15)
train_ids, valid_ids, test_ids = extract_index(train_df, valid_df, test_df)

In [8]:
train_id = train_ids[0] + train_ids[1] + train_ids[2]
valid_id = valid_ids[0] + valid_ids[1] + valid_ids[2]
test_id  = test_ids[0] + test_ids[1] + test_ids[2]

In [10]:
class_dict = {
    0 : "Normal",
    1 : "Non-Covid-Pne",
    2 : "Covid-Pne"
}

In [None]:
!mkdir '/ocean/projects/cis230079p/shared/CapStone-VeyTel-2024/datasets/data-clean'

def formulate_data(DATA_DIR, PARTITION:list):
  cnt = 0
  for partition in PARTITION:
    img_file_path = os.path.join(DATA_DIR, partition)
    img_files = sorted(os.listdir(img_file_path))
    for file in tqdm(img_files):
      if file in train_id:
        for cls in range (0, 3):
          if file in train_ids[cls]:
            # move the file to out_path = '/content/data-clean/image_resized/train/cls_number/file'
            out_path = f'/ocean/projects/cis230079p/shared/CapStone-VeyTel-2024/datasets/data-clean/train/{cls}/{file}'
            os.makedirs(os.path.dirname(out_path), exist_ok=True)
            copy(os.path.join(img_file_path, file), out_path)
            cnt += 1

      elif file in valid_id:
        for cls in range (0, 3):
          if file in valid_ids[cls]:
            out_path = f'/ocean/projects/cis230079p/shared/CapStone-VeyTel-2024/datasets/data-clean/dev/{cls}/{file}'
            os.makedirs(os.path.dirname(out_path), exist_ok=True)
            copy(os.path.join(img_file_path, file), out_path)
            cnt += 1

      elif file in test_id:
        out_path = f'/ocean/projects/cis230079p/shared/CapStone-VeyTel-2024/datasets/data-clean/test/{file}'
        os.makedirs(os.path.dirname(out_path), exist_ok=True)
        copy(os.path.join(img_file_path, file), out_path)
        cnt += 1
  print(f"Moved {cnt} images.")

In [None]:
DATA_DIR = "/ocean/projects/cis230079p/shared/CapStone-VeyTel-2024/datasets"
PARTITION = ["neg", "pos_non_COVID", "pos_COVID"]
formulate_data(DATA_DIR, PARTITION)

  0%|          | 0/3174 [00:00<?, ?it/s]

  0%|          | 0/1008 [00:00<?, ?it/s]

Moved 4182 images.
