In [None]:
# we need the following libraries, so let's install them
!pip install boto3
!pip install torchxrayvision
!pip install SimpleITK
!pip install radiomics
!pip install pyradiomics
!pip install torchvision

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# importing libraries
from time import time
import multiprocessing
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
from glob import glob
import matplotlib.pyplot as plt
# importing written classes and methods
import sys
from radiomics import featureextractor
from radiomics import firstorder, glcm, glrlm, glszm, shape, shape2D, ngtdm, gldm
import warnings
import logging

warnings.filterwarnings("ignore", category=DeprecationWarning)

# set level for all classes
logger = logging.getLogger("radiomics")
logger.setLevel(logging.ERROR)
# # ... or set level for specific class
# logger = logging.getLogger("radiomics.glcm")
# logger.setLevel(logging.ERROR)


In [None]:
# here we call the directories that images are saved; features are saved; targets are saved.
directory_ = '/content/sample_data/CheXpert/'
directory_processed_images = directory_ + 'processed_images'
directory_filterss = directory_ + 'filters'
directory_segments = directory_ + 'segments'
directory_cleaned_targets = directory_ + 'cleaned_targets'
directory_features = directory_ + 'features'

In [None]:
# loading the file of target
target = np.load(directory_cleaned_targets + '/' + 'new_targets.npy',allow_pickle=True).item()

#visuaizing the target
target_data_frame = pd.DataFrame(target)
name_of_targets = list(target_data_frame)
target_data_frame.head(10)

Unnamed: 0,index,Path,Patient_id,Study_Number,Sex,Age,Race,Ethnicity,Frontal/Lateral,AP/PA,...,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
0,0,CheXpert-v1.0/train/patient00001/study1/view1_...,1,1,Female,68.0,Other,Non-Hispanic/Non-Latino,Frontal,AP,...,0,0,0,0,0,0,0,0,0,1
1,1,CheXpert-v1.0/train/patient00002/study2/view1_...,2,2,Female,87.0,"White, non-Hispanic",Non-Hispanic/Non-Latino,Frontal,AP,...,0,-1,-1,0,-1,0,-1,0,1,0
2,2,CheXpert-v1.0/train/patient00002/study1/view1_...,2,1,Female,83.0,"White, non-Hispanic",Non-Hispanic/Non-Latino,Frontal,AP,...,0,0,-1,0,0,0,0,0,1,0
3,4,CheXpert-v1.0/train/patient00003/study1/view1_...,3,1,Male,41.0,"White, non-Hispanic",Non-Hispanic/Non-Latino,Frontal,AP,...,0,1,0,0,0,0,0,0,0,0
4,5,CheXpert-v1.0/train/patient00004/study1/view1_...,4,1,Female,20.0,Black or African American,Non-Hispanic/Non-Latino,Frontal,PA,...,0,0,0,0,0,0,0,0,0,0
5,7,CheXpert-v1.0/train/patient00005/study1/view1_...,5,1,Male,33.0,White,Non-Hispanic/Non-Latino,Frontal,PA,...,0,0,0,0,0,0,0,0,0,1
6,9,CheXpert-v1.0/train/patient00005/study2/view1_...,5,2,Male,33.0,White,Non-Hispanic/Non-Latino,Frontal,AP,...,0,0,0,0,0,1,0,0,0,0
7,10,CheXpert-v1.0/train/patient00005/study2/view2_...,5,2,Male,33.0,White,Non-Hispanic/Non-Latino,Frontal,AP,...,0,0,0,0,0,1,0,0,0,0
8,11,CheXpert-v1.0/train/patient00006/study1/view1_...,6,1,Female,42.0,Native Hawaiian or Other Pacific Islander,Non-Hispanic/Non-Latino,Frontal,AP,...,0,0,0,0,0,0,0,0,0,0
9,12,CheXpert-v1.0/train/patient00007/study1/view1_...,7,1,Male,69.0,Other,Hispanic/Latino,Frontal,AP,...,0,0,0,0,1,1,0,0,0,1


In [None]:
# total number of targets:
n_samples_target, n_targets_types = target_data_frame.shape
n_samples_target, n_targets_types

(191212, 24)

In [None]:
# checkin the frontal or lateral type of images:
pd.unique(target_data_frame['Frontal/Lateral']), pd.unique(target_data_frame['AP/PA'])

(array(['Frontal'], dtype=object), array(['AP', 'PA'], dtype=object))

In [None]:
# total number of unique subjects:
n_subj = len(pd.unique(target_data_frame['Patient_id']))
print("The number of subjects are: {0}".format(n_subj))

The number of subjects are: 64734


In [None]:
# getting the name of all feature radiomics:
radiomics_names = glob(directory_features + '/*.npy')
print("The number of samples (images) are: {0}".format(len(radiomics_names)))

The number of samples (images) are: 191212


In [None]:
# getting the number of feature radiomics per image:
n_type_of_features = np.load(radiomics_names[0]).shape[0]
n_type_of_features

464

In [None]:
# preallocating the final synced data
Features_names = ['feature_' + str(i) for i in range(n_type_of_features)]
Synced_data = pd.DataFrame( np.ones((n_samples_target, n_type_of_features + n_targets_types))*np.nan,columns = Features_names + name_of_targets)

In [None]:
# syncing the data
for j in tqdm(range(len(radiomics_names))):
  j_th_featres_name = radiomics_names[j]
  all_splits = j_th_featres_name.split('/')[-1].split('__')
  key_id = 'CheXpert-v1.0/' + all_splits[0] + '/' + all_splits[1] + '/' + all_splits[2] + '/' + all_splits[3] + '.jpg'
  feature_jth = np.load(j_th_featres_name)
  Synced_data.iloc[j, :464] = feature_jth
  Synced_data.iloc[j, 464:] = target_data_frame[target_data_frame['Path'] == key_id]

100%|██████████| 191212/191212 [23:39:53<00:00,  2.24it/s]


In [None]:
synced_data_loc = directory_ + 'synced_data'
try:
  os.mkdir(synced_data_loc)
except:
  pass
Synced_data.to_csv(synced_data_loc + '.csv')