In [2]:
from data_loaders import get_data_module
from utils.visualisation import showInRow

from transforms.pretraining import Moco2ChestTransforms
from transforms.finetuning import ChestTransforms

In [None]:
import torch
import numpy as np
from pathlib import Path
from PIL import Image
from torchvision import transforms
from torch.utils.data import Dataset
import pandas as pd

class CheXpertDataset(Dataset):
    
    def __init__(self, dataset_dir:Path, transform=None, part="full", binary=True):
        """
        Initialise chest-14 dataset
        URL: https://www.kaggle.com/nih-chest-xrays/data
        
        Args:
            dataset_dir (Path): path to dataset directory
            transform (callable, optional): Preprocessing transforms
            part (str): Type of partition
                       - "full" : Use the whole dataset
                       - "train_val": Use only train_val partition
                       - "test": Use only test partition
        """
        self.available_partitions = ["full", "train_val", "test"]
        self.transform = transform
        
        # Define pathes to all important files, read label data
        self.csv_data = pd.read_csv(dataset_dir / "train.csv")
 
        # Define label to idx mapping
        self.labels = [ "No Finding", "Enlarged Cardiomediastinum", "Cardiomegaly",
                       "Lung Opacity","Lung Lesion","Edema", "Consolidation",
                       "Pneumonia", "Atelectasis", "Pneumothorax", "Pleural Effusion",
                       "Pleural Other", "Fracture", "Support Devices"]
        
        # Define label to idx mapping
        self.label_to_idx = {}
        for i,label in enumerate(self.labels):
            self.label_to_idx[label] = i
            
        # Define idx to label mapping
        self.idx_to_label = {}
        for label, idx in self.label_to_idx.items():
            self.idx_to_label[idx] = label
        
        # Filter data based on defined partition type
        if part in ["train", "val"]:
            part = "train_val"
        if part in ["train_val", "test"]:
            split_file = dataset_dir / (part + "_list.txt")
            image_names  = []
            with open(split_file, "r") as f:
                image_names = f.read().split("\n")
            self.csv_data = self.csv_data[self.csv_data['Image Index'].isin(image_names)]
        
        self.binary = binary
        
            
    def label_to_one_hot(self, label_string):
        """
        Convert string label to one hot array
        """
        labels = label_string.split("|")
        one_hot_label = np.zeros(len(self.label_to_idx.keys()))
        for label in labels:
            idx = self.label_to_idx[label]
            one_hot_label[idx] = 1
        return one_hot_label
    
    def label_to_one_hot_binary(self, label_string):
        """
        Convert string label to one hot array based on Pathology / No finding
        """
        one_hot_label = np.zeros(2)
        idx = 0
        if label_string == "No Finding":
            one_hot_label[0] = 1
        else:
            one_hot_label[1] = 1
            idx = 1
        return one_hot_label, idx 
        
    def one_hot_to_label(self, one_hot_label):
        """
        Convert one hot array to string label
        """
        if len(one_hot_label) != len(self.label_to_idx.keys()):
            return "Undefined"
        else:
            labels = []
            for i in len(one_hot_label):
                if(one_hot_label[i]):
                    labels.append(self.idx_to_label[i])
            return "|".join(labels)

            
    def __len__(self):
        """
        Get the size of the dataset
        """
        return len(self.csv_data)
    
    
    def __getitem__(self, idx):
        """
        Get data item based on its index
        """
        # Convert torch tensors if given
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        # Read and preprocess image
        image_name = self.image_dir / self.csv_data.iloc[idx, 0]
        image = Image.open(image_name).convert('RGB')
        if self.transform:
            image = self.transform(image)
        
        # Get label and its one hot encoding
        label = self.csv_data.iloc[idx, 1]
        if self.binary:
            one_hot_label, label_idx = self.label_to_one_hot_binary(label)
            # Form output
            sample = {'image': image, 
                      'one_hot_label':  torch.tensor(one_hot_label).float(), 
                      'label':label_idx,
                      'text_label':label}
        
        else:
            one_hot_label = self.label_to_one_hot(label)
            # Form output
            sample = {'image': image.contiguous(), 
                      'one_hot_label':  torch.tensor(one_hot_label).float(), 
                      'label':label,
                      'text_label':label}
        
        return sample

In [11]:
from pathlib import Path
import pandas as pd

In [4]:
root = Path("/new_data/CheXpert/CheXpert-v1.0")

In [8]:
files = root.glob("train/*/*/*.jpg")

In [10]:
i = 0
for file in files:
    print(file)
    i += 1
    if i > 10:
        break

/new_data/CheXpert/CheXpert-v1.0/train/patient00002/study1/view1_frontal.jpg
/new_data/CheXpert/CheXpert-v1.0/train/patient00002/study1/view2_lateral.jpg
/new_data/CheXpert/CheXpert-v1.0/train/patient00002/study2/view1_frontal.jpg
/new_data/CheXpert/CheXpert-v1.0/train/patient00003/study1/view1_frontal.jpg
/new_data/CheXpert/CheXpert-v1.0/train/patient00005/study1/view1_frontal.jpg
/new_data/CheXpert/CheXpert-v1.0/train/patient00005/study1/view2_lateral.jpg
/new_data/CheXpert/CheXpert-v1.0/train/patient00005/study2/view1_frontal.jpg
/new_data/CheXpert/CheXpert-v1.0/train/patient00005/study2/view2_frontal.jpg
/new_data/CheXpert/CheXpert-v1.0/train/patient00006/study1/view1_frontal.jpg
/new_data/CheXpert/CheXpert-v1.0/train/patient00007/study1/view1_frontal.jpg
/new_data/CheXpert/CheXpert-v1.0/train/patient00007/study2/view1_frontal.jpg


In [25]:
csv_data = pd.read_csv(root / "train.csv")

In [17]:
csv_data.head(10)

Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
0,CheXpert-v1.0/train/patient00001/study1/view1_...,Female,68,Frontal,AP,1.0,,,,,,,,,0.0,,,,1.0
1,CheXpert-v1.0/train/patient00002/study2/view1_...,Female,87,Frontal,AP,,,-1.0,1.0,,-1.0,-1.0,,-1.0,,-1.0,,1.0,
2,CheXpert-v1.0/train/patient00002/study1/view1_...,Female,83,Frontal,AP,,,,1.0,,,-1.0,,,,,,1.0,
3,CheXpert-v1.0/train/patient00002/study1/view2_...,Female,83,Lateral,,,,,1.0,,,-1.0,,,,,,1.0,
4,CheXpert-v1.0/train/patient00003/study1/view1_...,Male,41,Frontal,AP,,,,,,1.0,,,,0.0,,,,
5,CheXpert-v1.0/train/patient00004/study1/view1_...,Female,20,Frontal,PA,1.0,0.0,,,,,0.0,,,,0.0,,,
6,CheXpert-v1.0/train/patient00004/study1/view2_...,Female,20,Lateral,,1.0,0.0,,,,,0.0,,,,0.0,,,
7,CheXpert-v1.0/train/patient00005/study1/view1_...,Male,33,Frontal,PA,1.0,,0.0,,,,0.0,,,,0.0,,,1.0
8,CheXpert-v1.0/train/patient00005/study1/view2_...,Male,33,Lateral,,1.0,,0.0,,,,0.0,,,,0.0,,,1.0
9,CheXpert-v1.0/train/patient00005/study2/view1_...,Male,33,Frontal,AP,,,,,,,,,,1.0,,,,


In [27]:
csv_data[csv_data['Path'].str.contains("train")]

Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
0,CheXpert-v1.0/train/patient00001/study1/view1_...,Female,68,Frontal,AP,1.0,,,,,,,,,0.0,,,,1.0
1,CheXpert-v1.0/train/patient00002/study2/view1_...,Female,87,Frontal,AP,,,-1.0,1.0,,-1.0,-1.0,,-1.0,,-1.0,,1.0,
2,CheXpert-v1.0/train/patient00002/study1/view1_...,Female,83,Frontal,AP,,,,1.0,,,-1.0,,,,,,1.0,
3,CheXpert-v1.0/train/patient00002/study1/view2_...,Female,83,Lateral,,,,,1.0,,,-1.0,,,,,,1.0,
4,CheXpert-v1.0/train/patient00003/study1/view1_...,Male,41,Frontal,AP,,,,,,1.0,,,,0.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223409,CheXpert-v1.0/train/patient64537/study2/view1_...,Male,59,Frontal,AP,,,,-1.0,,,,,-1.0,0.0,1.0,,,
223410,CheXpert-v1.0/train/patient64537/study1/view1_...,Male,59,Frontal,AP,,,,-1.0,,,,0.0,-1.0,,-1.0,,,
223411,CheXpert-v1.0/train/patient64538/study1/view1_...,Female,0,Frontal,AP,,,,,,-1.0,,,,,,,,
223412,CheXpert-v1.0/train/patient64539/study1/view1_...,Female,0,Frontal,AP,,,1.0,1.0,,,,-1.0,1.0,0.0,,,,0.0


In [28]:
files = root.glob("valid/*/*/*.jpg")

In [30]:
for f in files:
    print(f)