In [19]:
import os
import numpy as np
import pydicom
import cv2
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
def resize_image(image_path):
    dicom = pydicom.dcmread(image_path)

    # Convert the DCM image to a numpy array
    image_array = dicom.pixel_array

    # Resize the image using OpenCV
    resized_image = cv2.resize(image_array, (224, 224))

    return resized_image

In [3]:
# Specify the path to the main directory containing patient folders
main_directory = './small_train_images'

# Dictionary to store image data
image_data = {}

# Iterate over study_id folders
for study_id in os.listdir(main_directory):
    study_id_dir = os.listdir(f'./small_train_images/{study_id}')
    
    # Iterate over series folders for each patient
    for series_id in study_id_dir:
        series_id_dir = os.listdir(f'./small_train_images/{study_id}/{series_id}')
        # Initialize list to store image arrays for the series
        image_arrays = []
        
        # Iterate over DICOM files in the series folder
        for instance in series_id_dir:
            image_path = f'./small_train_images/{study_id}/{series_id}/{instance}'
            resized_image = resize_image(image_path)

            # Append resized_image array to the list
            image_arrays.append(resized_image)
        
        # Vertically stack DCM images
        stacked_images = np.vstack(image_arrays)
        # Store stacked images as a NumPy array
        np_array = np.array(stacked_images)
        print(series_id, np_array.shape)

        # Store image arrays in the dictionary with (study_id, series_id) tuple as key
        image_data[(study_id, series_id)] = np_array

142859125 (8064, 224)
2073726394 (10304, 224)
2399638375 (4256, 224)
3491739931 (4256, 224)
1224932122 (10080, 224)
2231042680 (4032, 224)
3543553307 (4032, 224)
1212326388 (3360, 224)
1638921810 (3360, 224)
3800798510 (6048, 224)
403244853 (6272, 224)
1539051863 (3808, 224)
2500166693 (6048, 224)
2677627096 (3808, 224)
3687121182 (12992, 224)
3753885158 (4032, 224)
434280813 (4032, 224)
1679014482 (10080, 224)
226564374 (3808, 224)
2528347280 (3808, 224)
307069509 (5600, 224)
1152175603 (5152, 224)
1676821058 (5152, 224)
2261718442 (6720, 224)
231278500 (8960, 224)
1379151387 (3360, 224)
1847558962 (4928, 224)
758801267 (3360, 224)
1054713880 (3360, 224)
2448190387 (9632, 224)
702807833 (3360, 224)
3201256954 (12096, 224)
3486248476 (3808, 224)
3666319702 (3808, 224)
132939515 (3808, 224)
1951927562 (5152, 224)
3219733239 (3808, 224)
1570286759 (3360, 224)
2406919186 (4704, 224)
481125819 (3360, 224)


In [4]:
# Check shapes
example_study_id = '4003253'
example_series_id = '702807833'
if (example_study_id, example_series_id) in image_data:
    image = image_data[(example_study_id, example_series_id)]
    print(f"Image shape:", image.shape)
else:
    print("No images found for the specified (study_id, series_id) tuple.")

Image shape: (3360, 224)


In [5]:
# Pad arrays

# Find the maximum shape among all numpy arrays
max_shape = max([np_array.shape for np_array in image_data.values()], key=lambda x: x[0])

for key in image_data:
    np_array = image_data[key]
    padding = max_shape[0] - np_array.shape[0]
    if padding > 0:
        padding_shape = ((0, padding), (0, 0))
        padded_np_array = np.pad(np_array, padding_shape, mode='constant', constant_values=0) # 0 is black (If I'm not mistaken)
        image_data[key] = padded_np_array

# Print the shapes of padded numpy arrays
for key in image_data:
    print(key, image_data[key].shape)

('10728036', '142859125') (12992, 224)
('10728036', '2073726394') (12992, 224)
('10728036', '2399638375') (12992, 224)
('10728036', '3491739931') (12992, 224)
('11340341', '1224932122') (12992, 224)
('11340341', '2231042680') (12992, 224)
('11340341', '3543553307') (12992, 224)
('11943292', '1212326388') (12992, 224)
('11943292', '1638921810') (12992, 224)
('11943292', '3800798510') (12992, 224)
('11943292', '403244853') (12992, 224)
('13317052', '1539051863') (12992, 224)
('13317052', '2500166693') (12992, 224)
('13317052', '2677627096') (12992, 224)
('22191399', '3687121182') (12992, 224)
('22191399', '3753885158') (12992, 224)
('22191399', '434280813') (12992, 224)
('26342422', '1679014482') (12992, 224)
('26342422', '226564374') (12992, 224)
('26342422', '2528347280') (12992, 224)
('26342422', '307069509') (12992, 224)
('29931867', '1152175603') (12992, 224)
('29931867', '1676821058') (12992, 224)
('29931867', '2261718442') (12992, 224)
('29931867', '231278500') (12992, 224)
('3373

In [6]:
# Make 3D arrays

# Group arrays by the first tuple values
grouped_arrays = {}
for key, value in image_data.items():
    if key[0] not in grouped_arrays:
        grouped_arrays[key[0]] = [value]
    else:
        grouped_arrays[key[0]].append(value)

# Stack arrays with the same first tuple values into a 3D array
stacked_arrays = {}
for key, values in grouped_arrays.items():
    stacked_arrays[key] = np.stack(values, axis=0)

# Check the stacked arrays
for key, value in stacked_arrays.items():
    print(f"Stacked arrays for key {key}: {value.shape}")

Stacked arrays for key 10728036: (4, 12992, 224)
Stacked arrays for key 11340341: (3, 12992, 224)
Stacked arrays for key 11943292: (4, 12992, 224)
Stacked arrays for key 13317052: (3, 12992, 224)
Stacked arrays for key 22191399: (3, 12992, 224)
Stacked arrays for key 26342422: (4, 12992, 224)
Stacked arrays for key 29931867: (4, 12992, 224)
Stacked arrays for key 33736057: (3, 12992, 224)
Stacked arrays for key 4003253: (3, 12992, 224)
Stacked arrays for key 4646740: (3, 12992, 224)
Stacked arrays for key 7143189: (3, 12992, 224)
Stacked arrays for key 8785691: (3, 12992, 224)


In [7]:
import torch
from torch.utils.data import Dataset, DataLoader


# Convert dictionary into a list of tuples
data_list = [(key, stacked_arrays[key]) for key in stacked_arrays]

# Custom dataset class
class CustomDataset(Dataset):
    def __init__(self, data_list):
        self.data_list = data_list

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        key, data = self.data_list[idx]
        data_np = data.numpy() if isinstance(data, torch.Tensor) else data
        return torch.from_numpy(data_np).float(), key

# Create dataset and dataloader
dataset = CustomDataset(data_list)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [8]:
# Check DataLoader batches
for data, key in dataloader:
    print(data.shape)

torch.Size([1, 3, 12992, 224])
torch.Size([1, 3, 12992, 224])
torch.Size([1, 3, 12992, 224])
torch.Size([1, 3, 12992, 224])
torch.Size([1, 4, 12992, 224])
torch.Size([1, 3, 12992, 224])
torch.Size([1, 3, 12992, 224])
torch.Size([1, 3, 12992, 224])
torch.Size([1, 4, 12992, 224])
torch.Size([1, 3, 12992, 224])
torch.Size([1, 4, 12992, 224])
torch.Size([1, 4, 12992, 224])


In [16]:
# Assuming your original target array has shape (1, 25)
original_targets = np.random.randint(0, 2, size=(1, 25))  # Example random target array

# Reshape the original target array to match the desired output shape (25, 3)
reshaped_targets = np.eye(3)[original_targets.flatten()].reshape(-1, 3)

print("Original target shape:", original_targets.shape)
print("Reshaped target shape:", reshaped_targets.shape)

Original target shape: (1, 25)
Reshaped target shape: (25, 3)


In [33]:
# target_df = pd.read_csv('./train.csv')
# target_arr = np.array(target_df.iloc[0])

In [26]:
target_arr = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1]])
target_arr.shape

(1, 26)

In [29]:
# Original target array
target_arr = np.array([[1, 1, 1, 1, 3, 1, 1, 1, 3, 2, 1, 1, 1, 2, 2, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 1]])

# Convert the target array to one-hot encoding with three classes
num_classes = 3
one_hot_targets = np.eye(num_classes)[target_arr.flatten() - 1]  # Subtracting 1 to make classes start from 0

# Reshape the one-hot encoded target array to match the desired output shape (25, 3)
reshaped_targets = one_hot_targets.reshape(-1, num_classes)

print("Original target shape:", target_arr.shape)
print("Reshaped target shape:", reshaped_targets.shape)

Original target shape: (1, 26)
Reshaped target shape: (26, 3)


In [30]:
reshaped_targets

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])