In [2]:
import os
import numpy as np
import pydicom
import cv2
import matplotlib.pyplot as plt
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
import torch.optim as optim

In [None]:
import os
import numpy as np
import pydicom
import cv2
import pandas as pd

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

# To resize images
def resize_image(image_path):
    dicom = pydicom.dcmread(image_path)

    # Convert the DCM image to a numpy array
    image_array = dicom.pixel_array

    # Resize the image using OpenCV
    resized_image = cv2.resize(image_array, (224, 224))

    return resized_image

# To stack on z-axis and pad tenors
def pad_to_3d(unpad_dict, max_channels=6):

    grouped_arrays = {}
    for key, value in unpad_dict.items():
        if key[0] not in grouped_arrays:
            grouped_arrays[key[0]] = [value]
        else:
            grouped_arrays[key[0]].append(value)

    stacked_arrays = {}
    for key, values in grouped_arrays.items():
        stacked_arrays[key] = np.stack(values, axis=0)

    stacked_tensors = {}
    for key, value in stacked_arrays.items():
        stacked_tensors[key] = torch.from_numpy(value)

    for key, tensor in stacked_tensors.items():
        if max_channels-tensor.shape[1] > 0:
            pad_size = max_channels - tensor.shape[1]
            stacked_tensors[key] = F.pad(tensor, (0, 0, 0, 0, pad_size, 0))

    return stacked_tensors



# To prepare for CNN
def prepare_for_cnn(root_directory, targets_df, study_id, study_id_dir, batch_size):
    
    # Prepare features
    image_dict = {}
    max_shape = tuple([43008, 224])
    for series_id in study_id_dir:
        series_id_dir = os.listdir(f'{root_directory}/{study_id}/{series_id}')
        # Initialize list to store image arrays for the series
        image_arrays = []
        counter += 1
        # Iterate over DICOM files in the series folder
        for idx in range(1, len(series_id_dir)+1):
            image_path = f'{root_directory}/{study_id}/{series_id}/{idx}.dcm'
            resized_image = resize_image(image_path)
            # Append resized_image array to the list
            image_arrays.append(resized_image)

        # Vertically stack DCM images
        stacked_images = np.vstack(image_arrays)
        # Store stacked images as a NumPy array
        np_array = np.array(stacked_images)

        # Pad them
        padding = max_shape[0] - np_array.shape[0]
        if padding > 0:
            padding_shape = ((0, padding), (0, 0))
            padded_np_array = np.pad(np_array, padding_shape, mode='constant', constant_values=0) # 0 is black (If I'm not mistaken)
            # Store image arrays in the dictionary with (study_id, series_id) tuple as key
            image_dict[(study_id, series_id)] = padded_np_array
            print(counter, series_id, padded_np_array.shape)
        else:
            image_dict[(study_id, series_id)] = np_array
            print(counter, series_id, np_array.shape)
    stacked_tensors = pad_to_3d(image_dict)

    
    # Prepare targets
    targets_tensors = {}
    for key, _ in stacked_tensors.items():
        target = targets_df[targets_df['study_id'] == int(key)]
        transposed_df = target.iloc[:, 1:].T
        one_hot_array = []
        for _, row in transposed_df.iterrows():
            if row.values == 'Normal/Mild':
                one_hot_array.append([1, 0, 0])
            elif row.values == 'Moderate':
                one_hot_array.append([0, 1, 0])
            elif row.values == 'Severe':
                one_hot_array.append([0, 0, 1])
        targets_tensors[key] = torch.tensor(one_hot_array)
    

    # Convert all tensors to float32
    feature_tensors = [tensor.float() for tensor in stacked_tensors.values()]
    target_tensors = [tensor.float() for tensor in targets_tensors.values()]

    # Stack the tensors
    X_train = torch.stack(feature_tensors)
    y_train = torch.stack(target_tensors)


    # Make torch DataLoader
    train_dataset = TensorDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

    # Check DataLoader batches
    # for i, train_batch in enumerate(train_loader):
    #     x_train_batch, y_train_batch = train_batch
    #     print(f'train tensor {i+1}', '|', x_train_batch.shape, '|', y_train_batch.shape)

    return train_loader

In [None]:
root_directory = '../train_images'

In [None]:
study_id_list = os.listdir(root_directory)

In [None]:
chunk_size = 10
for i in range(0, len(study_id_list), chunk_size):
    study_id_chunk = study_id_list[i:i + chunk_size]
    for study_id in study_id_chunk:
        if study_id not in exclude_array:
            study_id_dir = os.listdir(f'{root_directory}/{study_id}')
            features, targets = prepare_for_cnn(root_directory, study_id, study_id_dir, targets_df)
