# Need to change patient 0005 with fid = 0,1,1 and patient 0159 with fid = 1,1,2

# Resample

In [None]:
import SimpleITK as sitk
import numpy as np
from pathlib import Path
import pandas as pd
import pickle

In [None]:
train_path = '/project2/msca/projects/ProstateMRI/data/PROSTATEx-team1/ProstateX/Train/DATAPREP/nrrd-train/'

path_to_resampled = '/project2/msca/projects/ProstateMRI/data/PROSTATEx-team1/ProstateX/Train/DATAPREP/nrrd-resampled/'

def resample_image(desired_voxel_spacing, source_file_path):
        image = sitk.ReadImage(str(source_file_path))
        original_image_spacing = image.GetSpacing()

        if original_image_spacing != desired_voxel_spacing:
            ### HOW TO RESAMPLE SITK_IMAGE TO A NEW SPACING ###
            ### SOURCE: https://github.com/SimpleITK/SimpleITK/issues/561 ###
            
            # Converting to np array for calculations of new_size
            original_size_array = np.array(image.GetSize(), dtype = np.int)
            original_spac_array = np.array(image.GetSpacing())
            desired_spac_array = np.array(desired_voxel_spacing)
            
            new_size = original_size_array * (original_spac_array / desired_spac_array)
            new_size = np.ceil(new_size).astype(np.int)
            new_size = [int(s) for s in new_size]
            new_size = tuple(new_size)
            
            # Create the resample filter
            resample = sitk.ResampleImageFilter()
            resample.SetInterpolator(sitk.sitkLinear) 
            resample.SetSize(new_size)
            resample.SetOutputOrigin(image.GetOrigin()) 
            resample.SetOutputSpacing(desired_voxel_spacing)
            resample.SetOutputDirection(image.GetDirection())
            
            try:
                resampled_image = resample.Execute(image)
                # Print the changes
                print('\n')
                print('Resampling:', "/".join(source_file_path.parts[-5:]))
                print('original spacing:', image.GetSpacing())
                print('desired spacing:', desired_voxel_spacing)
                print('resampled spacing:', resampled_image.GetSpacing())
                print('original size:', image.GetSize())
                print('resampled size:', resampled_image.GetSize())
                print('\n')
            except:
                print('Problem with resampling image.')
                
        else:
            resampled_image = image
        
        return resampled_image
    
def write_resampled_image(image, path, counter):
    writer = sitk.ImageFileWriter()
    writer.SetFileName(str(path))
    writer.Execute(image)
    print('Saving image to:', "/", path_to_resampled)
    counter = counter + 1
    return counter
    
   
patient_nrrds = os.listdir(train_path)

desired_voxel = {'t2':(0.5,0.5,3.0),
                 'adc':(2.0,2.0,3.0),
                 'bval':(2.0,2.0,3.0),
                 'ktrans':(1.5,1.5,4.0)} 
                 
                 
counter = 1
for i in patient_nrrds:
    subdirectory = train_path+i
    print(subdirectory)
    if 't2' in str(i): 
        #path_t2_resampled = path_to_nifti_resampled.joinpath("/".join(file_path.parts[-3:]))
        t2_resampled = resample_image(desired_voxel.get('t2'), subdirectory)
        counter = write_resampled_image(t2_resampled, path_to_resampled+i, counter)
    if 'adc' in str(subdirectory):
        #path_adc_resampled = path_to_nifti_resampled.joinpath("/".join(file_path.parts[-3:]))
        adc_resampled  = resample_image(desired_voxel.get('adc'), subdirectory)
        counter = write_resampled_image(adc_resampled, path_to_resampled+i,counter) 
    if 'bval' in str(subdirectory):
        #path_bval_resampled = path_to_nifti_resampled.joinpath("/".join(file_path.parts[-3:]))
        bval_resampled = resample_image(desired_voxel.get('bval'), subdirectory)
        counter = write_resampled_image(bval_resampled,path_to_resampled+i,counter)    
    if 'ktrans' in str(subdirectory):
        #path_ktrans_resampled = path_to_nifti_resampled.joinpath("/".join(file_path.parts[-3:]))
        ktrans_resampled = resample_image(desired_voxel.get('ktrans'), subdirectory)
        counter = write_resampled_image(ktrans_resampled, path_to_resampled+i,counter)               
    print('\n ++++ Files reviewed for resampling:', counter)

# Compile data

In [None]:
def generate_cases_meta_df(is_training_data, sequence_type):
    """
    This function generates a data frame containing the necessary information (ProxID, DCMSerDesc,
    and path to resampled nrrd file) for cases so that they can be joined to tabular information 
    provided by the research team. Data that will be merged with dataset are found in ProstateX-Images
    and ProstateX-Images-KTrans files (Train and Test Respectively) 
    """

    if is_training_data == True:
        #path_lesion_information = Path('/home/alexander/Documents/DataProjects/Data/MBI/ProstateX/raw/train/lesion_information_train_204')
        path_resampled_nrrd = '/project2/msca/projects/ProstateMRI/data/PROSTATEx-team1/ProstateX/Train/DATAPREP/nrrd-resampled/'
    else:
        #path_lesion_information = Path('/home/alexander/Documents/DataProjects/Data/MBI/ProstateX/raw/test/lesion_information_test_140')
        path_resampled_nrrd = Path('/project2/msca/projects/ProstateMRI/data/PROSTATEx-team1/ProstateX/Test/Test_Clean/nrrd-resampled/')
    
    patient_data = []

    def generate_DCMSerDescr_from_filename(item):
        # remove extension from path
        split = item.split('.')
        name_without_extension = split[0]

        # remove first num and underscore from path
        first_underscore = name_without_extension.find('_') + 1
        value = name_without_extension[first_underscore:]
        patient = name_without_extension[:first_underscore-1]
        return value, patient

    sequences = os.listdir(path_resampled_nrrd)
    for item in sequences:
        if sequence_type in item:
            constructed_DCMSerDescr,key = generate_DCMSerDescr_from_filename(item)
            path_to_resampled = path_resampled_nrrd + item

            if 't2' in constructed_DCMSerDescr:
                sequence_type = 't2'
            elif 'ADC' in constructed_DCMSerDescr:
                sequence_type = 'ADC'
            elif 'BVAL' in constructed_DCMSerDescr:
                sequence_type = 'BVAL'
            else: 
                sequence_type = 'Ktrans'
                constructed_DCMSerDescr = 'Ktrans'

            value = [key,constructed_DCMSerDescr, path_to_resampled, sequence_type]
            patient_data.append(value)

    cases_meta_data_df = pd.DataFrame(patient_data, columns = ['ProxID', 'DCMSerDescr', 'resampled_nrrd', 'sequence_type'])
    cases_meta_data_df = cases_meta_data_df.reset_index()
    
    return cases_meta_data_df


In [None]:
def join_data(is_training_data , sequence_df_array):
    """
    This function combines information provided by the research team in ProstateX-Images
    and ProstateX-Images-KTrans (Train/Test) files with paths to the resampled nrrd files. The
    function accepts a boolean is_training_data to determine if it is training or test
    data that needs to be processed. A list containing data frames of the joined data
    is the second parameter. The function concatenates the data frames in this list and
    returns a final data frame of all the data.
    """

    if is_training_data == True:
        prostateX_images = pd.read_csv('/project2/msca/projects/ProstateMRI/data/PROSTATEx-team1/Training_Lesion_Information/ProstateX-Images-Train.csv')
        prostateX_images_ktrans = pd.read_csv('/project2/msca/projects/ProstateMRI/data/PROSTATEx-team1/Training_Lesion_Information/ProstateX-Images-KTrans-Train.csv')
        prostateX_findings = pd.read_csv('/project2/msca/projects/ProstateMRI/data/PROSTATEx-team1/Training_Lesion_Information/ProstateX-Findings-Train.csv')
    else:
        prostateX_images = pd.read_csv('/project2/msca/projects/ProstateMRI/data/PROSTATEx-team1/Test_Lesion_Information/ProstateX-Images-Test.csv')
        prostateX_images_ktrans = pd.read_csv('/project2/msca/projects/ProstateMRI/data/PROSTATEx-team1/Test_Lesion_Information/ProstateX-Images-KTrans-Test.csv')
        prostateX_findings = pd.read_csv('/project2/msca/projects/ProstateMRI/data/PROSTATEx-team1/Test_Lesion_Information/ProstateX-Findings-Test.csv')
  
    df_collection = []
    
    # Merging info for the DICOM series
    for dataframe in sequence_df_array[0:3]:
        # Convert DCMSerDescr values to lowercase in both frames (sanitize)
        dataframe.loc[:,'DCMSerDescr'] = dataframe.loc[:,'DCMSerDescr'].apply(lambda x: x.lower())
        prostateX_images.loc[:,'DCMSerDescr'] = prostateX_images.loc[:,'DCMSerDescr'].apply(lambda x: x.lower())
        
        # Keep only important columns from researcher provided data
        prostateX_images = prostateX_images[['ProxID', 'DCMSerDescr', 'fid', 'pos','WorldMatrix', 'ijk']]
        #print(prostateX_images.DCMSerDescr[:5], dataframe.DCMSerDescr[:5])
        
        # Merge nrrd paths with researcher provided data
        first_merge = pd.merge(dataframe, prostateX_images, how = 'inner', on = ['ProxID', 'DCMSerDescr'])
        
        # Merge findings (cancer/not cancer)
        final_merge = pd.merge(first_merge, prostateX_findings, how = 'inner', on = ['ProxID', 'fid', 'pos'])
        df_collection.append(final_merge)
        #print(len(df_collection))
   
    #Merging info for the KTRANS series
    first_merge = pd.merge(sequence_df_array[3], prostateX_images_ktrans, how = 'inner', on = ['ProxID'])
    
    # Merge findings (cancer/not cancer)
    final_merge = pd.merge(first_merge, prostateX_findings, how = 'inner', on = ['ProxID', 'fid', 'pos'])
    df_collection.append(final_merge)
    #print(len(df_collection))
    final_dataframe = pd.concat(df_collection, ignore_index=True)

    return final_dataframe


In [None]:
def repair_values(is_training_data, dataframe):
    """
    This function accepts a data frame and reformats entries in select columns
    to make them more acceptable for use in patch analysis (i.e. converting strings of 
    coordinate values to tuples of float).
    """

    def convert_to_tuple(dataframe, column):
        """
        This function converts row values (represented as string of floats
        delimited by spaces) to a tuple of floats. It accepts the original data
        frame and a string for the specified column that needs to be converted.
        """  
        pd_series_containing_lists_of_strings = dataframe[column].str.split() 
        list_for_new_series = []
        for list_of_strings in pd_series_containing_lists_of_strings:
            container_list = []
            for item in list_of_strings:
                if column == 'pos':
                    container_list.append(float(item))
                else:
                    container_list.append(int(item))
            list_for_new_series.append(tuple(container_list))
        
        return pd.Series(list_for_new_series)    

    # Call function to convert select columns
    dataframe = dataframe.assign(pos_tuple = convert_to_tuple(dataframe, 'pos'))
    dataframe = dataframe.assign(ijk_tuple = convert_to_tuple(dataframe, 'ijk'))
    
    # Drop old columns, rename new ones, and reorder...
    dataframe = dataframe.drop(columns = ['pos','ijk', 'WorldMatrix'])
    dataframe = dataframe.rename(columns = {'pos_tuple':'pos', 'ijk_tuple':'ijk'})

    if is_training_data:
        repaired_df = dataframe.loc[:,['ProxID', 'DCMSerDescr', 'resampled_nrrd', 'sequence_type', 'fid', 'pos', 'ijk', 'zone', 'ClinSig']]
    else:
        repaired_df = dataframe.loc[:,['ProxID', 'DCMSerDescr', 'resampled_nrrd', 'sequence_type', 'fid', 'pos', 'ijk', 'zone']]
    
    return repaired_df


t2_meta = generate_cases_meta_df(True, 't2')
adc_meta = generate_cases_meta_df(True, 'ADC')
bval_meta = generate_cases_meta_df(True, 'BVAL')
ktrans_meta = generate_cases_meta_df(True, 'Ktrans')

sequence_df_array = [t2_meta, adc_meta, bval_meta, ktrans_meta]
complete_df = join_data(True, sequence_df_array)

final_df = repair_values(is_training_data = True, dataframe=complete_df)

In [None]:
def save_data_to_directory(is_training_data, dataframe):
    if is_training_data == True:
        dataframe.to_csv('/project2/msca/projects/ProstateMRI/data/PROSTATEx-team1/Training_Lesion_Information/training_meta_data.csv')
        dataframe.to_pickle('/project2/msca/projects/ProstateMRI/data/PROSTATEx-team1/Training_Lesion_Information/training_meta_data.pkl')
    else:
        dataframe.to_csv('/project2/msca/projects/ProstateMRI/data/PROSTATEx-team1/Test_Lesion_Information/test_meta_data.csv')
        dataframe.to_pickle('/project2/msca/projects/ProstateMRI/data/PROSTATEx-team1/Test_Lesion_Information/test_meta_data.pkl')


In [None]:
save_data_to_directory(True, final_df)