In [None]:
from pathlib import Path
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
import pickle
import argparse
import os

PATH_TO_PARENT_DIR = Path(os.getcwd()).resolve().parent.parent
PATH_TO_DATA_DIR = PATH_TO_PARENT_DIR / "Data"

PATH_TO_CLEANED_DICOM_DF = PATH_TO_DATA_DIR / "cleaned_dicom_df.feather"

PATH_TO_SEGMENTATION_DF = PATH_TO_DATA_DIR / "segmentation_df.feather"
PATH_TO_BINCOUNT_HU_DF = PATH_TO_DATA_DIR / "bincount_HU_df.feather"
PATH_TO_BINCOUNT_STEP_75_DF = PATH_TO_DATA_DIR / "bincount_STEP_75_df.feather"
PATH_TO_BINCOUNT_STEP_150_DF = PATH_TO_DATA_DIR / "bincount_STEP_150_df.feather"
PATH_TO_TRAIN_TEST_SPLIT = PATH_TO_DATA_DIR / "train_test_split.feather"

PATH_TO_OUTPUT_DIR = PATH_TO_PARENT_DIR / "Model" / "Justin" / "Final"

WindowsPath('C:/Users/schoe/Desktop/inferring_body_weight_from_ct_scans')

In [42]:
def apply_voxel_volume(row, voxel_columns):
    return row[voxel_columns] * row['VoxelVolume']

# def transform_df(df, voxel_columns):
#     transformed_voxel_values = df.apply(lambda row: apply_voxel_volume(row, voxel_columns), axis=1)
#     transformed_voxel_values = pd.DataFrame(transformed_voxel_values.values.tolist(), columns=voxel_columns, index=df.index)
#     for col in voxel_columns:
#         df[col] = transformed_voxel_values[col]
    
#     df['PatientSex_encoded'] = df['PatientSex'].map({'F': 0, 'M': 1})
#     return df

def calculate_voxel_volumes(df, voxel_columns, replace_original_columns):
    transformed_voxel_values = df.apply(lambda row: apply_voxel_volume(row, voxel_columns), axis=1)
    transformed_voxel_values = pd.DataFrame(transformed_voxel_values.values.tolist(), columns=voxel_columns, index=df.index)

    if replace_original_columns:
        for col in voxel_columns:
            df[col] = transformed_voxel_values[col]
    else:
        new_columns = pd.DataFrame({f"volume_{col}": transformed_voxel_values[col] for col in voxel_columns})
        df = pd.concat([df, new_columns], axis=1)

    return df

def loadAndPrepareDataFrame(cleaned_dicom_df, actual_data_df, voxel_columns, columns_to_drop, calculate_voxel_volume, replace_original_columns):
    merged_with_meta_df = pd.merge(cleaned_dicom_df, actual_data_df, on="SeriesInstanceUID", how="left")
    merged_with_meta_df.loc[:, 'VoxelVolume'] = (merged_with_meta_df['PixelSpacing'] ** 2) * merged_with_meta_df['SliceThickness']
    if calculate_voxel_volume:
        merged_with_meta_df = calculate_voxel_volumes(merged_with_meta_df, voxel_columns, replace_original_columns)
        
    encoded_PatientSex_column = merged_with_meta_df['PatientSex'].map({'F': 0, 'M': 1})
    merged_with_meta_df = pd.concat([merged_with_meta_df, encoded_PatientSex_column.rename('PatientSex_encoded')], axis=1)
    final_df = merged_with_meta_df.drop(columns=columns_to_drop, errors='ignore')
    return final_df.copy()

def find_best_model_bayes(X, y):
    param_space = {
        'loss': Categorical(['squared_error', 'absolute_error']),
        'learning_rate': Real(0.01, 0.2, prior='log-uniform'),
        'n_estimators': Integer(50, 1000),
        'subsample': Real(0.5, 1.0, prior='uniform'),
        'min_samples_split': Integer(2, 10),
        'min_samples_leaf': Integer(1, 4),
        'min_weight_fraction_leaf': Real(0.0, 0.5, prior='uniform'),
        'max_depth': Integer(1, 9),
        'max_leaf_nodes': Integer(2, 10),
    }

    gb = GradientBoostingRegressor()
    bayes_search = BayesSearchCV(estimator=gb, search_spaces=param_space, n_iter=1, cv=8, n_jobs=-1, verbose=2, scoring='neg_mean_absolute_error')
    bayes_search.fit(X, y)
    
    print(bayes_search.best_params_)
    print(bayes_search.best_score_)
    return bayes_search.best_params_

def trainAndSaveResults(data_df, unique_name):
    df_for_training = data_df[data_df["set_type"] == "Train"]      
    target = df_for_training["PatientWeight"]
    df_for_training = df_for_training.drop(columns=["set_type", "PatientWeight", "SeriesInstanceUID"], errors='ignore')  
    
    best_params_bayes = find_best_model_bayes(df_for_training, target)
    best_model_bayes = GradientBoostingRegressor(**best_params_bayes)
    best_model_bayes.fit(df_for_training, target)

    with open(PATH_TO_OUTPUT_DIR / f'model_{unique_name}.pkl', 'wb') as f:
        pickle.dump(best_model_bayes, f)

    with open(PATH_TO_OUTPUT_DIR / f'best_params_{unique_name}.pkl', 'wb') as f:
        pickle.dump(best_params_bayes, f)
    
    target = data_df["PatientWeight"]
    df_for_predict = data_df.drop(columns=["set_type", "PatientWeight", "SeriesInstanceUID"], errors='ignore')
    predictions = best_model_bayes.predict(df_for_predict)
    
    results_df = pd.DataFrame(data_df["SeriesInstanceUID"])
    results_df["PredictedPatientWeight"] = predictions
    results_df.to_feather(PATH_TO_OUTPUT_DIR / f'predictions_{unique_name}.feather', version=2, compression="zstd")

In [43]:
calculate_voxel_volume = True
columns_to_drop_index = 0
replace_original_columns = False

In [44]:
cleaned_dicom_all_df = pd.read_feather(PATH_TO_CLEANED_DICOM_DF)
train_test_split_info_df = pd.read_feather(PATH_TO_TRAIN_TEST_SPLIT)
cleaned_dicom_df = pd.merge(cleaned_dicom_all_df, train_test_split_info_df, on="SeriesInstanceUID")

possible_columns_to_drop = [ ['PatientId','Rows', 'Columns', 'RescaleSlope', 'RescaleIntercept', 'SliceDirectory', 'PixelArrayFile', 'BodyPart', 'PixelSpacing', 'SliceThickness', 'VoxelVolume', 'PatientSex'], # These columns wont be used in training
                             ['PatientId','Rows', 'Columns', 'RescaleSlope', 'RescaleIntercept', 'SliceDirectory', 'PixelArrayFile', 'BodyPart', 'PixelSpacing', 'SliceThickness', 'PatientSex'], # These columns wont be used in training
                             ['PatientId','Rows', 'Columns', 'RescaleSlope', 'RescaleIntercept', 'SliceDirectory', 'PixelArrayFile', 'BodyPart', 'PatientSex'] ] # These columns wont be used in training

column_to_drop = possible_columns_to_drop[columns_to_drop_index]

segmentation_df = pd.read_feather(PATH_TO_SEGMENTATION_DF)
bincount_HU_df = pd.read_feather(PATH_TO_BINCOUNT_HU_DF)
bincount_STEP_75_df = pd.read_feather(PATH_TO_BINCOUNT_STEP_75_DF)
bincount_STEP_150_df = pd.read_feather(PATH_TO_BINCOUNT_STEP_150_DF)

merged_segmentation_Air_df = pd.merge(segmentation_df, bincount_HU_df[['Air', 'SeriesInstanceUID']], on="SeriesInstanceUID", how="left")
merged_segmentation_HU_df = pd.merge(segmentation_df, bincount_HU_df, on="SeriesInstanceUID", how="left")
merged_segmentation_75_df = pd.merge(segmentation_df, bincount_STEP_75_df, on="SeriesInstanceUID", how="left")
merged_segmentation_150_df = pd.merge(segmentation_df, bincount_STEP_150_df, on="SeriesInstanceUID", how="left")


voxel_columns_bin_75 = bincount_STEP_75_df.columns[0:-1].tolist()
voxel_columns_bin_150 = bincount_STEP_150_df.columns[0:-1].tolist()
voxel_columns_bin_HU = bincount_HU_df.columns[0:-1].tolist()
voxel_columns_segmentation = segmentation_df.columns[0:-1].tolist()
voxel_columns_seg_Air =  voxel_columns_segmentation + ['Air']
voxel_columns_seg_HU =  voxel_columns_segmentation + bincount_HU_df.columns[0:-1].tolist()
voxel_columns_seg_75 = voxel_columns_segmentation + bincount_STEP_75_df.columns[0:-1].tolist()
voxel_columns_seg_150 = voxel_columns_segmentation + bincount_STEP_150_df.columns[0:-1].tolist()

final_segmentation_df     = loadAndPrepareDataFrame(cleaned_dicom_df, segmentation_df           , voxel_columns_segmentation, column_to_drop, calculate_voxel_volume, replace_original_columns)
final_segmentation_Air_df = loadAndPrepareDataFrame(cleaned_dicom_df, merged_segmentation_Air_df, voxel_columns_seg_Air     , column_to_drop, calculate_voxel_volume, replace_original_columns)
final_segmentation_HU_df  = loadAndPrepareDataFrame(cleaned_dicom_df, merged_segmentation_HU_df , voxel_columns_seg_HU      , column_to_drop, calculate_voxel_volume, replace_original_columns)
final_segmentation_75_df  = loadAndPrepareDataFrame(cleaned_dicom_df, merged_segmentation_75_df , voxel_columns_seg_75      , column_to_drop, calculate_voxel_volume, replace_original_columns)
final_segmentation_150_df = loadAndPrepareDataFrame(cleaned_dicom_df, merged_segmentation_150_df, voxel_columns_seg_150     , column_to_drop, calculate_voxel_volume, replace_original_columns)
final_bin_HU_df           = loadAndPrepareDataFrame(cleaned_dicom_df, bincount_HU_df            , voxel_columns_bin_HU      , column_to_drop, calculate_voxel_volume, replace_original_columns)
final_bin_75_df           = loadAndPrepareDataFrame(cleaned_dicom_df, bincount_STEP_75_df       , voxel_columns_bin_75      , column_to_drop, calculate_voxel_volume, replace_original_columns)
final_bin_150_df          = loadAndPrepareDataFrame(cleaned_dicom_df, bincount_STEP_150_df      , voxel_columns_bin_150     , column_to_drop, calculate_voxel_volume, replace_original_columns)

In [45]:
final_bin_150_df

Unnamed: 0,PatientAge,PatientWeight,PatientSize,SliceCount,SeriesInstanceUID,set_type,0,150,300,450,...,volume_150,volume_300,volume_450,volume_600,volume_750,volume_900,volume_1050,volume_1200,volume_1350,PatientSex_encoded
0,61,46.0,1.61,263,1.2.40.0.13.1.28611523484845610500759615941748...,Train,52469016,839092,284233,182347,...,2.400661e+06,8.131971e+05,5.216990e+05,4.778681e+05,5.854311e+05,1.017237e+07,1.045658e+07,1.080863e+06,1.305076e+06,0
1,59,107.0,1.72,293,1.2.40.0.13.1.29372383852567236771106277061240...,Train,45197709,1115406,363675,192230,...,3.191202e+06,1.040483e+06,5.499744e+05,4.630308e+05,1.212862e+06,3.333923e+07,1.587456e+07,1.435284e+06,9.791193e+05,1
2,40,103.0,1.80,625,1.2.40.0.13.1.79959759832434680761991964822026...,Train,99062094,10483093,1782300,576921,...,4.318901e+07,7.342850e+06,2.376841e+06,2.160338e+06,2.330954e+06,5.900893e+07,5.710814e+07,3.203564e+06,1.689321e+06,1
3,79,73.0,1.77,567,1.2.40.0.13.1.24209483531338511718074425173759...,Train,95801339,8410955,1089065,590328,...,3.465207e+07,4.486810e+06,2.432076e+06,2.187171e+06,2.636014e+06,4.441908e+07,2.725920e+07,2.597638e+06,1.466139e+06,1
4,77,125.0,1.84,623,1.2.40.0.13.1.33709088158668319679757985067945...,Train,78937828,16443586,2384668,882332,...,6.774549e+07,9.824529e+06,3.635096e+06,4.443308e+06,6.005980e+06,1.015710e+08,3.173801e+07,3.162048e+06,1.475701e+06,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2926,77,75.0,1.70,567,1.2.40.0.13.1.14447032145816928538722256306280...,Train,92524548,8470579,1541096,941199,...,3.489771e+07,6.349120e+06,3.877620e+06,2.146289e+06,2.131367e+06,3.532814e+07,3.771580e+07,2.708952e+06,1.409281e+06,1
2927,55,49.0,1.58,567,1.2.40.0.13.1.22055339428679588669178744381225...,Train,120319092,2773707,1179018,748897,...,1.142732e+07,4.857404e+06,3.085361e+06,1.600414e+06,1.613956e+06,1.890777e+07,2.918827e+07,1.375650e+06,6.757251e+05,0
2928,51,63.0,1.74,293,1.2.40.0.13.1.15802889953337295246497758301352...,Train,56049001,974446,324573,197983,...,2.787912e+06,9.286108e+05,5.664339e+05,4.943247e+05,5.659304e+05,1.422764e+07,1.539870e+07,1.431198e+06,1.435513e+06,0
2929,90,69.0,1.76,567,1.2.40.0.13.1.56298341537091980176000389528343...,Train,100096171,6307908,1415207,929437,...,2.598778e+07,5.830473e+06,3.829162e+06,2.143150e+06,2.350355e+06,4.118877e+07,2.784339e+07,2.502378e+06,1.195332e+06,1
