In [2]:
from pathlib import Path
import os
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
import pickle

In [3]:
PATH_TO_PARENT_DIR = Path(os.getcwd()).resolve().parent.parent

PATH_TO_DATA_DIR = PATH_TO_PARENT_DIR / "Data"

PATH_TO_CLEANED_DICOM_DF = PATH_TO_DATA_DIR / "cleaned_dicom_df.feather"

PATH_TO_SEGMENTATION_DF = PATH_TO_DATA_DIR / "segmentation_df.feather"
PATH_TO_BINCOUNT_HU_DF = PATH_TO_DATA_DIR / "bincount_HU_df.feather"
PATH_TO_BINCOUNT_STEP_75_DF = PATH_TO_DATA_DIR / "bincount_STEP_75_df.feather"
PATH_TO_BINCOUNT_STEP_150_DF = PATH_TO_DATA_DIR / "bincount_STEP_150_df.feather"
PATH_TO_TRAIN_TEST_SPLIT = PATH_TO_DATA_DIR / "train_test_split.feather"

PATH_TO_MODEL_DIR = PATH_TO_PARENT_DIR / "Model" / "Justin"
PATH_TO_MODEL_SEG = PATH_TO_MODEL_DIR / "model_seg.pkl"
PATH_TO_MODEL_SEG_AIR = PATH_TO_MODEL_DIR / "model_seg_air.pkl"
PATH_TO_MODEL_SEG_75 = PATH_TO_MODEL_DIR / "model_seg_75.pkl"
PATH_TO_MODEL_SEG_150 = PATH_TO_MODEL_DIR / "model_seg_150.pkl"
PATH_TO_MODEL_SEG_HU = PATH_TO_MODEL_DIR / "model_seg_HU.pkl"
PATH_TO_MODEL_BINCOUNT_75 = PATH_TO_MODEL_DIR / "model_bincount_75.pkl"
PATH_TO_MODEL_BINCOUNT_150 = PATH_TO_MODEL_DIR / "model_bincount_150.pkl"
PATH_TO_MODEL_HU = PATH_TO_MODEL_DIR / "model_bincount_HU.pkl"

In [4]:
cleaned_dicom_df = pd.read_feather(PATH_TO_CLEANED_DICOM_DF)
train_test_split_df = pd.read_feather(PATH_TO_TRAIN_TEST_SPLIT)

train_test_split_df = train_test_split_df[train_test_split_df["set_type"] == "Train"]

cleaned_dicom_df = pd.merge(cleaned_dicom_df, train_test_split_df, on="SeriesInstanceUID")




segmentation_df = pd.read_feather(PATH_TO_SEGMENTATION_DF)
bincount_HU_df = pd.read_feather(PATH_TO_BINCOUNT_HU_DF)
bincount_STEP_75_df = pd.read_feather(PATH_TO_BINCOUNT_STEP_75_DF)
bincount_STEP_150_df = pd.read_feather(PATH_TO_BINCOUNT_STEP_150_DF)


merged_bin_HU_df = pd.merge(cleaned_dicom_df, bincount_HU_df, on="SeriesInstanceUID", how="left")
merged_bin_75_df = pd.merge(cleaned_dicom_df, bincount_STEP_75_df, on="SeriesInstanceUID", how="left")
merged_bin_150_df = pd.merge(cleaned_dicom_df, bincount_STEP_150_df, on="SeriesInstanceUID", how="left")
merged_segmentation_df = pd.merge(cleaned_dicom_df, segmentation_df, on="SeriesInstanceUID", how="left")
merged_segmentation_Air_df = pd.merge(merged_segmentation_df, bincount_HU_df[['Air', 'SeriesInstanceUID']], on="SeriesInstanceUID", how="left")
merged_segmentation_HU_df = pd.merge(merged_segmentation_df, bincount_HU_df, on="SeriesInstanceUID", how="left")
merged_segmentation_75_df = pd.merge(merged_segmentation_df, bincount_STEP_75_df, on="SeriesInstanceUID", how="left")
merged_segmentation_150_df = pd.merge(merged_segmentation_df, bincount_STEP_150_df, on="SeriesInstanceUID", how="left")


target = merged_segmentation_df['PatientWeight']

voxel_columns_bin_75 = bincount_STEP_75_df.columns[0:-1].tolist()
voxel_columns_bin_150 = bincount_STEP_150_df.columns[0:-1].tolist()
voxel_columns_bin_HU = bincount_HU_df.columns[0:-1].tolist()
voxel_columns_segmentation = segmentation_df.columns[0:-1].tolist()
voxel_columns_seg_Air =  voxel_columns_segmentation + ['Air']
voxel_columns_seg_HU =  voxel_columns_segmentation + bincount_HU_df.columns[0:-1].tolist()
voxel_columns_seg_75 = voxel_columns_segmentation + bincount_STEP_75_df.columns[0:-1].tolist()
voxel_columns_seg_150 = voxel_columns_segmentation + bincount_STEP_150_df.columns[0:-1].tolist()


merged_bin_HU_df.loc[:, 'VoxelVolume'] = (merged_bin_HU_df['PixelSpacing'] ** 2) * merged_bin_HU_df['SliceThickness']
merged_bin_75_df.loc[:, 'VoxelVolume'] = (merged_bin_75_df['PixelSpacing'] ** 2) * merged_bin_75_df['SliceThickness']
merged_bin_150_df.loc[:, 'VoxelVolume'] = (merged_bin_150_df['PixelSpacing'] ** 2) * merged_bin_150_df['SliceThickness']
merged_segmentation_df.loc[:, 'VoxelVolume'] = (merged_segmentation_df['PixelSpacing'] ** 2) * merged_segmentation_df['SliceThickness']
merged_segmentation_Air_df.loc[:, 'VoxelVolume'] = (merged_segmentation_Air_df['PixelSpacing'] ** 2) * merged_segmentation_Air_df['SliceThickness']
merged_segmentation_HU_df.loc[:, 'VoxelVolume'] = (merged_segmentation_HU_df['PixelSpacing'] ** 2) * merged_segmentation_HU_df['SliceThickness']
merged_segmentation_75_df.loc[:, 'VoxelVolume'] = (merged_segmentation_75_df['PixelSpacing'] ** 2) * merged_segmentation_75_df['SliceThickness']
merged_segmentation_150_df.loc[:, 'VoxelVolume'] = (merged_segmentation_150_df['PixelSpacing'] ** 2) * merged_segmentation_150_df['SliceThickness']


def apply_voxel_volume(row, voxel_columns):
    return row[voxel_columns] * row['VoxelVolume']

transformed_bin_HU_df = merged_bin_HU_df.copy()
transformed_bin_75_df = merged_bin_75_df.copy()
transformed_bin_150_df = merged_bin_150_df.copy()
transformed_segmentation_df = merged_segmentation_df.copy()
transformed_segmentation_Air_df = merged_segmentation_Air_df.copy()
transformed_segmentation_HU_df = merged_segmentation_HU_df.copy()
transformed_segmentation_75_df = merged_segmentation_75_df.copy()
transformed_segmentation_150_df = merged_segmentation_150_df.copy()

def transform_df(df, voxel_columns):
    transformed_voxel_values = df.apply(lambda row: apply_voxel_volume(row, voxel_columns), axis=1)
    transformed_voxel_values = pd.DataFrame(transformed_voxel_values.values.tolist(), columns=voxel_columns, index=df.index)
    for col in voxel_columns:
        df[col] = transformed_voxel_values[col]
    
    df['PatientSex_encoded'] = df['PatientSex'].map({'F': 0, 'M': 1})
    return df


transformed_bin_75_df = transform_df(transformed_bin_75_df, voxel_columns_bin_75)
transformed_bin_150_df = transform_df(transformed_bin_150_df, voxel_columns_bin_150)
transformed_bin_HU_df = transform_df(transformed_bin_HU_df, voxel_columns_bin_HU)
transformed_segmentation_df = transform_df(transformed_segmentation_df, voxel_columns_segmentation)
transformed_segmentation_Air_df = transform_df(transformed_segmentation_Air_df, voxel_columns_seg_Air)
transformed_segmentation_HU_df = transform_df(transformed_segmentation_HU_df, voxel_columns_seg_HU)
transformed_segmentation_75_df = transform_df(transformed_segmentation_75_df, voxel_columns_seg_75)
transformed_segmentation_150_df = transform_df(transformed_segmentation_150_df, voxel_columns_seg_150)



base_columns_to_drop = ['PatientWeight', 'PatientId','Rows', 'Columns', 'RescaleSlope', 'RescaleIntercept', 'SeriesInstanceUID', 'SliceDirectory', 'PixelArrayFile', 'BodyPart', 'PixelSpacing', 'SliceThickness', 'PatientSex', 'set_type'] # These columns wont be used in training

cleaned_bin_HU_df = transformed_bin_HU_df.drop(columns=base_columns_to_drop, errors='ignore')
cleaned_bin_75_df = transformed_bin_75_df.drop(columns=base_columns_to_drop, errors='ignore')
cleaned_bin_150_df = transformed_bin_150_df.drop(columns=base_columns_to_drop, errors='ignore')
cleaned_segmentation_df = transformed_segmentation_df.drop(columns=base_columns_to_drop, errors='ignore')
cleaned_segmentation_Air_df = transformed_segmentation_Air_df.drop(columns=base_columns_to_drop, errors='ignore')
cleaned_segmentation_HU_df = transformed_segmentation_HU_df.drop(columns=base_columns_to_drop, errors='ignore')
cleaned_segmentation_75_df = transformed_segmentation_75_df.drop(columns=base_columns_to_drop, errors='ignore')
cleaned_segmentation_150_df = transformed_segmentation_150_df.drop(columns=base_columns_to_drop, errors='ignore')

  df['PatientSex_encoded'] = df['PatientSex'].map({'F': 0, 'M': 1})
  df['PatientSex_encoded'] = df['PatientSex'].map({'F': 0, 'M': 1})
  df['PatientSex_encoded'] = df['PatientSex'].map({'F': 0, 'M': 1})
  df['PatientSex_encoded'] = df['PatientSex'].map({'F': 0, 'M': 1})
  df['PatientSex_encoded'] = df['PatientSex'].map({'F': 0, 'M': 1})


In [13]:
cleaned_bin_150_df

Unnamed: 0,PatientAge,PatientSize,SliceCount,0,150,300,450,600,750,900,1050,1200,1350,VoxelVolume,PatientSex_encoded
0,61,1.61,263,1.501151e+08,2.400661e+06,8.131971e+05,5.216990e+05,4.778681e+05,5.854311e+05,1.017237e+07,1.045658e+07,1.080863e+06,1.305076e+06,2.861023,0
1,59,1.72,293,1.293117e+08,3.191202e+06,1.040483e+06,5.499744e+05,4.630308e+05,1.212862e+06,3.333923e+07,1.587456e+07,1.435284e+06,9.791193e+05,2.861023,1
2,40,1.80,625,4.081233e+08,4.318901e+07,7.342850e+06,2.376841e+06,2.160338e+06,2.330954e+06,5.900893e+07,5.710814e+07,3.203564e+06,1.689321e+06,4.119873,1
3,79,1.77,567,3.946894e+08,3.465207e+07,4.486810e+06,2.432076e+06,2.187171e+06,2.636014e+06,4.441908e+07,2.725920e+07,2.597638e+06,1.466139e+06,4.119873,1
4,77,1.84,623,3.252138e+08,6.774549e+07,9.824529e+06,3.635096e+06,4.443308e+06,6.005980e+06,1.015710e+08,3.173801e+07,3.162048e+06,1.475701e+06,4.119873,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2339,77,1.70,567,3.811894e+08,3.489771e+07,6.349120e+06,3.877620e+06,2.146289e+06,2.131367e+06,3.532814e+07,3.771580e+07,2.708952e+06,1.409281e+06,4.119873,1
2340,55,1.58,567,4.956994e+08,1.142732e+07,4.857404e+06,3.085361e+06,1.600414e+06,1.613956e+06,1.890777e+07,2.918827e+07,1.375650e+06,6.757251e+05,4.119873,0
2341,51,1.74,293,1.603575e+08,2.787912e+06,9.286108e+05,5.664339e+05,4.943247e+05,5.659304e+05,1.422764e+07,1.539870e+07,1.431198e+06,1.435513e+06,2.861023,0
2342,90,1.76,567,4.123835e+08,2.598778e+07,5.830473e+06,3.829162e+06,2.143150e+06,2.350355e+06,4.118877e+07,2.784339e+07,2.502378e+06,1.195332e+06,4.119873,1


In [None]:

def find_best_model_bayes(X, y):
    param_space = {
        'loss': Categorical(['squared_error', 'absolute_error']),
        'learning_rate': Real(0.01, 0.2, prior='log-uniform'),
        'n_estimators': Integer(50, 700),
        'subsample': Real(0.5, 1.0, prior='uniform'),
        'min_samples_split': Integer(2, 10),
        'min_samples_leaf': Integer(1, 4),
        'min_weight_fraction_leaf': Real(0.0, 0.5, prior='uniform'),
        'max_depth': Integer(1, 9),
        'max_leaf_nodes': Integer(2, 10),
    }

    gb = GradientBoostingRegressor()
    bayes_search = BayesSearchCV(estimator=gb, search_spaces=param_space, n_iter=256, cv=8, n_jobs=-1, verbose=2, scoring='neg_mean_absolute_error')
    bayes_search.fit(X, y)
    
    print(bayes_search.best_params_)
    print(bayes_search.best_score_)
    return bayes_search.best_params_



best_params_seg_bayes = find_best_model_bayes(cleaned_segmentation_df, target)
best_model_seg_bayes = GradientBoostingRegressor(**best_params_seg_bayes)
best_model_seg_bayes.fit(cleaned_segmentation_df, target)


with open(PATH_TO_MODEL_SEG, 'wb') as f:
    pickle.dump(best_model_seg_bayes, f)

with open(PATH_TO_MODEL_DIR / "best_params_seg.pkl", 'wb') as f:
    pickle.dump(best_params_seg_bayes, f)



best_params_seg_air_bayes = find_best_model_bayes(cleaned_segmentation_Air_df, target)
best_model_seg_air_bayes = GradientBoostingRegressor(**best_params_seg_air_bayes)
best_model_seg_air_bayes.fit(cleaned_segmentation_Air_df, target)

with open(PATH_TO_MODEL_SEG_AIR, 'wb') as f:
    pickle.dump(best_model_seg_air_bayes, f)

with open(PATH_TO_MODEL_DIR / "best_params_seg_air.pkl", 'wb') as f:
    pickle.dump(best_params_seg_air_bayes, f)



best_params_seg_HU_bayes = find_best_model_bayes(cleaned_segmentation_HU_df, target)
best_model_seg_HU_bayes = GradientBoostingRegressor(**best_params_seg_HU_bayes)
best_model_seg_HU_bayes.fit(cleaned_segmentation_HU_df, target)

with open(PATH_TO_MODEL_SEG_HU, 'wb') as f:
    pickle.dump(best_model_seg_HU_bayes, f)

with open(PATH_TO_MODEL_DIR / "best_params_seg_HU.pkl", 'wb') as f:
    pickle.dump(best_params_seg_HU_bayes, f)



best_params_seg_75_bayes = find_best_model_bayes(cleaned_segmentation_75_df, target)
best_model_seg_75_bayes = GradientBoostingRegressor(**best_params_seg_75_bayes)
best_model_seg_75_bayes.fit(cleaned_segmentation_75_df, target)

with open(PATH_TO_MODEL_SEG_75, 'wb') as f:
    pickle.dump(best_model_seg_75_bayes, f)

with open(PATH_TO_MODEL_DIR / "best_params_seg_75.pkl", 'wb') as f:
    pickle.dump(best_params_seg_75_bayes, f)



best_params_seg_150_bayes = find_best_model_bayes(cleaned_segmentation_150_df, target)
best_model_seg_150_bayes = GradientBoostingRegressor(**best_params_seg_150_bayes)
best_model_seg_150_bayes.fit(cleaned_segmentation_150_df, target)

with open(PATH_TO_MODEL_SEG_150, 'wb') as f:
    pickle.dump(best_model_seg_150_bayes, f)

with open(PATH_TO_MODEL_DIR / "best_params_seg_150.pkl", 'wb') as f:
    pickle.dump(best_params_seg_150_bayes, f)




best_params_HU_bayes = find_best_model_bayes(cleaned_bin_HU_df, target)
best_model_HU_bayes = GradientBoostingRegressor(**best_params_HU_bayes)
best_model_HU_bayes.fit(cleaned_bin_HU_df, target)

with open(PATH_TO_MODEL_HU, 'wb') as f:
    pickle.dump(best_model_HU_bayes, f)

with open(PATH_TO_MODEL_DIR / "best_params_HU.pkl", 'wb') as f:
    pickle.dump(best_params_HU_bayes, f)

print('Finding best model for 75')
best_params_75_bayes = find_best_model_bayes(cleaned_bin_75_df, target)
best_model_75_bayes = GradientBoostingRegressor(**best_params_75_bayes)
best_model_75_bayes.fit(cleaned_bin_75_df, target)

with open(PATH_TO_MODEL_BINCOUNT_75, 'wb') as f:
    pickle.dump(best_model_75_bayes, f)

with open(PATH_TO_MODEL_DIR / "best_params_75.pkl", 'wb') as f:
    pickle.dump(best_params_75_bayes, f)

print('Finding best model for 150')
best_params_150_bayes = find_best_model_bayes(cleaned_bin_150_df, target)
best_model_150_bayes = GradientBoostingRegressor(**best_params_150_bayes)
best_model_150_bayes.fit(cleaned_bin_150_df, target)

with open(PATH_TO_MODEL_BINCOUNT_150, 'wb') as f:
    pickle.dump(best_model_150_bayes, f)
    
with open(PATH_TO_MODEL_DIR / "best_params_150.pkl", 'wb') as f:
    pickle.dump(best_params_150_bayes, f)