In this tutorial, we will analyze Lipoma dataset from the [WORC Database](https://github.com/MStarmans91/WORCDatabase/tree/development).

More details on the dataset as well as the original analysis performed by their authors can be found here:

`Starmans, M. P. A. et al. (2021). The WORC* database: MRI and CT scans, segmentations, and clinical labels for 932 patients from six radiomics studies. Submitted, preprint available from https://doi.org/10.1101/2021.08.19.21262238`

`The experiments are described in the following paper: Starmans, M. P. A. et al. (2021). Reproducible radiomics through automated machine learning validated on twelve clinical applications. Submitted, preprint available from https://arxiv.org/abs/2108.08618.`

In [None]:
# In case you haven't installed AutoRadiomics
!pip install autorad

In [None]:
from autorad.external.download_WORC import download_WORCDatabase
from pathlib import Path
import pandas as pd

# Set where we will save our data and results
base_dir = Path.cwd() / "autorad_tutorial"
data_dir = base_dir / "data"
result_dir = base_dir / "results"
data_dir.mkdir(exist_ok=True, parents=True)
result_dir.mkdir(exist_ok=True, parents=True)

%load_ext autoreload
%autoreload 2

# download data (it may take a few minutes)
download_WORCDatabase(
    dataset="Lipo",
    data_folder=data_dir,
    n_subjects=30,
    )

In [101]:
!ls $data_dir

[1m[36mLipo-007[m[m   [1m[36mLipo-031[m[m   [1m[36mLipo-050[m[m   [1m[36mLipo-063[m[m   [1m[36mLipo-082[m[m   [1m[36mLipo-106[m[m   labels.csv
[1m[36mLipo-008[m[m   [1m[36mLipo-039[m[m   [1m[36mLipo-051[m[m   [1m[36mLipo-064[m[m   [1m[36mLipo-085[m[m   [1m[36mLipo-107[m[m
[1m[36mLipo-009[m[m   [1m[36mLipo-043[m[m   [1m[36mLipo-052[m[m   [1m[36mLipo-066[m[m   [1m[36mLipo-091[m[m   [1m[36mLipo-108[m[m
[1m[36mLipo-012[m[m   [1m[36mLipo-044[m[m   [1m[36mLipo-055[m[m   [1m[36mLipo-067[m[m   [1m[36mLipo-094[m[m   [1m[36mLipo-111[m[m
[1m[36mLipo-028[m[m   [1m[36mLipo-049[m[m   [1m[36mLipo-057[m[m   [1m[36mLipo-068[m[m   [1m[36mLipo-095[m[m   [1m[36mLipo-112[m[m


In [102]:
from autorad.data.utils import get_paths_with_separate_folder_per_case

paths_df = get_paths_with_separate_folder_per_case(data_dir, relative=True)
paths_df.sample(5)

Unnamed: 0,ID,image_path,segmentation_path
19,Lipo-007,Lipo-007/image.nii.gz,Lipo-007/segmentation.nii.gz
11,Lipo-028,Lipo-028/image.nii.gz,Lipo-028/segmentation.nii.gz
0,Lipo-112,Lipo-112/image.nii.gz,Lipo-112/segmentation.nii.gz
20,Lipo-064,Lipo-064/image.nii.gz,Lipo-064/segmentation.nii.gz
15,Lipo-111,Lipo-111/image.nii.gz,Lipo-111/segmentation.nii.gz


In [107]:
from autorad.data.dataset import ImageDataset
from autorad.feature_extraction.extractor import FeatureExtractor
import logging

logging.getLogger().setLevel(logging.CRITICAL)

image_dataset = ImageDataset(
    paths_df,
    ID_colname="ID",
    root_dir=data_dir,
)

#for faster computation, let's resize the images

#Let's take a look at the data, plotting random 10 cases
image_dataset.plot_examples(n=10, window="bone")

TypeError: ImageDataset.plot_examples() got an unexpected keyword argument 'window'

In [108]:
extractor = FeatureExtractor(image_dataset)
feature_df = extractor.run()

5it [00:14,  2.45s/it]

KeyboardInterrupt: 

In [67]:
feature_df.head()

Unnamed: 0,ID,image_path,segmentation_path,diagnostics_Versions_PyRadiomics,diagnostics_Versions_Numpy,diagnostics_Versions_SimpleITK,diagnostics_Versions_PyWavelet,diagnostics_Versions_Python,diagnostics_Configuration_Settings,diagnostics_Configuration_EnabledImageTypes,...,wavelet-LLL_gldm_LargeDependenceLowGrayLevelEmphasis,wavelet-LLL_gldm_LowGrayLevelEmphasis,wavelet-LLL_gldm_SmallDependenceEmphasis,wavelet-LLL_gldm_SmallDependenceHighGrayLevelEmphasis,wavelet-LLL_gldm_SmallDependenceLowGrayLevelEmphasis,wavelet-LLL_ngtdm_Busyness,wavelet-LLL_ngtdm_Coarseness,wavelet-LLL_ngtdm_Complexity,wavelet-LLL_ngtdm_Contrast,wavelet-LLL_ngtdm_Strength
0,Liver-035,/Users/p.woznicki/git/AutoRadiomics/examples/a...,/Users/p.woznicki/git/AutoRadiomics/examples/a...,v3.0.1,1.22.1,2.1.1.2,1.2.0,3.10.4,"{'minimumROIDimensions': 1, 'minimumROISize': ...","{'Original': {}, 'LoG': {'sigma': [3.0, 5.0]},...",...,0.490335565711314,0.0092392563097354,0.1049786317887887,17.108519817338614,0.0019064162301314,2.195837937351879,0.0010111456662735,426.9975349731685,0.0619020314532844,0.284974468309886
1,Liver-067,/Users/p.woznicki/git/AutoRadiomics/examples/a...,/Users/p.woznicki/git/AutoRadiomics/examples/a...,v3.0.1,1.22.1,2.1.1.2,1.2.0,3.10.4,"{'minimumROIDimensions': 1, 'minimumROISize': ...","{'Original': {}, 'LoG': {'sigma': [3.0, 5.0]},...",...,0.1726950112519562,0.0084378080208024,0.2387784244404366,83.97021323093149,0.0023728396471364,0.7591978668706331,0.001202498835943,4314.0961725974375,0.0530871088832545,3.480993082112052
2,Liver-033,/Users/p.woznicki/git/AutoRadiomics/examples/a...,/Users/p.woznicki/git/AutoRadiomics/examples/a...,v3.0.1,1.22.1,2.1.1.2,1.2.0,3.10.4,"{'minimumROIDimensions': 1, 'minimumROISize': ...","{'Original': {}, 'LoG': {'sigma': [3.0, 5.0]},...",...,0.0023819900248588,0.0012766642609044,0.5654204077815189,1949.3340316747724,0.0011009446497239,0.0266051312038575,0.0076363940620611,17684.10707042901,0.3745009155825436,22.73438458589251
3,Liver-034,/Users/p.woznicki/git/AutoRadiomics/examples/a...,/Users/p.woznicki/git/AutoRadiomics/examples/a...,v3.0.1,1.22.1,2.1.1.2,1.2.0,3.10.4,"{'minimumROIDimensions': 1, 'minimumROISize': ...","{'Original': {}, 'LoG': {'sigma': [3.0, 5.0]},...",...,1.440566612137536,0.029632596656669,0.0490308578977173,1.5859015043216873,0.0034147618304968,2.044400483735623,0.0063941564076542,29.814833689553108,0.0317578315752439,0.2661640404226968
4,Liver-146,/Users/p.woznicki/git/AutoRadiomics/examples/a...,/Users/p.woznicki/git/AutoRadiomics/examples/a...,v3.0.1,1.22.1,2.1.1.2,1.2.0,3.10.4,"{'minimumROIDimensions': 1, 'minimumROISize': ...","{'Original': {}, 'LoG': {'sigma': [3.0, 5.0]},...",...,0.5235105253912582,0.0144098607235655,0.1017286419590698,9.821812246039771,0.0024608166414348,1.9618138907602145,0.0021490398335429,178.82845259100122,0.0331384950258136,0.3598787334834973


In [72]:
label_df = pd.read_csv(data_dir / "labels.csv")
label_df.head()

Unnamed: 0,patient_ID,diagnosis
0,Liver-185,1
1,Liver-166,0
2,Liver-180,0
3,Liver-067,0
4,Liver-035,0


In [73]:
from autorad.data.dataset import FeatureDataset

merged_feature_df = feature_df.merge(label_df, left_on="ID",
    right_on="patient_ID", how="left")
feature_dataset = FeatureDataset(
    merged_feature_df,
    target="diagnosis",
    ID_colname="ID"
)

Split the data into training/validation/test sets:

In [91]:
splits_path = result_dir / "splits.json"
feature_dataset.split(method="train_val_test",
        save_path=splits_path,
        test_size=0.3,
    )

{'split_type': '50% train + 20% validation + 30% test',
 'train': ['Liver-158',
  'Liver-008',
  'Liver-031',
  'Liver-128',
  'Liver-065',
  'Liver-168',
  'Liver-180',
  'Liver-006',
  'Liver-037',
  'Liver-011',
  'Liver-086',
  'Liver-115',
  'Liver-126',
  'Liver-184',
  'Liver-035',
  'Liver-185',
  'Liver-067',
  'Liver-159',
  'Liver-145'],
 'val': ['Liver-147',
  'Liver-010',
  'Liver-024',
  'Liver-119',
  'Liver-124',
  'Liver-154',
  'Liver-167',
  'Liver-106',
  'Liver-079'],
 'test': ['Liver-107',
  'Liver-021',
  'Liver-166',
  'Liver-146',
  'Liver-038',
  'Liver-033',
  'Liver-156',
  'Liver-034',
  'Liver-030',
  'Liver-054',
  'Liver-023',
  'Liver-122']}

In [92]:
from autorad.training.trainer import Trainer
from autorad.models.classifier import MLClassifier

models = MLClassifier.initialize_default_sklearn_models()
print(models)

[Random Forest, Logistic Regression, SVM, XGBoost]


In [94]:
trainer = Trainer(
    dataset=feature_dataset,
    models=models,
    result_dir=result_dir,
    experiment_name="Liver_detection",
)
trainer.run_auto_preprocessing(
        selection_methods=['anova'],
        oversampling=False,
        )

In [97]:
trainer.set_optimizer("optuna", n_trials=30)
trainer.run(auto_preprocess=True)

[32m[I 2022-06-07 15:31:27,318][0m A new study created in memory with name: Liver_detection[0m
[33m[W 2022-06-07 15:31:27,321][0m Trial 0 failed because of the following error: ValueError('No preprocessing done!')[0m
Traceback (most recent call last):
  File "/Users/p.woznicki/git/AutoRadiomics/.venv/lib/python3.10/site-packages/optuna/study/_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
  File "/Users/p.woznicki/git/AutoRadiomics/autorad/training/trainer.py", line 120, in <lambda>
    lambda trial: self._objective(trial, auto_preprocess),
  File "/Users/p.woznicki/git/AutoRadiomics/autorad/training/trainer.py", line 153, in _objective
    for X_train, y_train, X_val, y_val in data.iter_training():
  File "/Users/p.woznicki/git/AutoRadiomics/autorad/data/dataset.py", line 58, in iter_training
    raise ValueError("No preprocessing done!")
ValueError: No preprocessing done!


ValueError: No preprocessing done!

In [96]:
from autorad.utils import io
from autorad.visualization import plotly_utils
from autorad.training.trainer import Inferrer

best_params = io.load_json(result_dir / "best_params.json")
inferrer = Inferrer(params=best_params, result_dir=result_dir)
inferrer.fit_eval(feature_dataset, result_name="test")

results = pd.read_csv(result_dir / "test.csv")
plotly_utils.plot_roc_curve(results.y_true, results.y_pred_proba)

In [79]:
results

Unnamed: 0,y_true,y_pred_proba
0,1,0.458062
1,1,0.458403
2,1,0.458564
3,1,0.458457
4,0,0.458511
5,0,0.458574
6,0,0.458334
7,0,0.458369
