Notebook to obtain patient, e-number and slide statistics

Imports

In [1]:
# Local dependencies
from NET_CUP.data_loading.data_tree import ENumber, Slide, DataTree
from NET_CUP.data_loading.feature_type import FeatureType
from NET_CUP.data_loading import data_tree
import NET_CUP.datasources_config as datasources_config

# Other dependencies
import pandas as pd
from anytree import findall

Functions

In [2]:
def print_info(data: DataTree) -> None:
    # Patient level info
    print('##### Patient info #####')
    patients = data_tree.get_patients(data, 'pio')
    sex = {'m': 0, 'f': 0}
    origin = {origin: 0 for origin in data_tree.Origin}
    age = []

    for patient in patients:
        sex[patient.sex] = sex[patient.sex] + 1
        origin[patient.origin] = origin[patient.origin] + 1
        age.append(patient.init_diagn - patient.birthdate)

    age = pd.Series(age)
        
    print(origin)
    print(sex)
    print(age.mean(), age.std())
    print()

    # Enumber level info
    enumbers = findall(data, filter_=lambda node: isinstance(node, ENumber) and node.parent in patients)
    biopsy_enumbers = [enumber for enumber in enumbers if enumber.biopsy]
    resection_enumbers = [enumber for enumber in enumbers if not enumber.biopsy]

    print('##### Enumber info #####')
    print('Amount of enumbers: ' + str(len(enumbers)))
    print('Amount of biopsy enumbers: ' + str(len(biopsy_enumbers)))
    print('Amount of resection enumbers: ' + str(len(resection_enumbers)))
    print()

    # Slide level info
    slides = findall(data, filter_=lambda node: isinstance(node, Slide) and node.parent.parent in patients)
    biopsy_slides = [slide for slide in slides if slide.parent.biopsy]
    resection_slides = [slide for slide in slides if not slide.parent.biopsy]

    print('##### Slide info #####')
    print('Amount of slides: ' + str(len(slides)))
    print('Amount of biopsy slides: ' + str(len(biopsy_slides)))
    print('Amount of resection slides: ' + str(len(resection_slides)))
    print()


UKE data

In [3]:
uke_data = data_tree.create_tree(datasources_config.PATIENTS_PATH,
                             datasources_config.ENUMBER_PATH)
data_tree.drop_slides_without_extracted_features(uke_data, FeatureType.MTDP, datasources_config.UKE_DATASET_DIR)

print_info(uke_data)

##### Patient info #####
{<Origin.PANCREAS: 0>: 40, <Origin.SI: 1>: 51, <Origin.OTHER: 2>: 8, <Origin.CUP: 3>: 0}
{'m': 59, 'f': 40}
20739 days 04:21:49.090909184 5077 days 15:00:17.358404736

##### Enumber info #####
Amount of enumbers: 142
Amount of biopsy enumbers: 39
Amount of resection enumbers: 103

##### Slide info #####
Amount of slides: 270
Amount of biopsy slides: 51
Amount of resection slides: 219



External data

In [4]:
external_data = data_tree.create_tree(datasources_config.PATIENTS_PATH,
                             datasources_config.ENUMBER_PATH)
data_tree.drop_slides_without_extracted_features(external_data, FeatureType.MTDP, datasources_config.EXTERNAL_DATASET_DIR)

print_info(external_data)

##### Patient info #####
{<Origin.PANCREAS: 0>: 4, <Origin.SI: 1>: 5, <Origin.OTHER: 2>: 2, <Origin.CUP: 3>: 0}
{'m': 3, 'f': 8}
15106 days 17:27:16.363636480 3896 days 20:53:23.710549056

##### Enumber info #####
Amount of enumbers: 11
Amount of biopsy enumbers: 10
Amount of resection enumbers: 1

##### Slide info #####
Amount of slides: 19
Amount of biopsy slides: 18
Amount of resection slides: 1

