In [7]:
import pandas as pd
import random

### Extract Discovery set from Danenberg_et_al
* 1) data filtering: exclude patients with no clinical data and images with less than 500 cells
* 2) Random partition: for reproducibility purpose, we use a random seed.

In [8]:
# read single cell data and clinical data
cells = pd.read_csv("Datasets/Danenberg_et_al/cells.csv")
clinical = pd.read_csv("Datasets/Danenberg_et_al/clinical.csv")
print("Initially,")
print(
    "{} patients ({} images) with cell data, {} patients with clinical data, ".format(
        len(cells["metabric_id"].unique()),
        len(cells["ImageNumber"].unique()),
        len(clinical["metabric_id"].unique()),
    )
)
# remove images without invasive tumor
print("\nRemove images without invasive tumor,")
cells = cells.loc[cells.isTumour == 1]
print(
    "{} patients ({} images) with cell data, {} patients with clinical data, ".format(
        len(cells["metabric_id"].unique()),
        len(cells["ImageNumber"].unique()),
        len(clinical["metabric_id"].unique()),
    )
)
# remove patients with no clinical data
print("\nRemove patients with no clinical data,")
cells = cells.loc[cells["metabric_id"].isin(clinical["metabric_id"])]
print(
    "{} patients ({} images) with cell data and clinical data, ".format(
        len(cells["metabric_id"].unique()),
        len(cells["ImageNumber"].unique()),
    )
)
# remove images with less than 500 cells
print("\nRemove images with less than 500 cells")
cells_per_image = cells.groupby("ImageNumber").size()
cells = cells.loc[
    cells["ImageNumber"].isin(cells_per_image[cells_per_image > 500].index)
]
clinical = clinical.loc[clinical["metabric_id"].isin(cells["metabric_id"].unique())]
print(
    "{} patients ({} images) with more than 500 cells and clinical data, ".format(
        len(cells["metabric_id"].unique()),
        len(cells["ImageNumber"].unique()),
    )
)

random.seed(0)
Subset_id = [1] * (len(clinical) - 200) + [2] * 200
random.shuffle(Subset_id)
clinical['Subset_id'] = Subset_id
cells_discovery = cells.loc[cells["metabric_id"].isin(clinical.loc[clinical['Subset_id'] == 1, "metabric_id"])]
cells_validation = cells.loc[cells["metabric_id"].isin(clinical.loc[clinical['Subset_id'] == 2, "metabric_id"])]
print("\nAfter splitting into discovery and validation sets,")
print(
    "{} patients ({} images) with more than 500 cells and clinical data in the discovery set, ".format(
        len(cells_discovery["metabric_id"].unique()),
        len(cells_discovery["ImageNumber"].unique()),
    )
)

Initially
718 patients (794 images) with cell data, 2604 patients with clinical data, 

Remove images without invasive tumor
693 patients (749 images) with cell data, 2604 patients with clinical data, 

Remove patients with no clinical data
683 patients (737 images) with cell data and clinical data, 

Remove images with less than 500 cells
579 patients (621 images) with more than 500 cells and clinical data, 

After splitting into discovery and validation sets
379 patients (404 images) with more than 500 cells and clinical data in the discovery set, 


### Standardize data, and Generate cellular graphs

### Identify TME patterns with Soft WL subtree kernel