In [1]:
import pandas as pd
import random
from definitions import get_node_color, get_node_id
import sys
import numpy as np
sys.path.append("./..")
from cell_graph import Cell_Graph
from soft_wl_subtree import Soft_WL_Subtree

### Extract Discovery set from Danenberg_et_al
* 1) data filtering: exclude patients with no clinical data and images with less than 500 cells
* 2) Random partition: for reproducibility purpose, we use a random seed.

In [2]:
# read single cell data and clinical data
cells = pd.read_csv("Datasets/Danenberg_et_al/cells.csv")
clinical = pd.read_csv("Datasets/Danenberg_et_al/clinical.csv")
print("Initially,")
print(
    "{} patients ({} images) with cell data, {} patients with clinical data, ".format(
        len(cells["metabric_id"].unique()),
        len(cells["ImageNumber"].unique()),
        len(clinical["metabric_id"].unique()),
    )
)
# remove images without invasive tumor
print("\nRemove images without invasive tumor,")
cells = cells.loc[cells.isTumour == 1]
print(
    "{} patients ({} images) with cell data, {} patients with clinical data, ".format(
        len(cells["metabric_id"].unique()),
        len(cells["ImageNumber"].unique()),
        len(clinical["metabric_id"].unique()),
    )
)
# remove patients with no clinical data
print("\nRemove patients with no clinical data,")
cells = cells.loc[cells["metabric_id"].isin(clinical["metabric_id"])]
print(
    "{} patients ({} images) with cell data and clinical data, ".format(
        len(cells["metabric_id"].unique()),
        len(cells["ImageNumber"].unique()),
    )
)
# remove images with less than 500 cells
print("\nRemove images with less than 500 cells")
cells_per_image = cells.groupby("ImageNumber").size()
cells = cells.loc[
    cells["ImageNumber"].isin(cells_per_image[cells_per_image > 500].index)
]
clinical = clinical.loc[clinical["metabric_id"].isin(cells["metabric_id"].unique())]
print(
    "{} patients ({} images) with more than 500 cells and clinical data, ".format(
        len(cells["metabric_id"].unique()),
        len(cells["ImageNumber"].unique()),
    )
)

random.seed(0)
Subset_id = [1] * (len(clinical) - 200) + [2] * 200
random.shuffle(Subset_id)
clinical['Subset_id'] = Subset_id
cells_discovery = cells.loc[cells["metabric_id"].isin(clinical.loc[clinical['Subset_id'] == 1, "metabric_id"])]
cells_validation = cells.loc[cells["metabric_id"].isin(clinical.loc[clinical['Subset_id'] == 2, "metabric_id"])]
print("\nAfter splitting into discovery and validation sets,")
print(
    "{} patients ({} images) with more than 500 cells and clinical data in the discovery set, ".format(
        len(cells_discovery["metabric_id"].unique()),
        len(cells_discovery["ImageNumber"].unique()),
    )
)

Initially,
718 patients (794 images) with cell data, 2604 patients with clinical data, 

Remove images without invasive tumor,
693 patients (749 images) with cell data, 2604 patients with clinical data, 

Remove patients with no clinical data,
683 patients (737 images) with cell data and clinical data, 

Remove images with less than 500 cells
579 patients (621 images) with more than 500 cells and clinical data, 

After splitting into discovery and validation sets,
379 patients (404 images) with more than 500 cells and clinical data in the discovery set, 


### Standardize data, and Generate cellular graphs

In [3]:
# Assign cell type Id based on meta description column
cells['cellTypeID'] = cells['meta_description'].map(get_node_id('Danenberg', 'CellType'))
# standardize column names
patientID_colname = "metabric_id"
imageID_colname = "ImageNumber"
celltypeID_colname = "cellTypeID"
coorX_colname = "Location_Center_X"
coorY_colname = "Location_Center_Y"
cells = cells.rename(
            columns={
                patientID_colname: "patientID",
                imageID_colname: "imageID",
                celltypeID_colname: "celltypeID",
                coorX_colname: "coorX",
                coorY_colname: "coorY",
            }
        )

In [4]:
cell_graph_ = Cell_Graph(a = 0.01)
Cell_graphs = cell_graph_.generate(cells)
print("There are {} patients/cell graphs".format(len(Cell_graphs)))


cell_graph = Cell_graphs[0]
print("The first cell graph is a tuple with 3 elements: (patient_id, graph, cell_types)")
print("\tThe first element is the patient id: {}".format(cell_graph[0]))
print("\tThe second element is the adjacnecy matrix, with the shape of {}".format(cell_graph[1].shape))
print("\tThe third element is the cell types, with the shape of {}".format(cell_graph[2].shape))
print(
        "There are {} cells with {} unique cell types".format(
            cell_graph[1].shape[0], np.unique(cell_graph[2]).shape[0]
        )
    )


There are 579 patients/cell graphs
The first cell graph is a tuple with 3 elements: (patient_id, graph, cell_types)
	The first element is the patient id: MB-0282
	The second element is the adjacnecy matrix, with the shape of (1624, 1624)
	The third element is the cell types, with the shape of (1624, 32)
There are 1624 cells with 2 unique cell types


### Identify TME patterns with Soft WL subtree kernel

In [5]:
! export OMP_NUM_THREADS=1 # to avoid the insufficient memory error
soft_wl_subtree_ = Soft_WL_Subtree(
            n_iter=2, k=100
        )
Cell_graphs_prime, Signatures = soft_wl_subtree_.discover_patterns(Cell_graphs)
print("There are {} discovered patterns".format(len(Signatures)))
cell_graph_prime = Cell_graphs_prime[0]
print("The first Cell_graphs_prime element (and all others) is a tuple: (patient_id, adj, patterns)")
print("\tThe first element is the patient id: {}".format(cell_graph_prime[0]))
print("\tThe second element is the adjacnecy matrix, with the shape of {}".format(cell_graph_prime[1].shape))
print("\tThe third element is the patterns, with the shape of {}".format(cell_graph_prime[2].shape))
print(
        "There are {} cells with {} unique patterns".format(
            cell_graph_prime[1].shape[0], np.unique(cell_graph_prime[2]).shape[0]
        )
    )



/cis/home/zwang/.bashrc: line 40: /cis/home/zwang/.local/bin/virtualenvwrapper.sh: No such file or directory
Initialize SoftWL: n_iter=2, n_jobs=-1, k=100, normalize=True
Discovering TME patterns from 579 graphs, median number of nodes is 1475.0, node feature matrix dimension is (1624, 32)
	 1) Graph Convolution
	 2) Clustering Subtrees
Finding 100 nearest neighbors using minkowski metric and 'auto' algorithm
