In [1]:
import h5py
import numpy as np
import pandas as pd
from collections import defaultdict
import phate
import scanpy as sc
import plotly.express as px

from anndata import AnnData

In [2]:
DATA_F = '../data/GSE103224.h5'
BIOMARKER_F = '../data/glioma_survival_associated_genes_Fatai.csv'

In [3]:
with h5py.File(DATA_F, 'r') as f:
    CELLS = [
        str(x)[2:-1]
        for x in f['cell'][:]
    ]
    TUMORS = [
        str(x)[2:-1]
        for x in f['tumor'][:]
    ]
    GENE_IDS = [
        str(x)[2:-1]
        for x in f['gene_id'][:]
    ]
    GENE_NAMES = [
        str(x)[2:-1]
        for x in f['gene_name'][:]
    ]
    
# Map each cell to its index in the data matrix
CELL_TO_INDEX = {
    cell: index
    for index, cell in enumerate(CELLS)
}

# Map each tumor to its indices in the data matrix
TUMOR_TO_INDICES = defaultdict(lambda: [])
for index, tumor in enumerate(TUMORS):
    TUMOR_TO_INDICES[tumor].append(index)
TUMOR_TO_INDICES = dict(TUMOR_TO_INDICES)

In [4]:
def counts_matrix_for_tumor(tumor):
    """
    Retrieve the counts matrix for a given tumor.

    Args:
        tumor: the tumor ID
    Returns:
        counts: matrix of counts for tumor
        cells: cell names correponding to the
            rows of counts
    """
    indices = TUMOR_TO_INDICES[tumor]
    with h5py.File(DATA_F, 'r') as f:
        counts = f['count'][indices]
    cells = list(np.array(CELLS)[indices])
    return counts, cells

# Subset to biomarker expressions

In [83]:
biomarkers = pd.read_table(BIOMARKER_F, )
genes = pd.Index(GENE_NAMES) & biomarkers["Gene"]
genes

Index(['TIMP1', 'CASP3', 'PLA2G5', 'CDC6', 'TIMP4', 'INHBB', 'HOMER1',
       'PDLIM3', 'RPRM', 'DKK1', 'DYNLT3', 'NET1', 'NET1', 'CCNA1', 'PDGFRA',
       'NSUN5', 'NSUN5', 'PYGB', 'SLC43A3', 'TAGLN2', 'CDK2', 'ID4', 'ATRX',
       'SH2B2', 'SFRP4', 'FAM3C', 'ACOX2', 'THBS2', 'ZNF609', 'SFRP1',
       'IGFBP3', 'MGMT', 'FZD3', 'CCNE1', 'LGALS8', 'LGALS8', 'TP53', 'THBS3',
       'IDH1', 'MDK', 'DCTD', 'PRKAR2B', 'KLF13', 'POLE3', 'IGFBP2', 'IGFBP2',
       'CCND1', 'CDKN2A', 'PDPN', 'ADM', 'FZD7', 'GHR', 'EZH2', 'TFRC',
       'RGS14', 'PIK3CA', 'VEGFA', 'REPS2', 'MT1M', 'CDKN1C', 'STMN1', 'IRS2',
       'HSPA1B', 'LIFR', 'G6PC3', 'GADD45G', 'FBXO17', 'PTPRN', 'WAC',
       'STAG3L4', 'EFEMP2', 'EFEMP2', 'PPARGC1A', 'EGFR', 'MT1E'],
      dtype='object')

In [94]:
gene_indices = [np.where(np.array(GENE_NAMES) == gene)[0][0] for gene in genes.unique()]

# Geting count matrix, normalizing, and calculate PHATE on biomarker expression

In [96]:
tumor_dfs = {}
TUMORS = np.unique(TUMORS)
for tumor in TUMORS:
    print(tumor)
    counts, cells = counts_matrix_for_tumor(tumor)
    ad = AnnData(
            X=counts, 
            obs=pd.DataFrame(data=cells, columns=['cell']),
            var=pd.DataFrame(
                index=GENE_NAMES, 
                data=GENE_NAMES, 
                columns=['gene_name']
            )
        )
    sc.pp.normalize_total(ad, target_sum=1e6)
    sc.pp.log1p(ad)
    tumor_dfs[tumor] = ad
    
    # Calculate PHATE on biomarker expressions
    phate_operator = phate.PHATE(n_jobs=-2, random_state=1)
    X_phate = phate_operator.fit_transform(tumor_dfs[tumor].X[:, gene_indices])
    tumor_dfs[tumor].obs = pd.DataFrame(
        data=[[x,y,cell] for (x,y),cell in zip(X_phate, cells)], columns=['PHATE 1', 'PHATE 2', 'cell'])

PJ016


Transforming to str index.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Calculating PHATE...
  Running PHATE on 3085 cells and 70 genes.
  Calculating graph and diffusion operator...
    Calculating KNN search...
    Calculated KNN search in 0.34 seconds.
    Calculating affinities...
    Calculated affinities in 0.06 seconds.
  Calculated graph and diffusion operator in 0.41 seconds.
  Calculating landmark operator...
    Calculating SVD...
    Calculated SVD in 0.15 seconds.
    Calculating KMeans...
    Calculated KMeans in 15.77 seconds.
  Calculated landmark operator in 17.11 seconds.
  Calculating optimal t...
  Calculated optimal t in 1.98 seconds.
  Calculating diffusion potential...
  Calculated diffusion potential in 0.58 seconds.
  Calculating metric MDS...
  Calculated metric MDS in 10.64 seconds.
Calculated PHATE in 30.72 seconds.


AnnData expects string indices for some functionality, but your first two indices are: RangeIndex(start=0, stop=2, step=1). 


PJ017


Transforming to str index.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Calculating PHATE...
  Running PHATE on 1261 cells and 70 genes.
  Calculating graph and diffusion operator...
    Calculating KNN search...
    Calculated KNN search in 0.12 seconds.
    Calculating affinities...
    Calculated affinities in 0.03 seconds.
  Calculated graph and diffusion operator in 0.16 seconds.
  Calculating optimal t...



Detected zero distance between 420 pairs of samples. Consider removing duplicates to avoid errors in downstream processing.


overflow encountered in power



  Calculated optimal t in 0.52 seconds.
  Calculating diffusion potential...
  Calculated diffusion potential in 0.20 seconds.
  Calculating metric MDS...
  Calculated metric MDS in 3.17 seconds.
Calculated PHATE in 4.06 seconds.


AnnData expects string indices for some functionality, but your first two indices are: RangeIndex(start=0, stop=2, step=1). 


PJ018


Transforming to str index.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Calculating PHATE...
  Running PHATE on 2197 cells and 70 genes.
  Calculating graph and diffusion operator...
    Calculating KNN search...
    Calculated KNN search in 0.22 seconds.
    Calculating affinities...
    Calculated affinities in 0.05 seconds.
  Calculated graph and diffusion operator in 0.28 seconds.
  Calculating landmark operator...
    Calculating SVD...
    Calculated SVD in 0.11 seconds.
    Calculating KMeans...



Detected zero distance between samples 1539 and 2042, 1539 and 2127, 1539 and 2183, 2042 and 2127, 2042 and 2183, 2127 and 2183. Consider removing duplicates to avoid errors in downstream processing.



    Calculated KMeans in 13.76 seconds.
  Calculated landmark operator in 15.00 seconds.
  Calculating optimal t...
  Calculated optimal t in 1.95 seconds.
  Calculating diffusion potential...
  Calculated diffusion potential in 0.47 seconds.
  Calculating metric MDS...
  Calculated metric MDS in 10.61 seconds.
Calculated PHATE in 28.31 seconds.


AnnData expects string indices for some functionality, but your first two indices are: RangeIndex(start=0, stop=2, step=1). 


PJ025


Transforming to str index.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Calculating PHATE...
  Running PHATE on 5924 cells and 70 genes.
  Calculating graph and diffusion operator...
    Calculating KNN search...
    Calculated KNN search in 0.76 seconds.
    Calculating affinities...
    Calculated affinities in 0.13 seconds.
  Calculated graph and diffusion operator in 0.90 seconds.
  Calculating landmark operator...
    Calculating SVD...



Detected zero distance between 704 pairs of samples. Consider removing duplicates to avoid errors in downstream processing.


overflow encountered in power



    Calculated SVD in 0.33 seconds.
    Calculating KMeans...
    Calculated KMeans in 14.80 seconds.
  Calculated landmark operator in 16.39 seconds.
  Calculating optimal t...
  Calculated optimal t in 2.05 seconds.
  Calculating diffusion potential...
  Calculated diffusion potential in 0.61 seconds.
  Calculating metric MDS...
  Calculated metric MDS in 10.65 seconds.
Calculated PHATE in 30.61 seconds.


AnnData expects string indices for some functionality, but your first two indices are: RangeIndex(start=0, stop=2, step=1). 


PJ030


Transforming to str index.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Calculating PHATE...
  Running PHATE on 3097 cells and 70 genes.
  Calculating graph and diffusion operator...
    Calculating KNN search...
    Calculated KNN search in 0.33 seconds.
    Calculating affinities...
    Calculated affinities in 0.07 seconds.
  Calculated graph and diffusion operator in 0.41 seconds.
  Calculating landmark operator...
    Calculating SVD...



Detected zero distance between 2072 pairs of samples. Consider removing duplicates to avoid errors in downstream processing.


overflow encountered in power



    Calculated SVD in 0.17 seconds.
    Calculating KMeans...
    Calculated KMeans in 13.77 seconds.
  Calculated landmark operator in 15.14 seconds.
  Calculating optimal t...
  Calculated optimal t in 2.00 seconds.
  Calculating diffusion potential...
  Calculated diffusion potential in 0.39 seconds.
  Calculating metric MDS...
  Calculated metric MDS in 10.39 seconds.
Calculated PHATE in 28.34 seconds.


AnnData expects string indices for some functionality, but your first two indices are: RangeIndex(start=0, stop=2, step=1). 


PJ032


Transforming to str index.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Calculating PHATE...
  Running PHATE on 1377 cells and 70 genes.
  Calculating graph and diffusion operator...
    Calculating KNN search...
    Calculated KNN search in 0.12 seconds.
    Calculating affinities...
    Calculated affinities in 0.04 seconds.
  Calculated graph and diffusion operator in 0.16 seconds.
  Calculating optimal t...



Detected zero distance between 1894 pairs of samples. Consider removing duplicates to avoid errors in downstream processing.


overflow encountered in power



  Calculated optimal t in 0.66 seconds.
  Calculating diffusion potential...
  Calculated diffusion potential in 0.19 seconds.
  Calculating metric MDS...
  Calculated metric MDS in 4.02 seconds.
Calculated PHATE in 5.04 seconds.


AnnData expects string indices for some functionality, but your first two indices are: RangeIndex(start=0, stop=2, step=1). 


PJ035


Transforming to str index.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Calculating PHATE...
  Running PHATE on 3768 cells and 70 genes.
  Calculating graph and diffusion operator...
    Calculating KNN search...
    Calculated KNN search in 0.34 seconds.
    Calculating affinities...
    Calculated affinities in 0.09 seconds.
  Calculated graph and diffusion operator in 0.44 seconds.
  Calculating landmark operator...
    Calculating SVD...



Detected zero distance between 4306 pairs of samples. Consider removing duplicates to avoid errors in downstream processing.


overflow encountered in power



    Calculated SVD in 0.22 seconds.
    Calculating KMeans...
    Calculated KMeans in 11.09 seconds.
  Calculated landmark operator in 12.89 seconds.
  Calculating optimal t...
  Calculated optimal t in 2.21 seconds.
  Calculating diffusion potential...
  Calculated diffusion potential in 0.47 seconds.
  Calculating metric MDS...
  Calculated metric MDS in 10.24 seconds.
Calculated PHATE in 26.26 seconds.


AnnData expects string indices for some functionality, but your first two indices are: RangeIndex(start=0, stop=2, step=1). 


PJ048


Transforming to str index.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Calculating PHATE...
  Running PHATE on 3084 cells and 70 genes.
  Calculating graph and diffusion operator...
    Calculating KNN search...
    Calculated KNN search in 0.24 seconds.
    Calculating affinities...
    Calculated affinities in 0.07 seconds.
  Calculated graph and diffusion operator in 0.32 seconds.
  Calculating landmark operator...
    Calculating SVD...



Detected zero distance between 998 pairs of samples. Consider removing duplicates to avoid errors in downstream processing.


overflow encountered in power



    Calculated SVD in 0.18 seconds.
    Calculating KMeans...
    Calculated KMeans in 12.80 seconds.
  Calculated landmark operator in 14.13 seconds.
  Calculating optimal t...
  Calculated optimal t in 2.04 seconds.
  Calculating diffusion potential...
  Calculated diffusion potential in 0.47 seconds.
  Calculating metric MDS...
  Calculated metric MDS in 10.30 seconds.
Calculated PHATE in 27.26 seconds.


AnnData expects string indices for some functionality, but your first two indices are: RangeIndex(start=0, stop=2, step=1). 


In [112]:
all_tumor_df = pd.concat([pd.DataFrame(tumor_dfs[tumor].X, 
                                       index=tumor_dfs[tumor].obs["cell"],
                                       columns=GENE_NAMES) for tumor in TUMORS])

In [None]:
# Calculate PHATE on biomarker expressions
phate_operator = phate.PHATE(n_jobs=-2, random_state=1)
X_phate = phate_operator.fit_transform(all_tumor_df)

Calculating PHATE...
  Running PHATE on 23793 cells and 60725 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...


In [None]:
for tumor in tumor_dfs:
    print(tumor)
    fig = px.scatter(x=tumor_dfs[tumor].obs["PHATE 1"], 
                     y=tumor_dfs[tumor].obs["PHATE 2"], 
                     labels=[tumor_dfs[tumor].obs["cell"]])
    fig.show()