In [None]:
import h5py
import numpy as np
import pandas as pd
from collections import defaultdict
import phate
import scanpy as sc
import plotly.express as px

from anndata import AnnData

In [None]:
DATA_F = '../data/GSE103224.h5'
BIOMARKER_F = '../data/glioma_survival_associated_genes_Fatai.csv'

In [None]:
with h5py.File(DATA_F, 'r') as f:
    CELLS = [
        str(x)[2:-1]
        for x in f['cell'][:]
    ]
    TUMORS = [
        str(x)[2:-1]
        for x in f['tumor'][:]
    ]
    GENE_IDS = [
        str(x)[2:-1]
        for x in f['gene_id'][:]
    ]
    GENE_NAMES = [
        str(x)[2:-1]
        for x in f['gene_name'][:]
    ]
    
# Map each cell to its index in the data matrix
CELL_TO_INDEX = {
    cell: index
    for index, cell in enumerate(CELLS)
}

# Map each tumor to its indices in the data matrix
TUMOR_TO_INDICES = defaultdict(lambda: [])
for index, tumor in enumerate(TUMORS):
    TUMOR_TO_INDICES[tumor].append(index)
TUMOR_TO_INDICES = dict(TUMOR_TO_INDICES)

def counts_matrix_for_tumor(tumor):
    indices = TUMOR_TO_INDICES[tumor]
    with h5py.File(DATA_F, 'r') as f:
        counts = f['count'][indices]
    cells = list(np.array(CELLS)[indices])
    return counts, cells

In [None]:
tumor_dfs = {}
TUMORS = np.unique(TUMORS)
for tumor in TUMORS:
    print(tumor)
    counts, cells = counts_matrix_for_tumor(tumor)
    ad = AnnData(
            X=counts, 
            obs=pd.DataFrame(data=cells, columns=['cell']),
            var=pd.DataFrame(
                index=GENE_NAMES, 
                data=GENE_NAMES, 
                columns=['gene_name']
            )
        )
    sc.pp.normalize_total(ad, target_sum=1e6)
    sc.pp.log1p(ad)
    tumor_dfs[tumor] = ad

In [None]:
all_tumor_df = pd.concat([pd.DataFrame(tumor_dfs[tumor].X, 
                                       index=tumor_dfs[tumor].obs["cell"],
                                       columns=GENE_NAMES) for tumor in TUMORS])