In [2]:
#create jupyter section header for navigation



In [2]:
import torch


# import

In [3]:
#import
import json
import os
from pathlib import Path
from typing import Optional, Union
from sklearn.metrics import mutual_info_score

import numpy as np
import scanpy as sc
import pandas as pd
import torch
import short_utils

from anndata import AnnData
from torch.utils.data import DataLoader, SequentialSampler
from tqdm import tqdm


from scgpt import logger
from scgpt.data_collator import DataCollator
from scgpt.model import TransformerModel
from scgpt.tokenizer import GeneVocab
from scgpt.utils import load_pretrained


PathLike = Union[str, os.PathLike]

# batch embed func

In [3]:

def get_batch_cell_embeddings(
    adata,
    cell_embedding_mode: str = "cls",
    model=None,
    vocab=None,
    max_length=1200,
    batch_size=64,
    model_configs=None,
    gene_ids=None,
    use_batch_labels=False,
) -> np.ndarray:
    """
    Get the cell embeddings for a batch of cells.

    Args:
        adata (AnnData): The AnnData object.
        cell_embedding_mode (str): The mode to get the cell embeddings. Defaults to "cls".
        model (TransformerModel, optional): The model. Defaults to None.
        vocab (GeneVocab, optional): The vocabulary. Defaults to None.
        max_length (int): The maximum length of the input sequence. Defaults to 1200.
        batch_size (int): The batch size for inference. Defaults to 64.
        model_configs (dict, optional): The model configurations. Defaults to None.
        gene_ids (np.ndarray, optional): The gene vocabulary ids. Defaults to None.
        use_batch_labels (bool): Whether to use batch labels. Defaults to False.

    Returns:
        np.ndarray: The cell embeddings.
    """

    count_matrix = adata.X
    count_matrix = (
        count_matrix if isinstance(count_matrix, np.ndarray) else count_matrix.A
    )
    print("loaded count matrix")

    # gene vocabulary ids
    if gene_ids is None:
        gene_ids = np.array(adata.var["id_in_vocab"])
        assert np.all(gene_ids >= 0)

    if use_batch_labels:
        batch_ids = np.array(adata.obs["batch_id"].tolist())

    class Dataset(torch.utils.data.Dataset):
        def __init__(self, count_matrix, gene_ids, batch_ids=None):
            self.count_matrix = count_matrix
            self.gene_ids = gene_ids
            self.batch_ids = batch_ids

        def __len__(self):
            return len(self.count_matrix)

        def __getitem__(self, idx):
            row = self.count_matrix[idx]
            nonzero_idx = np.nonzero(row)[0]
            values = row[nonzero_idx]
            genes = self.gene_ids[nonzero_idx]
            # append <cls> token at the beginning
            genes = np.insert(genes, 0, vocab["<cls>"])
            values = np.insert(values, 0, model_configs["pad_value"])
            genes = torch.from_numpy(genes).long()
            values = torch.from_numpy(values)
            output = {
                "id": idx,
                "genes": genes,
                "expressions": values,
            }
            if self.batch_ids is not None:
                output["batch_labels"] = self.batch_ids[idx]
            return output

    if cell_embedding_mode == "cls":
        dataset = Dataset(
            count_matrix, gene_ids, batch_ids if use_batch_labels else None
        )
        print("created dataset")
        collator = DataCollator(
            do_padding=True,
            pad_token_id=vocab[model_configs["pad_token"]],
            pad_value=model_configs["pad_value"],
            do_mlm=False,
            do_binning=True,
            max_length=max_length,
            sampling=True,
            keep_first_n_tokens=1,
        )
        print("created collator")

        data_loader = DataLoader(
            dataset,
            batch_size=batch_size,
            sampler=SequentialSampler(dataset),
            collate_fn=collator,
            drop_last=False,
            num_workers=min(len(os.sched_getaffinity(0)), batch_size),
            pin_memory=True,
        )
        print("created data loader")

        device = next(model.parameters()).device
        print("created device")
        cell_embeddings = np.zeros(
            (len(dataset), model_configs["embsize"]), dtype=np.float32
        )
        print("created intial cell embeddings")
        with torch.no_grad(), torch.cuda.amp.autocast(enabled=True):
            count = 0
            for data_dict in tqdm(data_loader, desc="Embedding cells"):
                input_gene_ids = data_dict["gene"].to(device)
                src_key_padding_mask = input_gene_ids.eq(
                    vocab[model_configs["pad_token"]]
                )
                print(" input gene ids to device")
                embeddings = model._encode(
                    input_gene_ids,
                    data_dict["expr"].to(device),
                    src_key_padding_mask=src_key_padding_mask,
                    batch_labels=data_dict["batch_labels"].to(device)
                    if use_batch_labels
                    else None,
                )
                print("encoded embeddings")

                embeddings = embeddings[:, 0, :]  # get the <cls> position embedding
                embeddings = embeddings.cpu().numpy()
                cell_embeddings[count : count + len(embeddings)] = embeddings
                count += len(embeddings)
        cell_embeddings = cell_embeddings / np.linalg.norm(
            cell_embeddings, axis=1, keepdims=True
        )
    else:
        raise ValueError(f"Unknown cell embedding mode: {cell_embedding_mode}")
    return cell_embeddings



# embedd all data func

In [5]:

def embed_data(
    adata_or_file: Union[AnnData, PathLike],
    model_dir: PathLike,
    cell_type_key: str = "cell_type",
    gene_col: str = "feature_name",
    max_length=1200,
    batch_size=64,
    obs_to_save: Optional[list] = None,
    device: Union[str, torch.device] = "cuda",
    use_fast_transformer: bool = True,
    return_new_adata: bool = False,
) -> AnnData:
    """
    Preprocess anndata and embed the data using the model.

    Args:
        adata_or_file (Union[AnnData, PathLike]): The AnnData object or the path to the
            AnnData object.
        model_dir (PathLike): The path to the model directory.
        cell_type_key (str): The key in adata.obs that contains the cell type labels.
            Defaults to "cell_type".
        gene_col (str): The column in adata.var that contains the gene names.
        max_length (int): The maximum length of the input sequence. Defaults to 1200.
        batch_size (int): The batch size for inference. Defaults to 64.
        obs_to_save (Optional[list]): The list of obs columns to save in the output adata.
            If None, will only keep the column of :attr:`cell_type_key`. Defaults to None.
        device (Union[str, torch.device]): The device to use. Defaults to "cuda".
        use_fast_transformer (bool): Whether to use flash-attn. Defaults to True.
        return_new_adata (bool): Whether to return a new AnnData object. If False, will
            add the cell embeddings to a new :attr:`adata.obsm` with key "X_scGPT".

    Returns:
        AnnData: The AnnData object with the cell embeddings.
    """
    if isinstance(adata_or_file, AnnData):
        adata = adata_or_file
    else:
        adata = sc.read_h5ad(adata_or_file)

    # verify cell type key and gene col
    assert cell_type_key in adata.obs
    if gene_col == "index":
        adata.var["index"] = adata.var.index
    else:
        assert gene_col in adata.var

    if device == "cuda":
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        if not torch.cuda.is_available():
            print("WARNING: CUDA is not available. Using CPU instead.")

    # LOAD MODEL
    model_dir = Path(model_dir)
    vocab_file = model_dir / "vocab.json"
    model_config_file = model_dir / "args.json"
    model_file = model_dir / "best_model.pt"
    pad_token = "<pad>"
    special_tokens = [pad_token, "<cls>", "<eoc>"]

    # vocabulary
    vocab = GeneVocab.from_file(vocab_file)
    for s in special_tokens:
        if s not in vocab:
            vocab.append_token(s)
    adata.var["id_in_vocab"] = [
        vocab[gene] if gene in vocab else -1 for gene in adata.var[gene_col]
    ]
    gene_ids_in_vocab = np.array(adata.var["id_in_vocab"])
    logger.info(
        f"match {np.sum(gene_ids_in_vocab >= 0)}/{len(gene_ids_in_vocab)} genes "
        f"in vocabulary of size {len(vocab)}."
    )
    adata = adata[:, adata.var["id_in_vocab"] >= 0]

    with open(model_config_file, "r") as f:
        model_configs = json.load(f)

    # Binning will be applied after tokenization. A possible way to do is to use the unified way of binning in the data collator.

    vocab.set_default_index(vocab["<pad>"])
    genes = adata.var[gene_col].tolist()
    gene_ids = np.array(vocab(genes), dtype=int)

    # all_counts = adata.layers["counts"]
    # num_of_non_zero_genes = [
    #     np.count_nonzero(all_counts[i]) for i in range(all_counts.shape[0])
    # ]
    # max_length = min(max_length, np.max(num_of_non_zero_genes) + 1)

    model = TransformerModel(
        ntoken=len(vocab),
        d_model=model_configs["embsize"],
        nhead=model_configs["nheads"],
        d_hid=model_configs["d_hid"],
        nlayers=model_configs["nlayers"],
        nlayers_cls=model_configs["n_layers_cls"],
        n_cls=1,
        vocab=vocab,
        dropout=model_configs["dropout"],
        pad_token=model_configs["pad_token"],
        pad_value=model_configs["pad_value"],
        do_mvc=True,
        do_dab=False,
        use_batch_labels=False,
        domain_spec_batchnorm=False,
        explicit_zero_prob=False,
        use_fast_transformer=use_fast_transformer,
        fast_transformer_backend="flash",
        pre_norm=False,
    )
    load_pretrained(model, torch.load(model_file), verbose=False)
    model.to(device)
    model.eval()
    print("loaded model")

    # get cell embeddings
    cell_embeddings = get_batch_cell_embeddings(
        adata,
        cell_embedding_mode="cls",
        model=model,
        vocab=vocab,
        max_length=max_length,
        batch_size=batch_size,
        model_configs=model_configs,
        gene_ids=gene_ids,
        use_batch_labels=False,
    )
    print("got cell embeddings")

    if return_new_adata:
        obs_to_save = [cell_type_key] if obs_to_save is None else obs_to_save
        obs_df = adata.obs[obs_to_save]
        return sc.AnnData(X=cell_embeddings, obs=obs_df, dtype="float32")

    adata.obsm["X_scGPT"] = cell_embeddings
    return adata

# run embedding

In [6]:
#check cuda available
torch.cuda.is_available()

NameError: name 'os' is not defined

In [None]:
#print working directory:
print(os.getcwd())
#print working dir contents:
print(os.listdir(os.getcwd()))


In [None]:
#set working directory:
#os.chdir("/workspace")

In [None]:
base_dir = short_utils.get_base_dir()
base_dir


In [38]:
# load data
full_adata = sc.read_h5ad(base_dir / 'training_data/cell_lines/cell_line_states/hypoxia_acidosis_PMID_19057672/hypoxia_acidosis_PMID_19057672.h5ad')
# load and examine the data at data/brca_scrna_epithelial.h5ad
# obs_names a list of 33K ensembl gene ids.
# var_names is an index of 240K sample names r1...hbac...
# need to conv obs_names to hgnc names


In [None]:
#check size of data
print(full_adata.X.nnz)

In [None]:
all_genes_adata = sc.read_h5ad('data/tcga_brca_erbb2_oncosig_all_genes.h5ad')

In [9]:
#truncate the data to 1000 cells
my_adata = full_adata

In [20]:
#deal with na
var = my_adata.var
var.isna().sum()

gene_hgnc    0
Gene ID      0
dtype: int64

In [19]:
#replace nan with 'unknwo'

col_name = 'gene_hgnc'

# Add 'unknown' to the categories
if 'unknown' not in my_adata.var[col_name].cat.categories:
    my_adata.var[col_name] = my_adata.var[col_name].cat.add_categories('unknown')

# Now you can fill NaN values with 'unknown'
my_adata.var[col_name] = my_adata.var[col_name].fillna('unknown')


In [25]:
# prep args for embed:

#if plot by label, set the cell type arg to the cool with label

embed_args = {'adata_or_file': my_adata,
              'model_dir': Path(base_dir / 'scgpt/models/scGPT_pancancer'),
              'cell_type_key': "cell_type",
                'gene_col': "gene_hgnc",
              'max_length' : 20000,
              'batch_size' : 1,
              'obs_to_save':  None,
              'device':  "cuda",
              'use_fast_transformer': False,
              'return_new_adata':  True,
              }

In [24]:
torch.cuda.empty_cache()

In [26]:
cell_embbed = embed_data(**embed_args)

#clean cell output

scGPT - INFO - match 19094/21057 genes in vocabulary of size 60697.
loaded model
loaded count matrix
created dataset
created collator
created data loader
created device
created intial cell embeddings


Embedding cells:   0%|          | 0/26 [00:00<?, ?it/s]

 input gene ids to device
encoded embeddings


Embedding cells:   4%|▍         | 1/26 [00:02<00:52,  2.08s/it]

 input gene ids to device
encoded embeddings


Embedding cells:   8%|▊         | 2/26 [00:04<00:48,  2.02s/it]

 input gene ids to device
encoded embeddings


Embedding cells:  12%|█▏        | 3/26 [00:06<00:46,  2.00s/it]

 input gene ids to device
encoded embeddings


Embedding cells:  15%|█▌        | 4/26 [00:08<00:43,  1.99s/it]

 input gene ids to device
encoded embeddings


Embedding cells:  19%|█▉        | 5/26 [00:09<00:41,  1.97s/it]

 input gene ids to device
encoded embeddings


Embedding cells:  23%|██▎       | 6/26 [00:11<00:39,  1.97s/it]

 input gene ids to device
encoded embeddings


Embedding cells:  27%|██▋       | 7/26 [00:13<00:37,  1.95s/it]

 input gene ids to device
encoded embeddings


Embedding cells:  31%|███       | 8/26 [00:15<00:35,  1.95s/it]

 input gene ids to device
encoded embeddings


Embedding cells:  35%|███▍      | 9/26 [00:17<00:33,  1.95s/it]

 input gene ids to device
encoded embeddings


Embedding cells:  38%|███▊      | 10/26 [00:19<00:31,  2.00s/it]

 input gene ids to device
encoded embeddings


Embedding cells:  42%|████▏     | 11/26 [00:21<00:29,  1.99s/it]

 input gene ids to device
encoded embeddings


Embedding cells:  46%|████▌     | 12/26 [00:23<00:28,  2.01s/it]

 input gene ids to device
encoded embeddings


Embedding cells:  50%|█████     | 13/26 [00:25<00:25,  1.98s/it]

 input gene ids to device
encoded embeddings


Embedding cells:  54%|█████▍    | 14/26 [00:27<00:23,  1.96s/it]

 input gene ids to device
encoded embeddings


Embedding cells:  58%|█████▊    | 15/26 [00:29<00:21,  1.94s/it]

 input gene ids to device
encoded embeddings


Embedding cells:  62%|██████▏   | 16/26 [00:31<00:19,  1.94s/it]

 input gene ids to device
encoded embeddings


Embedding cells:  65%|██████▌   | 17/26 [00:33<00:17,  1.94s/it]

 input gene ids to device
encoded embeddings


Embedding cells:  69%|██████▉   | 18/26 [00:35<00:15,  1.95s/it]

 input gene ids to device
encoded embeddings


Embedding cells:  73%|███████▎  | 19/26 [00:37<00:13,  1.94s/it]

 input gene ids to device
encoded embeddings


Embedding cells:  77%|███████▋  | 20/26 [00:39<00:11,  2.00s/it]

 input gene ids to device
encoded embeddings


Embedding cells:  81%|████████  | 21/26 [00:41<00:09,  1.97s/it]

 input gene ids to device
encoded embeddings


Embedding cells:  85%|████████▍ | 22/26 [00:43<00:07,  1.93s/it]

 input gene ids to device
encoded embeddings


Embedding cells:  88%|████████▊ | 23/26 [00:45<00:05,  1.92s/it]

 input gene ids to device
encoded embeddings


Embedding cells:  92%|█████████▏| 24/26 [00:47<00:03,  1.91s/it]

 input gene ids to device
encoded embeddings


Embedding cells:  96%|█████████▌| 25/26 [00:48<00:01,  1.90s/it]

 input gene ids to device
encoded embeddings


Embedding cells: 100%|██████████| 26/26 [00:50<00:00,  1.95s/it]

got cell embeddings





consider to avoid memory outage.
figure out why reserving so much memory.
https://github.com/rentruewang/koila

# add clinical data to adata

In [45]:
#load clinical data 
clin_path = Path(base_dir / 'training_data/tcga/clinical/brca_tcga_pan_can_atlas_2018_clinical_data.tsv')

clin_df = pd.read_csv(clin_path, sep='\t', index_col=0)

In [46]:
clin_df.iloc[0:5,:]

Unnamed: 0_level_0,Patient ID,Sample ID,Diagnosis Age,Neoplasm Disease Stage American Joint Committee on Cancer Code,American Joint Committee on Cancer Publication Version Type,Aneuploidy Score,Buffa Hypoxia Score,Cancer Type,TCGA PanCanAtlas Cancer Type Acronym,Cancer Type Detailed,...,Subtype,Tissue Prospective Collection Indicator,Tissue Retrospective Collection Indicator,Tissue Source Site,Tissue Source Site Code,TMB (nonsynonymous),Tumor Disease Anatomic Site,Tumor Type,Patient Weight,Winter Hypoxia Score
Study ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
brca_tcga_pan_can_atlas_2018,TCGA-3C-AAAU,TCGA-3C-AAAU-01,55,STAGE X,6TH,19.0,-21.0,Breast Cancer,BRCA,Breast Invasive Lobular Carcinoma,...,BRCA_LumA,No,Yes,Columbia University,3C,0.8,Breast,Infiltrating Lobular Carcinoma,,-28.0
brca_tcga_pan_can_atlas_2018,TCGA-3C-AALI,TCGA-3C-AALI-01,50,STAGE IIB,6TH,22.0,5.0,Breast Cancer,BRCA,Breast Invasive Ductal Carcinoma,...,BRCA_Her2,No,Yes,Columbia University,3C,15.266667,Breast,Infiltrating Ductal Carcinoma,,20.0
brca_tcga_pan_can_atlas_2018,TCGA-3C-AALJ,TCGA-3C-AALJ-01,62,STAGE IIB,7TH,13.0,-5.0,Breast Cancer,BRCA,Breast Invasive Ductal Carcinoma,...,BRCA_LumB,No,Yes,Columbia University,3C,0.933333,Breast,Infiltrating Ductal Carcinoma,,-10.0
brca_tcga_pan_can_atlas_2018,TCGA-3C-AALK,TCGA-3C-AALK-01,52,STAGE IA,7TH,4.0,-27.0,Breast Cancer,BRCA,Breast Invasive Ductal Carcinoma,...,BRCA_LumA,No,Yes,Columbia University,3C,1.5,Breast,Infiltrating Ductal Carcinoma,,4.0
brca_tcga_pan_can_atlas_2018,TCGA-4H-AAAK,TCGA-4H-AAAK-01,50,STAGE IIIA,7TH,7.0,-27.0,Breast Cancer,BRCA,Breast Invasive Lobular Carcinoma,...,BRCA_LumA,Yes,No,"Proteogenex, Inc.",4H,0.7,Breast,Infiltrating Lobular Carcinoma,,-20.0


AttributeError: 'DataFrame' object has no attribute 'Summarise'

In [47]:
cols = pd.Series(clin_df.columns)
keep_cols = [0,1,2,3,5,6,19,21,28,29,30,33,34,36,38,40,41,46,49,50,51,56]
#keep selected cols
clin_df = clin_df.iloc[:,keep_cols]

In [48]:
clin_df = clin_df.set_index(['Sample ID'])
clin_df.index = clin_df.index.str.replace('-','.')


In [49]:
#keep only samples in my_adata.obs['Sample_ID']
clin_df = clin_df.loc[my_adata.obs['Sample_ID'],:]

In [55]:
#add clinical data to adata by joining on sample id
new_obs = pd.merge(my_adata.obs.copy(), clin_df, left_on='Sample_ID', right_on='Sample ID', how='left')

In [56]:
#for each col, count na:
for col in new_obs.columns:
    print(col, new_obs[col].isna().sum())

Sample_ID 0
oncosig_label_ERBB2 0
cell_type 0
Patient ID 0
Diagnosis Age 0
Neoplasm Disease Stage American Joint Committee on Cancer Code 5
Aneuploidy Score 24
Buffa Hypoxia Score 0
Fraction Genome Altered 2
Neoplasm Histologic Grade 949
MSI MANTIS Score 34
MSIsensor Score 0
Mutation Count 0
Overall Survival (Months) 0
Overall Survival Status 0
American Joint Committee on Cancer Metastasis Stage Code 0
American Joint Committee on Cancer Tumor Stage Code 0
Progress Free Survival (Months) 1
Progression Free Status 0
Ragnum Hypoxia Score 0
Sex 0
Somatic Status 0
Subtype 0
TMB (nonsynonymous) 0


In [58]:

new_obs = new_obs.drop(['Neoplasm Histologic Grade'], axis=1) if 'Neoplasm Histologic Grade' in new_obs.columns else new_obs

In [59]:
#add the new obs to cell em
cell_embbed.obs = new_obs

AnnData expects .obs.index to contain strings, but got values like:
    [0, 1, 2, 3, 4]

    Inferred to be: integer

  value_idx = self._prep_dim_index(value.index, attr)


In [67]:
cell_embbed.write_h5ad(base_dir / 'scgpt/data/bulk_brca_erbb2/tcga_brca_erbb2_scgpt_emb_oncosig_sub_genes_clin.h5ad')

In [65]:
#add new obs to my adata and make smaple id the index
my_adata.obs = new_obs
my_adata.obs.set_index('Sample_ID', inplace=True) 
#full_adata = sc.read_h5ad(base_dir / 'scgpt/data/bulk_brca_erbb2/tcga_brca_erbb2_oncosig_sub_genes.h5ad')
# save my_adata as 'tcga_brca_erbb2_oncosig_sub_genes_clin.h5ad'


In [66]:
my_adata.write_h5ad(base_dir / 'scgpt/data/bulk_brca_erbb2/tcga_brca_erbb2_oncosig_sub_genes_clin.h5ad')

# check corr between X and obs

In [200]:
from sklearn.metrics import mutual_info_score
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

In [195]:
#drop some of the meta data cols
cell_embbed.obs.columns
cols_drop = [ 'cell_type', 
       'Neoplasm Disease Stage American Joint Committee on Cancer Code',
             'MSI MANTIS Score', 'MSIsensor Score', 'Mutation Count',
 'Overall Survival Status',
       'American Joint Committee on Cancer Metastasis Stage Code',
       'American Joint Committee on Cancer Tumor Stage Code',
     'Progression Free Status',
   'Sex', 'Somatic Status', ]
cell_embbed.obs = cell_embbed.obs.drop(cols_drop, axis=1)

In [239]:
# get data frames and scale


# Convert the X matrix to a DataFrame
values_df = pd.DataFrame(cell_embbed.X, columns=[f'X_{i}' for i in range(cell_embbed.X.shape[1])])

metadata = pd.DataFrame(cell_embbed.obs.copy().drop(['Patient ID'], axis=1))


In [240]:
#get numerical cols as num_metadata
num_meta_cols = metadata.select_dtypes(include='number').columns

#get cols of values

#get a list of strings: Dim_0, Dim_2, until Dim_511
val_cols = values_df.columns

In [241]:
#scale
#transform the vals df to log p 1
values_df = np.log1p(values_df)
#scale the vals df 0 to 1
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
values_df = scaler.fit_transform(values_df)
#scale the numerical values of the metadata
metadata[num_meta_cols] = scaler.fit_transform(metadata[num_meta_cols])


In [246]:
# generate null MI with bootstrapping
null_mi_distribution = []
#take a number of bootsraps ~ TWICE NUMBER OF COLUMNS

n_bootstraps = 1000

# set threshold of MI 
mi_threshold = 0.95
#choose 3 random test runs on which to check the data
test_runs = np.random.randint(0, n_bootstraps, size=3)
test_runs= np.append(test_runs, 0)
test_runs

# Flatten the DataFrame into a 1D NumPy array
flattened_values = values_df.flatten()

In [None]:
# find null mi dist using boot strap. 
for i in range(n_bootstraps):
    # Randomly draw values from the flattened array
    resampled_values = np.random.choice(flattened_values, size=len(values_df))
    
    # Randomly select a column from metadata
    random_meta_col = np.random.choice(metadata[num_meta_cols].columns)
    selected_metadata = metadata[random_meta_col]
    
    # Handle NA values - drop rows with NA in either column
    na_mask =  selected_metadata.isna()
    selected_values = resampled_values[~na_mask]
    selected_metadata = selected_metadata[~na_mask]
    
    #sanity check
    # if i in test_runs:
    #     #print length of bothg cols
    #     print('selected values len:', len(selected_values))
    #     print('selected metadata len:', len(selected_metadata))
    #     print('shapes: val, meta: \n' , selected_values.shape, selected_metadata.shape)
    #     #print the first 5 rows of both cols
    #     print('selected values 1:5:', selected_values[:5])
    #     print('selected metadata 1:5:', selected_metadata[:5])
    
    # If the metadata column is categorical, encode it
    if selected_metadata.dtype == 'object' or selected_metadata.dtype.name == 'category':
        encoder = LabelEncoder()
        selected_metadata = encoder.fit_transform(selected_metadata)


    # Calculate MI with the resampled data
    mi = mutual_info_score(selected_values, selected_metadata)
    null_mi_distribution.append(mi)

# sort, and print the MI value at threshold percentile. do not rescale!
null_mi_distribution = np.sort(null_mi_distribution)
mi_threshold_val = null_mi_distribution[int(mi_threshold * len(null_mi_distribution))]
print('MI threshold:', mi_threshold_val)


In [251]:
selected_metadata = metadata['Subtype']
    
# Handle NA values - drop rows with NA in either column
na_mask =  selected_metadata.isna()

selected_metadata = selected_metadata[~na_mask]
  # If the metadata column is categorical, encode it
if selected_metadata.dtype == 'object' or selected_metadata.dtype.name == 'category':
    encoder = LabelEncoder()
    selected_metadata = encoder.fit_transform(selected_metadata)

In [252]:

curr_vals = values_df[:,10]
curr_vals = curr_vals[~na_mask]

In [253]:

# Calculate MI with the resampled data
actual_mi = mutual_info_score(selected_values, selected_metadata)
actual_mi


Clustering metrics expects discrete values but received continuous values for label, and multiclass values for target



1.2796347848861829

In [257]:

#check if any na in val df numpy arr
np.isnan(values_df).any()

False

In [None]:
#init a db: val_col, mi, p_val
mi_df = pd.DataFrame(columns=['val_col', 'mi', 'p_val'])

selected_metadata = metadata['Buffa Hypoxia Score']

# Handle NA values - drop rows with NA in either column
na_mask =  selected_metadata.isna()

selected_metadata = selected_metadata[~na_mask]
  # If the metadata column is categorical, encode it
if selected_metadata.dtype == 'object' or selected_metadata.dtype.name == 'category':
    encoder = LabelEncoder()
    selected_metadata = encoder.fit_transform(selected_metadata)

for val_col in range(values_df.shape[1]):
    curr_vals = values_df[:,val_col]
    curr_vals = curr_vals[~na_mask]
    
    # Calculate MI with the resampled data
    actual_mi = mutual_info_score(curr_vals, selected_metadata)
    
    #check mi above threshold
    if actual_mi > mi_threshold_val:
        #calc p val:
        print('actual mi:', actual_mi)
        # Calculate the percentile of the actual MI value
        percentile = np.sum(null_mi_distribution <= actual_mi) / len(null_mi_distribution)
        p_value = 1 - percentile
        #add to df
    mi_df = mi_df.append({'val_col': val_col, 'mi': actual_mi, 'p_val': p_value}, ignore_index=True)

In [None]:
# calc MI between each col in metadata and each col in values
# create a dict whose keys are the meta data cols, and the values are lists of tuples of (col, mi, p-val) of cols whose mi was above the threshold
mi_dict = {}
for meta_col in metadata.columns:
    selected_metadata = metadata[meta_col]
    
    # Handle NA values - drop rows with NA in either column
    na_mask =  selected_metadata.isna()
    
    selected_metadata = selected_metadata[~na_mask]
      # If the metadata column is categorical, encode it
    if selected_metadata.dtype == 'object' or selected_metadata.dtype.name == 'category':
        encoder = LabelEncoder()
        selected_metadata = encoder.fit_transform(selected_metadata)
    
    for val_col in range(values_df.shape[1]):
        curr_vals = values_df[:,val_col]
        curr_vals = curr_vals[~na_mask]
        
        # Calculate MI with the resampled data
        actual_mi = mutual_info_score(curr_vals, selected_metadata)
        
        #check mi above threshold
        if actual_mi > mi_threshold_val:
            #calc p val:
            print('actual mi:', actual_mi)
            # Calculate the percentile of the actual MI value
            percentile = np.sum(null_mi_distribution <= actual_mi) / len(null_mi_distribution)
            p_value = 1 - percentile
            #add to dict
            if meta_col in mi_dict.keys():
                mi_dict[meta_col].append((val_col, actual_mi, p_value))
            else:
                mi_dict[meta_col] = [(val_col, actual_mi, p_value)]
    

In [226]:
# find the cols with highest MI for each col in metadata





MI threshold: 6.707322570859226


float32 int64


0.0831414634121193

In [None]:
# get corrs for scaled data

# Identify numerical columns (excluding 'UMAP 1' and 'UMAP 2' as they were used for subsetting)
numerical_cols = cell_embbed.obs.select_dtypes(include='number')
# Initialize a DataFrame to store the highest correlation for each numerical column
highest_correlations = pd.DataFrame(columns=['obs_column', 'X_column', 'correlation'])


# Loop through each column in the obs matrix
for obs_col in metadata.columns:
    #print('obs col 1[1]:', cell_embbed.obs[obs_col][0])
    # Initialize a variable to store the highest correlation for this obs column
    highest_corr = {'obs_column': obs_col, 'X_column': None, 'correlation': 0}

    

      # If the metadata column is categorical, encode it
    if selected_metadata.dtype == 'object' or selected_metadata.dtype.name == 'category':
        #move to next col
        continue
    # Loop through each column in the X matrix
    for val_col in range(values_df.shape[1]):
        curr_vals = values_df[:,val_col]
        #convert curr vals to pd series
        curr_vals = pd.Series(curr_vals)
        corr = cell_embbed.obs[obs_col].corr(curr_vals)
        print('corr:', corr)
        # Check if this is the highest correlation so far for this obs column
        if abs(corr) > abs(highest_corr['correlation']):
            highest_corr['X_column'] = val_col
            highest_corr['correlation'] = corr
            
    print(highest_corr)
    # Append the highest correlation for this obs column to the DataFrame
    highest_correlations = highest_correlations.append(highest_corr, ignore_index=True)

# Display the results
highest_correlations


In [None]:


# Identify numerical columns (excluding 'UMAP 1' and 'UMAP 2' as they were used for subsetting)
numerical_cols = cell_embbed.obs.select_dtypes(include='number')
# Initialize a DataFrame to store the highest correlation for each numerical column
highest_correlations = pd.DataFrame(columns=['obs_column', 'X_column', 'correlation'])


# Loop through each column in the obs matrix
for obs_col in metadata.columns:
    #print('obs col 1[1]:', cell_embbed.obs[obs_col][0])
    # Initialize a variable to store the highest correlation for this obs column
    highest_corr = {'obs_column': obs_col, 'X_column': None, 'correlation': 0}
    
    # Loop through each column in the X matrix
    for x_col in X_df.columns:
        #print('x col 1[1]:', X_df[x_col][0])
        # Compute the correlation
        corr = cell_embbed.obs[obs_col].corr(X_df[x_col])
        #print('corr:', corr)
        # Check if this is the highest correlation so far for this obs column
        if abs(corr) > abs(highest_corr['correlation']):
            highest_corr['X_column'] = x_col
            highest_corr['correlation'] = corr
            
    print(highest_corr)
    # Append the highest correlation for this obs column to the DataFrame
    highest_correlations = highest_correlations.append(highest_corr, ignore_index=True)

# Display the results
highest_correlations


In [None]:
# and now with MI:


# Initialize a DataFrame to store the highest mutual information for each numerical column
highest_mutual_info = pd.DataFrame(columns=['obs_column', 'X_column', 'mutual_info'])

# Loop through each column in the obs matrix
for obs_col in numerical_cols.columns:
    # Initialize a variable to store the highest mutual information for this obs column
    highest_mi = {'obs_column': obs_col, 'X_column': None, 'mutual_info': 0}
    
    # Select and process the obs column, drop NA values
    curr_obs = cell_embbed.obs[obs_col].dropna()
    print('curr obs:', obs_col, len(curr_obs))
    
    # Loop through each column in the X matrix
    for x_col in X_df.columns:
        # Compute the mutual information
         # Select the x_col values, aligning with curr_obs by index
        x_col_vals = X_df.loc[curr_obs.index, x_col]
        
        mi = mutual_info_score(curr_obs, x_col_vals)
       

        # Check if this is the highest mutual information so far for this obs column
        if mi > highest_mi['mutual_info']:
            highest_mi['X_column'] = x_col
            highest_mi['mutual_info'] = mi

    # Append the highest mutual information for this obs column to the DataFrame
    highest_mutual_info = highest_mutual_info.append(highest_mi, ignore_index=True)

# Display the results
print(highest_mutual_info)

# plotting

In [27]:
#plot the result cell_embbed.X which is #num cell rows of 512 collums
import umap
import matplotlib.pyplot as plt

  warn(


In [None]:
full_adata.X.shape

In [28]:
projection_data = cell_embbed.X

In [29]:
#fit the projection
reducer = umap.UMAP(random_state=42)
embedding = reducer.fit_transform(projection_data)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [144]:
# create a PCA

from sklearn.decomposition import PCA
import pandas as pd

# Perform PCA on the embeddings
pca = PCA(n_components=2)
pca_result = pca.fit_transform(projection_data)

# Create a DataFrame for the PCA results
pca_df = pd.DataFrame(pca_result, columns=['PCA 1', 'PCA 2'])
pca_df.index = my_adata.obs.index

# If you have the obs data as a DataFrame named my_adata.obs, concatenate it with the PCA results
full_df = pd.concat([my_adata.obs, pca_df], axis=1)


In [None]:
import plotly.express as px

# Create a PCA plot
fig = px.scatter(pca_df, x='PCA 1', y='PCA 2',color=my_adata.obs['oncosig_label_ERBB2'], title="PCA Plot")
fig.update_layout(xaxis_title="PCA 1", yaxis_title="PCA 2")
fig.show()

In [146]:
# Assuming my_adata.obs is a DataFrame containing the original observational data
# Calculate correlations
correlation_matrix = full_df.corr()

# Display the correlation matrix
correlation_matrix


Unnamed: 0,oncosig_label_ERBB2,Diagnosis Age,Aneuploidy Score,Buffa Hypoxia Score,Fraction Genome Altered,MSI MANTIS Score,MSIsensor Score,Mutation Count,Overall Survival (Months),Progress Free Survival (Months),Ragnum Hypoxia Score,TMB (nonsynonymous),PCA 1,PCA 2
oncosig_label_ERBB2,1.0,-0.041841,0.04536,0.094829,-0.003064,-0.066289,-0.03852,0.038359,-0.028976,-0.03478,0.179141,0.037966,0.06221735,-0.03819977
Diagnosis Age,-0.041841,1.0,0.051049,-0.05628,0.014275,0.02917,0.067884,0.060284,-0.156524,-0.127021,-0.135804,0.060228,-0.08804612,-0.1375311
Aneuploidy Score,0.04536,0.051049,1.0,0.338573,0.572522,-0.003335,0.051344,0.026769,-0.033938,-0.033109,0.335994,0.024584,0.2546548,0.1240515
Buffa Hypoxia Score,0.094829,-0.05628,0.338573,1.0,0.510997,0.107306,0.172723,0.073004,-0.036369,-0.058704,0.742164,0.076077,0.7789725,0.2611648
Fraction Genome Altered,-0.003064,0.014275,0.572522,0.510997,1.0,0.061065,0.169252,0.01171,-0.032953,-0.066046,0.478391,0.011306,0.3790246,0.1429963
MSI MANTIS Score,-0.066289,0.02917,-0.003335,0.107306,0.061065,1.0,0.698591,0.176453,-0.107241,-0.096528,0.035729,0.170774,0.1307817,0.03957399
MSIsensor Score,-0.03852,0.067884,0.051344,0.172723,0.169252,0.698591,1.0,0.226222,-0.059285,-0.063982,0.133691,0.224036,0.125476,0.07777142
Mutation Count,0.038359,0.060284,0.026769,0.073004,0.01171,0.176453,0.226222,1.0,-0.06039,-0.058729,0.067501,0.997975,0.03906375,0.05051949
Overall Survival (Months),-0.028976,-0.156524,-0.033938,-0.036369,-0.032953,-0.107241,-0.059285,-0.06039,1.0,0.914193,-0.015474,-0.062871,-0.00409451,0.01299708
Progress Free Survival (Months),-0.03478,-0.127021,-0.033109,-0.058704,-0.066046,-0.096528,-0.063982,-0.058729,0.914193,1.0,-0.025935,-0.060804,-0.02679977,0.01201868


# select and analyze subsets based on umap

In [42]:
import plotly.express as px

# Prepare your data
umap_x = embedding[:, 0]
umap_y = embedding[:, 1]



ModuleNotFoundError: No module named 'plotly'

In [148]:

# Create a DataFrame for Plotly: add the UMAP cols to my_adata.obs
umap_df = pd.DataFrame(my_adata.obs.copy())
umap_df['UMAP 1'] = umap_x
umap_df['UMAP 2'] = umap_y

plt_title = 'UMAP tcga brca erbb2 scgpt sub genes & onc label'


In [151]:
# Create a Plotly figure
fig = px.scatter(umap_df, x='UMAP 1', y='UMAP 2', color='oncosig_label_ERBB2',
                 color_continuous_scale=['darkblue', 'red'],
                 title=plt_title,
                 labels={'Label': 'Oncosig Labels'},
                 opacity=0.4)

# Update layout
fig.update_layout(legend_title_text='Oncosig Labels',
                  xaxis_title='UMAP 1',
                  yaxis_title='UMAP 2')

# Show the plot
fig.show()

In [152]:

# Compute the correlation matrix
correlation_matrix = umap_df.corr()
correlation_matrix

Unnamed: 0,oncosig_label_ERBB2,Diagnosis Age,Aneuploidy Score,Buffa Hypoxia Score,Fraction Genome Altered,MSI MANTIS Score,MSIsensor Score,Mutation Count,Overall Survival (Months),Progress Free Survival (Months),Ragnum Hypoxia Score,TMB (nonsynonymous),UMAP 1,UMAP 2
oncosig_label_ERBB2,1.0,-0.041841,0.04536,0.094829,-0.003064,-0.066289,-0.03852,0.038359,-0.028976,-0.03478,0.179141,0.037966,-0.040236,-0.071903
Diagnosis Age,-0.041841,1.0,0.051049,-0.05628,0.014275,0.02917,0.067884,0.060284,-0.156524,-0.127021,-0.135804,0.060228,0.12157,-0.074331
Aneuploidy Score,0.04536,0.051049,1.0,0.338573,0.572522,-0.003335,0.051344,0.026769,-0.033938,-0.033109,0.335994,0.024584,-0.256623,-0.072141
Buffa Hypoxia Score,0.094829,-0.05628,0.338573,1.0,0.510997,0.107306,0.172723,0.073004,-0.036369,-0.058704,0.742164,0.076077,-0.753224,-0.266112
Fraction Genome Altered,-0.003064,0.014275,0.572522,0.510997,1.0,0.061065,0.169252,0.01171,-0.032953,-0.066046,0.478391,0.011306,-0.365822,-0.155833
MSI MANTIS Score,-0.066289,0.02917,-0.003335,0.107306,0.061065,1.0,0.698591,0.176453,-0.107241,-0.096528,0.035729,0.170774,-0.112037,-0.034119
MSIsensor Score,-0.03852,0.067884,0.051344,0.172723,0.169252,0.698591,1.0,0.226222,-0.059285,-0.063982,0.133691,0.224036,-0.127525,-0.030202
Mutation Count,0.038359,0.060284,0.026769,0.073004,0.01171,0.176453,0.226222,1.0,-0.06039,-0.058729,0.067501,0.997975,-0.060614,0.024009
Overall Survival (Months),-0.028976,-0.156524,-0.033938,-0.036369,-0.032953,-0.107241,-0.059285,-0.06039,1.0,0.914193,-0.015474,-0.062871,-0.010372,-0.00257
Progress Free Survival (Months),-0.03478,-0.127021,-0.033109,-0.058704,-0.066046,-0.096528,-0.063982,-0.058729,0.914193,1.0,-0.025935,-0.060804,0.012493,0.022562


In [113]:
#define grid subset: umap 1 min, max; umap 2 min, max:
box_select = {'UMAP 1': [0, 12], 'UMAP 2': [5, 9]}
# select from umap df:
umap_subset = umap_df.loc[(umap_df['UMAP 1'] > box_select['UMAP 1'][0]) & (umap_df['UMAP 1'] <= box_select['UMAP 1'][1]) & (umap_df['UMAP 2'] > box_select['UMAP 2'][0]) & (umap_df['UMAP 2'] <= box_select['UMAP 2'][1]), :]
#select the opposite as well
umap_subset_opp = umap_df.loc[(umap_df['UMAP 1'] <= box_select['UMAP 1'][0]) | (umap_df['UMAP 1'] > box_select['UMAP 1'][1]) | (umap_df['UMAP 2'] <= box_select['UMAP 2'][0]) | (umap_df['UMAP 2'] > box_select['UMAP 2'][1]), :]

In [83]:
#print data type for each collumn
for col in umap_subset.columns:
    print(col, umap_subset[col].dtype)


oncosig_label_ERBB2 int64
cell_type category
Patient ID object
Diagnosis Age int64
Neoplasm Disease Stage American Joint Committee on Cancer Code category
Aneuploidy Score float64
Buffa Hypoxia Score float64
Fraction Genome Altered float64
MSI MANTIS Score float64
MSIsensor Score float64
Mutation Count float64
Overall Survival (Months) float64
Overall Survival Status category
American Joint Committee on Cancer Metastasis Stage Code category
American Joint Committee on Cancer Tumor Stage Code category
Progress Free Survival (Months) float64
Progression Free Status category
Ragnum Hypoxia Score float64
Sex category
Somatic Status category
Subtype category
TMB (nonsynonymous) float64
UMAP 1 float32
UMAP 2 float32


In [114]:
from scipy.stats import ttest_ind
#create a table of the mean of the numerical cols in each df as well as the result of a 
# two tailed t test

# Identify numerical columns (excluding 'UMAP 1' and 'UMAP 2' as they were used for subsetting)
numerical_cols = umap_df.select_dtypes(include='number').columns.drop(['UMAP 1', 'UMAP 2'])

# Initialize a DataFrame to store the results
results_df = pd.DataFrame(columns=['Column', 'Mean Subset', 'Mean Opposite Subset', 'T-test P-value'])

# Loop through each numerical column to compute means and perform t-tests
for col in numerical_cols:
    mean_subset = umap_subset[col].mean()
    mean_subset_opp = umap_subset_opp[col].mean()

    # Perform two-tailed t-test
    t_test_result = ttest_ind(umap_subset[col], umap_subset_opp[col], nan_policy='omit')

    # Append results to the DataFrame
    results_df = results_df.append({
        'Column': col,
        'Mean Subset': mean_subset,
        'Mean Opposite Subset': mean_subset_opp,
        'T-test P-value': t_test_result.pvalue
    }, ignore_index=True)

# Display the results
results_df

Unnamed: 0,Column,Mean Subset,Mean Opposite Subset,T-test P-value
0,oncosig_label_ERBB2,0.047386,0.278932,1.572582e-25
1,Diagnosis Age,58.689542,57.540059,0.2010629
2,Aneuploidy Score,10.962775,14.389222,1.2713e-10
3,Buffa Hypoxia Score,-13.650327,-3.403561,8.71019e-13
4,Fraction Genome Altered,0.258496,0.361149,1.36273e-13
5,MSI MANTIS Score,0.304848,0.299343,0.070247
6,MSIsensor Score,0.44348,0.628338,0.1002101
7,Mutation Count,71.294118,101.735905,0.1196262
8,Overall Survival (Months),43.757278,36.785711,0.01080943
9,Progress Free Survival (Months),40.260671,34.264378,0.01457105


In [89]:
# Identify categorical columns
categorical_cols = umap_df.select_dtypes(include='category').columns
categorical_cols

Index(['cell_type',
       'Neoplasm Disease Stage American Joint Committee on Cancer Code',
       'Overall Survival Status',
       'American Joint Committee on Cancer Metastasis Stage Code',
       'American Joint Committee on Cancer Tumor Stage Code',
       'Progression Free Status', 'Sex', 'Somatic Status', 'Subtype'],
      dtype='object')

In [115]:
from scipy.stats import chi2_contingency
# First, create a DataFrame without the columns you want to exclude
reduced_df = umap_df.drop(['UMAP 1', 'UMAP 2', 'cell_type', 'Sex'], axis=1)

# Now, select only the categorical columns from this reduced DataFrame
categorical_cols = reduced_df.select_dtypes(include='category').columns

# Initialize a DataFrame to store the results
results_df = pd.DataFrame(columns=['Column', 'Counts in Subset', 'Counts in Opposite Subset', 'Chi-Squared P-value'])

# Loop through each categorical column to count levels and perform chi-squared tests
for col in categorical_cols:
    # Count the levels in each subset
    counts_subset = umap_subset[col].value_counts()
    counts_subset_opp = umap_subset_opp[col].value_counts()

    # Create a contingency table
    contingency_table = pd.DataFrame({
        'Subset': counts_subset,
        'Opposite Subset': counts_subset_opp
    }).fillna(0)

    # Perform chi-squared test
    chi2, p, _, _ = chi2_contingency(contingency_table)

    # Append results to the DataFrame
    results_df = results_df.append({
        'Column': col,
        'Counts in Subset': dict(counts_subset),
        'Counts in Opposite Subset': dict(counts_subset_opp),
        'Chi-Squared P-value': p
    }, ignore_index=True)

# Display the results
results_df

Unnamed: 0,Column,Counts in Subset,Counts in Opposite Subset,Chi-Squared P-value
0,Neoplasm Disease Stage American Joint Committe...,"{'STAGE IIA': 214, 'STAGE IIB': 137, 'STAGE II...","{'STAGE IIA': 105, 'STAGE IIB': 83, 'STAGE III...",0.6730802
1,Overall Survival Status,"{'0:LIVING': 539, '1:DECEASED': 73}","{'0:LIVING': 282, '1:DECEASED': 55}",0.07244549
2,American Joint Committee on Cancer Metastasis ...,"{'M0': 495, 'MX': 100, 'M1': 11, 'CM0 (I+)': 6}","{'M0': 294, 'MX': 36, 'M1': 7, 'CM0 (I+)': 0}",0.02551072
3,American Joint Committee on Cancer Tumor Stage...,"{'T2': 343, 'T1C': 139, 'T3': 73, 'T1': 24, 'T...","{'T2': 212, 'T1C': 55, 'T3': 34, 'T4B': 13, 'T...",0.2451361
4,Progression Free Status,"{'0:CENSORED': 534, '1:PROGRESSION': 78}","{'0:CENSORED': 289, '1:PROGRESSION': 48}",0.5816681
5,Somatic Status,{'Matched': 612},{'Matched': 337},1.0
6,Subtype,"{'BRCA_LumA': 393, 'BRCA_Basal': 155, 'BRCA_No...","{'BRCA_LumB': 160, 'BRCA_LumA': 98, 'BRCA_Her2...",2.611642e-97


In [116]:
from scipy.stats import hypergeom


# Extracting the relevant data from results_df
subtype_data = results_df.loc[results_df['Column'] == 'Subtype'].iloc[0]
counts_subset = subtype_data['Counts in Subset']
counts_subset_opp = subtype_data['Counts in Opposite Subset']
chi_squared_p_value = subtype_data['Chi-Squared P-value']

# Getting the union of keys from both count dictionaries
all_subtypes = set(counts_subset.keys()).union(set(counts_subset_opp.keys()))

# Creating a new DataFrame
subtype_counts_df = pd.DataFrame(index=all_subtypes, columns=['Subset', 'Opposite Subset'])

# Populating the DataFrame
for subtype in all_subtypes:
    subtype_counts_df.loc[subtype, 'Subset'] = counts_subset.get(subtype, 0)
    subtype_counts_df.loc[subtype, 'Opposite Subset'] = counts_subset_opp.get(subtype, 0)

# Adding the chi-squared p-value as a new column
subtype_counts_df['Chi-Squared P-value'] = chi_squared_p_value

# Total number of samples in each subset
n_subset = umap_subset.shape[0]
n_subset_opp = umap_subset_opp.shape[0]
total_samples = n_subset + n_subset_opp

# Add a new column for the probability calculation
subtype_counts_df['Random Selection Probability'] = 0

for subtype in all_subtypes:
    # Total number of samples of this subtype in the entire dataset
    total_subtype_samples = counts_subset.get(subtype, 0) + counts_subset_opp.get(subtype, 0)

    # Number of samples of this subtype in the umap_subset
    subtype_samples_in_subset = counts_subset.get(subtype, 0)

    # Calculate the probability using the hypergeometric distribution
    p_value = hypergeom(total_samples, total_subtype_samples, n_subset).pmf(subtype_samples_in_subset)
    subtype_counts_df.loc[subtype, 'Random Selection Probability'] = p_value

# Display the new DataFrame
subtype_counts_df


Unnamed: 0,Subset,Opposite Subset,Chi-Squared P-value,Random Selection Probability
BRCA_Her2,1,71,2.611642e-97,3.0928780000000004e-33
BRCA_Normal,32,3,2.611642e-97,0.000192366
BRCA_LumB,31,160,2.611642e-97,3.713985e-54
BRCA_Basal,155,5,2.611642e-97,1.028707e-26
BRCA_LumA,393,98,2.611642e-97,8.646483999999999e-26


In [None]:
cell_embbed.obs.head()

In [None]:

# Set style for a light background
plt.style.use('seaborn-v0_8-pastel')  # or 'classic'

#create plot
plt.figure(figsize=(12, 10))

# Use the 'oncosig_label' values for coloring the points
# 'cmap' can be adjusted to your preferred color map

# Manually set colors based on 'oncosig_label_ERBB2'
colors = ['darkblue' if label == 0 else 'red' for label in my_adata.obs['oncosig_label_ERBB2']]


scatter = plt.scatter(embedding[:, 0], embedding[:, 1], c=colors, s=8)
plt_title = 'UMAP tcga brca scgpt emb sub genes & onc label'
plt.title(plt_title, fontsize=18)
plt.xlabel('UMAP 1', fontsize=12)
plt.ylabel('UMAP 2', fontsize=12)


# Create a legend
legend_labels = ['Label 0', 'Label 1']
legend_elements = [plt.Line2D([0], [0], marker='o', color='w', label=legend_labels[i],
                              markerfacecolor=scatter.cmap(scatter.norm(i)), markersize=10)
                   for i in range(2)]
plt.legend(handles=legend_elements, title='Oncosig Labels')

# Save the plot to a file

#plt.savefig(f'plots/{plt_title}.png', dpi=300, bbox_inches='tight')

# Display the plot in the notebook
plt.show()

! compare to raw all genes

In [None]:
#create plot
raw_all_genes_embedding = reducer.fit_transform(all_genes_adata.X)

In [None]:


plt.figure(figsize=(12, 10))

# 'cmap' can be adjusted to your preferred color map


# Manually set colors based on 'oncosig_label_ERBB2'
colors = ['darkblue' if label == 0 else 'red' for label in cell_embbed.obs['oncosig_label_ERBB2']]



scatter = plt.scatter(raw_all_genes_embedding[:, 0], raw_all_genes_embedding[:, 1], c=colors, s=7)
plt_title = 'UMAP tcga brca raw all genes emb + onc label'

plt.title(plt_title, fontsize=18)
plt.xlabel('UMAP 1', fontsize=12)
plt.ylabel('UMAP 2', fontsize=12)

# Create a legend
legend_labels = ['Label 0', 'Label 1']
legend_elements = [plt.Line2D([0], [0], marker='o', color='w', label=legend_labels[i],
                              markerfacecolor=scatter.cmap(scatter.norm(i)), markersize=10)
                   for i in range(2)]
plt.legend(handles=legend_elements, title='Oncosig Labels')
# Save the plot to a file

plt.savefig(f'plots/{plt_title}.png', dpi=300, bbox_inches='tight')
# Display the plot
plt.show()



! compare to umap of maually selected genes

In [None]:
#create plot
raw_selected_embedding = reducer.fit_transform(my_adata.X)

In [None]:


plt.figure(figsize=(12, 10))



# Manually set colors based on 'oncosig_label_ERBB2'
colors = ['darkblue' if label == 0 else 'red' for label in cell_embbed.obs['oncosig_label_ERBB2']]



scatter = plt.scatter(raw_selected_embedding[:, 0], raw_selected_embedding[:, 1], c=colors, s=7)
plt_title = 'UMAP tcga brca raw selected genes emb + onc label'

plt.title(plt_title, fontsize=18)
plt.xlabel('UMAP 1', fontsize=12)
plt.ylabel('UMAP 2', fontsize=12)

# Create a legend
legend_labels = ['Label 0', 'Label 1']
legend_elements = [plt.Line2D([0], [0], marker='o', color='w', label=legend_labels[i],
                              markerfacecolor=scatter.cmap(scatter.norm(i)), markersize=10)
                   for i in range(2)]
plt.legend(handles=legend_elements, title='Oncosig Labels')


# Save the plot to a file

plt.savefig(f'plots/{plt_title}.png', dpi=300, bbox_inches='tight')

# Display the plot
plt.show()


In [None]:
plt.style.available

In [None]:
cell_embbed.write_h5ad('data/brca_scrna_epithelial_scGPT.h5ad')