# Create gene vectors from mutations and CNA data

Use cooccurrence statistics to create gene vectors. 

[Stop Using word2vec](https://multithreaded.stitchfix.com/blog/2017/10/18/stop-using-word2vec/)

[PMI Word Vectors from Wikipedia](https://www.kaggle.com/code/kenshoresearch/kdwd-pmi-word-vectors)

[Improving Distributional Similarity with Lessons Learned from Word Embeddings](https://aclanthology.org/Q15-1016/) (LGD15)



# Create Gene Embeddings from Tumor Sample "Sentences" 

We are going to create Gene vectors by treating them like words in sentences.
We can then combine these to make tumor sample embeddings.
The descriptions below are copied from LGD15. In their work, they refer "words and their contexts". 
We will map these idess to "genes and their contexts". 
To create our gene embeddings we will, 

* create gene-gene co-occurrence matrices. 
* calculate a pointwise mutual information matrices. 
* reduce the dimensionality using singular value decomposition. 


# Pointwise Mutual Information Matrices (Notation from LGD15)


## Notation


We assume a collection of words $w \in V_W$ and their
contexts $c \in V_C$, where $V_W$ and $V_C$
are the word and context vocabularies, and denote
the collection of observed word-context pairs as $D$.

We use $\#(w,c)$ to denote the number of times the pair
$(w,c)$ appears in $D$ and $\#(w)$ and $\#(c)$ to denote 
the number of times $w$ and $c$ occurred in $D$, respectively.

$$
\begin{align}
\#(w) = \sum_{c^{\prime}} \#(w, c^{\prime})
, \quad
\#(c) = \sum_{w^{\prime}} \#(w^{\prime}, c)
, \quad
\lvert D \rvert = \sum_{w,c} \#(w, c)
\end{align}
$$


$$
\begin{align}
\hat{P}(w) = \frac{\#(w)}{\lvert D \rvert}
, \quad
\hat{P}(c) = \frac{\#(c)}{\lvert D \rvert}
, \quad
\hat{P}(w,c) = \frac{\#(w,c)}{\lvert D \rvert}
\end{align}
$$


## Contexts

$D$ is commonly obtained by taking a
corpus $w_1$, $w_2$, . . . , $w_n$ and defining the contexts
of word $w_i$ as the words surrounding it in an 
$L$-sized window $w_{i−L}$, . . . , $w_{i−1}$, $w_{i+1}$, . . . , $w_{i+L}$.

In our case, the corpus will be genes and their contexts will be 
other genes that co-occurr in the same sample.  


## Definitions

$$
\begin{align}
PMI(w, c) = 
\log \frac
{\hat{P}(w,c)}
{\hat{P}(w)\hat{P}(c)} =
\log \frac
{\#(w,c) \, \cdot \lvert D \rvert}
{\#(w) \cdot \#(c)}
\end{align}
$$

$$
\begin{align}
PPMI(w, c) = {\rm max} \left[ PMI(w, c), 0 \right]
\end{align}
$$


## Context Distribution Smoothing

$$
\begin{align}
PMI_{\alpha}(w, c) = 
\log \frac
{\hat{P}(w,c)}
{\hat{P}(w)\hat{P}_{\alpha}(c)} = 
\log \frac
{\#(w,c) \cdot \sum_{c^{\prime}} \#(c^{\prime})^{\alpha}}
{\#(w) \cdot \#(c)^{\alpha}}
\end{align}
$$

$$
\begin{align}
\hat{P}_{\alpha}(c) = 
\frac
{\#(c)^{\alpha}}
{\sum_{c^{\prime}} \#(c^{\prime})^{\alpha}}
\end{align}
$$

In [None]:
import os
import pandas as pd

In [None]:
from nextgenlp import synapse 
from nextgenlp import genie
from nextgenlp import genie_constants
from nextgenlp import embedders
from nextgenlp.config import config

In [None]:
SYNC_PATH = config['Paths']['SYNAPSE_PATH']
EMBEDDINGS_PATH = config['Paths']['EMBEDDINGS_PATH']
print(SYNC_PATH)
print(EMBEDDINGS_PATH)

In [None]:
MIN_UNIGRAM_COUNT = 10
EMBEDDING_SIZES = [50, 100, 200]

# RAS Pathway data

In [None]:
df_ras = pd.read_excel(os.path.join(SYNC_PATH, '../nci-ras-initiative/ras-pathway-gene-names.xlsx'))

In [None]:
df_ras

In [None]:
def get_meta_extra(df_ras, embds, unigram_name):
    # record RAS pathway membership
    df_meta_extra = pd.DataFrame(
        [
            embds.index_to_unigram[ii]
            for ii in range(len(embds.index_to_unigram))
        ],
        columns=[unigram_name],
    )

    df_meta_extra = pd.merge(
        df_meta_extra,
        df_ras[["Gene name"]],
        left_on=unigram_name,
        right_on="Gene name",
        how="left",
    ).rename(columns={"Gene name": "path_RAS_flag"})
    bmask = df_meta_extra["path_RAS_flag"].isnull()
    df_meta_extra.loc[bmask, "path_RAS_flag"] = 0
    df_meta_extra.loc[~bmask, "path_RAS_flag"] = 1
    
    return df_meta_extra

# Loop over all options

In [None]:
unigram_name = "gene"

for genie_version in synapse.VALID_GENIE_VERSIONS:
    print(genie_version)

    syn_file_paths = synapse.get_file_name_to_path(genie_version=genie_version)
    df_gp_wide = genie.read_gene_panels(syn_file_paths["gene_panels"], style="wide")

    df_psm_all = genie.read_pat_sam_mut(
        syn_file_paths["data_clinical_patient"],
        syn_file_paths["data_clinical_sample"],
        syn_file_paths["data_mutations_extended"],
    )
    df_dcs_all = genie.read_clinical_sample(syn_file_paths["data_clinical_sample"])
    df_dcs_all = df_dcs_all.loc[df_psm_all["SAMPLE_ID"].unique()]
    df_cna_all = genie.read_cna(syn_file_paths["data_CNA"])

    print("df_psm_all.shape: ", df_psm_all.shape)
    print("df_dcs_all.shape: ", df_dcs_all.shape)
    print("df_cna_all.shape: ", df_cna_all.shape)

    for subset_name, seq_assay_ids in genie_constants.SEQ_ASSAY_ID_GROUPS.items():
        print(subset_name)
        out_path = os.path.join(EMBEDDINGS_PATH, genie_version, subset_name)
        os.makedirs(out_path, exist_ok=True)

        (
            subset_sample_ids,
            subset_genes,
        ) = genie.get_genes_and_samples_from_seq_assay_ids(
            df_gp_wide, df_dcs_all, seq_assay_ids
        )

        df_dcs = df_dcs_all.loc[subset_sample_ids]
        
        
        df_psm = df_psm_all[df_psm_all["SAMPLE_ID"].isin(df_dcs.index)]
        mut_sentences = genie.get_psm_sentences(df_psm)
        mut_sentences = genie.filter_sentences_by_gene(mut_sentences, subset_genes)
        df_dcs_mut = df_dcs.loc[mut_sentences.index].copy()
        df_dcs_mut["mut_sentences"] = mut_sentences

        for embedding_size in EMBEDDING_SIZES:
            print(embedding_size)

            # mutations extended emebeddings
            # ===========================================================
            embds_mut = embedders.GenePpmiEmbeddings(
                df_dcs_mut["mut_sentences"],
                subset_name,
                min_unigram_weight=MIN_UNIGRAM_COUNT,
                unigram_weighter=embedders.unigram_weighter_identity,
                skipgram_weighter=embedders.skipgram_weighter_product,
                embedding_size=embedding_size,
            )
            embds_mut.create_embeddings()
            df_meta_extra = get_meta_extra(df_ras, embds_mut, unigram_name)
            embds_mut.write_gene_projector_files(
                out_path, f"mut_{subset_name}", unigram_name, df_meta_extra
            )

            # for mutations, remove weight from sentences for metadata
            df_dcs_mut_meta = df_dcs_mut.copy()
            df_dcs_mut_meta["mut_sentences"] = df_dcs_mut_meta["mut_sentences"].apply(
                lambda x: [el[0] for el in x]
            )
            embds_mut.write_sample_projector_files(
                out_path, f"mut_{subset_name}", unigram_name, df_dcs_mut_meta
            )

        
        df_cna = df_cna_all[df_cna_all.index.isin(df_dcs.index)]
        if df_cna.shape[0] < 10:
            continue
        cna_sentences = genie.get_cna_sentences(df_cna)
        cna_sentences = genie.filter_sentences_by_gene(cna_sentences, subset_genes)
        df_dcs_cna = df_dcs.loc[cna_sentences.index].copy()
        df_dcs_cna["cna_sentences"] = cna_sentences

        for embedding_size in EMBEDDING_SIZES:
            print(embedding_size)


            # copy number alterations emebeddings
            # ===========================================================
            embds_cna = embedders.GenePpmiEmbeddings(
                df_dcs_cna["cna_sentences"],
                subset_name,
                min_unigram_weight=MIN_UNIGRAM_COUNT,
                unigram_weighter=embedders.unigram_weighter_abs,
                skipgram_weighter=embedders.skipgram_weighter_norm,
                embedding_size=embedding_size,
            )
            embds_cna.create_embeddings()
            df_meta_extra = get_meta_extra(df_ras, embds_cna, unigram_name)
            embds_cna.write_gene_projector_files(
                out_path, f"cna_{subset_name}", unigram_name, df_meta_extra
            )
            embds_cna.write_sample_projector_files(
                out_path, f"cna_{subset_name}", unigram_name, df_dcs_cna
            )

In [None]:
subset_name = "MSK"

In [None]:
in_path = os.path.join(EMBEDDINGS_PATH, genie_version, subset_name)

In [None]:
df_v = pd.read_csv(
    os.path.join(in_path, f"mut_{subset_name}_sample_100_vecs.tsv"),
    sep="\t",
    header=None,
)

In [None]:
df_v

In [None]:
df_m = pd.read_csv(os.path.join(in_path, f'mut_{subset_name}_sample_meta.tsv'), sep='\t')

In [None]:
df_m