In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import plotly.express as px
import seaborn as sns
from seaborn import (axes_style, plotting_context)
import sklearn.metrics as skm
import colorcet as cc
import sklearn as sk
import sklearn.decomposition as decomp
import sklearn.pipeline as pipe
import sklearn.neighbors as nbr
import sklearn.base as skbase
import sklearn.model_selection as skms
import pickle
import os
import joblib
import itertools
from pydeseq2.dds import DeseqDataSet as DDS
from pydeseq2.default_inference import DefaultInference
from pydeseq2.ds import DeseqStats as DS
import gseapy
import scanorama
import glob
import anndata as ann
import time


sns.set_style("whitegrid")
custom_cm = cc.m_CET_L17_r
custom_cm.set_extremes(under= "lightgrey")

In [None]:
os.makedirs(
    "figures",
    exist_ok= True,
)
os.makedirs(
    "pickles",
    exist_ok= True,
)
os.makedirs(
    "data",
    exist_ok= True
)
os.makedirs(
    "outputs/wgcna",
    exist_ok= True
)

In [None]:
files = glob.glob("Alex_Lemonade_portal/**/*_filtered*.h5ad", recursive= True)
datasets = [sc.read_h5ad(f) for f in files]
ids = ["sample" + f.split("/")[-1].split("_")[0][-3:] for f in files]
merged_data = ann.concat(
    datasets,
    merge= "first", 
    keys= ids, 
    label= "sample", 
    index_unique= "-"
)
merged_data.obs["age"] = merged_data.obs["age"].astype(float)
merged_data.obs = merged_data.obs.drop(columns= "subdiagnosis")
merged_data.var = merged_data.var.rename(columns= {"detected": "pct_cells_detected"})

In [None]:
sc.pl.violin(
    merged_data,
    ["sum", "detected", "subsets_mito_percent"],
    multi_panel= True,
    save= "_others_preproc.pdf",
    log= True,
)

In [None]:
mask = (merged_data.obs["sum"] > 100) \
        & (merged_data.obs["detected"] > 100) \
        & (merged_data.obs["subsets_mito_percent"] < 50)
merged_data = merged_data[mask, :].copy()
sc.pl.violin(
    merged_data,
    ["sum", "detected", "subsets_mito_percent"],
    multi_panel= True,
    save= "_others_preproc.pdf",
    log= True,
)

In [None]:
ax = sc.pl.violin(
    merged_data.transpose(),
    ["pct_cells_detected"],
    log= True,
)


In [None]:
merged_data = merged_data[:, merged_data.var["pct_cells_detected"] > 0.1].copy()
sc.pl.violin(
    merged_data.transpose(),
    ["pct_cells_detected"],
    log= True,
)

In [None]:
merged_data.var.info()

In [None]:
merged_data.var["gene_symbol"] = merged_data.var["gene_symbol"].astype("str")
merged_data.var.loc[merged_data.var["gene_symbol"] == "nan", "gene_symbol"] = merged_data.var.loc[merged_data.var["gene_symbol"] == "nan"].index
dupes = merged_data.var["gene_symbol"].duplicated()
merged_data.var.loc[dupes, "gene_symbol"] = merged_data.var.loc[dupes, "gene_symbol"].str.cat(
    merged_data.var.loc[dupes, "gene_ids"].astype(str).apply(lambda x: x[-6:]), sep= "-"
)

In [None]:
sc.pl.highest_expr_genes(
    merged_data,
    gene_symbols= "gene_symbol",
    save= True
)

In [None]:
sc.pp.highly_variable_genes(
    merged_data,
    n_top_genes= 2000,
    flavor= "seurat_v3",
)

sc.pl.highly_variable_genes(
    merged_data,
    save= True,
)

In [None]:
merged_data.var["mean"].sort_values(ascending= False)

In [None]:
merged_data = merged_data[:, merged_data.var["mean"] < 20]

sc.pp.highly_variable_genes(
    merged_data,
    n_top_genes= 2000,
    flavor= "seurat_v3",
)

sc.pl.highly_variable_genes(
    merged_data,
    save= True,
    log= True,
)

In [None]:
sc.pl.scatter(
    merged_data,
    "sum",
    "detected",
    color= "subsets_mito_percent",
    color_map= "viridis",
    save= "_others_sum_vs_detected.pdf"
)

In [None]:
merged_data.write_h5ad("data/merged_w_others_preproc.h5ad")

In [None]:
!sbatch -c48 -p himem scripts/others_gs_sbatch.sh

In [None]:
merged_data = sc.read_h5ad("data/merged_w_others_preproc.h5ad")

In [None]:
merged_data = sc.read_h5ad("data/merged_w_others_filt.h5ad")
cox_model = pd.read_csv("../ewing_survival/outputs/cox_model_summary.csv")

In [None]:
merged_data.layers["norm"] = sc.pp.normalize_total(
    merged_data,
    copy= True,
    exclude_highly_expressed= True,
    key_added= "norm_factor",
).X

merged_data.layers["log"] = sc.pp.log1p(
    merged_data,
    copy= True,
    layer= "norm", 
).layers["norm"]


merged_data.layers["norm_scaled_genes"] = sc.pp.scale(
    merged_data,
    copy= True,
    layer= "log"
).layers["log"]

In [None]:
class ScPCA(skbase.TransformerMixin, skbase.BaseEstimator):
    def __init__(self, layer= None, n_comps= None, mask= None):
        self.layer = layer
        self.n_comps = n_comps
        self.mask = mask

    def fit(self, X, y= None):
        return self

    def transform(self, X):
        return sc.pp.pca(
            X,
            n_comps= self.n_comps,
            mask_var= self.mask,
            layer= self.layer,
            copy= True,
        )

class ScNeighbors(skbase.TransformerMixin, skbase.BaseEstimator):
    def __init__(self, n_neighbors= 15, n_pcs= None):
        self.n_neighbors = n_neighbors
        self.n_pcs = n_pcs

    def fit(self, X, y= None):
        return self
        
    def transform(self, X):
        return sc.pp.neighbors(
            X,
            n_neighbors= self.n_neighbors,
            n_pcs= self.n_pcs,
            copy= True,
        )
    
class ScLeiden(skbase.TransformerMixin, skbase.BaseEstimator):
    def __init__(self, resolution= 1):
        self.resolution = resolution

    def fit(self, X, y= None):
        return self

    def transform(self, X):
        return sc.tl.leiden(
            X,
            resolution= self.resolution,
            flavor= "igraph",
            copy= True,
        )

class ScScore(skbase.TransformerMixin, skbase.BaseEstimator):

    def fit(self, X, y= None):
        return self

    def score(estimator, X, y= None, sample_weight= None):
        return skm.silhouette_score(
            X.obsm["X_pca"],
            labels= X.obs["leiden"]
        )

In [None]:
with open("pickles/gridsearch_others_1000", mode= "br") as f:
    grids = pickle.load(f)

In [None]:
grids.best_params_

In [None]:
grids_df = pd.DataFrame({
    "n_neighbors": grids.cv_results_["param_scneighbors__n_neighbors"],
    "n_comps": grids.cv_results_["param_scpca__n_comps"],
    "resolution": grids.cv_results_["param_scleiden__resolution"],
    "iter": grids.cv_results_["iter"],
    "mean_test_score": grids.cv_results_["mean_test_score"],
})
grids_df["resolution"] = grids_df["resolution"].round(2)

In [None]:
combos = itertools.combinations(
    grids_df.columns.drop(["mean_test_score", "iter"]),
    r= 2
)
for combo in combos:
    dummy = grids_df.groupby(
        by= list(combo)
    ).agg(lambda x: np.nan).pivot_table(
        columns= combo[0],
        index=combo[1],
        values= "mean_test_score",
        dropna= False,
    )
    fig, axs = plt.subplots(
        ncols= grids_df["iter"].nunique(),
        figsize= (15, 5),
    ) 
    for i in grids_df["iter"].unique():
        data = grids_df.loc[grids_df["iter"] == i]
        sns.heatmap(
            dummy.fillna(data.groupby(
                by= list(combo)
            ).mean().pivot_table(
                columns= combo[0], 
                index= combo[1], 
                values= "mean_test_score"
            )),
            cmap= "cet_rainbow4",
            ax= axs[i]
        )
    fig.tight_layout()
    fig.savefig("figures/others_{}_{}_heatmap.pdf".format(combo[0], combo[1]), bbox_inches= "tight")
    plt.show()

In [None]:
ax = sns.boxplot(
    data,
    x= "resolution",
    y= "mean_test_score",
)
sns.stripplot(
    data,
    x= "resolution",
    y= "mean_test_score",
    alpha= 0.6,
    size= 3,
    ax= ax
)

In [None]:
def make_heatmap_df(data, **kwargs):
    heat_df = data.pivot_table(
        columns= "n_neighbors",
        index= "resolution",
        values= "mean_test_score",
        dropna= False,
    )
    sns.heatmap(
        heat_df,
        cmap= cc.rainbow4,
        annot= True,
        square= True,
        **kwargs,
    )

fg = sns.FacetGrid(
    data= data,
    col= "n_comps",
    col_wrap= 2,
    sharex= False,
    sharey= False,
    aspect= 2,
    height= 6,
)
fg.map_dataframe(
    make_heatmap_df,
    vmin= data["mean_test_score"].min(),
    vmax= data["mean_test_score"].max(),
)

In [None]:
data = grids_df.loc[grids_df["iter"] == grids_df["iter"].max()]
fig = px.scatter_3d(
    data,
    x= "n_comps",
    y= "n_neighbors",
    z= "resolution",
    color= "mean_test_score",
    color_continuous_scale= cc.rainbow4
)
fig.show()

In [None]:
sc.pp.pca(
    merged_data,
    layer= "norm_scaled_genes",
    mask_var= "highly_variable",
    n_comps= grids.best_params_["scpca__n_comps"],
)

sc.pl.pca_variance_ratio(
    merged_data,
    log= True
)

In [None]:
sc.pl.pca(
    merged_data,
    dimensions= [(0,1), (2, 3), (4, 5), (6, 7)],
    ncols= 2,
    color= "subsets_mito_percent"
)

In [None]:
sc.pp.neighbors(
    merged_data,
    n_neighbors= grids.best_params_["scneighbors__n_neighbors"],
)
sc.tl.leiden(
    merged_data,
    flavor= "igraph",
    resolution= grids.best_params_["scleiden__resolution"],
)
skm.silhouette_score(
    merged_data.obsm["X_pca"],
    labels= merged_data.obs["leiden"],
)

In [None]:
sc.tl.umap(
    merged_data,
    min_dist= 0.5,
    spread= 1.1
)

In [None]:
merged_data.write_h5ad("data/merged_w_others_filt.h5ad")
merged_data = sc.read_h5ad("data/merged_w_others_filt.h5ad")

In [None]:
merged_data.obs["sum_log"] = np.log10(merged_data.obs["sum"])

In [None]:
with plotting_context("talk"):
    fig = sc.pl.umap(
        merged_data,
        color= [
            "tissue_location",
            "disease_timing",
            "subsets_mito_percent",
            "sum_log",
            "leiden",
            "diagnosis",
            "sample_id",
        ],
        gene_symbols= "gene_symbol",
        cmap= "jet",
        palette= cc.glasbey_category10,
        size= 5,
        ncols= 2,
        wspace= 0.4,
        return_fig= True,
        vmin= 0.1,
        alpha= 0.6,
    )
    for ax in fig.axes:
        ax.set_title(
            ax.get_title().replace("_", " ").title()
        )
    fig.savefig(
        "figures/overview_others_umap.png",
        bbox_inches= "tight",
    )

In [None]:
with plotting_context("talk"):
    fig = sc.pl.umap(
        merged_data,
        color= [
            "disease_timing",
            "leiden",
            "MACROH2A1",
            "MACROH2A2",
            "KDM1A",
            "FLI1",
            "diagnosis",
        ],
        gene_symbols= "gene_symbol",
        layer= "norm_scaled_genes",
        vmin= 0.1,
        cmap= custom_cm,
        palette= cc.glasbey_category10,
        size= 5,
        ncols= 2,
        wspace= 0.4,
        return_fig= True,
        alpha= 0.6,
    )
    for ax in fig.axes:
        if "_" in ax.get_title():
            ax.set_title(
                ax.get_title().replace("_", " ").title()
            )
    fig.savefig(
        "figures/others_goi_umap.png",
        bbox_inches= "tight",
    )

In [None]:
sc.tl.rank_genes_groups(
    merged_data,
    groupby= "leiden",
    layer= "log",
    method= "wilcoxon",
    mask_var= "highly_variable"
)

sc.tl.filter_rank_genes_groups(
    merged_data
)

In [None]:
sc.pl.rank_genes_groups(
    merged_data,
    gene_symbols= "gene_symbol",
    key= "rank_genes_groups"
)

In [None]:
sc.pl.rank_genes_groups_dotplot(
    merged_data,
    groupby= "leiden",
    gene_symbols= "gene_symbol",
    layer= "norm_scaled_genes",
    n_genes= 5,
    cmap= "bwr",
    save= "rank_genes_disease_timing.png",
    vcenter= 0,
)

In [None]:
idents = {
    "5": "Lung Epithilial",
    "6": "Immune",
    "7": "Endothelial",
    "4": "Fibroblasts",
}
merged_data.obs["tumor"] = ~merged_data.obs["leiden"].isin(idents.keys())
merged_data.obs["idents"] = merged_data.obs["leiden"]
merged_data.obs["idents"] = merged_data.obs["idents"].replace(idents)
merged_data.obs["idents"].unique()

In [None]:
cluster_genes = sc.get.rank_genes_groups_df(merged_data, group="7").head()["names"]
cluster_genes = merged_data.var.loc[cluster_genes, "gene_symbol"]
sc.pl.umap(
    merged_data,
    color= [
        "idents",

    ],
    gene_symbols= "gene_symbol",
    cmap= custom_cm,
    palette= cc.glasbey_category10,
    vmin= 0.1,
    ncols= 3,
    size= 5,
    alpha= 0.6,
)

In [None]:
sc.pp.pca(
    merged_data,
    layer= "norm_scaled_genes",
    mask_var= "highly_variable",
    n_comps= 11,
)

sc.pl.pca_variance_ratio(
    merged_data,
    log= True
)

In [None]:
sc.pl.pca(
    merged_data,
    dimensions= [(0,1), (2, 3), (4, 5), (6, 7)],
    ncols= 2,
    color= "subsets_mito_percent"
)

In [None]:
sc.pp.neighbors(
    merged_data,
    n_neighbors= 45,
)
sc.tl.leiden(
    merged_data,
    flavor= "igraph",
    resolution= 0.76,
)
skm.silhouette_score(
    merged_data.obsm["X_pca"],
    labels= merged_data.obs["leiden"],
)

In [None]:
sc.tl.umap(
    merged_data,
    min_dist= 0.5,
    spread= 1.1
)

In [None]:
cluster_genes = sc.get.rank_genes_groups_df(merged_data, group="7").head()["names"]
cluster_genes = merged_data.var.loc[cluster_genes, "gene_symbol"]
sc.pl.umap(
    merged_data,
    color= [
        "idents",
        "leiden",

    ],
    gene_symbols= "gene_symbol",
    cmap= custom_cm,
    palette= cc.glasbey_category10,
    vmin= 0.1,
    ncols= 3,
    size= 5,
    alpha= 0.6,
    edges= True,
    edges_width= 0.01
)

In [None]:
sc.tl.rank_genes_groups(
    merged_data,
    groupby= "leiden",
    layer= "log",
    method= "wilcoxon",
    mask_var= "highly_variable"
)

sc.tl.filter_rank_genes_groups(
    merged_data
)

In [None]:
sc.pl.rank_genes_groups(
    merged_data,
    gene_symbols= "gene_symbol",
    key= "rank_genes_groups"
)

In [None]:
sc.tl.dendrogram(
    merged_data,
    groupby= "leiden",
)
sc.pl.rank_genes_groups_dotplot(
    merged_data,
    groupby= "leiden",
    gene_symbols= "gene_symbol",
    layer= "norm_scaled_genes",
    n_genes= 5,
    cmap= "bwr",
    vcenter= 0,
    swap_axes= True,
)

In [None]:
idents = {
    "5": "Lung Epithilial",
    "7": "Immune",
    "8": "Endothelial",
    "6": "Fibroblasts",
    "0": "Fibrous Ewing",
    "2": "Neural Ewing",
    "13": "Skeletal Muscle",
    "15": "Neural Rhabdo",
    "14": "Initial Rhabdo",
    "16": "Mesenchymal Rhabdo",
    "9": "Ribosomal Rhabdo",
    "12": "Ribosomal Rhabdo",
    "18": "Ribosomal Ewing",
    "3": "FGF Ribo Ewing",
    "4": "Neural Ribo Ewing",
    "10": "Neural Ewing",
    "11": "Initial DSRCT",
    "17": "Recurrence DSRCT",
    "1": "Recurrence Ewing",
}
merged_data.obs["tumor"] = ~merged_data.obs["leiden"].isin(["5", "7", "8", "6", "13"])
merged_data.obs["idents"] = merged_data.obs["leiden"]
merged_data.obs["idents"] = merged_data.obs["idents"].replace(idents)
merged_data.obs["idents"].unique()

In [None]:
with plotting_context("talk"):
    fig = sc.pl.umap(
        merged_data,
        color= [
            "disease_timing",
            "diagnosis",
            "tissue_location",
            "sample_id",
            "leiden",
            "idents",
            "subsets_mito_percent",
            "sum_log",
        ],
        gene_symbols= "gene_symbol",
        layer= "norm_scaled_genes",
        vmin= 0.1,
        cmap= "jet",
        palette= cc.glasbey_category10,
        size= 5,
        ncols= 2,
        wspace= 0.5,
        return_fig= True,
        alpha= 0.6,
    )
    for ax in fig.axes:
        ax.set_title(
            ax.get_title().replace("_", " ").title()
        )

In [None]:
with plotting_context("talk"):
    fig = sc.pl.umap(
        merged_data,
        color= [
            "disease_timing",
            "idents",
            "MACROH2A1",
            "MACROH2A2",
            "FLI1",
            "STAG2",
            "KDM1A",
            "FTL",
        ],
        gene_symbols= "gene_symbol",
        layer= "norm_scaled_genes",
        vmin= 0.1,
        cmap= custom_cm,
        palette= cc.glasbey_category10,
        size= 5,
        ncols= 2,
        wspace= 0.5,
        return_fig= True,
        alpha= 0.6,
    )
    for ax in fig.axes:
        if "_" in ax.get_title():
            ax.set_title(
                ax.get_title().replace("_", " ").title()
            )
    fig.savefig(
        "figures/others_goi_umap.png",
        bbox_inches= "tight",
    )

In [None]:
tumor = merged_data[merged_data.obs["tumor"]]

In [None]:
fig, axs = plt.subplots(1, 2, figsize= (10, 5))
with plotting_context("notebook"):
    sc.pl.dotplot(
        tumor,
        groupby= "disease_timing",
        var_names=  [
            "MACROH2A1",
            "MACROH2A2",
            "FLI1",
            "STAG2",
            "KDM1A",
            "FTL",
        ],
        gene_symbols= "gene_symbol",
        layer= "norm_scaled_genes",
        dendrogram= True,
        cmap= "bwr",
        vcenter= 0,
        show= False,
        ax= axs[0],
        swap_axes= True,
    )
    sc.pl.umap(
        tumor,
        color= "disease_timing",
        size=5,
        alpha= 0.6,
        palette= cc.glasbey_category10,
        ax= axs[1]
    )
    fig

In [None]:
sc.tl.dendrogram(
    tumor,
    groupby= "diagnosis",
)
with plotting_context("notebook"):
    sc.pl.dotplot(
        tumor,
        groupby= "diagnosis",
        var_names=  [
            "FLI1",
            "STAG2",
            "MACROH2A1",
            "MACROH2A2",
            "KDM1A",
            "FTL",
        ],
        gene_symbols= "gene_symbol",
        layer= "norm_scaled_genes",
        dendrogram= True,
        cmap= "bwr",
        vcenter= 0,
        save= "tumor_goi_diagnosis_matrix.png",
    )

In [None]:
with plotting_context("talk"):
    fig = sc.pl.umap(
        tumor,
        color= [
            "disease_timing",
            "leiden",
            "MACROH2A1",
            "MACROH2A2",
            "KDM1A",
            "FLI1",
            "diagnosis",
        ],
        gene_symbols= "gene_symbol",
        layer= "norm_scaled_genes",
        vmin= 0.1,
        cmap= custom_cm,
        palette= cc.glasbey_category10,
        size= 5,
        ncols= 2,
        wspace= 0.4,
        return_fig= True,
        alpha= 0.6,
    )
    for ax in fig.axes:
        if "_" in ax.get_title():
            ax.set_title(
                ax.get_title().replace("_", " ").title()
            )
    fig.savefig(
        "figures/tumor.png",
        bbox_inches= "tight",
    )

In [None]:
tumor.uns["rank_genes_groups_filtered"] = None

In [None]:
tumor.write_h5ad("data/tumor.h5ad")

In [None]:
!sbatch -c48 -p himem scripts/others_gs_sbatch.sh -a data/tumor.h5ad -p tumor

In [None]:
cdh1_up = []
with open("gene_lists/CDH1_up.txt") as f:
    cdh1_up = f.read().split()
cdh1_up = pd.Series(cdh1_up)
cdh1_up = cdh1_up[cdh1_up.isin(merged_data.var["gene_symbol"])]

In [None]:
cdh1_dn = []
with open("gene_lists/CDH1_dn.txt") as f:
    cdh1_dn = f.read().split()
cdh1_dn = pd.Series(cdh1_dn)
cdh1_dn = cdh1_dn[cdh1_dn.isin(merged_data.var["gene_symbol"])]

In [None]:
with plotting_context("notebook"):
    sc.pl.dotplot(
        tumor,
        groupby= "sample_id",
        var_names=  [
            "FLI1",
            "KDM1A",
            "MACROH2A1",
            "MACROH2A2"
        ],
        gene_symbols= "gene_symbol",
        layer= "norm_scaled_genes",
        dendrogram= True,
        cmap= "bwr",
        vcenter= 0,
        save= "other_survival_sample_matrix.png",
    )

In [None]:
sc.tl.dendrogram(
    tumor,
    groupby= "leiden"
)
fig, axs = plt.subplots(ncols= 2, figsize= (10, 5))
with plotting_context("notebook"):
    sc.pl.dotplot(
        tumor,
        groupby= "leiden",
        var_names=  [
            "FLI1",
            "KDM1A",
            "MACROH2A1",
            "MACROH2A2"
        ],
        gene_symbols= "gene_symbol",
        layer= "norm_scaled_genes",
        dendrogram= True,
        cmap= "bwr",
        vcenter= 0,
        show= False,
        ax= axs[0]
    )
    sc.pl.umap(
        tumor,
        color= "leiden",
        palette= cc.glasbey_category10,
        size= 5,
        alpha= 0.6,
        ax= axs[1],
    )

In [None]:
with plotting_context("notebook"):
    sc.pl.dotplot(
        tumor,
        groupby= "sample_id",
        var_names=  list(cox_model["covariate"]) + ["KDM1A", "CDH1"],
        gene_symbols= "gene_symbol",
        layer= "norm_scaled_genes",
        dendrogram= True,
        save= "full_model_matrixplot_sampleid.pdf",
        cmap= "bwr",
        vcenter= 0,
    )

In [None]:
with plotting_context("notebook"):
    sc.pl.dotplot(
        merged_data,
        groupby= "leiden",
        var_names=  list(cox_model["covariate"]) + ["KDM1A", "CDH1"],
        gene_symbols= "gene_symbol",
        layer= "norm_scaled_genes",
        dendrogram= True,
        save= "survival_leiden.png",
        cmap= "bwr",
        vcenter= 0,
    )

In [None]:
sc.pl.matrixplot(
    merged_data,
    groupby= "leiden",
    var_names=  np.concat([cdh1_dn, ["CDH1", "KDM1A"]]),
    gene_symbols= "gene_symbol",
    layer= "norm_scaled_genes",
    dendrogram= True,
    save= "cdh1_dn.png",
    swap_axes= True,
)

In [None]:
sc.pl.heatmap(
    merged_data,
    groupby= "leiden",
    var_names=  np.concat([cdh1_dn, ["CDH1", "KDM1A"]]),
    gene_symbols= "gene_symbol",
    layer= "norm_scaled_genes",
    cmap= custom_cm,
    vmax= 10,
    save= "cm_cdh1_dn.png",
)

In [None]:
cm_df = merged_data[
        :, merged_data.var["gene_symbol"].isin(cdh1_dn)
    ].to_df("norm_scaled_genes")
cm_df.columns = cm_df.columns.map(merged_data.var["gene_symbol"])
cm_df["leiden"] = merged_data.obs["leiden"]

g = sns.clustermap(
    cm_df.groupby("leiden").mean().T,
    cmap= "viridis",
)
g.savefig("figures/cdh1_dn_cm.png")

In [None]:
cm_df = merged_data[
        :, merged_data.var["gene_symbol"].isin(cdh1_up)
    ].to_df("norm_scaled_genes")
cm_df.columns = cm_df.columns.map(merged_data.var["gene_symbol"])
cm_df["leiden"] = merged_data.obs["leiden"]
g = sns.clustermap(
    cm_df.groupby("leiden").mean().T,
    cmap= "viridis"
)
g.savefig("figures/cdh1_up_cm.png")

In [None]:
sc.pl.rank_genes_groups_heatmap(
    merged_data,
    groupby= "leiden",
    n_genes= -10,
    gene_symbols= "gene_symbol",
    standard_scale= "var",
    layer= "norm_scaled_genes",
    figsize= (10, 10),
    cmap= "viridis",
)

In [None]:
sc.tl.rank_genes_groups(
    tumor,
    groupby= "disease_timing",
    layer= "log",
    method= "wilcoxon",
    mask_var= "highly_variable",
    key_added= "timing_rank_genes",
)

sc.tl.filter_rank_genes_groups(
    tumor,
    key= "timing_rank_genes",
    groupby= "disease_timing",
    key_added= "filtered_timing_rank_genes"
)

sc.tl.dendrogram(
    tumor,
    groupby= "disease_timing",
)

In [None]:
sc.pl.rank_genes_groups(
    tumor,
    gene_symbols= "gene_symbol",
    key= "timing_rank_genes",
)

In [None]:
sc.pl.rank_genes_groups_dotplot(
    tumor,
    groupby= "disease_timing",
    gene_symbols= "gene_symbol",
    layer= "norm_scaled_genes",
    key= "timing_rank_genes",
    n_genes= 20,
    cmap= "bwr",
)

In [None]:
cluster_genes = sc.get.rank_genes_groups_df(
        tumor, 
        group="Initial diagnosis", 
        key= "timing_rank_genes" 
    ).head(7)["names"]
cluster_genes = tumor.var.loc[cluster_genes, "gene_symbol"]
sc.pl.umap(
    tumor,
    color= [
        *cluster_genes,
        "leiden",
        "disease_timing",
    ],
    gene_symbols= "gene_symbol",
    cmap= custom_cm,
    vmin= 0.1,
    ncols= 3,
)

In [None]:
cluster_genes = sc.get.rank_genes_groups_df(
        tumor, 
        group="Recurrence", 
        key= "timing_rank_genes" 
    ).head(7)["names"]
cluster_genes = tumor.var.loc[cluster_genes, "gene_symbol"]
sc.pl.umap(
    tumor,
    color= [
        *cluster_genes,
        "leiden",
        "disease_timing",
    ],
    gene_symbols= "gene_symbol",
    cmap= custom_cm,
    vmin= 0.1,
    ncols= 3,
)

In [None]:
sc.tl.rank_genes_groups(
    merged_data,
    groupby= "diagnosis",
    layer= "log",
    method= "wilcoxon",
    mask_var= "highly_variable",
    key_added= "diag_rank_genes",
)

sc.tl.filter_rank_genes_groups(
    merged_data,
    key= "diag_rank_genes",
    groupby= "diagnosis",
    key_added= "filtered_diag_rank_genes"
)

sc.tl.dendrogram(
    merged_data,
    groupby= "diagnosis",
)

In [None]:
sc.pl.rank_genes_groups(
    merged_data,
    gene_symbols= "gene_symbol",
    key= "diag_rank_genes",
)

In [None]:
sc.pl.rank_genes_groups_dotplot(
    merged_data,
    groupby= "diagnosis",
    gene_symbols= "gene_symbol",
    layer= "norm_scaled_genes",
    key= "diag_rank_genes",
    n_genes= 20,
)

In [None]:
gene_list = [merged_data.uns["timing_rank_genes"]["names"][:20][timing] for timing in ["Initial diagnosis", "Recurrence"]]
gene_list = np.concat(gene_list)
gene_list = merged_data.var.loc[gene_list, "gene_symbol"].reset_index(drop= True)
gene_list

In [None]:
sc.tl.score_genes(
    merged_data,
    merged_data.var["gene_symbol"].isin(pd.concat([cdh1_dn, cdh1_up])).index,
    layer= "norm_scaled_genes",
)

In [None]:
pb_data = sc.get.aggregate(
    merged_data,
    by= ["leiden", "disease_timing"],
    func= ["sum", "count_nonzero"]
)

In [None]:
pb_data

In [None]:
pb_data = pb_data[~pb_data.obs["leiden"].isin(["11"])]

In [None]:
ax = sns.boxplot(
    pb_data.to_df("count_nonzero").sample(30, axis= 1)
)
_ = ax.set_xticks(ax.get_xticks(), ax.get_xticklabels(), rotation= 40, ha= "right")
#ax.set_yscale("log")

In [None]:
pb_data.X = None
pb_data.X = pb_data.layers["sum"]

In [None]:
dds = DDS(
    adata= pb_data,
    design= "~ disease_timing + leiden",
)

In [None]:
dds.deseq2()

In [None]:
ds = DS(dds, contrast=["disease_timing", "Recurrence", "Initial diagnosis"])
ds.summary()
rdf = ds.results_df
rdf["gene"] = pb_data.var["gene_symbol"]
rdf["-log(p)"] = -np.log10(rdf["padj"])

In [None]:
sns.scatterplot(
    rdf,
    x= "log2FoldChange",
    y= "-log(p)",
)

In [None]:
rdf["sig"] = (np.abs(rdf["log2FoldChange"]) >= 1.5) & (rdf["padj"] < 0.05)
rdf["sig"].sum()

In [None]:
plot_df = rdf.loc[rdf["sig"]].sort_values("log2FoldChange", ascending= False)
plot_df = plot_df.loc[~plot_df["gene"].str.startswith("ENSG000")]
plot_df

In [None]:
with plotting_context("talk"):
    fig, ax = plt.subplots(figsize= (10, 13))
    ax = sns.scatterplot(
        plot_df,
        y= "gene",
        x= "log2FoldChange",
        hue= "-log(p)",
        palette= "viridis",
    )
    ax.legend(bbox_to_anchor= (1,1))

In [None]:
gs_df = pb_data.to_df()
gs_df.columns = gs_df.columns.map(pb_data.var["gene_symbol"])
gs_df = gs_df.T.loc[~(gs_df.sum(axis=0) == 0)]
gs_df

In [None]:
gs_res = gseapy.gsea(
    data= gs_df,
    gene_sets= ["MSigDB_Hallmark_2020"],
    cls= pb_data.obs["disease_timing"]
)

In [None]:
ax = gs_res.plot(gs_res.res2d.Term[:5])


In [None]:
gs_res.res2d

In [None]:
gs = gseapy.enrichr(
    gene_list= cox_model["covariate"],
    gene_sets= "KEGG_2021_Human",
    outdir= "outputs",
)

In [None]:
gs.res2d

In [None]:
ax = gseapy.barplot(
    gs.results,
    column= "Adjusted P-value",
    title= "KEGG",
    top_term= 5,
    cutoff= 1,
)

In [None]:
gs = gseapy.enrichr(
    gene_list= plot_df["gene"],
    gene_sets= "KEGG_2021_Human",
    outdir= "outputs",
    cutoff= 1,
)
gs.res2d

In [None]:
ax = gseapy.barplot(
    gs.results,
    column= "Adjusted P-value",
    title= "KEGG",
    top_term= 5,
    cutoff= 1,
)

In [None]:
pb_data.obs = pb_data.obs.drop(columns= ["size_factors", "replaceable"])

In [None]:
dt_wgcna = PyWGCNA.WGCNA(
    anndata= pb_data,
    name= "disease_timing",
    outputPath= "outputs/wgcna",
    species= "homo sapien"
)

In [None]:
dt_wgcna.preprocess()
dt_wgcna.findModules()

In [None]:
dt_wgcna.setMetadataColor(
    "leiden",
    dict(zip(dt_wgcna.geneExpr.obs["leiden"].unique(), cc.glasbey_category10))
)
dt_wgcna.setMetadataColor(
    "disease_timing",
    {
        "Initial diagnosis": cc.glasbey_category10[-1],
        "Recurrence": cc.glasbey_category10[-2]
    }
)
geneList = PyWGCNA.getGeneList(
    dataset= "hsapiens_gene_ensembl",
    attributes= [
        "ensembl_gene_id",
        "external_gene_name",
        "gene_biotype"
    ],
    maps= [
        "gene_id",
        "gene_name",
        "gene_biotype",
    ]

)
dt_wgcna.updateGeneInfo(
    geneList
)

In [None]:
dt_wgcna.analyseWGCNA()

In [None]:
idx = merged_data.var["gene_symbol"].str.contains("MACRO")
merged_data.var.loc[idx]

In [None]:
with plotting_context("talk"):
    fig = sc.pl.umap(
        merged_data,
        color= [
            "tissue_location",
            "disease_timing",
            "sample_id",
            "leiden",
            "MACROH2A1",
            "MACROH2A2",
            "FLI1",
            "STAG2",
        ],
        gene_symbols= "gene_symbol",
        layer= "norm_scaled_genes",
        color_map= custom_cm,
        palette= cc.glasbey_category10,
        size= 240000 / merged_data.n_obs,
        vmin= 0.1,
        ncols= 2,
        wspace= 0.4,
        return_fig= True,
    )
    for ax in fig.axes:
        ax.set_title(
            ax.get_title().replace("_", " ").title()
        )
    fig.savefig(
        "figures/MACRO_umap.png",
        bbox_inches= "tight",
    )

In [None]:
with plotting_context("talk"):
    fig, ax = plt.subplots()
    sc.pl.dotplot(
        merged_data,
        var_names= [
            "FLI1",
            "STAG2",
            "MACROH2A1",
            "MACROH2A2",
            "KDM1A",
        ],
        gene_symbols= "gene_symbol",
        groupby= "disease_timing",
        layer= "norm_scaled_genes",
        swap_axes= True,
        cmap= "bwr",
        ax= ax 
    )
    fig.savefig("figures/timing_macro_dot.png", bbox_inches= "tight")

In [None]:
with plotting_context("talk"):
    fig, ax = plt.subplots()
    sc.pl.dotplot(
        merged_data,
        var_names= [
            "FLI1",
            "STAG2",
            "MACROH2A1",
            "MACROH2A2",
            "KDM1A",
        ],
        gene_symbols= "gene_symbol",
        groupby= "sample_id",
        layer= "norm_scaled_genes",
        swap_axes= True,
        cmap= "bwr",
        ax= ax 
    )
    fig.savefig("figures/sample_Macro_dot.png", bbox_inches= "tight")