## Runs DE on different clusters. Also runs DE within each cluster for each condition.
Does this over each donor and combined

In [1]:
# Input info
se_cells_meta_f = "/data/Mito_Trace/output/pipeline/v02/CHIP_b1/MTBlacklist_A2/data/merged/MT/cellr_True/numread_200/filters/minC10_minR50_topN0_hetT0.001_hetC10_hetCount5_bq20/mgatk/vireoIn/clones/variants_init/knn/kparam_3/gff_A2_black/annotation_clones/se_cells_meta.tsv"
p_thresh = 0.1 
out_f = "/data/Mito_Trace/output/pipeline/v02/CHIP_b1/MTBlacklist_A2/data/merged/MT/cellr_True/numread_200/filters/minC10_minR50_topN0_hetT0.001_hetC10_hetCount5_bq20/mgatk/vireoIn/clones/variants_init/knn/kparam_3/gff_A2_black/annotation_clones/hypergeom_clone_clust/mincl.10_bothConds.False_p0.1/noInput_hypergeom.csv"

min_clone_size = 10

input_cond = "Input"
#conds_sep = False

In [2]:
import pandas as pd
import numpy as np
from os.path import join
from tqdm.notebook import tqdm

from scipy.stats import hypergeom, fisher_exact
from statsmodels.stats import multitest 

import seaborn as sns
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

## Data prep

In [3]:
cells_meta = pd.read_csv(se_cells_meta_f, sep="\t")
cells_meta = cells_meta.loc[~(cells_meta["name"]=="None")]
sizes = cells_meta.groupby("name").size().sort_values(ascending=False)

cells_meta = cells_meta.loc[~(cells_meta["condition"]==input_cond)]


name_cond_size = cells_meta.groupby(["name"]).size()
name_cond_size = name_cond_size[name_cond_size>min_clone_size]
clones_filt = name_cond_size.index 

sizes = sizes.loc[clones_filt].sort_values(ascending=False)
clones_filt

groups = cells_meta.groupby(["seurat_clusters", "name"]).size().reset_index().rename({0:"count"}, axis=1)
groups

clones = clones_filt#np.unique(groups["name"])
clones

atac_cl = np.unique(groups["seurat_clusters"])
atac_cl

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 13, 14])

## Run hypergeometric for each clone and cluster

In [11]:

def run_hypergeom(groups, clones, atac_cl):
    #p(k,M,n,N) = (n choose k)((M-n)choose(N-k))/(MchooseN)
    #pmf(k, M, n, N) = choose(n, k) * choose(M - n, N - k) / choose(M, N),
                                   #for max(0, N - (M-n)) <= k <= min(n, N)

    # M: Total number of cells
    # n: Number of cells in the atac cluster (group population)
    # N: Number of cells in clone (the draw)
    # x: Number of cells in specific clone and cluster
    enrichment_df = pd.DataFrame(index=clones,
                                 columns=atac_cl, dtype=np.float128)

    M = groups["count"].sum()
    for cl in clones:
        for atac in atac_cl:     
            n = groups[groups["seurat_clusters"]==atac]["count"].sum()
            N = groups[groups["name"]==cl]["count"].sum()

            x = groups[((groups["name"]==cl) & (groups["seurat_clusters"]==atac))]["count"].sum()

            # rv = hypergeom(M, n, N)
            prb = 1 - hypergeom.cdf(x, M, n, N)
            enrichment_df.loc[cl, atac] = prb
    nrows, ncols = enrichment_df.shape
    reject, pvals_corrected, _, _ = multitest.multipletests(enrichment_df.values.flatten(),
                                                            alpha=p_thresh, method="fdr_bh")
    pvals_corrected = np.reshape(pvals_corrected, [nrows,ncols])
    bh_enrichment_df = enrichment_df.copy()
    bh_enrichment_df.loc[:,:] = pvals_corrected

    return bh_enrichment_df


def process_hypergeom(bh_enrichment_df, sizes, p_thresh):
    output_df = pd.DataFrame(index=sizes.index)
    output_df["significant clusters"] = ""
    output_df["size"] = sizes
    #output_df["min_significance"] = None

    sig_results = []
    sig_order = []
    for ind, val in bh_enrichment_df.loc[sizes.index].iterrows():
        passed = val[val<p_thresh].index.values
        if len(passed)>0:
            output_df.loc[ind, "significant clusters"] = ";".join([str(x) for x in passed])
            output_df.loc[ind, "min_significance"] = min(val)
           # sig_results.append((ind, passed))
    output_df.loc[:, bh_enrichment_df.columns] = bh_enrichment_df.loc[output_df.index]
    #output_df = output_df.sort_values("min_significance")
    output_df=output_df.sort_values("size", ascending=True)
    output_df = output_df.loc[~(output_df["min_significance"].isnull())]
    return output_df

def set_nonsig_to_one(bh_enrichment_df):
    bh_enrichment_df[bh_enrichment_df>p_thresh] = 1
    bh_enrichment_df[bh_enrichment_df==0] = min(p_thresh,min(set((bh_enrichment_df.values).flatten())-{0})) # Set to the next min, or p_thresh, whichever is smaller
    return bh_enrichment_df


def plot(output_df, bh_enrichment_df, out_f):
    # Generate the colors based on size
    anno_labels = np.sort(output_df["size"].unique())

    anno_pal = sns.cubehelix_palette(len(anno_labels),
                                        light=.9, dark=.2, reverse=True,
                                        rot=.1, start=2.8)
    anno_lut = dict(zip(map(str, anno_labels), anno_pal))

    anno_colors = pd.Series(anno_lut)
    anno_colors

    output_df["clone_size"] = output_df["size"].apply(lambda x: anno_colors.loc[str(x)])

    # Clustermap
    g = sns.clustermap(-np.log10(bh_enrichment_df.loc[output_df.index].fillna(1)), 
                       row_cluster=True, row_colors=output_df[["clone_size"]])
    g.ax_heatmap.set(xlabel="Cluster ID")
    g.ax_cbar.set(title="-log10 p-value")

    for label in anno_labels: #[::step]:
        g.ax_col_dendrogram.bar(0, 0, color=anno_lut[str(label)],
                                label=label, linewidth=0)
    g.ax_col_dendrogram.legend(loc="best", ncol=6)
    g.fig.suptitle("Clones overrepresented in certain cell-types (no input condition) ")
    plt.tight_layout()
    g.ax_col_dendrogram.legend(loc="right", ncol=6)
    plt.savefig(out_f+".png")
    return


def wrap_hyper(cells_meta_f, out_f):
    # Get groups, sizes, and clones
    name_cond_size = cells_meta.groupby(["name"]).size()
    name_cond_size = name_cond_size[name_cond_size>min_clone_size]
    clones_filt = name_cond_size.index 
    sizes = sizes.loc[clones_filt].sort_values(ascending=False)
    groups = cells_meta.groupby(["seurat_clusters", "name"]).size().reset_index().rename({0:"count"}, axis=1)
    clones = clones_filt#np.unique(groups["name"])
    atac_cl = np.unique(groups["seurat_clusters"])

    # Run enrichment
    bh_enrichment_df = run_hypergeom(bh_enrichment_df, clones, atac_cl)
    # Process results and create df
    output_df = process_hypergeom(bh_enrichment_df, sizes, p_thresh)
    # Save df
    output_df.to_csv(out_f, sep=",")
    
    ## Plot 
    bh_enrichment_df = set_nonsig_to_one(bh_enrichment_df)
    plot(output_df, bh_enrichment_df, out_f)
    

In [4]:
#p(k,M,n,N) = (n choose k)((M-n)choose(N-k))/(MchooseN)
#pmf(k, M, n, N) = choose(n, k) * choose(M - n, N - k) / choose(M, N),
                               #for max(0, N - (M-n)) <= k <= min(n, N)
    
# M: Total number of cells
# n: Number of cells in the atac cluster (group population)
# N: Number of cells in clone (the draw)
# x: Number of cells in specific clone and cluster
enrichment_df = pd.DataFrame(index=clones,
                             columns=atac_cl, dtype=np.float128)

M = groups["count"].sum()
for cl in clones:
    for atac in atac_cl:     
        n = groups[groups["seurat_clusters"]==atac]["count"].sum()
        N = groups[groups["name"]==cl]["count"].sum()
        
        x = groups[((groups["name"]==cl) & (groups["seurat_clusters"]==atac))]["count"].sum()
        
        # rv = hypergeom(M, n, N)
        prb = 1 - hypergeom.cdf(x, M, n, N)
        enrichment_df.loc[cl, atac] = prb
        

## Adjust p-value

In [5]:
nrows, ncols = enrichment_df.shape
reject, pvals_corrected, _, _ = multitest.multipletests(enrichment_df.values.flatten(),
                                                        alpha=p_thresh, method="fdr_bh")
pvals_corrected = np.reshape(pvals_corrected, [nrows,ncols])
bh_enrichment_df = enrichment_df.copy()
bh_enrichment_df.loc[:,:] = pvals_corrected
bh_enrichment_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,13,14
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0_0,0.192438,0.999899,0.955882,0.169234,0.749909,0.847234,0.289200,0.487407,0.779893,0.794656,0.399851,0.228322,0.349039
0_1,0.983173,0.275959,0.905307,0.154259,0.999899,0.809083,0.954721,0.419668,0.089638,0.988798,0.152397,0.207869,0.683404
0_10,0.002548,0.986795,0.898973,0.207274,0.999899,0.999899,0.808591,0.683301,0.999899,0.257624,0.148450,0.295869,0.140015
0_11,0.896642,0.207869,0.380179,0.140015,0.988798,0.860469,0.593488,0.458119,0.518134,0.905307,0.605796,0.157462,0.510934
0_12,0.032305,0.999899,0.893181,0.204298,0.999899,0.988798,0.247329,0.999899,0.999899,0.000032,0.056503,0.291443,0.850600
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1_5,0.999899,0.897304,0.765279,0.251038,0.117989,0.255480,0.999899,0.999899,0.936833,0.016979,0.875746,0.377110,0.140015
1_6,0.999899,0.899091,0.337614,0.152397,0.001079,0.552430,0.999899,0.476020,0.775948,0.977406,0.999899,0.205392,0.678091
1_7,0.610114,0.605383,0.518177,0.149226,0.852327,0.140015,0.977406,0.543449,0.252373,0.999899,0.999899,0.184281,0.605796
1_8,0.331294,0.977406,0.395778,0.218586,0.432699,0.961536,0.275959,0.999899,0.949996,0.015075,0.791840,0.326802,0.604512


## Create csv file of results and normalized counts

In [6]:
output_df = pd.DataFrame(index=sizes.index)
output_df["significant clusters"] = ""
output_df["size"] = sizes
#output_df["min_significance"] = None

sig_results = []
sig_order = []
for ind, val in bh_enrichment_df.loc[sizes.index].iterrows():
    passed = val[val<p_thresh].index.values
    if len(passed)>0:
        output_df.loc[ind, "significant clusters"] = ";".join([str(x) for x in passed])
        output_df.loc[ind, "min_significance"] = min(val)
       # sig_results.append((ind, passed))
output_df.loc[:, bh_enrichment_df.columns] = bh_enrichment_df.loc[output_df.index]
output_df = output_df.sort_values("min_significance")

output_df = output_df.loc[~(output_df["min_significance"].isnull())]
output_df

Unnamed: 0_level_0,significant clusters,size,min_significance,0,1,2,3,4,5,6,7,8,9,10,13,14
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1_3,3;4,453,1e-05,0.999899,0.765279,0.808806,0.035296,1e-05,0.188046,0.999899,0.940525,0.614347,0.999899,0.999899,0.326802,0.893181
0_12,0;9;10,265,3.2e-05,0.032305,0.999899,0.893181,0.204298,0.999899,0.988798,0.247329,0.999899,0.999899,3.2e-05,0.056503,0.291443,0.8506
1_10,9,297,0.000102,0.811133,0.897304,0.108909,0.21319,0.29304,0.999899,0.679393,0.966379,0.999899,0.000102,0.158925,0.316239,0.878746
1_6,4,330,0.001079,0.999899,0.899091,0.337614,0.152397,0.001079,0.55243,0.999899,0.47602,0.775948,0.977406,0.999899,0.205392,0.678091
0_10,0,279,0.002548,0.002548,0.986795,0.898973,0.207274,0.999899,0.999899,0.808591,0.683301,0.999899,0.257624,0.14845,0.295869,0.140015
1_33,9,46,0.003459,0.458119,0.778934,0.639651,0.127439,0.375616,0.999899,0.688263,0.990758,0.999899,0.003459,0.893181,0.143904,0.140015
1_34,9;10,39,0.007154,0.420318,0.999899,0.594645,0.12626,0.379573,0.999899,0.24295,0.999899,0.999899,0.007154,0.088198,0.140015,0.368846
1_4,4,438,0.008225,0.999899,0.817038,0.710618,0.207274,0.008225,0.323309,0.963565,0.33145,0.834808,0.999899,0.847234,0.295869,0.856774
0_3,10,342,0.010706,0.817038,0.547119,0.594645,0.218586,0.999899,0.999899,0.765279,0.911937,0.89559,0.148426,0.010706,0.325257,0.596246
0_16,3;13,199,0.011461,0.140015,0.563216,0.551072,0.011461,0.855482,0.988798,0.671003,0.856774,0.896642,0.918004,0.889704,0.032305,0.631376


In [7]:
output_df.to_csv(out_f, sep=",")

## Plot

In [8]:
output_df=output_df.sort_values("size", ascending=True)
bh_enrichment_df[bh_enrichment_df>p_thresh] = 1
bh_enrichment_df[bh_enrichment_df==0] = min(p_thresh,min(set((bh_enrichment_df.values).flatten())-{0})) # Set to the next min, or p_thresh, whichever is smaller


In [None]:
%matplotlib inline
anno_labels = np.sort(output_df["size"].unique())

anno_pal = sns.cubehelix_palette(len(anno_labels),
                                    light=.9, dark=.2, reverse=True,
                                    rot=.1, start=2.8)
anno_lut = dict(zip(map(str, anno_labels), anno_pal))

anno_colors = pd.Series(anno_lut)
anno_colors

output_df["clone_size"] = output_df["size"].apply(lambda x: anno_colors.loc[str(x)])


g = sns.clustermap(-np.log10(bh_enrichment_df.loc[output_df.index].fillna(1)), 
                   row_cluster=True, row_colors=output_df[["clone_size"]])
g.ax_heatmap.set(xlabel="Cluster ID")
g.ax_cbar.set(title="-log10 p-value")

for label in anno_labels: #[::step]:
    g.ax_col_dendrogram.bar(0, 0, color=anno_lut[str(label)],
                            label=label, linewidth=0)
g.ax_col_dendrogram.legend(loc="best", ncol=6)
g.fig.suptitle("Clones overrepresented in certain cell-types (no input condition) ")
plt.tight_layout()
g.ax_col_dendrogram.legend(loc="right", ncol=6)
plt.savefig(out_f+".png")