In [1]:
pwd

'/data2/mito_lineage/Analysis/multiplex'

In [2]:
INDIR="data/jan21_2021/chrM/pseudo/minC200_minAF0.01/numC25000_ispropFalse"
OUTDIR= "data/jan21_2021/chrM/pseudo/minC200_minAF0.01/numC25000_ispropFalse/flt3"
N_DONORS=4 


In [3]:
from os.path import join, exists, dirname
from glob import glob
import pickle
import mplh.cluster_help as ch
import os
import vireoSNP
import numpy as np
from scipy import sparse
from scipy.io import mmread
import matplotlib.pyplot as plt
from scipy.stats import hypergeom, fisher_exact
print(vireoSNP.__version__)

import pandas as pd
import seaborn as sns
from vireoSNP import Vireo
np.set_printoptions(formatter={'float': lambda x: format(x, '.5f')})

fig_utils from mplh
here
0.4.2


In [4]:
AD_F = join(INDIR, "cellSNP.tag.AD.mtx")
DP_F = join(INDIR, "cellSNP.tag.DP.mtx")
cell_inds_names = join(INDIR,"cell_indices_*.txt")

# Using vireo for clonal reconstruction - mitochondrial mutations

The mitochondrial mutations data set is extracted from [Ludwig et al, Cell, 2019](https://doi.org/10.1016/j.cell.2019.01.022), the 9 variants used here are from Supp Fig. 2F (and main Fig. 2F).

For clonal reconstruction, we are using the core class `vireoSNP.Vireo` with multiple initializations

## Load cluster results and test for enrichment

In [5]:
def extract_clusters(modelCA, ad, dp, sample_colors, prob_thresh=0.9, doublet_thresh=0.9):
    """ Creates a dictionary where the keys are the cluster IDs (0-based index)
        and the values are the cell indices (0-based index)"""
    cell_clusters = {}
    sample_labels = {}
    doublet_prob = modelCA.predict_doublet(ad, dp, 
                                       update_GT=False, 
                                       update_ID=False)[0].sum(axis=1)
    low_conf_cells = np.flatnonzero(doublet_prob > doublet_thresh)
    for n in range(modelCA.ID_prob.shape[1]):
        cell_clusters[n] = np.flatnonzero((modelCA.ID_prob[:,n]>prob_thresh))
        print('before doublet removal')
        print(len(cell_clusters[n]))
        cell_clusters[n] = cell_clusters[n][~(np.isin(cell_clusters[n], low_conf_cells))]
        print('after doublet removal')
        print(len(cell_clusters[n]))
        curr_sample_colors = sample_colors.iloc[cell_clusters[n]].copy()
        curr_sample_colors = curr_sample_colors.reset_index()
        sample_labels[n] = curr_sample_colors
        print(f"Cluster {n}: {len(cell_clusters[n])} cells ")
    
    return cell_clusters, sample_labels

def run_enrichment(df):
    """Runs hypergeometric for flt3 expansion. 
    
    df: pd.DataFrame where index is Control and Flt3, columns are cluster labels, and elements are number of cells.
    """
    enrichment_df = pd.DataFrame(index=["p", "Fisher p"], 
                                 columns=df.columns, dtype=np.float128)
    
    #M: Total number of cells
    #n: Number of cells in the clone
    #N: Number of flt3 cells
    #x: Number of cells in specific clone with flt3
    M = df.sum().sum()
    N = df.loc["# Flt3 Cells in Cluster"].sum()  #df.loc["Flt3"].sum()
    #rv = hypergeom(M, n, N)
    for col in df.columns:
        n = df[col].sum()
        #x = df.loc["Flt3", col]
        x = df.loc["# Flt3 Cells in Cluster", col] 
        prb = 1-hypergeom.cdf(x, M, n, N)
        enrichment_df.loc["p", col] = prb

        oddsratio, fish_p = fisher_exact([[x, N-x], [n-x, M-N-(n-x)]])
        enrichment_df.loc["Fisher p", col] = fish_p
        
    return enrichment_df

In [6]:
from mplh.fig_utils import helper_save

In [7]:
#sns.set_context("talk",rc={"font.size":8,"axes.titlesize":8,"axes.labelsize":5})
def set_pubfig():
    sns.set_context("paper", rc={"font.size":8,"axes.titlesize":8,"axes.labelsize":5})

def defaultPlotting(): 
    sns.set(rc={'figure.figsize':(16,9),"font.size":20,"axes.titlesize":20,"axes.labelsize":20},style="white")
    
defaultPlotting()

In [8]:
n_clone_list = [3, 5, 10, 20, 40]#[2,3,4,5,6,7]

for n in range(N_DONORS):
    curr_ad_f = join(OUTDIR, f"cluster{n}.AD.mtx")
    curr_dp_f = join(OUTDIR, f"cluster{n}.DP.mtx")
    print(curr_ad_f)
    print(curr_dp_f)
    curr_ad = mmread(curr_ad_f).tocsc()
    curr_dp = mmread(curr_dp_f).tocsc()
    curr_labels = pd.read_csv(join(OUTDIR, f"cluster{n}.labels.txt"), index_col=0)
    out_f = join(OUTDIR, f"cluster{n}_lineage_elbow.png")
    for k in n_clone_list:
        curr_modelCA = pickle.load(open(out_f+f"clones{k}.modelCA.p", "rb"))
        cell_clusters, sample_labels = extract_clusters(curr_modelCA, curr_ad, curr_dp, 
                                                        curr_labels)
        # Create counts df 
        clust_counts = pd.DataFrame(index=["# Control Cells in Cluster", "# Flt3 Cells in Cluster"], 
                                 columns=sample_labels.keys())
        for curr_k in clust_counts.columns:
            clust_counts.at["# Control Cells in Cluster", curr_k] = (sample_labels[curr_k]["sample ID"] == "Control").sum()
            clust_counts.at["# Flt3 Cells in Cluster", curr_k] = (sample_labels[curr_k]["sample ID"] == "Flt3").sum()
        clust_counts = clust_counts.astype(np.double)
        fold_df = pd.DataFrame(np.log2((clust_counts.loc["# Flt3 Cells in Cluster"]+1)/(clust_counts.loc["# Control Cells in Cluster"]+1))).transpose()
        fold_df = fold_df.rename({0:"Flt3 fold enrichment"}, axis=0)                
        
        # Get enrichment
        enrich_df = run_enrichment(clust_counts)
        enrich_stats = pd.concat((clust_counts, fold_df, enrich_df)).transpose()

        f, ax = plt.subplots(nrows=3, ncols=1 ,figsize=(15,15),dpi=300)
        labels = np.array([f"{x:.2E}" for x in enrich_df.astype(float).values[0]])
        #fig, ax = plt.subplots()
        sns.heatmap(enrich_df.astype(float), annot=True, fmt=".2E", cbar=False, #labels.reshape([1, len(labels)]), 
                    cmap='RdYlGn', ax=ax[0])
        
        ax[0].set_title("Flt3 enrichment p-value")

        

        sns.heatmap(fold_df, ax=ax[1],  annot=True, fmt=".2f", cbar=False, cmap="RdBu")
                    #annot=np.array(labels).astype('unicode').reshape([1, len(labels)]))
        ax[1].set_aspect('equal', adjustable='box')
        ax[1].set_title("Log2 Fold change of flt3/control")
        
        plt.xlabel("log2((Flt3+1)/(WT+1))")
        #plt.savefig(out_f+f"clones{k}_labelChange.png")
        #plt.savefig(out_f+f"clones{k}_labelCounts.png")
        sns.heatmap((clust_counts.astype(int)), vmin=0, ax=ax[2],annot=True, fmt=".1E", cbar=False, cmap="Blues")
        ax[2].set_title("Number of cells in each cluster")
        
        plt.suptitle(f"Donor {n}")
        plt.savefig(out_f+f"clones{k}_labelEnrich.png")
        plt.close()

        
        # Put in scatterplot. 
        enrich_stats = pd.concat((clust_counts, fold_df, enrich_df)).transpose()
        enrich_stats['p'] = enrich_stats['p'].astype(np.float32)
        enrich_stats['-log10p'] = -np.log10(enrich_stats['p'])
        enrich_stats.loc[enrich_stats["-log10p"]==np.infty, "-log10p"] = 0
        f = plt.figure(figsize=(10,10))
        sns.scatterplot(data=enrich_stats,x="# Control Cells in Cluster", y="# Flt3 Cells in Cluster", size="-log10p", s=100, 
                        sizes=(20,200), hue="Flt3 fold enrichment", palette="RdBu")
        plt.savefig(out_f+f"clones{k}_scatterEnrich.png")
        plt.close()
        
        
        enrich_stats['Fisher p'] = enrich_stats['Fisher p'].astype(np.float32)
        enrich_stats['Fisher -log10p'] = -np.log10(enrich_stats['Fisher p'])
        enrich_stats.loc[enrich_stats["Fisher -log10p"]==np.infty, "Fisher -log10p"] = 0
        f, ax = plt.subplots(figsize=(10,10))
        sns.scatterplot(data=enrich_stats,x="# Control Cells in Cluster", y="# Flt3 Cells in Cluster", size="Fisher -log10p", s=100, 
                        sizes=(20,200), hue="Flt3 fold enrichment", palette="RdBu")
        plt.axis('square')
        ax.plot([0, 1], [0, 1], transform=ax.transAxes, color='black')
        #ax.plot([0, max(max(enrich_stats[["Control", "Flt3"]]))])

        #helper_save(out_f+f"clones{k}_scatterFisherEnrich.png", to_pdf=False)
        plt.savefig(out_f+f"clones{k}_scatterFisherEnrich.png", dpi=300)
        plt.close()

data/jan21_2021/chrM/pseudo/minC200_minAF0.01/numC25000_ispropFalse/flt3/cluster0.AD.txt
data/jan21_2021/chrM/pseudo/minC200_minAF0.01/numC25000_ispropFalse/flt3/cluster0.DP.txt
before doublet removal
1590
after doublet removal
1317
Cluster 0: 1317 cells 
before doublet removal
287
after doublet removal
183
Cluster 1: 183 cells 
before doublet removal
2173
after doublet removal
1971
Cluster 2: 1971 cells 
before doublet removal
35
after doublet removal
14
Cluster 0: 14 cells 
before doublet removal
2172
after doublet removal
1977
Cluster 1: 1977 cells 
before doublet removal
98
after doublet removal
53
Cluster 2: 53 cells 
before doublet removal
153
after doublet removal
106
Cluster 3: 106 cells 
before doublet removal
1591
after doublet removal
1328
Cluster 4: 1328 cells 
before doublet removal
89
after doublet removal
51
Cluster 0: 51 cells 
before doublet removal
210
after doublet removal
171
Cluster 1: 171 cells 
before doublet removal
671
after doublet removal
581
Cluster 2: 581 c

  result = getattr(ufunc, method)(*inputs, **kwargs)


before doublet removal
9
after doublet removal
9
Cluster 0: 9 cells 
before doublet removal
44
after doublet removal
41
Cluster 1: 41 cells 
before doublet removal
6
after doublet removal
0
Cluster 2: 0 cells 
before doublet removal
97
after doublet removal
81
Cluster 3: 81 cells 
before doublet removal
66
after doublet removal
33
Cluster 4: 33 cells 
before doublet removal
41
after doublet removal
39
Cluster 5: 39 cells 
before doublet removal
11
after doublet removal
9
Cluster 6: 9 cells 
before doublet removal
8
after doublet removal
8
Cluster 7: 8 cells 
before doublet removal
18
after doublet removal
15
Cluster 8: 15 cells 
before doublet removal
42
after doublet removal
36
Cluster 9: 36 cells 
before doublet removal
73
after doublet removal
58
Cluster 10: 58 cells 
before doublet removal
36
after doublet removal
26
Cluster 11: 26 cells 
before doublet removal
9
after doublet removal
8
Cluster 12: 8 cells 
before doublet removal
3
after doublet removal
3
Cluster 13: 3 cells 
befor

data/jan21_2021/chrM/pseudo/minC200_minAF0.01/numC25000_ispropFalse/flt3/cluster2.AD.txt
data/jan21_2021/chrM/pseudo/minC200_minAF0.01/numC25000_ispropFalse/flt3/cluster2.DP.txt
before doublet removal
1314
after doublet removal
1218
Cluster 0: 1218 cells 
before doublet removal
717
after doublet removal
437
Cluster 1: 437 cells 
before doublet removal
1330
after doublet removal
1232
Cluster 2: 1232 cells 
before doublet removal
166
after doublet removal
72
Cluster 0: 72 cells 
before doublet removal
1036
after doublet removal
956
Cluster 1: 956 cells 
before doublet removal
547
after doublet removal
310
Cluster 2: 310 cells 
before doublet removal
926
after doublet removal
875
Cluster 3: 875 cells 
before doublet removal
387
after doublet removal
344
Cluster 4: 344 cells 
before doublet removal
118
after doublet removal
107
Cluster 0: 107 cells 
before doublet removal
135
after doublet removal
51
Cluster 1: 51 cells 
before doublet removal
121
after doublet removal
117
Cluster 2: 117 c

Cluster 11: 0 cells 
before doublet removal
40
after doublet removal
26
Cluster 12: 26 cells 
before doublet removal
0
after doublet removal
0
Cluster 13: 0 cells 
before doublet removal
85
after doublet removal
62
Cluster 14: 62 cells 
before doublet removal
0
after doublet removal
0
Cluster 15: 0 cells 
before doublet removal
0
after doublet removal
0
Cluster 16: 0 cells 
before doublet removal
0
after doublet removal
0
Cluster 17: 0 cells 
before doublet removal
0
after doublet removal
0
Cluster 18: 0 cells 
before doublet removal
130
after doublet removal
106
Cluster 19: 106 cells 
before doublet removal
0
after doublet removal
0
Cluster 0: 0 cells 
before doublet removal
0
after doublet removal
0
Cluster 1: 0 cells 
before doublet removal
0
after doublet removal
0
Cluster 2: 0 cells 
before doublet removal
45
after doublet removal
26
Cluster 3: 26 cells 
before doublet removal
0
after doublet removal
0
Cluster 4: 0 cells 
before doublet removal
0
after doublet removal
0
Cluster 5: