In [1]:
%matplotlib inline

In [2]:
pwd

'/data2/mito_lineage/Analysis/peaks_expression/jan21_2021'

In [3]:
config_f = "config.yaml"
outdir=""

In [4]:
import os
if outdir == "":
    outdir="./output"

if not os.path.exists(outdir):
    os.mkdir(outdir)

In [5]:
from src.utils.parse_config import read_config_file
from os.path import join, dirname
import pandas as pd
from glob import glob
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import mplh.cluster_help as ch
%load_ext autoreload
%autoreload 2

In [6]:
config = read_config_file(config_f)
chip_genes = config["chip_genes"]
config


{'global': 'PROJECT',
 'indir': '/data2/isshamie/mito_lineage/data/processed/mtscATAC/jan21_2021/MTblacklist',
 'experiment': 'jan21_2021',
 'genome': 'MTblacklist',
 'chip_genes': ['TP53', 'U2AF1', 'SF3B1', 'SRSF2', 'TET2', 'IDH1', 'IDH2'],
 'samples': ['J2', 'P2']}

### Load:
1. Raw cells barcode w qc info
2. filtered cells barcode list
3. Peak-by-cell sparse matrix
4. Peak gene annotations 
5. Graph clustering filtered cells cluster label
6. TF-by-cell sparse count matrix
7. TF labels

In [7]:
#curr_in = join(config["indir"], config['CBs'][0], 'outs')

curr_in = join(config["indir"], "reanalysis_aggr", "outs")
CBs = pd.read_csv(join(config["indir"], "barcodes_conditionInfo.csv"), header=None)
CBs = CBs.drop(0,axis=1).rename({1:"Condition"}, axis=1)
CBs.index=CBs.index+1
cell_inds = pd.read_csv(join(curr_in, "filtered_peak_bc_matrix","barcodes.tsv"),header=None)
## Load peak annotations

peak_annotations = pd.read_csv(join(curr_in,'peak_annotation.tsv'),sep='\t')
peak_annotations.index=peak_annotations.index+1
peak_annotations["gene"] = peak_annotations["gene"].str.upper()
peak_annotations["Peak"]= peak_annotations.index
# peak_annotations


### Load clusters and add to CBs
cluster_f = glob(join(curr_in,'analysis',"clustering", "graphclust","clusters.csv"))
clusters_df = pd.read_csv(cluster_f[0])
clusters_df.index=clusters_df.index+1

CBs=pd.concat((CBs,clusters_df), axis=1)

## Load peaks matrix
filename = join(curr_in,'filtered_peak_bc_matrix/matrix.mtx')
peaks_sparse_mtx = pd.read_csv(filename,sep=' ',skiprows=2, header=None).iloc[1:].reset_index(drop=True)
peaks_sparse_mtx.columns = ["Peak", "Cell", "Count"]



## Look at marker genes from Lin et al

In [8]:
# chip_markers ={"cKit":"KIT", 
#                  "Sca1": "LY6E", #"Ly6a",  #?? LY6K LY6E LY6H
#                  "CD11c": "ITGAX",
#                  "CD150": "SLAMF1", 
#                  "CD34": "CD34", 
#                  "CD16/32":"FCGR3A", 
#                  "CD45.1": "PTPRC",
#                  "CD45.2":"PTPRC", 
#                  "CD48": "SLAMF2", # Other SLAMFs
#                  "IL7Ra":"IL7R", 
#                  "CD11b":"ITGAM"}
# chip_genes = list(chip_markers.values())
# chip_genes

## Filter for the chip genes 
Uses peak_annotations

In [9]:
set(peak_annotations['peak_type'])

{'distal',
 'distal;distal',
 'distal;distal;distal',
 'distal;distal;distal;distal',
 'distal;distal;distal;distal;distal',
 'distal;distal;distal;distal;distal;distal',
 'distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal',
 'distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal',
 'distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal',
 'distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal',
 'distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;promoter',
 'distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;distal;

In [10]:
chip_anno = peak_annotations[peak_annotations["gene"].isin(chip_genes)].set_index("Peak", drop=True).copy()
chip_anno.head()

Unnamed: 0_level_0,peak,gene,distance,peak_type
Peak,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20333,chr2_197378418_197378819,SF3B1,56208,distal
20334,chr2_197384388_197384470,SF3B1,50557,distal
20335,chr2_197387734_197388234,SF3B1,46793,distal
20336,chr2_197389155_197389922,SF3B1,45105,distal
20337,chr2_197391169_197391551,SF3B1,43476,distal


In [11]:
peaks_sparse_mtx = peaks_sparse_mtx.loc[peaks_sparse_mtx["Peak"].isin(chip_anno.index)]
peaks_sparse_mtx["gene"] = peaks_sparse_mtx["Peak"].map(peak_annotations["gene"])
peaks_sparse_mtx["Cluster"] = peaks_sparse_mtx["Cell"].map(clusters_df["Cluster"])
peaks_dense = peaks_sparse_mtx.pivot(index="Peak", columns="Cell",values="Count").fillna(0)

## The number of cells within a peak and number of peaks within a cell.

In [12]:
sns.distplot((peaks_dense>0).sum(axis=0))
plt.title("Distribution of number of peaks seen in a cell across all cells")

Text(0.5, 1.0, 'Distribution of number of peaks seen in a cell across all cells')

In [13]:
sns.distplot((peaks_dense>0).sum(axis=1))
plt.title("Distribution of the number of cells seen in a peak across all peaks")

Text(0.5, 1, 'Distribution of the number of cells seen in a peak across all peaks')

In [14]:
#chip_anno = peak_annotations[peak_annotations["gene"].isin(chip_genes)].copy()
#chip_inds = peak_annotations.index
chip_anno["ID"] = chip_anno.apply(lambda x: x["gene"]+"_"+x["peak_type"]+"_"+x["distance"],axis=1)
chip_peaks_dense = peaks_dense.loc[chip_anno.index].copy()
#chip_peaks_dense = chip_peaks_dense.rename(chip_anno.set_index("Peak").apply(lambda x: x["gene"]+"_"+x["peak_type"],axis=1), axis=0)
#chip_peaks_dense

## Cluster based on IM genes 

### a. Dont cluster the chip peaks

In [15]:
# #rand_df = chip_peaks_dense.sample(n=500,axis=1)
# ch.plot_cluster(chip_peaks_dense.fillna(0), row_meta=chip_anno[["gene", "peak_type"]], 
#                 col_meta=CBs.drop("Barcode",axis=1), to_row_clust=False, row_names=False,
#                 metric='jaccard', to_legend=True, white_name=None)
# plt.savefig(join(outdir, "CHIP_genes_chromatinFragments.png"), bbox_inches='tight')

### b. Clustering the chip peaks as well
and binarizing the results as 0 or >0

In [16]:
#geneType_chip_anno = chip_anno.rename(chip_anno.apply(lambda x: x["gene"]+"_"+x["peak_type"],axis=1), axis=0)
%matplotlib inline
rand_df = chip_peaks_dense.sample(n=1000,axis=1)
ch.plot_cluster(rand_df.fillna(0), row_meta=chip_anno[["gene", "peak_type"]], 
                col_meta=CBs.drop(["Barcode","Cluster"],axis=1).loc[rand_df.columns], to_row_clust=True, 
                metric='jaccard', to_legend=True, white_name=None, row_names=False )
plt.savefig(join(outdir, "CHIP_genes_chromatinFragments.png"), bbox_inches='tight')

clustering
cmap None


In [17]:
# #geneType_chip_anno = chip_anno.rename(chip_anno.apply(lambda x: x["gene"]+"_"+x["peak_type"],axis=1), axis=0)
# rand_df = chip_peaks_dense.sample(n=1000,axis=1)
# ch.plot_cluster(rand_df.fillna(0), row_meta=chip_anno[["gene", "peak_type"]], 
#                 col_meta=CBs.drop(["Barcode","Cluster"],axis=1).loc[rand_df.columns], to_row_clust=True, 
#                 metric='jaccard', vmax=1, to_legend=True, white_name=None, row_names=False )
# #plt.savefig(join(outdir, "CHIP_genes_chromatinFragments.png"), bbox_inches='tight')

In [18]:
chip_peaks_dense

Cell,1,2,3,4,5,6,8,9,10,13,...,18738,18739,18740,18741,18744,18745,18746,18747,18748,18750
Peak,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20334,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20335,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20336,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20337,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20338,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20339,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20340,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20905,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20906,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
chip_peaks_dense.shape

(51, 11001)

In [20]:
CBs.drop(["Barcode","Cluster"],axis=1).loc[rand_df.columns]

Unnamed: 0_level_0,Condition
Cell,Unnamed: 1_level_1
15350,Ctrl
7353,Flt3l
8764,Flt3l
3133,Flt3l
4347,Flt3l
...,...
11041,Flt3l
16116,Flt3l
8265,Flt3l
16616,Flt3l


## Countplots for each gene and peak type

In [21]:
sns.countplot(chip_anno["gene"])

  ax.set_xticks(np.arange(len(self.plot_data)))


<matplotlib.axes._subplots.AxesSubplot at 0x7f7092f66668>

In [22]:
sns.countplot(chip_anno["peak_type"])

<matplotlib.axes._subplots.AxesSubplot at 0x7f7092f66668>