In [1]:
import pandas as pd
import os
import re

from ConSReg import ConSReg

File names of input data

In [2]:
# Dap-seq narrow peak files
dap_file_list = os.listdir("data/dap_seq_all_peaks/")
dap_file = [ "data/dap_seq_all_peaks/" + file for file in dap_file_list if re.match(".*narrowPeak",file) is not None]

# ATAC-seq peak file
atac_file = "data/atac_seq_all_merged.bed"

# Arabidopsis genome annotation file
gff_file = "data/TAIR10_GFF3_genes.gff"

# Differential contrast result
sc_diff_file = ["/work/alexsong/project/scConSReg/data/cortext-endodermis.csv"]

Create a ConSReg object for analysis

In [3]:
analysis = ConSReg()

Read and preprocess all data files

In [4]:
analysis.preprocess(dap_file = dap_file, diff_file = sc_diff_file, atac_file = atac_file, gff_file=gff_file,)

Merging DAP-seq peaks...





>> preparing features information...		 2018-10-18 02:49:26 PM 
>> identifying nearest features...		 2018-10-18 02:49:27 PM 
>> calculating distance from peak to TSS...	 2018-10-18 02:50:41 PM 
>> assigning genomic annotation...		 2018-10-18 02:50:41 PM 
>> assigning chromosome lengths			 2018-10-18 02:51:07 PM 
>> done...					 2018-10-18 02:51:07 PM 
Done
Overlapping DAP-seq with ATAC-seq...
Done
Reading diff tables...
Done




<ConSReg.ConSReg instance at 0x7fdea4822710>

Generate feature matrices for each differential contrast

In [5]:
analysis.gen_feature_mat()

Generating feature matrices...
Done


<ConSReg.ConSReg instance at 0x7fdea4822710>

Compute AUC-ROC and AUC-PRC from corss-validation (CV) using LRLASSO method. Mean and standard deviation of AUC-ROC and AUC-PRC were reporeted from five replicates of CV 

In [6]:
analysis.eval_by_cv(ml_engine = 'lrlasso',rep = 5)

Performing cross-validation for each feature matrix using lrlasso engine...
Done


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.7s finished


<ConSReg.ConSReg instance at 0x7fdea4822710>

Check the CV results

In [7]:
analysis.auroc

Unnamed: 0,diff_name,auroc_mean_UR,auroc_std_UR,auroc_mean_DR,auroc_std_DR
0,cortext-endodermis.csv,0.707462,0.039551,0.7002,0.023088


In [9]:
analysis.compute_imp_score(n_resampling = 200, n_jobs = 36, verbose = True)

Performing stability selection and compute importance score for each TF...
Done


<ConSReg.ConSReg instance at 0x7fdea4822710>

In [10]:
analysis.gen_networks(imp_cutoff = 0.5, verbose = True)

<ConSReg.ConSReg instance at 0x7fdea4822710>

Save analysis result

In [11]:
# Cross-validation result
analysis.auroc.to_csv("results/single_cell_analysis/auroc_result.csv")
analysis.auprc.to_csv("results/single_cell_analysis/auprc_result.csv")

# Importance scores
analysis.imp_scores_UR.to_csv("results/single_cell_analysis/imp_score_UR.csv")
analysis.imp_scores_DR.to_csv("results/single_cell_analysis/imp_score_DR.csv")

# Networks were saved in the format of edge list
for diff_name, network in zip(analysis.diff_name_list, analysis.networks_UR):
    network.to_csv("results/single_cell_analysis/{}_UR_network.csv".format(diff_name))
    
for diff_name, network in zip(analysis.diff_name_list, analysis.networks_DR):
    network.to_csv("results/single_cell_analysis/{}_DR_network.csv".format(diff_name))