In [1]:
import pandas as pd
import os
import re

from ConSReg import ConSReg

File names of input data

In [2]:
# Dap-seq narrow peak files
dap_file_list = os.listdir("data/dap_seq_all_peaks/")
dap_file = [ "data/dap_seq_all_peaks/" + file for file in dap_file_list if re.match(".*narrowPeak",file) is not None]

# ATAC-seq peak file
atac_file = "data/atac_seq_all_merged.bed"

# Arabidopsis genome annotation file
gff_file = "data/TAIR10_GFF3_genes.gff"

# Differential contrast result generated by DESeq2
diff_file_list = os.listdir("data/diff_evalB/")
diff_file = [ "data/diff_evalB/" + file for file in diff_file_list if re.match(".*csv",file) is not None]

Create a ConSReg object for analysis

In [3]:
analysis = ConSReg()

Read and preprocess all data files

In [4]:
analysis.preprocess(dap_file = dap_file, diff_file = diff_file, atac_file = atac_file, gff_file=gff_file)

Merging DAP-seq peaks...





>> preparing features information...		 2018-10-17 09:55:57 PM 
>> identifying nearest features...		 2018-10-17 09:55:57 PM 
>> calculating distance from peak to TSS...	 2018-10-17 09:57:11 PM 
>> assigning genomic annotation...		 2018-10-17 09:57:11 PM 
>> assigning chromosome lengths			 2018-10-17 09:57:38 PM 
>> done...					 2018-10-17 09:57:38 PM 
Done
Overlapping DAP-seq with ATAC-seq...
Done
Reading diff tables...
Done




<ConSReg.ConSReg instance at 0x7f21269d1098>

In [6]:
analysis.gen_feature_mat()

Generating feature matrices...
Done


<ConSReg.ConSReg instance at 0x7f21269d1098>

Generate feature matrices for each differential contrast

Compute AUC-ROC and AUC-PRC from corss-validation (CV) using LRLASSO method. Mean and standard deviation of AUC-ROC and AUC-PRC were reporeted from five replicates of CV 

In [8]:
analysis.eval_by_cv(ml_engine = 'lrlasso',rep = 5, n_jobs = 44)

Performing cross-validation for each feature matrix using lrlasso engine...


[Parallel(n_jobs=44)]: Done   1 tasks      | elapsed:   10.1s
[Parallel(n_jobs=44)]: Done   2 out of  44 | elapsed:   10.2s remaining:  3.6min
[Parallel(n_jobs=44)]: Done   5 out of  44 | elapsed:   14.3s remaining:  1.9min
[Parallel(n_jobs=44)]: Done   8 out of  44 | elapsed:   15.3s remaining:  1.1min
[Parallel(n_jobs=44)]: Done  11 out of  44 | elapsed:   18.6s remaining:   55.7s
[Parallel(n_jobs=44)]: Done  14 out of  44 | elapsed:   19.4s remaining:   41.5s
[Parallel(n_jobs=44)]: Done  17 out of  44 | elapsed:   20.6s remaining:   32.7s
[Parallel(n_jobs=44)]: Done  20 out of  44 | elapsed:   23.9s remaining:   28.7s
[Parallel(n_jobs=44)]: Done  23 out of  44 | elapsed:   27.3s remaining:   24.9s
[Parallel(n_jobs=44)]: Done  26 out of  44 | elapsed:   35.3s remaining:   24.5s
[Parallel(n_jobs=44)]: Done  29 out of  44 | elapsed:   44.7s remaining:   23.1s
[Parallel(n_jobs=44)]: Done  32 out of  44 | elapsed:   47.1s remaining:   17.7s
[Parallel(n_jobs=44)]: Done  35 out of  44 | el

Done


<ConSReg.ConSReg instance at 0x7f21269d1098>

Check the CV results

In [9]:
analysis.auroc

Unnamed: 0,diff_name,auroc_mean_UR,auroc_std_UR,auroc_mean_DR,auroc_std_DR
0,GSE81202_13-GSE81202_11.csv,0.814116,0.011186,0.874437,0.005554
1,GSE72806_3-GSE72806_1.csv,0.816361,0.00998,0.845932,0.006617
2,GSE69510_2-GSE69510_1.csv,0.872991,0.016182,0.861803,0.016924
3,PRJEB10930_4_T-PRJEB10930_4_C.csv,0.845177,0.0044,0.818052,0.014804
4,GSE81202_18-GSE81202_16.csv,0.842078,0.015224,0.828709,0.007584
5,GSE67332_5-GSE67332_4.csv,0.813359,0.016495,0.837403,0.010939
6,GSE81202_17-GSE81202_15.csv,0.848836,0.007454,0.838216,0.005513
7,GSE80565_7-GSE80565_8.csv,0.841519,0.010085,0.810898,0.026635
8,GSE63406_5-GSE63406_3.csv,0.827257,0.006889,0.863418,0.002842
9,GSE80565_9-GSE80565_10.csv,0.835047,0.019036,0.797814,0.025172


Generate importance score for each TF

In [10]:
analysis.compute_imp_score(n_resampling = 200, n_jobs = 44, verbose = True)

Performing stability selection and compute importance score for each TF...
Done


<ConSReg.ConSReg instance at 0x7f21269d1098>

Generate GRN for each differential contrast

In [11]:
analysis.gen_networks(imp_cutoff = 0.5, verbose = True)

<ConSReg.ConSReg instance at 0x7f21269d1098>

Save analysis result

In [13]:
# Cross-validation result
analysis.auroc.to_csv("results/bulk_analysis/auroc_result.csv")
analysis.auprc.to_csv("results/bulk_analysis/auprc_result.csv")

# Importance scores
analysis.imp_scores_UR.to_csv("results/bulk_analysis/imp_score_UR.csv")
analysis.imp_scores_DR.to_csv("results/bulk_analysis/imp_score_DR.csv")

# Networks were saved in the format of edge list
for diff_name, network in zip(analysis.diff_name_list, analysis.networks_UR):
    network.to_csv("results/bulk_analysis/{}_UR_network.csv".format(diff_name))
    
for diff_name, network in zip(analysis.diff_name_list, analysis.networks_DR):
    network.to_csv("results/bulk_analysis/{}_DR_network.csv".format(diff_name))