In [2]:
import pandas as pd
import os
import re

from ConSReg.main import ConSReg
from ConSReg.main import load_obj

File names of input data

In [3]:
# Dap-seq narrow peak files
dap_file_list = os.listdir("data/dap_seq_all_peaks/")
dap_files = [ "data/dap_seq_all_peaks/" + file for file in dap_file_list if re.match(".*narrowPeak",file) is not None]

# ATAC-seq peak file
atac_file = "data/atac_seq_all_peaks/all_merged.bed"

# Arabidopsis genome annotation file
gff_file = "data/gff/TAIR10_GFF3_genes.gff"

# Differential contrast result generated by DESeq2
diff_file_list = os.listdir("data/diff_evalB/")
diff_files = [ "data/diff_evalB/" + file for file in diff_file_list if re.match(".*csv",file) is not None]

## Step 1. Preprocessing the datasets

### 1.1 Read and preprocess all data files

In [4]:
analysis = ConSReg()
analysis.preprocess(dap_files = dap_files, diff_files = diff_files, atac_file = atac_file, gff_file=gff_file)

Merging DAP-seq peaks...





>> preparing features information...		
 
2019-08-25 05:03:35 PM
 


>> identifying nearest features...		
 
2019-08-25 05:03:35 PM
 


>> calculating distance from peak to TSS...	
 
2019-08-25 05:04:40 PM
 


>> assigning genomic annotation...		
 
2019-08-25 05:04:40 PM
 


>> assigning chromosome lengths			
 
2019-08-25 05:05:10 PM
 


>> done...					
 
2019-08-25 05:05:10 PM
 


Done
Overlapping DAP-seq with ATAC-seq...
Done
Reading diff tables...
Done


<ConSReg.main.ConSReg at 0x7fad74622978>

### 1.2 You may save the analysis object as pickle file and load it later to resume analysis

In [None]:
analysis.save_obj("data/analysis_obj/ConSReg_obj_preprocessed.pkl")

### 1.3 Alternatively, you may load a previously saved object which already has the datasets preprocessed. This saves proprocessing time

In [None]:
analysis = load_obj("data/analysis_obj/ConSReg_obj_preprocessed.pkl")

## Step 2. Generate feature matrices

### 2.1 Generate feature matrices for each differential contrast

In [5]:
analysis.gen_feature_mat(neg_type='udg')

Existing feature matrices will be overwritten.
Generating feature matrices...
Done


<ConSReg.main.ConSReg at 0x7fad74622978>

### 2.2 You may export/save different types of feature matrices. 
The three functions, analysis.get_feature_mat_dap(), analysis.get_feature_mat_reweight(), and analysis.get_feature_mat_final() will each returns a named tuple which has three properties:
 - .comp_names: names of differential contrasts. These names were extracted from differential contrast input file names
 - .UR_feature_mat_list: a list of pandas dataframe. Each dataframe is a UR feature matrix for the corresponding differential contrast
 - .DR_feature_mat_list: a list of pandas dataframe. Each dataframe is a DR feature matrix for the corresponding differential contrast

In [6]:
feature_mat_list_dap = analysis.get_feature_mat_dap()

In [7]:
feature_mat_list_reweight = analysis.get_feature_mat_reweight()

In [8]:
feature_mat_list_final = analysis.get_feature_mat_final()

### 2.3 View one feature matrix. analysis._feature_mat_list_final is a list with all feature matrices in it.
- len(analysis.feature_mat_list_final) is equal to number of differential contrasts. And each element itself is a two element list with UR feature matrix as first element and DR feature matrix as second element.
- Each element is a pandas dataframe. You may save the feature matrix by calling .to_csv() function. For example: `analysis._feature_mat_list_final[0][0].to_csv("feature_matrix.csv")` can save one feature matrix as csv file.

In [12]:
analysis._feature_mat_list_final[0][0]

Unnamed: 0,label,AT1G32870,AT4G18890,AT5G47660,AT3G56850,AT1G45249,AT1G75240,AT2G35700,AT5G56840,AT1G72010,...,AT5G58900,AT5G62320,AT1G24250,AT4G31800,AT1G44830,AT3G03200,AT5G02320,AT2G33710,AT1G12630,AT1G79580
AT3G27250,1,0.000000,0.000000,-0.000000,0.076324,0.304791,0.024463,0.265030,-0.000000,0.210088,...,0.000000,-0.000000,-0.000000,-0.000000,-0.055374,0.000000,-0.000000,0.009690,-0.014042,0.183454
AT3G29575,1,0.000000,0.143296,-0.000000,0.152649,0.609583,0.024463,0.000000,-0.796831,0.000000,...,0.000000,-0.000000,-0.000000,-0.189457,-0.000000,0.000000,-0.000000,0.000000,-0.000000,0.000000
AT2G33380,1,0.000000,0.000000,-0.000000,0.076324,0.304791,0.024463,0.000000,-0.000000,0.000000,...,0.000000,-0.000000,-0.000000,-0.000000,-0.000000,0.000000,-0.000000,0.000000,-0.000000,0.000000
AT1G62540,1,0.000000,0.000000,-0.000000,0.076324,0.000000,0.006176,0.000000,-0.295853,0.000000,...,0.000000,-0.000000,-0.000000,-0.000000,-0.000000,0.000000,-0.000000,0.000000,-0.000000,0.000000
AT2G39800,1,0.000000,0.000000,-0.000000,0.152649,0.000000,0.000000,0.000000,-0.000000,0.000000,...,0.259084,-0.000000,-0.000000,-0.000000,-0.000000,0.000000,-0.000000,0.009690,-0.000000,0.000000
AT5G24770,1,0.000000,0.000000,-0.000000,0.000000,0.000000,0.000000,0.000000,-0.000000,0.000000,...,0.000000,-0.000000,-0.000000,-0.000000,-0.000000,0.000000,-0.000000,0.000000,-0.000000,0.000000
AT1G52000,1,0.000000,0.000000,-0.000000,0.076324,0.304791,0.000000,0.000000,-0.796831,0.000000,...,0.000000,-0.000000,-0.000000,-0.189457,-0.000000,0.000000,-0.000000,0.000000,-0.000000,0.000000
AT2G40610,1,0.000000,0.143296,-0.030482,0.000000,0.000000,0.024463,0.000000,-0.000000,0.000000,...,0.000000,-0.043489,-0.000000,-0.000000,-0.000000,0.399359,-0.000000,0.019381,-0.000000,0.000000
AT5G42800,1,0.000000,0.000000,-0.000000,0.000000,0.000000,0.000000,0.000000,-0.000000,0.000000,...,0.000000,-0.035711,-0.000000,-0.000000,-0.000000,0.000000,-0.000000,0.000000,-0.000000,0.000000
AT2G34430,1,0.248632,0.000000,-0.125661,0.228973,0.000000,0.000000,0.000000,-0.000000,0.000000,...,0.000000,-0.000000,-0.000000,-0.000000,-0.000000,0.000000,-0.000000,0.009690,-0.000000,0.000000


### 2.4 Similar to step one. You may also save the analysis object and load the analysis object later to complete other analyses

In [None]:
analysis.save_obj("data/analysis_obj/ConSReg_obj_feature_mat_generated.pkl")

In [None]:
analysis = load_obj("data/analysis_obj/ConSReg_obj_feature_mat_generated.pkl")

## Step 3. Perform evaluation (Note this may take a long time for large dataset. You may skip this step since it is only intended to demonstrate classifier performance)

### 3.1 Compute AUC-ROC and AUC-PRC from corss-validation (CV) using LRLASSO method. 
Here, mean and standard deviation of AUC-ROC and AUC-PRC were reporeted from five replicates of CV 

In [13]:
analysis.eval_by_cv(ml_engine = 'lrlasso',rep = 5, n_jobs = 32)

Performing cross-validation for each feature matrix using lrlasso engine...
Old evaluation results will be ovewritten


[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done   1 tasks      | elapsed:   30.5s
[Parallel(n_jobs=32)]: Done   2 out of  44 | elapsed:   33.3s remaining: 11.6min
[Parallel(n_jobs=32)]: Done   5 out of  44 | elapsed:   34.0s remaining:  4.4min
[Parallel(n_jobs=32)]: Done   8 out of  44 | elapsed:   35.9s remaining:  2.7min
[Parallel(n_jobs=32)]: Done  11 out of  44 | elapsed:   37.5s remaining:  1.9min
[Parallel(n_jobs=32)]: Done  14 out of  44 | elapsed:   40.2s remaining:  1.4min
[Parallel(n_jobs=32)]: Done  17 out of  44 | elapsed:   46.5s remaining:  1.2min
[Parallel(n_jobs=32)]: Done  20 out of  44 | elapsed:   48.6s remaining:   58.3s
[Parallel(n_jobs=32)]: Done  23 out of  44 | elapsed:   51.0s remaining:   46.6s
[Parallel(n_jobs=32)]: Done  26 out of  44 | elapsed:   59.8s remaining:   41.4s
[Parallel(n_jobs=32)]: Done  29 out of  44 | elapsed:  1.2min remaining:   36.0s
[Parallel(n_jobs=32)]: Done  32 out of  44 | elapse

Done


[Parallel(n_jobs=32)]: Done  44 out of  44 | elapsed:  1.7min remaining:    0.0s
[Parallel(n_jobs=32)]: Done  44 out of  44 | elapsed:  1.7min finished


<ConSReg.main.ConSReg at 0x7fad74622978>

Check the CV results

In [14]:
analysis.auroc

Unnamed: 0,diff_name,auroc_mean_UR,auroc_std_UR,auroc_mean_DR,auroc_std_DR
0,PRJEB10930_3_T-PRJEB10930_3_C.csv,0.851986,0.015209,0.807076,0.015445
1,GSE81202_14-GSE81202_12.csv,0.856556,0.0114,0.856728,0.006942
2,GSE81202_18-GSE81202_16.csv,0.841938,0.005814,0.824144,0.00786
3,GSE63406_6-GSE63406_4.csv,0.827699,0.006553,0.860982,0.013506
4,GSE67332_5-GSE67332_4.csv,0.801529,0.005378,0.831494,0.004638
5,PRJNA324514_11-PRJNA314076_1.csv,0.833421,0.0071,0.812786,0.004372
6,GSE69510_4-GSE69510_3.csv,0.880442,0.010548,0.82832,0.018051
7,GSE63406_3-GSE63406_1.csv,0.874188,0.006204,0.840987,0.014558
8,PRJNA324514_6-PRJNA314076_1.csv,0.823324,0.010366,0.83204,0.00515
9,GSE80565_3-GSE80565_4.csv,0.875108,0.008878,0.829168,0.025153


## Step 4 Generate importance score for each TF and GRN for each differential contrast (May take a long time)

### 4.1 Generate importance scores

In [15]:
analysis.compute_imp_score(n_resampling = 200, n_jobs = 32, verbose = True)

Existing importance scores will be overwritten.
Performing stability selection and compute importance score for each TF...


[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done   2 out of  10 | elapsed:    1.9s remaining:    7.5s
[Parallel(n_jobs=32)]: Done  10 out of  10 | elapsed:    2.9s finished
[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    1.7s
[Parallel(n_jobs=32)]: Done 200 out of 200 | elapsed:    2.3s finished
[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done   2 out of  10 | elapsed:    4.7s remaining:   18.7s
[Parallel(n_jobs=32)]: Done  10 out of  10 | elapsed:    5.3s finished
[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    3.1s
[Parallel(n_jobs=32)]: Done 200 out of 200 | elapsed:    4.4s finished
[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done   2 out of  10 | 

[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    2.3s
[Parallel(n_jobs=32)]: Done 200 out of 200 | elapsed:    3.3s finished
[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done   2 out of  10 | elapsed:    1.7s remaining:    7.0s
[Parallel(n_jobs=32)]: Done  10 out of  10 | elapsed:    2.0s finished
[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 137 out of 200 | elapsed:    0.9s remaining:    0.4s
[Parallel(n_jobs=32)]: Done 200 out of 200 | elapsed:    1.3s finished
[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done   2 out of  10 | elapsed:    6.9s remaining:   27.5s
[Parallel(n_jobs=32)]: Done  10 out of  10 | elapsed:    7.9s finished
[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    4.1s
[Parallel(n_jobs=32)]: Done 200 out of 20

[Parallel(n_jobs=32)]: Done   2 out of  10 | elapsed:    4.4s remaining:   17.7s
[Parallel(n_jobs=32)]: Done  10 out of  10 | elapsed:    5.0s finished
[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    1.4s
[Parallel(n_jobs=32)]: Done 200 out of 200 | elapsed:    2.0s finished
[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done   2 out of  10 | elapsed:    1.3s remaining:    5.4s
[Parallel(n_jobs=32)]: Done  10 out of  10 | elapsed:    1.6s finished
[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 200 out of 200 | elapsed:    1.3s finished
[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done   2 out of  10 | elapsed:    8.0s remaining:   32.0s
[Parallel(n_jobs=32)]: Done  10 out of  10 | elapsed:    8.8s finished
[Parallel(n_jobs=32)]: Using bac

[Parallel(n_jobs=32)]: Done  10 out of  10 | elapsed:   10.7s finished
[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    5.8s
[Parallel(n_jobs=32)]: Done 200 out of 200 | elapsed:    8.3s finished
[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done   2 out of  10 | elapsed:    3.3s remaining:   13.4s
[Parallel(n_jobs=32)]: Done  10 out of  10 | elapsed:    3.6s finished
[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    2.2s
[Parallel(n_jobs=32)]: Done 200 out of 200 | elapsed:    3.2s finished
[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done   2 out of  10 | elapsed:    3.6s remaining:   14.2s
[Parallel(n_jobs=32)]: Done  10 out of  10 | elapsed:    3.8s finished
[Parallel(n_jobs=32)]: Using backend LokyBackend wi

[Parallel(n_jobs=32)]: Done   2 out of  10 | elapsed:    4.0s remaining:   16.2s
[Parallel(n_jobs=32)]: Done  10 out of  10 | elapsed:    4.5s finished
[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    1.2s
[Parallel(n_jobs=32)]: Done 200 out of 200 | elapsed:    2.0s finished
[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done   2 out of  10 | elapsed:    6.2s remaining:   24.7s
[Parallel(n_jobs=32)]: Done  10 out of  10 | elapsed:    7.2s finished
[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    3.1s
[Parallel(n_jobs=32)]: Done 200 out of 200 | elapsed:    4.0s finished
[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done   2 out of  10 | elapsed:    3.6s remaining:   14.2s
[Parallel(n_jobs=32)]: Done  10 out of  1

Done


[Parallel(n_jobs=32)]: Done 200 out of 200 | elapsed:   16.1s finished


<ConSReg.main.ConSReg at 0x7fad74622978>

### 4.2 Generate GRN for each differential contrast

In [16]:
analysis.gen_networks(imp_cutoff = 0.5, verbose = True)

Existing networks will be overwritten.
Generating networks...
Done


<ConSReg.main.ConSReg at 0x7fad74622978>

## Step 5. Save analysis result

In [17]:
# Cross-validation result
analysis.auroc.to_csv("results/bulk_analysis/auroc_result.csv")
analysis.auprc.to_csv("results/bulk_analysis/auprc_result.csv")

# Importance scores
analysis.imp_scores_UR.to_csv("results/bulk_analysis/imp_score_UR.csv")
analysis.imp_scores_DR.to_csv("results/bulk_analysis/imp_score_DR.csv")

# Networks were saved in the format of edge list
for diff_name, network in zip(analysis._diff_name_list, analysis.networks_UR):
    network.to_csv("results/bulk_analysis/{}_UR_network.csv".format(diff_name))
    
for diff_name, network in zip(analysis._diff_name_list, analysis.networks_DR):
    network.to_csv("results/bulk_analysis/{}_DR_network.csv".format(diff_name))