In [1]:
import pandas as pd
import os
import re

from ConSReg.ConSReg import ConSReg
from ConSReg.ConSReg import load_obj

File names of input data

In [2]:
# Dap-seq narrow peak files
dap_file_list = os.listdir("data/dap_seq_all_peaks/")
dap_files = [ "data/dap_seq_all_peaks/" + file for file in dap_file_list if re.match(".*narrowPeak",file) is not None]

# ATAC-seq peak file
atac_file = "data/atac_seq_all_merged.bed"

# Arabidopsis genome annotation file
gff_file = "data/TAIR10_GFF3_genes.gff"

# Differential contrast result generated by DESeq2
diff_file_list = os.listdir("data/diff_evalB/")
diff_files = [ "data/diff_evalB/" + file for file in diff_file_list if re.match(".*csv",file) is not None]

## Step 1. Preprocessing the datasets

### 1.1 Read and preprocess all data files

In [3]:
analysis = ConSReg()
analysis.preprocess(dap_files = dap_files, diff_files = diff_files, atac_file = atac_file, gff_file=gff_file)

Merging DAP-seq peaks...





>> preparing features information...		
 
2019-07-17 11:14:08 PM
 


>> identifying nearest features...		
 
2019-07-17 11:14:08 PM
 


>> calculating distance from peak to TSS...	
 
2019-07-17 11:15:24 PM
 


>> assigning genomic annotation...		
 
2019-07-17 11:15:24 PM
 


>> assigning chromosome lengths			
 
2019-07-17 11:15:50 PM
 


>> done...					
 
2019-07-17 11:15:50 PM
 


Done
Overlapping DAP-seq with ATAC-seq...
Done
Reading diff tables...
Done


<ConSReg.ConSReg.ConSReg at 0x7fa81d0a7438>

### 1.2 You may save the analysis object as pickle file and load it later to resume analysis

In [4]:
analysis.save_obj("data/analysis_obj/ConSReg_obj_preprocessed.pkl")

<ConSReg.ConSReg.ConSReg at 0x7fa81d0a7438>

### 1.3 Alternatively, you may load a previously saved object which already has the datasets preprocessed. This saves proprocessing time

In [5]:
analysis = load_obj("data/analysis_obj/ConSReg_obj_preprocessed.pkl")

## Step 2. Generate feature matrices

### 2.1 Generate feature matrices for each differential contrast

In [6]:
analysis.gen_feature_mat(neg_type='udg')

Existing feature matrices will be overwritten.
Generating feature matrices...
Done


<ConSReg.ConSReg.ConSReg at 0x7fa81d0e6c50>

### 2.2 View one feature matrix. analysis._feature_mat_list is a list with all feature matrices in it.
- len(analysis.feature_mat_list) is equal to number of differential contrasts. And each element itself is a two element list with UR feature matrix as first element and DR feature matrix as second element.
- Each element is a pandas dataframe. You may save the feature matrix by calling .to_csv() function. For example: `analysis._feature_mat_list[0][0].to_csv("feature_matrix.csv")` can save one feature matrix as csv file.

In [7]:
analysis._feature_mat_list[0][0]

Unnamed: 0,label,AT1G12260,AT1G43700,AT1G19210,AT1G62300,AT1G34190,AT5G46830,AT2G45420,AT3G01220,AT4G01250,...,AT1G12630,AT4G36620,AT1G18330,AT3G09600,AT2G18350,AT3G20770,AT2G40620,AT5G13910,AT1G03840,AT2G33550
AT2G46970,1,-0.000000,0.000000,-0.000000,-0.000000,0.000000,0.000000,0.0,-0.000000,0.611992,...,0.000000,-0.0,0.208551,0.111396,0.000000,0.000000,0.000000,-0.000000,-0.191872,-0.000000
AT2G23170,1,-0.000000,0.000000,-0.078672,-0.000000,0.000000,0.000000,0.0,-0.000000,0.000000,...,0.000000,-0.0,0.000000,0.000000,0.000000,0.000000,0.000000,-0.000000,-0.000000,-0.000000
AT5G07010,1,-0.000000,0.000000,-0.000000,-0.000000,0.000000,0.000000,0.0,-0.000000,0.000000,...,0.014068,-0.0,0.543578,0.000000,0.000000,0.000000,0.089513,-0.000000,-0.156727,-0.000000
AT3G21330,1,-0.000000,0.000000,-0.000000,-0.000000,0.000000,0.000000,0.0,-0.000000,0.000000,...,0.000000,-0.0,0.220660,0.000000,0.317217,0.000000,0.000000,-0.000000,-0.000000,-0.000000
AT1G75450,1,-0.000000,0.000000,-0.000000,-0.000000,0.000000,0.000573,0.0,-0.024928,0.675532,...,0.000000,-0.0,0.000000,0.000000,0.000000,0.000000,0.000000,-0.000000,-0.000000,-0.000000
AT4G37770,1,-0.104406,0.000000,-0.110359,-0.000000,0.000000,0.000000,0.0,-0.000000,0.000000,...,0.014068,-0.0,0.000000,0.000000,0.000000,0.000000,0.000000,-0.000000,-0.000000,-0.000000
AT4G31380,1,-0.000000,0.000000,-0.000000,-0.000000,0.000000,0.000000,0.0,-0.000000,0.000000,...,0.000000,-0.0,0.000000,0.000000,0.000000,0.000000,0.000000,-0.000000,-0.000000,-0.000000
AT1G04180,1,-0.102855,0.000000,-0.035512,-0.000000,0.000000,0.023157,0.0,-0.251774,0.177244,...,0.000000,-0.0,0.000000,0.000000,0.000000,0.000000,0.000000,-0.000000,-0.000000,-0.000000
AT5G02540,1,-0.000000,0.000000,-0.000000,-0.000000,0.000000,0.000000,0.0,-0.000000,0.000000,...,0.000000,-0.0,0.000000,0.000000,0.000000,0.000000,0.000000,-0.000000,-0.000000,-0.000000
AT3G21320,1,-0.000000,0.000000,-0.220718,-0.276289,0.084531,0.023157,0.0,-0.000000,0.675532,...,0.014068,-0.0,0.391538,0.000000,0.000000,0.000000,0.000000,-0.000000,-0.191872,-0.000000


### 2.3 Similar to step one. You may also save the analysis object and load the analysis object later to complete other analyses

In [8]:
analysis.save_obj("data/analysis_obj/ConSReg_obj_feature_mat_generated.pkl")

<ConSReg.ConSReg.ConSReg at 0x7fa81d0e6c50>

In [9]:
analysis = load_obj("data/analysis_obj/ConSReg_obj_feature_mat_generated.pkl")

## Step 3. Perform evaluation (Note this may take a long time for large dataset. You may skip this step since it is only intended to demonstrate classifier performance)

### 3.1 Compute AUC-ROC and AUC-PRC from corss-validation (CV) using LRLASSO method. 
Here, mean and standard deviation of AUC-ROC and AUC-PRC were reporeted from five replicates of CV 

In [4]:
analysis.eval_by_cv(ml_engine = 'lrlasso',rep = 5, n_jobs = 44)

Performing cross-validation for each feature matrix using lrlasso engine...
Old evaluation results will be ovewritten


[Parallel(n_jobs=44)]: Using backend LokyBackend with 44 concurrent workers.
[Parallel(n_jobs=44)]: Done   1 tasks      | elapsed:   32.9s
[Parallel(n_jobs=44)]: Done   2 out of  44 | elapsed:   33.8s remaining: 11.8min
[Parallel(n_jobs=44)]: Done   5 out of  44 | elapsed:   37.5s remaining:  4.9min
[Parallel(n_jobs=44)]: Done   8 out of  44 | elapsed:   38.4s remaining:  2.9min
[Parallel(n_jobs=44)]: Done  11 out of  44 | elapsed:   40.7s remaining:  2.0min
[Parallel(n_jobs=44)]: Done  14 out of  44 | elapsed:   41.7s remaining:  1.5min
[Parallel(n_jobs=44)]: Done  17 out of  44 | elapsed:   43.2s remaining:  1.1min
[Parallel(n_jobs=44)]: Done  20 out of  44 | elapsed:   43.7s remaining:   52.4s
[Parallel(n_jobs=44)]: Done  23 out of  44 | elapsed:   50.1s remaining:   45.7s
[Parallel(n_jobs=44)]: Done  26 out of  44 | elapsed:   59.8s remaining:   41.4s
[Parallel(n_jobs=44)]: Done  29 out of  44 | elapsed:  1.2min remaining:   36.1s
[Parallel(n_jobs=44)]: Done  32 out of  44 | elapse

Done


[Parallel(n_jobs=44)]: Done  44 out of  44 | elapsed:  1.5min remaining:    0.0s
[Parallel(n_jobs=44)]: Done  44 out of  44 | elapsed:  1.5min finished


<ConSReg.ConSReg.ConSReg at 0x7fee660241d0>

Check the CV results

In [5]:
analysis.auroc

Unnamed: 0,diff_name,auroc_mean_UR,auroc_std_UR,auroc_mean_DR,auroc_std_DR
0,GSE81202_13-GSE81202_11.csv,0.838265,0.011172,0.885731,0.009195
1,GSE72806_3-GSE72806_1.csv,0.823117,0.006926,0.843879,0.003968
2,GSE69510_2-GSE69510_1.csv,0.874383,0.016706,0.852047,0.022058
3,PRJEB10930_4_T-PRJEB10930_4_C.csv,0.832124,0.018862,0.829033,0.016037
4,GSE81202_18-GSE81202_16.csv,0.842126,0.006949,0.827755,0.004292
5,GSE67332_5-GSE67332_4.csv,0.812208,0.010673,0.829531,0.010433
6,GSE81202_17-GSE81202_15.csv,0.844734,0.005367,0.838345,0.00599
7,GSE80565_7-GSE80565_8.csv,0.860174,0.011864,0.799766,0.023943
8,GSE63406_5-GSE63406_3.csv,0.826804,0.00588,0.848921,0.009165
9,GSE80565_9-GSE80565_10.csv,0.846018,0.022634,0.81687,0.02476


## Step 4 Generate importance score for each TF and GRN for each differential contrast (May take a long time)

### 4.1 Generate importance scores

In [6]:
analysis.compute_imp_score(n_resampling = 200, n_jobs = 44, verbose = True)

Existing importance scores will be overwritten.
Performing stability selection and compute importance score for each TF...
Done


<ConSReg.ConSReg.ConSReg at 0x7fee660241d0>

### 4.2 Generate GRN for each differential contrast

In [7]:
analysis.gen_networks(imp_cutoff = 0.5, verbose = True)

Existing networks will be overwritten.
Generating networks...
Done


<ConSReg.ConSReg.ConSReg at 0x7fee660241d0>

## Step 5. Save analysis result

In [None]:
# Cross-validation result
analysis.auroc.to_csv("results/bulk_analysis/auroc_result.csv")
analysis.auprc.to_csv("results/bulk_analysis/auprc_result.csv")

# Importance scores
analysis.imp_scores_UR.to_csv("results/bulk_analysis/imp_score_UR.csv")
analysis.imp_scores_DR.to_csv("results/bulk_analysis/imp_score_DR.csv")

# Networks were saved in the format of edge list
for diff_name, network in zip(analysis._diff_name_list, analysis.networks_UR):
    network.to_csv("results/bulk_analysis/{}_UR_network.csv".format(diff_name))
    
for diff_name, network in zip(analysis._diff_name_list, analysis.networks_DR):
    network.to_csv("results/bulk_analysis/{}_DR_network.csv".format(diff_name))