## Data Preparation 

We use Beeline benchmark to benchmark the performance of DeepSEM.
The data preparation process are shown in below.
1. Download raw data from https://doi.org/10.5281/zenodo.3378975, which is provided by BEELINE benchmark
2. Use the preoprocess code in https://github.com/Murali-group/Beeline/blob/master/generateExpInputs.py to generate dataset.

We also provide demo data as shown in ../demo_data/GRN_inference/input 

# Run DeepSEM by using following command:
for cell type specific GRN inference task: python main.py --task non_celltype_GRN --data_file demo_data/GRN_inference/input/500_STRING_hESC/data.csv --net_file demo_data/GRN_inference/input/500_STRING_hESC/label.csv  --save_name out


for cell type non-specific GRN inference task: python main.py --task celltype_GRN --data_file demo_data/GRN_inference/input/500_ChIP-seq_hESC/data.csv --net_file demo_data/GRN_inference/input/500_ChIP-seq_hESC/label.csv  --save_name out

In [None]:
! python main.py --task non_celltype_GRN --data_file demo_data/GRN_inference/input/500_STRING_hESC/data.csv --net_file demo_data/GRN_inference/input/500_STRING_hESC/label.csv --save_name out
! python main.py --task celltype_GRN --data_file demo_data/GRN_inference/input/500_ChIP-seq_hESC/data.csv --net_file demo_data/GRN_inference/input/500_ChIP-seq_hESC/label.csv --save_name out

# Calculate EPR values

In [15]:
import pandas as pd
output = pd.read_csv('../demo_data/GRN_inference/output/500_STRING_hESC_demo_output.tsv',sep='\t')
output['EdgeWeight'] = abs(output['EdgeWeight'])
output = output.sort_values('EdgeWeight',ascending=False)
label = pd.read_csv('../demo_data/GRN_inference/input//500_STRING_hESC/label.csv')
TFs = set(label['Gene1'])
Genes = set(label['Gene1'])| set(label['Gene2'])
output = output[output['Gene1'].apply(lambda x: x in TFs)]
output = output[output['Gene2'].apply(lambda x: x in Genes)]
output = output[output['Gene2']!=output['Gene1']]
label_set = set(label['Gene1']+'|'+label['Gene2'])
output= output.iloc[:len(label_set)]
len(set(output['Gene1']+'|' +output['Gene2']) & label_set) / (len(label_set)**2/(len(TFs)*len(Genes)-len(TFs)))


4.12143991002342

# Calculate AUPR ratio values

In [27]:
from sklearn.metrics import average_precision_score
import numpy as np
import pandas as pd

output = pd.read_csv('../demo_data/GRN_inference/output/500_STRING_hESC_demo_output.tsv',sep='\t')
output['EdgeWeight'] = abs(output['EdgeWeight'])
output = output.sort_values('EdgeWeight',ascending=False)
label = pd.read_csv('../demo_data/GRN_inference/input//500_STRING_hESC/label.csv')
TFs = set(label['Gene1'])
Genes = set(label['Gene1'])| set(label['Gene2'])
output = output[output['Gene1'].apply(lambda x: x in TFs)]
output = output[output['Gene2'].apply(lambda x: x in Genes)]
label_set = set(label['Gene1']+'|'+label['Gene2'])
preds,labels,randoms = [] ,[],[]
res_d = {}
l = []
p= []
for item in (output.to_dict('records')):
        res_d[item['Gene1']+item['Gene2']] = item['EdgeWeight']
for item in (set(label['Gene1'])):
        for item2 in  set(label['Gene1'])| set(label['Gene2']):
            if item+item2 in label_set:
                l.append(1)
            else:
                l.append(0)
            if item+ item2 in res_d:
                p.append(res_d[item+item2])
            else:
                p.append(-1)
average_precision_score(l,p)/np.mean(l)

2.0942791999205195

# Ensemble DeepSEM result

In [None]:
res = []
for i in range(10):
    res.append(pd.read_csv('../../scGRN/Upload/GRN_inference_benchmark/cross_validation/500_STRING_hESC/rep_i.csv',sep='\t'))
res = pd.concat(res)
res['EdgeWeight'] = abs(res['EdgeWeight'])
res.groupby(['Gene1','Gene2']).mean()