##### Generating the transcription factor(TF) - functional module regulation by estimating the enrichment of target genes for one TF in one functional module. Before processing the following pipeline, make sure you have downloaded all essential input data  from the shared directory  https://osf.io/34xnm/?view_only=5b968aebebe14d4c97ff9d7ce4cb5070 which has been discribed in the manuscript "Functional module states framework reveals cell states for drug and target prediction" by Guangrong Qin et al.  

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import scipy 
import scipy.stats as ss
import statsmodels
from statsmodels import stats
from statsmodels.stats import multitest
sys.path.append('../Script/')
import FM_States
import FM_selection
import TF

ROOT_DIR = os.path.abspath("../")

##### Loading the gene expression matrix and select the functional modules. The functional modules are from the KEGG pathways.  

In [2]:
para_in = {
    'output_dir': ROOT_DIR+"/Sample_output/TF_pairs/",
    'input_expr_file': os.path.join(ROOT_DIR, "Sample_input/Example1/Sample1_data_MCF7_drugs_CTRP2.csv"),
    'out_dir': ROOT_DIR+"/Sample_output/Sample1",
    'sele_modules': ['Translation',
         'Nucleotide metabolism',
         'Signal transduction',
         'Amino acid metabolism',
         'Folding sorting and degradation',
         'Replication and repair',
         'Carbohydrate metabolism',
         'Membrane transport',
         'Cellular community - eukaryotes',
         'Lipid metabolism',
         'Metabolism of other amino acids',
         'Transcription',
         'Xenobiotics biodegradation and metabolism',
         'Signaling molecules and interaction',
         'Energy metabolism',
         'Transport and catabolism',
         'Glycan biosynthesis and metabolism',
         'Metabolism of cofactors and vitamins',
         'Cell motility',
         'Cell cycle', 
         'Apoptosis', 
         'Cellular senescence', 
         'p53 signaling pathway']
}

In [3]:
## generate a output directory

output_dir = para_in['output_dir']

if os.path.exists(output_dir) == False:
    try:
        os.mkdir(output_dir)
    except OSError:
        print ("Creation of the directory %s failed" % output_dir)
    else:
        print ("Successfully created the directory %s " % output_dir)
else:
    print ("INfO:  %s already exists!" % output_dir)

INfO:  /project/Sample_output/TF_pairs/ already exists!


##### 1) Load genes from the selected fucntional modules from KEGG pathways; 2) Load the gene expression matrix 3) Get the TF-module pairs by estimating the enrichment of target genes for one TF in one funtional module.


In [4]:
#Load the fucntional modules from KEGG pathways
dic_module, KEGG_level2, KEGG_level3, KEGG_modules = FM_selection.load_function_modules("KEGG")
module_selected_gmt = KEGG_modules.loc[KEGG_modules['name'].isin(para_in['sele_modules']) ]

#Load the gene expression matrix
data_matrix_MCF7_CTRP2 = pd.read_csv(para_in['input_expr_file'], index_col = 'Unnamed: 0')

#Get the TF-module pairs by estimating the enrichment of target genes for one TF in one funtional module.
TF_pairs = TF.get_tfpairs_for_select_pathways(data_matrix_MCF7_CTRP2,para_in['sele_modules'],dic_module)


In [5]:
TF_pairs.to_csv(para_in['output_dir'] +"/TF_pairs.csv")


#### The result files will be used in Example1-generate-FM-matrix.ipynb