## About this notebook
#### The jupyter notebook is used to generate FM-factor for the Breast cancer cell lines before drug treatment. The transcriptomic data were derived from the cancer cell lines from the GDSC study (https://dx.doi.org/10.1016%2Fj.cell.2016.06.017, https://www.cancerrxgene.org/) 

In [4]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import rpy2.robjects as ro
import rpy2.robjects.numpy2ri
rpy2.robjects.numpy2ri.activate()
import scipy 
import scipy.stats as ss
import statsmodels
from statsmodels import stats
from statsmodels.stats import multitest
sys.path.append('../Script/')
import FM_States
import FM_selection
import TF
import rpy2
from rpy2.robjects.packages import importr
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import Query
base = importr('base')
CC = importr('ConsensusClusterPlus')
pheatmap = importr('pheatmap')
bezier = importr('bezier')
ROOT_DIR = os.path.abspath("../")

In [2]:
para = {
    'output_dir': ROOT_DIR+"/Sample_output/Example3/",
    'input_dir': ROOT_DIR+"/Sample_input/Example3/",
    'output_fmf_file':ROOT_DIR+"/Sample_output/Example3/" +"matrix_factor_brca_gdsc.csv",
    'sele_modules': ['Translation',
        'Nucleotide metabolism',
        'Signal transduction',
        'Amino acid metabolism',
        'Folding sorting and degradation',
         'Replication and repair',
         'Carbohydrate metabolism',
         'Membrane transport',
         'Cellular community - eukaryotes',
         'Lipid metabolism',
         'Metabolism of other amino acids',
         'Transcription',
         'Xenobiotics biodegradation and metabolism',
         'Signaling molecules and interaction',
         'Energy metabolism',
         'Transport and catabolism',
         'Glycan biosynthesis and metabolism',
         'Metabolism of cofactors and vitamins',
         'Cell motility',
         'Cell cycle', 
         'Apoptosis', 
         'Cellular senescence', 
         'p53 signaling pathway']
}

In [5]:
query_EXP = '''
select COSMIC_identifier, gene_symbol, RMA_proc_basalExp from `isb-cgc-04-0002.GDSC_v0.Basal_Gene_Exp`
where COSMIC_identifier in 
(select COSMIC_identifier from `isb-cgc-04-0002.GDSC_v0.Cell_Line_Details`
where TCGA_Cancer_Type = 'BRCA' and Whole_Exome_Sequencing = 'Y' )
'''

Expr = Query.Query_FromGCloud(query_EXP)
Expr.to_csv(para['input_dir'] + "/BRCA_GDSC_Expr.csv")

In [6]:
BRCA_expr = pd.read_csv(para['input_dir'] +"/matrix_exp_brca_gdsc.csv", index_col="Unnamed: 0")

In [7]:
dic_module, KEGG_level2, KEGG_level3, KEGG_modules = FM_selection.load_function_modules("KEGG")
module_selected_gmt = KEGG_modules.loc[KEGG_modules['name'].isin(para['sele_modules']) ]

TF_pairs = TF.get_tfpairs_for_select_pathways(BRCA_expr,list(set(KEGG_modules['name']).intersection(para['sele_modules'])),dic_module)


TF_pairs.to_csv(para['output_dir'] +"/TF_pairs.csv")

matrix_factor = FM_States.generate_factor(BRCA_expr, list(set(KEGG_modules['name']).intersection(para['sele_modules'])), module_selected_gmt, TF_pairs, UP = True, DOWN = True, ssGSEA = True, TF = True, absolute = True)
matrix_factor.to_csv(para['output_dir'] +"/matrix_factor_brca_gdsc.csv")

#### The results can be used to understand the FM-factors and their associations with drug sensitivity in Example3_association_analysis_FM-Facotors_drugResponse.ipynb or be used to predict drug sensitivity in Example3_predict_drug_response_rf.ipynb