#### About this notebook:  This jupyter notebook is used to 1) select the FM-factors which show difference between the drug treated group and non-drug treated group,  2) cluster the FM-factor matrix into different states, and 3) annotate each state using the FM-factors. The input files are from the results of Example1-generate-FM-matrix.ipynb.

#### Load libraries

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import rpy2.robjects as ro
import rpy2.robjects.numpy2ri
rpy2.robjects.numpy2ri.activate()
import scipy 
import scipy.stats as ss
import statsmodels
from statsmodels import stats
from statsmodels.stats import multitest
sys.path.append('../Script/')
import FM_States
import FM_selection
import TF
import rpy2
from rpy2.robjects.packages import importr

base = importr('base')
CC = importr('ConsensusClusterPlus')
pheatmap = importr('pheatmap')
ROOT_DIR = os.path.abspath("../")

#### Load parameters: Load FM-factors with drug treatment and FM-factors without drug treatment

In [None]:
para_in = {
    'input_FM_matrix_file': os.path.join(ROOT_DIR, "Sample_output/Example1/matrix_factor_mcf7.csv"),
    "input_FM_matrix_file_comp": os.path.join(ROOT_DIR, "Sample_output/Example1/ctrl_factor_mcf7.csv"),
    'out_dir': ROOT_DIR+"/Sample_output/Example1/",
}

In [None]:
ctrl_factor = pd.read_csv(para_in['input_FM_matrix_file_comp'],  header=0,index_col = 'Unnamed: 0')
matrix_factor = pd.read_csv(para_in['input_FM_matrix_file'],  header=0,index_col = 'Unnamed: 0')


#### Select the FM-factors which show difference between the drug treated group and non-drug treated group

In [None]:
Diff_factors = FM_States.Diff_comp(ctrl_factor, matrix_factor)

B = Diff_factors.values
nr,nc = B.shape
factors = Diff_factors.index.values
columns = Diff_factors.columns.values
#result_matrix = ro.r.matrix(B, nrow=nr, ncol=nc, dimnames = [ Diff_factors.index.values,Diff_factors.columns.values])
#result_matrix = ro.r.matrix(B, nrow=nr, ncol=nc, dimnames = list( Diff_factors.index.values, Diff_factors.columns.values))

out_dir = para_in['out_dir']
%load_ext rpy2.ipython



In [None]:
%%R -i B,nr,nc,factors,columns,out_dir  -o select_modules -w 5 -h 5 --units in -r 300

plt_sizeeffect <- function(B,nr,nc,factors,columns, se, fdr){
    result_matrix = (matrix(B,nr,nc,dimnames=list(c(factors),c(columns)) ))
    
    up = result_matrix[which(result_matrix[,'SizeEffect'] > se & (result_matrix[,'-log10(FDR)']) > fdr),c('SizeEffect',"-log10(FDR)")]
    down = result_matrix[which(result_matrix[,'SizeEffect'] < -1 * se & (result_matrix[,'-log10(FDR)']) > fdr),c('SizeEffect',"-log10(FDR)")]
    select_modules = c(rownames(up), rownames(down))
    select_matrix = rbind(up,down)
    rownames(select_matrix) = select_modules
    write.csv(select_matrix, file = paste(out_dir,"select_modules.csv",sep = ''))



    pdf(paste(out_dir,"Fig1_sizeEffect_label.pdf",sep = ''),width = 6, height = 6)
    plot(result_matrix[,c('SizeEffect',"-log10(FDR)")])
    points(down, xlim = c(-12,15), pch = 20, col = "blue")
    points(up, xlim = c(-12,15), pch = 20, col = "red")
    text(down[,1],down[,2],rownames(down),col = "blue",pos = 1,cex = 0.3)
    text(up[,1],up[,2],rownames(up),col = "red",pos = 1,cex = 0.3)
    abline(h = fdr, col = "gray60",lty=2)
    abline(v = se, col = "gray60",lty=2)
    abline(v = -1 * se,col = "gray60",lty=2)
    dev.off()


    pdf(paste(out_dir,"Fig1_sizeEffect_nolabel.pdf",sep = ''),width = 6, height = 6)
    plot(result_matrix[,c('SizeEffect',"-log10(FDR)")])
    points(down, xlim = c(-12,15), pch = 20, col = "blue")
    points(up, xlim = c(-12,15), pch = 20, col = "red")
    abline(h = fdr, col = "gray60",lty=2)
    abline(v = se, col = "gray60",lty=2)
    abline(v = -1 * se,col = "gray60",lty=2)
    dev.off()
    return(select_modules)
}


select_modules = plt_sizeeffect(B,nr,nc,factors,columns, 0.2, 6) ##The selection of P value and effect size might be artificial
#print(select_modules)


#### Normalize the FM-factors with drug treatment using the distribution of FM-factors without drug treatment

In [None]:
reference_factor_select_modules = FM_States.compare_factors_between_experimental_to_reference(ctrl_factor[list(select_modules)], matrix_factor[list(select_modules)])

matrix_factor_selected = reference_factor_select_modules.loc[:,list(select_modules)]
B = matrix_factor_selected.values
nr,nc = B.shape
samples_name = ro.r.c(matrix_factor_selected.index.values)
factor_name  = ro.r.c(matrix_factor_selected.columns.values)

Br = ro.r.matrix(B, nrow=nr, ncol=nc, dimnames = [ matrix_factor_selected.index.values, matrix_factor_selected.columns.values])
%load_ext rpy2.ipython

####  Cluster the FM-factor matrix into different states using Consensus Clustering method

In [None]:
%%R -i Br,samples_name,factor_name  -o Cluster_results -w 6 -h 6 --units in -r 300

rownames(Br) = samples_name
colnames(Br) = factor_name
Cluster_results = ConsensusClusterPlus(t(apply(Br, 2, scale)), maxK = 8, rep = 100, pItem = 0.9, 
                             pFeature = 0.9, clusterAlg = "km", distance = "euclidean",seed = 2314)

#print(Cluster_results)

In [None]:
%%R -i Cluster_results

threshold = 0.8
cc_score = c()

for (i in seq(2,8)){
    x = (Cluster_results[i][[1]]$consensusMatrix)
    #print(x)
    cc_score = c(cc_score, 1 - (length(x[x > threshold]) + length(x[x < (1- threshold)]))/length(x))
}

barplot(cc_score, ylab = 'proportion of ambiguous clustering (PAC)', names.arg=c("K=2","K=3","K=4","K=5","K=6","K=7","K=8"))


In [None]:
cluster_sele = 5  #selection of clusters

list_labe = ["S2","S3","S1","S4","S5"] #Reorder the clusters. 

colors = ['#ff8080','pink','#fdd5ac','#8895df', 'cyan'] #Define the colors for cluster annotation
#The number of clusters and order of clusters here only reflects the example we are providing, for different dataset, the orders should be defined by the users.

In [None]:
%%R -i Cluster_results,Br,samples_name,factor_name,select_modules,cluster_sele,colors,list_labe,out_dir   -w 5.5 -h 4 --units in -r 300
#User specific orders
library(pheatmap)
rownames(Br) = samples_name
colnames(Br) = factor_name

plot_heatmap <- function(Cluster_results, Br, select_modules,cl_num_sele){

    list_labe_R = c()
    Rename_Label = c()
    color_list = c()

    for (i in seq(1,length(list_labe))){
        list_labe_R = append(list_labe_R, list_labe[[i]][1])
        Rename_Label = append(Rename_Label,paste('S',i,sep = ''))
        color_list = append(color_list, colors[[i]][1])
    }

    names(Rename_Label) = list_labe_R  
    names(color_list) = Rename_Label
    anno_colors = list(States = color_list)


    cl_sam = Cluster_results[[cl_num_sele]]$consensusClass
    names(cl_sam) = rownames(Br)
    annotation_col_1 = data.frame(cl_sam)
    Factors = t(Br[,select_modules])
    labe = paste('S',annotation_col_1[,1],sep = '')
    names(labe) = rownames(annotation_col_1)

    annotation_col_1 = c()
    names_1 =c()



    for(i in unique(list_labe_R) ){ 
      if (length(labe[which(labe == i)]) > 2){ 
          result_hclust = hclust(dist((Br[names(labe[which(labe == i)]),])))
          names_1 = c(names_1, names(labe[which(labe == i)])[result_hclust$order])
          annotation_col_1 = c(annotation_col_1, rep(Rename_Label[i],length(labe[which(labe == i)])))
         }
        else{ 
        names_1 = c(names_1, names(labe[which(labe == i)]))
        annotation_col_1 = c(annotation_col_1, rep(Rename_Label[i],length(labe[which(labe == i)])))}
    }
    annotation_col_1 = data.frame(annotation_col_1)
    rownames(annotation_col_1) = names_1
    colnames(annotation_col_1) = c('States')



    write.csv(annotation_col_1, file = paste(out_dir,"/annotation_col.csv",sep = ''))
    color_forpheatmap = c(c(colorRampPalette(c("#0000ff", "#e1e5e5"))(60),c(colorRampPalette(c("#e1e5e5","#ff0000"))(60)) ))

    pheatmap(t(Br[names_1,select_modules]),
             annotation_col = annotation_col_1,
             annotation_colors = anno_colors[1],
             labels_col = '',
             fontsize = 6,
             cluster_cols = FALSE,scale = 'row', 
             color = color_forpheatmap,
             fontsize_row = 6
            )
    return(annotation_col_1)
}

annotation_col_1 = plot_heatmap(Cluster_results, Br, select_modules,cluster_sele)

#print(annotation_col_1)

In [None]:
matrix_factor_selected = reference_factor_select_modules.loc[:,list(select_modules)]
annotation_col_1 = annotation_col_1 = pd.read_csv(out_dir+"/annotation_col.csv", index_col= 'Unnamed: 0')
#Features = FM_States.Get_features_one_vs_all_others(matrix_factor_selected, 0.01, 0.8, annotation_col_1,select_modules) ##user define threshold

#### Annotation by functional modules

In [None]:
Features, dic_state = FM_States.Get_features_one_vs_one(matrix_factor_selected, 0.01, 1, 3, annotation_col_1,select_modules)

#### Heatmap visualization of altered modules

In [None]:
modules = []
for key in dic_state:
    for key1 in dic_state[key]:
        modules = modules + dic_state[key][key1]
modules = list(set(modules))

result = pd.DataFrame()
for state in ["S1","S2","S3","S4","S5"]:
    cur_state = []
    for module in modules:
        if (module in dic_state[state]['up']) and (module not in dic_state[state]['down']):
            cur_state.append(1)
        elif (module  in dic_state[state]['down']) and ( module not in dic_state[state]['up']):
            cur_state.append(-1)
        elif (module  in dic_state[state]['down']) and (module  in dic_state[state]['up']):
            cur_state.append(0.5)
        elif module not in dic_state[state]:
            cur_state.append(0)
    result[state] = cur_state 

result.index = modules
result = result[~(result == 0).all(1)]

In [None]:
%%R -i result -w 3 -h 3 --units in -r 300

n = 5
order = c()
for (i in seq(1,n)){
    order = c(order, (paste('S',i,sep = '')))
}

Features = result
color_max = max(round(min(Features)*100) * -1, round(max(Features)*100))
#color_forpheatmap2 = c(c(colorRampPalette(c("#0000ff", "white"))(color_max)[(color_max - (round(min(Features)*100) * -1)) : color_max],
#                         c(colorRampPalette(c("white","#ff0000"))(color_max)[1:(round(max(Features)*100))]) ))
color_forpheatmap2 = c("blue","white",'yellow','red')
pheatmap((Features),border_color = "grey",
         border_width = 0.05,
         color = color_forpheatmap2,
         cellwidth = 9,
         cellheight = 6,
         cluster_cols = FALSE,
         fontsize = 6,
         fontsize_row = 6,
         fontsize_col = 6,
         legend = FALSE)

#### Heatmap visualization of the alteration of transcriptional regulation strength

In [None]:
modules = []
for key in dic_state:
    for key1 in dic_state[key]:
        modules = modules + dic_state[key][key1]
modules = list(set(modules))

result = pd.DataFrame()
for state in ["S1","S2","S3","S4","S5"]:
    cur_state = []
    for module in modules:
        if (module in dic_state[state]['up_tf']) and (module not in dic_state[state]['down_tf']):
            cur_state.append(1)
        elif (module  in dic_state[state]['down_tf']) and ( module not in dic_state[state]['up_tf']):
            cur_state.append(-1)
        elif (module  in dic_state[state]['down_tf']) and (module  in dic_state[state]['up_tf']):
            cur_state.append(0.5)
        elif module not in dic_state[state]:
            cur_state.append(0)
    result[state] = cur_state 

result.index = modules
result = result[~(result == 0).all(1)]

In [None]:
%%R -i result -w 3 -h 3 --units in -r 300

n = 5
order = c()
for (i in seq(1,n)){
    order = c(order, (paste('S',i,sep = '')))
}

Features = result
color_max = max(round(min(Features)*100) * -1, round(max(Features)*100))
#color_forpheatmap2 = c(c(colorRampPalette(c("#0000ff", "white"))(color_max)[(color_max - (round(min(Features)*100) * -1)) : color_max],
#                         c(colorRampPalette(c("white","#ff0000"))(color_max)[1:(round(max(Features)*100))]) ))
color_forpheatmap2 = c("blue","white",'yellow','red')
pheatmap((Features),border_color = "grey",
         border_width = 0.05,
         color = color_forpheatmap2,
         cellwidth = 9,
         cellheight = 6,
         fontsize = 6,
         cluster_cols = FALSE,
         fontsize_row = 6,
         fontsize_col = 6,
         legend = FALSE)

#### Heatmap visualization of all altered functional-module factors

In [None]:
Features_r = Features.values
nr,nc = Features.shape
samples_name = ro.r.c(Features.index.values)
factor_name = ro.r.c(Features.columns.values)

Features_r = ro.r.matrix(Features_r, nrow=nr, ncol=nc, dimnames = [ Features.index.values, Features.columns.values])

%load_ext rpy2.ipython

In [None]:
%%R -i Features_r,nr,nc,samples_name,factor_name -w 10 -h 6 --units in -r 300
Features_r = matrix(Features_r,nr,nc,dimnames = list(samples_name,factor_name))

## Need revision 
n = 5
order = c()
for (i in seq(1,n)){
    order = c(order, (paste('S',i,sep = '')))
}

Features = Features_r[order,]
#print(Features)
color_max = max(round(min(Features)*100) * -1, round(max(Features)*100))
color_forpheatmap2 = c(c(colorRampPalette(c("#0000ff", "white"))(color_max)[(color_max - (round(min(Features)*100) * -1)) : color_max],
                         c(colorRampPalette(c("white","#ff0000"))(color_max)[1:(round(max(Features)*100))]) ))

label = Features
label <- data.frame(matrix(as.numeric(unlist(Features)), nrow=length(Features), byrow=T))


pheatmap((Features),border_color = "grey",
         border_width = 0.05,
         color = color_forpheatmap2,
         cellwidth = 12,
         cellheight = 12,
         cluster_cols = TRUE,
         cluster_row = FALSE,
         fontsize_row = 6,
         fontsize_col = 8,
         legend = FALSE)

In [None]:
Features.to_csv(para_in['out_dir']+"/Features.csv")

####  FM-factors that show significant difference between at least one state to the rest states using Wilcoxon rank-sum test (p<0.01 and |Effect size| > 1). There features are selected to predict therapeutic vulnerabilities.

In [None]:
Features = FM_States.Get_features_one_vs_all_others(matrix_factor_selected, 0.01, 1, annotation_col_1,select_modules)
Features.to_csv(para_in['out_dir']+"/Features_diff_all.csv")

#### For more annotation about the transcription factor regulation, drug concentration or drug targets, please check Example1_annotation_tf, Example1_annotation_drugResponse.ipynb and Example1_annotation_targets.