In [27]:
import os
os.chdir("/home/wmbio/WORK/gitworking/Multi-omics-intergration")
os.getcwd()

'/home/wmbio/WORK/gitworking/Multi-omics-intergration'

## **Module**

In [28]:
from wmbio import * 

## **Best Sub-Group Selection**

* **FILE PATH**

In [29]:
# PATH
CANCER_TYPE = "COAD"
GROUP_PHTH = os.getcwd() + '/group/'
PNG_PATH = os.getcwd() + '/png/'
GROUP_VALIDATION_PATH = os.getcwd() + '/group_validation/'
DEG_PATH = os.getcwd() + "/best_deg/"
RDATA_PATH = os.getcwd() + "/RAW_DATA/GDC_PREPARE/"
RAW_PATH = os.getcwd() + "/RAW_DATA/"

METHOD = 'deseq2'
LOGFC=1
FDR=0.05

In [30]:
# Load Validation score
col=['FILENAME','Log Rank Test','Silhouette','RNA_ANOVA_F1','RNA_RF_F1',
     'miRNA_ANOVA_F1','miRNA_RF_F1','Methylation_ANOVA_F1','Methylation_RF_F1']

group_score = pd.read_csv(GROUP_VALIDATION_PATH + CANCER_TYPE + '_validation.csv', usecols=col)

# Q3 value
SILHOUETTE = group_score.Silhouette.quantile(.5)
RNA_ANOVA = group_score.RNA_ANOVA_F1.quantile(.7)
RNA_RF = group_score.RNA_RF_F1.quantile(.7)
MIRNA_ANOVAR = group_score.miRNA_ANOVA_F1.quantile(.7)
MIRNA_RF = group_score.miRNA_RF_F1.quantile(.7)
MT_ANOVAR = group_score.Methylation_ANOVA_F1.quantile(.7)
MT_RF = group_score.Methylation_RF_F1.quantile(.7)

print("SILHOUETTE Q3 : ", SILHOUETTE)
print("RNA_ANOVA Q3 : ", RNA_ANOVA)
print("RNA_RF Q3 : ", RNA_RF)
print("MIRNA_ANOVAR Q3 : ", MIRNA_ANOVAR)
print("MIRNA_RF Q3 : ", MIRNA_RF)
print("MT_ANOVAR Q3 : ", MT_ANOVAR)
print("MT_RF Q3 : ", MT_RF)

# Condition for Filtering
filter_cond = (group_score['Silhouette'] >= SILHOUETTE) & (group_score['Log Rank Test'] < 0.05) & \
              ((group_score['RNA_ANOVA_F1'] > RNA_ANOVA) | (group_score['RNA_RF_F1'] > RNA_RF)) & \
              ((group_score['miRNA_ANOVA_F1'] > MIRNA_ANOVAR) | (group_score['miRNA_RF_F1'] > MIRNA_RF)) & \
              ((group_score['Methylation_ANOVA_F1'] > MT_ANOVAR) | (group_score['Methylation_RF_F1'] > MT_RF))

SILHOUETTE Q3 :  0.3026
RNA_ANOVA Q3 :  86.1574074074074
RNA_RF Q3 :  86.71717171717171
MIRNA_ANOVAR Q3 :  80.06613756613757
MIRNA_RF Q3 :  78.7979797979798
MT_ANOVAR Q3 :  86.17063492063492
MT_RF Q3 :  87.09090909090908


In [31]:
group_score = group_score[filter_cond].sort_values(["Silhouette"], ascending = (False))

In [32]:
bestSubgroup = group_score.FILENAME.to_list()

In [33]:
if len(bestSubgroup) > 100:
    random.seed(331)
    bestSubgroup = random.sample(bestSubgroup, k=100)

In [34]:
len(bestSubgroup)

58

* **DEA**

In [38]:
dea_result = list()
for best_group in bestSubgroup:

    DEG_CHECK = "_".join([CANCER_TYPE, METHOD.upper(), best_group]) + ".txt"
    SAMPLE_GROUP = GROUP_PHTH + CANCER_TYPE + "/" + CANCER_TYPE + "_GROUP_" + best_group + ".txt"

    if os.path.isfile(DEG_PATH + CANCER_TYPE + "/" + DEG_CHECK):
        deg_list = pd.read_csv(DEG_PATH + CANCER_TYPE + "/" + DEG_CHECK, sep = "\t")

        # cut-off
        deseq_filter = ((deg_list.log2FoldChange <= -(LOGFC)) | (deg_list.log2FoldChange >= LOGFC)) & (deg_list.padj < FDR)
        deg_list = deg_list.loc[deseq_filter, :]
    else :
        # DEG Extraction
        deg_list = deg_extract(log_fc=LOGFC, fdr=FDR,
                      cancer_type=CANCER_TYPE, 
                      sample_group=SAMPLE_GROUP, deg_path=DEG_PATH, 
                      file_name=best_group,
                      rdata_path=RDATA_PATH,
                      method=METHOD,
                      batch_removal=True,
                      raw_path=RAW_PATH)
        # cut-off
        deseq_filter = ((deg_list.log2FoldChange <= -(LOGFC)) | (deg_list.log2FoldChange >= LOGFC)) & (deg_list.padj < FDR)
        deg_list = deg_list.loc[deseq_filter, :]

    dea_result.append(deg_list)
    gc.collect()

# Filter DEA
# combine result
if METHOD == 'all':
    dea_combine = list(map(deseq2_edger_combine, dea_result))
    dea_combine = [col_rename(dea_combine[index], index, bestSubgroup) for index in range(len(dea_combine))]
    dea_combine = reduce(lambda left, right : pd.merge(left, right, left_on='gene', right_on='gene', how = 'outer'), dea_combine)
elif METHOD == 'deseq2' :
    dea_combine = list(map(lambda d : d[["row", "log2FoldChange", "padj"]], dea_result))
    dea_combine = [col_rename(dea_combine[index], index, bestSubgroup) for index in range(len(dea_combine))]
    dea_combine = reduce(lambda left, right : pd.merge(left, right, left_on='gene', right_on='gene', how = 'outer'), dea_combine)

* **Blank Row**

In [39]:
# blank row calculation
blank_row = dea_combine.loc[:, dea_combine.columns.str.contains("[0-9]_log2FoldChange")].isnull().sum(axis=1) # serise
dea_combine['1-blank_ratio'] = blank_row.apply(lambda x : ((1 - (x / len(bestSubgroup))) * 100))

* **NT vs TP DEA(Deseq2)**

In [40]:
nt_tp_deseq2 = deg_extract_normal(log_fc=0, pvalue=0.1, cancer_type=CANCER_TYPE, 
                                  rdata_path=RDATA_PATH, deg_path=DEG_PATH, batch_removal=True)

nt_tp_deseq2_col = nt_tp_deseq2[['row', 'log2FoldChange', 'pvalue']]
nt_tp_deseq2_col.columns = ['gene', 'NT-TP_log2FoldChange', 'pvalue']

Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  

In [41]:
result_combine = pd.merge(left=dea_combine, right=nt_tp_deseq2_col, left_on='gene', right_on='gene', how = 'left')

In [42]:
result_combine

Unnamed: 0,gene,SubGroup-20220427-060001_log2FoldChange,SubGroup-20220427-060001_padj,SubGroup-20220430-004133_log2FoldChange,SubGroup-20220430-004133_padj,SubGroup-20220429-185614_log2FoldChange,SubGroup-20220429-185614_padj,SubGroup-20220421-205602_log2FoldChange,SubGroup-20220421-205602_padj,SubGroup-20220421-025501_log2FoldChange,...,SubGroup-20220423-113340_padj,SubGroup-20220422-233617_log2FoldChange,SubGroup-20220422-233617_padj,SubGroup-20220323-043857_log2FoldChange,SubGroup-20220323-043857_padj,SubGroup-20220430-091018_log2FoldChange,SubGroup-20220430-091018_padj,1-blank_ratio,NT-TP_log2FoldChange,pvalue
0,ABAT,1.092802,9.728776e-14,,,1.152848,3.824600e-15,,,1.396301,...,,1.035756,2.840247e-13,,,1.256486,3.441428e-17,72.413793,,
1,ABCA4,-2.232752,8.270729e-14,,,-2.762217,9.557745e-21,,,-2.979125,...,6.629546e-07,-2.664077,7.304443e-19,-2.69202,2.130575e-19,-2.804696,6.799331e-21,84.482759,,
2,ACPP,-1.103564,5.558293e-11,,,-1.317642,1.331414e-16,,,-1.397364,...,2.873277e-10,-1.037415,1.726725e-09,,,-1.070577,6.203052e-10,58.620690,,
3,ACTG2,1.491554,7.568113e-12,,,,,,,,...,,1.157565,9.391733e-08,,,1.650670,1.075936e-16,39.655172,,
4,ADCY5,1.114685,5.324036e-07,,,,,,,,...,,1.274243,5.683762e-10,,,1.386005,8.496522e-12,34.482759,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3059,RDH16,,,,,,,,,,...,,,,,,-1.018076,4.348137e-07,1.724138,,
3060,PCDHB16,,,,,,,,,,...,,,,,,1.035861,9.920237e-07,1.724138,,
3061,COL21A1,,,,,,,,,,...,,,,,,1.043680,1.434225e-08,1.724138,,
3062,COX6B2,,,,,,,,,,...,,,,,,-1.030075,3.934752e-07,1.724138,,


* **textmining**

In [43]:
sql = 'SELECT * FROM Textmining.' + CANCER_TYPE
tm_df = query_tm_db(sql)
result_combine_tm = pd.merge(left=result_combine, right=tm_df, left_on="gene", right_on="gene", how='left')



* **DGIdb**

In [None]:
def dgidb_extract(gene_list):
    r = ro.r
    r['source']('src/r-function.R')
    dgidb_r = ro.globalenv['dgidb_interaction']
    
    dgidb_result = dgidb_r(gene_list)
    
    with localconverter(ro.default_converter + pandas2ri.converter):
        dgidb_result = ro.conversion.rpy2py(dgidb_result)
        
    return dgidb_result

In [45]:
gene_list = result_combine_tm.loc[:, 'gene'].to_list()

In [47]:
r = ro.r
r['source']('src/r-function.R')
dgidb_r = ro.globalenv['dgidb_interaction']

In [48]:
dgidb_result = dgidb_r(gene_list)

[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 12
[1] 13
[1] 14
[1] 15
[1] 16
[1] 17
[1] 18
[1] 19
[1] 20
[1] 21
[1] 22
[1] 23
[1] 24
[1] 25
[1] 26
[1] 27
[1] 28
[1] 29
[1] 30
[1] 31


In [50]:
with localconverter(ro.default_converter + pandas2ri.converter):
    dgidb_result = ro.conversion.rpy2py(dgidb_result)

In [51]:
dgidb_result

Unnamed: 0,GENE_NAME,DGI_GENE_CATEGORY,DGI_COUNT,DGI(DRUG_NAME;SCORE;TYPE)
1,ABAT,"DRUGGABLE GENOME,ENZYME",7,VIGABATRIN;33.34;inhibitor&CHEMBL378577;4.55;i...
2,ABCA4,"TRANSPORTER,ABC TRANSPORTER,DRUGGABLE GENOME",0,
3,ACP3,"DRUGGABLE GENOME,ENZYME",0,
4,ACTG2,,0,
5,ADCY5,"KINASE,DRUGGABLE GENOME,ENZYME",1,CHEMBL401844;31.83;
...,...,...,...,...
2949,RDH16,"DRUGGABLE GENOME,SHORT CHAIN DEHYDROGENASE RED...",0,
2950,PCDHB16,,0,
2951,COL21A1,DRUGGABLE GENOME,0,
2952,COX6B2,,0,


In [54]:
pd.merge(left=result_combine_tm, right=dgidb_result, left_on='gene', right_on='GENE_NAME', how='left')

Unnamed: 0,gene,SubGroup-20220427-060001_log2FoldChange,SubGroup-20220427-060001_padj,SubGroup-20220430-004133_log2FoldChange,SubGroup-20220430-004133_padj,SubGroup-20220429-185614_log2FoldChange,SubGroup-20220429-185614_padj,SubGroup-20220421-205602_log2FoldChange,SubGroup-20220421-205602_padj,SubGroup-20220421-025501_log2FoldChange,...,pvalue,type,SUPPORT,CONFIDENCE,LIFT,COUNT,GENE_NAME,DGI_GENE_CATEGORY,DGI_COUNT,DGI(DRUG_NAME;SCORE;TYPE)
0,ABAT,1.092802,9.728776e-14,,,1.152848,3.824600e-15,,,1.396301,...,,,,,,,ABAT,"DRUGGABLE GENOME,ENZYME",7.0,VIGABATRIN;33.34;inhibitor&CHEMBL378577;4.55;i...
1,ABCA4,-2.232752,8.270729e-14,,,-2.762217,9.557745e-21,,,-2.979125,...,,Colorectal Neoplasms,0.000035,1.00,1.183219,7.0,ABCA4,"TRANSPORTER,ABC TRANSPORTER,DRUGGABLE GENOME",0.0,
2,ACPP,-1.103564,5.558293e-11,,,-1.317642,1.331414e-16,,,-1.397364,...,,,,,,,,,,
3,ACTG2,1.491554,7.568113e-12,,,,,,,,...,,Colorectal Neoplasms,0.000015,1.00,1.183219,3.0,ACTG2,,0.0,
4,ADCY5,1.114685,5.324036e-07,,,,,,,,...,,Colorectal Neoplasms,0.000015,0.75,0.887415,3.0,ADCY5,"KINASE,DRUGGABLE GENOME,ENZYME",1.0,CHEMBL401844;31.83;
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3059,RDH16,,,,,,,,,,...,,,,,,,RDH16,"DRUGGABLE GENOME,SHORT CHAIN DEHYDROGENASE RED...",0.0,
3060,PCDHB16,,,,,,,,,,...,,,,,,,PCDHB16,,0.0,
3061,COL21A1,,,,,,,,,,...,,,,,,,COL21A1,DRUGGABLE GENOME,0.0,
3062,COX6B2,,,,,,,,,,...,,,,,,,COX6B2,,0.0,


In [55]:
pd.merge(left=result_combine_tm, right=dgidb_result, left_on='gene', right_on='GENE_NAME', how='left').to_csv('temp.csv', index=False)

* **Write Result**

In [None]:
pd.merge(left=dea_combine, right=nt_tp_deseq2_col, left_on='gene', right_on='gene', how = 'left').to_csv('temp.csv', index = False)