## Integrate the results of 4 ML models

In [2]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import json
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# load DMPCohort names
name_dict = {}
DMP_names = pd.read_csv('datafile/DMPCohort-names.csv', sep=',')
for idx in range(DMP_names.shape[0]):
    #DMP_names['number'][idx] = DMP_names['number'][idx][-3:]
    name_dict[DMP_names['number'][idx][-3:]] = DMP_names['name'][idx]

In [8]:
# load datasets
# load DT datasets
DT_real_array = np.load('full_model/DT_importance.npy')
with open('datafile/pvalues-log10-DT.json', 'r') as inp: # load -logP
    DT_pvalue_dict = json.load(inp)

# load RF datasets
RF_real_array = np.load('full_model/RF_importance.npy')
with open('datafile/pvalues-log10-RF.json', 'r') as inp: # load -logP
    RF_pvalue_dict = json.load(inp)

# load XGB datasets
XGB_real_array = np.load('full_model/XGB_importance.npy')
with open('datafile/pvalues-log10-XGB.json', 'r') as inp: # load -logP
    XGB_pvalue_dict = json.load(inp)

# load lightGBM datasets
lightGBM_real_array = np.load('full_model/lightGBM_importance.npy')
with open('datafile/pvalues-log10-lightGBM.json', 'r') as inp: # load -logP
    lightGBM_pvalue_dict = json.load(inp)

In [10]:
# test argsort
a = np.array([1,3,5,2,6])
b = np.argsort(a)
c = np.argsort(-a)
print(a,b,c)

[1 3 5 2 6] [0 3 1 2 4] [4 2 1 3 0]


In [10]:
# select first 20 features with -logP > 1.3, pathway:[0:205], gut:[205:]
# DT
DT_dict_pathway = {}
DT_dict_gut = {}
DT_sort_arg = np.argsort(-DT_real_array)
count_pathway = 0
count_gut = 0
for i in DT_sort_arg:
    if(DT_pvalue_dict[str(i+446)] > 1.3 and i < 205):
        DT_dict_pathway[str(i+446)] = {}
        DT_dict_pathway[str(i+446)]['rank'] = count_pathway + 1
        DT_dict_pathway[str(i+446)]['index'] = str(i+446)
        DT_dict_pathway[str(i+446)]['name'] = name_dict[str(i+446)]
        DT_dict_pathway[str(i+446)]['pvalue(-logP)'] = DT_pvalue_dict[str(i+446)]
        DT_dict_pathway[str(i+446)]['feature importance'] = float(DT_real_array[i])
        count_pathway += 1
    if(count_pathway == 20):
        break
for i in DT_sort_arg:
    if(DT_pvalue_dict[str(i+446)] > 1.3 and i >= 205):
        DT_dict_gut[str(i+446)] = {}
        DT_dict_gut[str(i+446)]['rank'] = count_gut + 1
        DT_dict_gut[str(i+446)]['index'] = str(i+446)
        DT_dict_gut[str(i+446)]['name'] = name_dict[str(i+446)]
        DT_dict_gut[str(i+446)]['pvalue(-logP)'] = DT_pvalue_dict[str(i+446)]
        DT_dict_gut[str(i+446)]['feature importance'] = float(DT_real_array[i])
        count_gut += 1
    if(count_gut == 20):
        break
with open('datafile/top20-feature-DT-pathway.json','w+') as out:
    json.dump(DT_dict_pathway, out, indent=2)
with open('datafile/top20-feature-DT-gut.json','w+') as out:
    json.dump(DT_dict_gut, out, indent=2)

# RF
RF_dict_pathway = {}
RF_dict_gut = {}
RF_sort_arg = np.argsort(-RF_real_array)
count_pathway = 0
count_gut = 0
for i in RF_sort_arg:
    if(RF_pvalue_dict[str(i+446)] > 1.3 and i < 205):
        RF_dict_pathway[str(i+446)] = {}
        RF_dict_pathway[str(i+446)]['rank'] = count_pathway + 1
        RF_dict_pathway[str(i+446)]['index'] = str(i+446)
        RF_dict_pathway[str(i+446)]['name'] = name_dict[str(i+446)]
        RF_dict_pathway[str(i+446)]['pvalue(-logP)'] = RF_pvalue_dict[str(i+446)]
        RF_dict_pathway[str(i+446)]['feature importance'] = float(RF_real_array[i])
        count_pathway += 1
    if(count_pathway == 20):
        break
for i in RF_sort_arg:
    if(RF_pvalue_dict[str(i+446)] > 1.3 and i >= 205):
        RF_dict_gut[str(i+446)] = {}
        RF_dict_gut[str(i+446)]['rank'] = count_gut + 1
        RF_dict_gut[str(i+446)]['index'] = str(i+446)
        RF_dict_gut[str(i+446)]['name'] = name_dict[str(i+446)]
        RF_dict_gut[str(i+446)]['pvalue(-logP)'] = RF_pvalue_dict[str(i+446)]
        RF_dict_gut[str(i+446)]['feature importance'] = float(RF_real_array[i])
        count_gut += 1
    if(count_gut == 20):
        break
with open('datafile/top20-feature-RF-pathway.json','w+') as out:
    json.dump(RF_dict_pathway, out, indent=2)
with open('datafile/top20-feature-RF-gut.json','w+') as out:
    json.dump(RF_dict_gut, out, indent=2)

# XGB
XGB_dict_pathway = {}
XGB_dict_gut = {}
XGB_sort_arg = np.argsort(-XGB_real_array)
count_pathway = 0
count_gut = 0
for i in XGB_sort_arg:
    if(XGB_pvalue_dict[str(i+446)] > 1.3 and i < 205):
        XGB_dict_pathway[str(i+446)] = {}
        XGB_dict_pathway[str(i+446)]['rank'] = count_pathway + 1
        XGB_dict_pathway[str(i+446)]['index'] = str(i+446)
        XGB_dict_pathway[str(i+446)]['name'] = name_dict[str(i+446)]
        XGB_dict_pathway[str(i+446)]['pvalue(-logP)'] = XGB_pvalue_dict[str(i+446)]
        XGB_dict_pathway[str(i+446)]['feature importance'] = float(XGB_real_array[i])
        count_pathway += 1
    if(count_pathway == 20):
        break
for i in XGB_sort_arg:
    if(XGB_pvalue_dict[str(i+446)] > 1.3 and i >= 205):
        XGB_dict_gut[str(i+446)] = {}
        XGB_dict_gut[str(i+446)]['rank'] = count_gut + 1
        XGB_dict_gut[str(i+446)]['index'] = str(i+446)
        XGB_dict_gut[str(i+446)]['name'] = name_dict[str(i+446)]
        XGB_dict_gut[str(i+446)]['pvalue(-logP)'] = XGB_pvalue_dict[str(i+446)]
        XGB_dict_gut[str(i+446)]['feature importance'] = float(XGB_real_array[i])
        count_gut += 1
    if(count_gut == 20):
        break
with open('datafile/top20-feature-XGB-pathway.json','w+') as out:
    json.dump(XGB_dict_pathway, out, indent=2)
with open('datafile/top20-feature-XGB-gut.json','w+') as out:
    json.dump(XGB_dict_gut, out, indent=2)

# lightGBM
lightGBM_dict_pathway = {}
lightGBM_dict_gut = {}
lightGBM_sort_arg = np.argsort(-lightGBM_real_array)
count_pathway = 0
count_gut = 0
for i in lightGBM_sort_arg:
    if(lightGBM_pvalue_dict[str(i+446)] > 1.3 and i < 205):
        lightGBM_dict_pathway[str(i+446)] = {}
        lightGBM_dict_pathway[str(i+446)]['rank'] = count_pathway + 1
        lightGBM_dict_pathway[str(i+446)]['index'] = str(i+446)
        lightGBM_dict_pathway[str(i+446)]['name'] = name_dict[str(i+446)]
        lightGBM_dict_pathway[str(i+446)]['pvalue(-logP)'] = lightGBM_pvalue_dict[str(i+446)]
        lightGBM_dict_pathway[str(i+446)]['feature importance'] = float(lightGBM_real_array[i])
        count_pathway += 1
    if(count_pathway == 20):
        break
for i in lightGBM_sort_arg:
    if(lightGBM_pvalue_dict[str(i+446)] > 1.3 and i >= 205):
        lightGBM_dict_gut[str(i+446)] = {}
        lightGBM_dict_gut[str(i+446)]['rank'] = count_gut + 1
        lightGBM_dict_gut[str(i+446)]['index'] = str(i+446)
        lightGBM_dict_gut[str(i+446)]['name'] = name_dict[str(i+446)]
        lightGBM_dict_gut[str(i+446)]['pvalue(-logP)'] = lightGBM_pvalue_dict[str(i+446)]
        lightGBM_dict_gut[str(i+446)]['feature importance'] = float(lightGBM_real_array[i])
        count_gut += 1
    if(count_gut == 20):
        break
with open('datafile/top20-feature-lightGBM-pathway.json','w+') as out:
    json.dump(lightGBM_dict_pathway, out, indent=2)
with open('datafile/top20-feature-lightGBM-gut.json','w+') as out:
    json.dump(lightGBM_dict_gut, out, indent=2)