# SMP-Mapper

Metabs only.

In [14]:
import numpy as np
import pandas as pd
import requests
import json

import os
from bs4 import BeautifulSoup as bs
import re

from smp_mapper_func import *


# Set some global params
# alpha values set rather high to get a non-stupid number of results
alpha_ttest = 0.05 # ttest FDR threshold
alpha_pw_maca = 0.05 # threshold for pw enrichment

MACA_PEA_RESULTS_FN = "~/Documents/zprivate-data/maca-pea-shg-hsa/pathway_results.csv"
METABS_TTEST_RESULTS_FN = "~/Documents/zprivate-data/metabs-de-ttest.csv"

PATHWAY_ID_REF_TBL_FN = "db/pathbank_all_pathways_nodesc.csv"
PATHBANK_METAB_REF_TBL_FN = "db/pathbank_all_metabolites.csv"

SIMPLE_SVG_DIR = "/Users/don/Documents/smp-mapper-metabs/db/pathbank_simple_svg/"
SPECIES = "Rattus norvegicus"

# node colouring
COLOUR_DEFAULT = "#7570b3" #purple
COLOUR_UP = "#1b9e77" # green
COLOUR_DOWN = "#d95f02" # orange

def char_check(my_str): 
  
    # Make own character set and pass  
    # this as argument in compile method 
    regex = re.compile('[@_!#$%^&*()<>?/\|}{~:]') 
      
    # Pass the string in search  
    # method of regex object.     
    if(regex.search(my_str) != None): 
        print(my_str) 

def isalnum_or_space(char):
    if (str(char).isalnum()) or (str(char) == " "):
        return True

In [3]:
# ========= read ttest results table =========
d_metab_ttest = pd.read_csv(METABS_TTEST_RESULTS_FN)

# ========= read and get ref table =========
d_metab_ref_tbl = make_metab_ref_table(PATHWAY_ID_REF_TBL_FN, PATHBANK_METAB_REF_TBL_FN)
d_metab_ref_tbl = d_metab_ref_tbl.loc[d_metab_ref_tbl["Species"]==SPECIES].drop_duplicates()


In [5]:
# ========= Prep PEA MetaboAnalyst =========
d_pea_maca = pd.read_csv(MACA_PEA_RESULTS_FN)
# filter out significant pathway names here to make the table smaller
d_pea_maca = d_pea_maca.loc[d_pea_maca["Holm adjust"] < 0.01]

colnames_ls = list(d_pea_maca.columns)
colnames_ls[0] = "Pathway Name"
d_pea_maca.columns = colnames_ls

d_pea_maca["Pathway Name"] = d_pea_maca.apply(lambda row: sanitize_pathway_names(str(row["Pathway Name"])), axis=1)
d_metab_ref_tbl["Pathway Name"] = d_metab_ref_tbl.apply(lambda row: sanitize_pathway_names(str(row["Pathway Name"])), axis=1)

d_t = d_metab_ref_tbl[["Pathway Name", "PathBank ID", "PW ID", "Species"]]

# Join on Pathway Name to get more IDs
d_pea_maca = pd.merge(d_pea_maca,
                      d_t,
                      how="left",
                      on="Pathway Name")

# print rows with no matches found on `Pathway Name`
d_t = d_pea_maca.loc[d_pea_maca["PW ID"].isna()]
if len(d_t) > 0:
    print("Pathways with no matching PathBank entry (no PW ID):")
    for nm in list(d_t["Pathway Name"]):
        print(nm)

Pathways with no matching PathBank entry (no PW ID):
Catecholamine Biosynthesis


In [6]:
# Create metabs_pw_dict
maca_dict = {}
maca_pw_id_ls = list(set(d_pea_maca["PW ID"]))
for pw_id in maca_pw_id_ls:
    d_t = d_metab_ref_tbl.loc[d_metab_ref_tbl["PW ID"]==pw_id]
    # Filter out only required columns
    colnames_ls = ['PathBank ID', 'Pathway Name', 'Pathway Subject', 'Species', 
              'Metabolite ID', 'Metabolite Name', 'HMDB ID', 'KEGG ID', 'PW ID']
    d_t = d_t[colnames_ls]
    
    # Join with t-test results on HMDB ID
    d_t = pd.merge(d_t, 
                   d_metab_ttest[["HMDB ID", "p.value", "FDR", "logFC"]], 
                   how="left", on="HMDB ID")
    
    maca_dict[pw_id] = d_t

In [16]:
pw_id_ls = [str(x) for x in list(set(d_pea_maca["PW ID"]))]
pw_id_ls.remove("nan")

for pw_id in pw_id_ls:
    # Read svg of the enriched pathway
    with open(SIMPLE_SVG_DIR+pw_id+"_simple.svg", "r") as f:
        contents = f.read()
        soup = bs(contents, "xml")

    print("Num metabs found in %s = %s" % (pw_id, len(list(maca_dict[pw_id]["Metabolite ID"]))))
    # Grab all metab_ids, and associated logfc and q-vals
    for metab_id in list(maca_dict[pw_id]["Metabolite ID"]):
        d_t = maca_dict[pw_id]
        lfc = d_t.loc[d_t["Metabolite ID"]==metab_id]["logFC"].values[0]
        q_val = d_t.loc[d_t["Metabolite ID"]==metab_id]["FDR"].values[0]

        node_colour = COLOUR_DEFAULT
        if lfc > 0.1:
            node_colour = COLOUR_UP
        elif lfc < -0.1:
            node_colour = COLOUR_DOWN

        # grab g tags with the data-element-ids
        elem_ls = soup.find_all(attrs={"data-element-id" : metab_id})
        if q_val < alpha_pw_maca:
            for e in elem_ls:
                circle = e.find('circle')
                circle['fill'] = node_colour
                circle["stroke-width"] = "5"
        elif q_val > alpha_pw_maca:
            for e in elem_ls:
                circle = e.find('circle')
                circle['stroke'] = node_colour
                circle["stroke-width"] = "10"
                
        # set subpathway fill to a faint gray
        elem_ls = soup.find_all(attrs={"data-element-type" : "sub_pathway"})
        for e in elem_ls:
            rect = e.find("rect")
            rect["fill"] = "#DFDFDF"

    # write out
    with open("./test-out/"+pw_id+".svg", "w") as f:
        # Hack to fix some kind of fudged logic which introduces a syntax error in reading the svg
        f.write(soup.prettify().replace('xmlns:="', 'xmlns="'))

Num metabs found in PW088374 = 19
Num metabs found in PW088321 = 20
Num metabs found in PW088382 = 60
Num metabs found in PW088361 = 43
Num metabs found in PW088359 = 59
Num metabs found in PW088293 = 33
Num metabs found in PW088339 = 36
Num metabs found in PW088308 = 20
Num metabs found in PW088315 = 32
Num metabs found in PW088347 = 67
Num metabs found in PW088295 = 34
Num metabs found in PW088368 = 25
Num metabs found in PW088332 = 28
Num metabs found in PW088364 = 30
Num metabs found in PW088345 = 47
Num metabs found in PW088300 = 32
Num metabs found in PW088354 = 34
Num metabs found in PW088317 = 8
Num metabs found in PW088357 = 56
