<a href="https://colab.research.google.com/github/IKKEM-Lin/colab/blob/main/path_search_20230901.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 产物路径搜索，需上传[ttl文件](https://github.com/IKKEM-Lin/colab/blob/main/gen_turtle_20230901.ipynb)

In [20]:
# install dependencies
! pip install rdflib
! pip install requests
! pip install loguru
!pip install pyvis

from pyvis.network import Network

from rdflib import Namespace, Literal, URIRef, Graph
import requests
from loguru import logger

import copy
import json
import uuid
import collections
import hashlib
import os
import re



### 1. 公共函数，用于统一化合物的名称，https://pubchem.deno.dev 会缓存所请求的数据

In [21]:
# common function

def get_IUPAC_name_final(name, mapping_dict):
    try:
        r = requests.get(f"https://pubchem.deno.dev/iupac?name={name}")
        result = json.loads(r.text)
        if result.get("data"):
            return result.get("data") or name
    except:
        return name

def get_spieces_class_operations(key, mapping_dict, IUPAC_name = ""):
    def get_md5(name):
        return hashlib.md5(name.encode("UTF-8")).hexdigest()

    def get_CHEBI_ID(name):
        try:
            r = requests.get(f"https://pubchem.deno.dev/chebi?name={name}")
            result = json.loads(r.text)
            if result.get("data"):
                return result.get("data") or name
        except:
            return name

    if not IUPAC_name:
        IUPAC_name = get_IUPAC_name_final(key, mapping_dict)
    name_list = IUPAC_name.split(";")
    name = name_list[0] if name_list else ""
    tag = get_CHEBI_ID(name)
    if "CHEBI" in tag:
        re_tag = ''.join(re.findall("(CHEBI_\d+)", tag))
        return re_tag, URIRef("obo:" + re_tag)
    else:
        id_str = get_md5(IUPAC_name)
        return id_str, URIRef("spi:"+"{}".format(id_str))


def get_spieces(name):
  # 拿到name对应的ID
  id_str, URI = get_spieces_class_operations(name, {})
  if "CHEBI" in id_str:
    return URIRef("obo:{}".format(id_str))
  else:
    return URIRef("spi:{}".format(id_str))

### 2. 读取ttl文件

In [22]:
sparql_prefix =  "SELECT * WHERE {{ \n" \
            "\t{expression}\n"\
          "}}\n"

substance_only_product = ["CO", "H2", "syngas", "H2O", "CO2", "Carbon Dioxide", "Carbon Monoxide"]
substance_only_product = list(map(lambda x: get_spieces(x), substance_only_product))

rg = Graph()
rg.parse("./10000.ttl", format='turtle')


<Graph identifier=Nf84da9eee617401a9958ba56b3ff66e5 (<class 'rdflib.graph.Graph'>)>

### 3.定义查询的公共函数

In [23]:
print(len(rg), substance_only_product)
def gen_query(pred, obj):
  return f"?val <{pred}> <{obj}> ."

def find_reactions_from_product(produce):
  query = sparql_prefix.format(expression = gen_query('react:has_product', produce))
  result = rg.query(query)
  return map(lambda x: x.val, result)

def find_reactants_from_reaction(reaction):
  query = sparql_prefix.format(expression = gen_query('spi:is_reactant_of', reaction))
  # print(query)
  result = rg.query(query)
  return map(lambda x: x.val, result)

def get_products_from_reaction(reaction):
  query = sparql_prefix.format(expression = gen_query('spi:is_product_of', reaction))
  result = rg.query(query)
  return map(lambda x: x.val, result)

def is_reaction(node):
  return "react:" in str(node)

def get_substance_name(substance):
  obo = substance
  if is_reaction(obo):
    return substance
  if not isinstance(substance, URIRef):
    obo = URIRef(obo)
  query = sparql_prefix.format(expression = f"""
    <{obo}> <spi:has_IUPAC_name> ?IUPAC_name.
    OPTIONAL {{<{obo}> <spi:has_name> ?name}}
    OPTIONAL {{<{obo}> <spi:has_formula> ?formula}}
  """) + "LIMIT 5"
  result = rg.query(query)
  result = list(map(lambda x: str(x.formula or x.name), result))
  return result and result[0] or substance

def add_keys_to_dict(keys, dict = {}):
  for key in keys:
    dict[key] = {}
  return dict

# str(list(find_reactions_from_product("obo:CHEBI_17790"))[0])

508271 [rdflib.term.URIRef('spi:a6b3f1c744304a3ee2ec20446f690f45'), rdflib.term.URIRef('obo:CHEBI_33608'), rdflib.term.URIRef('spi:a68a857ab0e54f5e63adea6b0bde9b44'), rdflib.term.URIRef('obo:CHEBI_15377'), rdflib.term.URIRef('spi:05f338756c3795e0fe583df923cd6a65'), rdflib.term.URIRef('obo:CHEBI_16526'), rdflib.term.URIRef('obo:CHEBI_17245')]


### 4. 反应聚类

In [24]:
reaction_group = {}
def get_all_reaction():
  query = sparql_prefix.format(expression = f"?val <react:id> ?obj")
  result = rg.query(query)
  return map(lambda x: x.val, result)

def get_reaction_uniq_key(reaction):
  reactants = sorted(map(lambda x: str(x), find_reactants_from_reaction(reaction)))
  products = sorted(map(lambda x: str(x), get_products_from_reaction(reaction)))
  key = ",".join(reactants) + ";" + ",".join(products)
  return key

all_reactions = list(get_all_reaction())
print(len(all_reactions))
for reaction in all_reactions:
  key = get_reaction_uniq_key(reaction)
  if reaction_group.get(key):
    reaction_group.get(key).append(reaction)
  else:
    reaction_group[key] = [reaction]

print(len(reaction_group.keys()))

8264
3255


### 5.定义路径搜索函数

In [25]:
def search_one(name, max_steps=2):
  query_str = get_spieces(name)
  if query_str in substance_only_product:
    return [[query_str]]
  duplicated_reactions = [];
  res_path = [[query_str]]
  logger.info(query_str) #################
  result_hash = ""
  condition = lambda path: (len(path) < max_steps*2 and path[-1] not in substance_only_product)
  # print(res_path)
  path_handled_cache = []
  count = 0
  while any(map(condition ,res_path)):
    new_res_paths = []
    wait_process_paths = []
    for path in res_path:
      if condition(path):
        wait_process_paths.append(path)
      else:
        new_res_paths.append(path)
    for path in wait_process_paths:
      # print(path, is_reaction(path[-1])) #################
      if path in path_handled_cache:
        new_res_paths.append(path)
        continue
      if is_reaction(path[-1]):
        # 处理反应
        temp_reactants = list(find_reactants_from_reaction(path[-1]))
        substant_in_previous = any([item in path for item in temp_reactants])
        if substant_in_previous:
          # end with reaction, need to cut off finally
          new_res_paths.append(path)
          path_handled_cache.append(path)
        else:
          new_res_paths.extend([path[:] + ([reactant]) for reactant in temp_reactants])
      else:
        # 处理物质
        temp_reactions = list(find_reactions_from_product(path[-1]))
        temp_wait_process_reactions = [reaction for reaction in temp_reactions if (reaction not in path and reaction not in duplicated_reactions)]
        if not temp_wait_process_reactions:
          new_res_paths.append(path)
          path_handled_cache.append(path)
          continue
        for reaction in temp_wait_process_reactions:
          if reaction in duplicated_reactions:
            continue
          reaction_key = get_reaction_uniq_key(reaction)
          reaction_match = reaction_group.get(reaction_key, [reaction])
          # print("reaction_match", reaction_match)
          new_res_paths.append(path[:] + ([reaction]))
          # 处理重复反应
          duplicated_reactions.extend(reaction_match)
    res_path = new_res_paths[:]
    new_hash = hash(str(res_path))
    count = count + 1
    print(count, new_hash, len(res_path)) #################
    if new_hash == result_hash:
      break
    else:
      result_hash = new_hash
  return res_path

### 6. 测试并打印结果

In [28]:
paths = search_one("Butane", 4)

print("----------------- Result ------------------------", len(paths))
result = {}
for path in paths:
  temp = result
  for key in path[:-1]:
    if key not in temp:
      temp[key] = {}
    temp = temp[key]

  last_key = path[-1]
  if last_key not in temp:
    temp[last_key] = {}

def print_dict(d, indent=0):
    for key, value in d.items():
        print('  ' * indent + str(get_substance_name(key)))
        if isinstance(value, dict):
            print_dict(value, indent+2)
# print_dict(result)

[32m2023-09-16 00:52:22.164[0m | [1mINFO    [0m | [36m__main__[0m:[36msearch_one[0m:[36m7[0m - [1mobo:CHEBI_37808[0m


1 -8304707064572476474 20
2 -1570976654211278449 34
3 274574934784807317 229
4 7746296649417788691 328
5 -4362271681398093635 962
6 4722737169462819330 1314
7 -4196988408362479118 1601
8 -9049392711047467272 1601
9 -9049392711047467272 1601
----------------- Result ------------------------ 1601


### 7. 可视化构建

In [None]:
output_file = "tree.html"
option = {
    "hierarchical": False,
    "substance_same_node": False
}

data = result
net = Network(height="90vh", width="100%", notebook=True, layout= option.get("hierarchical") and {"direction": "LR"} or None )
colors = ["#CC9900", "#999900", "#669900", "#339900", "009900"]

def add_node(net, node_id, label, parent=None, level = 0):
    node_color = "#0099CC"
    if level % 2 == 1:
      node_color = colors[level // 2]
    net.add_node(node_id, label=label, color=node_color, size=parent and 25 or 40)
    if parent:
        net.add_edge(parent, node_id)

def build_tree(net, data, parent=None, level = 0):
    count = 0
    for key, value in data.items():
        count += 1
        node_id = option.get("substance_same_node") and key or ((parent or "") + key + str(count))
        add_node(net, node_id, get_substance_name(key), parent, level)
        if isinstance(value, dict):
            new_level = level + 1
            build_tree(net, value, parent=node_id, level = new_level)

build_tree(net, data)
net.show(output_file)

with open(output_file) as f:
    content = f.read()

content = re.sub(r'<center>|</center>', '', content)

# 下面操作将层次图由上下结构改为左右，注释可恢复，可参考：https://ame.cool/pages/84ec1c/#%E9%85%8D%E7%BD%AE%E9%A1%B9%E8%AF%A6%E6%83%85
if option.get("hierarchical"):
  content = re.sub(r'"hierarchical": {', '"hierarchical": { direction: "LR",', content)

with open(output_file, 'w') as f:
    f.write(content)


# 上传展示
import IPython
upload_url = "https://reaction-tree.deno.dev"
files = {'file': (output_file, open(output_file, 'rb'), 'text/html', {'Expires': '0'})}
file_url = requests.post(upload_url, files = files).json().get("Key")
# print(file_url)
if file_url:
  html = f"<strong>展示地址：</strong><a target='_blank' href='{upload_url}/{file_url}'>{upload_url}/{file_url}</a>"
IPython.display.HTML(html)


tree.html
