In [29]:
from Bio.KEGG import REST
import json
import pandas as pd
import os
import requests
import tqdm

In [None]:


# 下载 KEGG Compound 列表
compound_list = REST.kegg_list("compound").read()
with open("kegg_compound_list.txt", "w") as file:
    file.write(compound_list)

# # 获取特定化合物条目
# compound_entry = REST.kegg_get("cpd:C00001").read()
# print(compound_entry)


In [None]:
# 提取tsv文件
TSV_FILE = "drug-mappings.tsv"
drug_mapping = pd.read_csv(TSV_FILE, sep="\t")


Unnamed: 0,drugbankId,name,ttd_id,pubchem_cid,cas_num,chembl_id,zinc_id,chebi_id,kegg_cid,kegg_id,bindingDB_id,UMLS_cuis,stitch_id
0,DB13088,AZD-0424,D0QG8F,9893171.0,692054-06-1,CHEMBL3545177,,,,,,C4519307,
1,DB13089,Enoxolone,D06EWG,10114.0,471-53-4,CHEMBL230006,ZINC000019203131,30853.0,C02283,,50233538.0,C0017986,
2,DB13082,Nefiracetam,D0KD5P,71157.0,77191-36-7,CHEMBL260829,ZINC000000003788,135004.0,,,,C0165264,
3,DB13083,Talarozole,D0AN7B,9799888.0,201410-53-9,CHEMBL459505,,102167.0,,D09385,50253810.0,C2606129,
4,DB13080,Roluperidone,D0SQ1W,9799284.0,359625-79-9,,,,,,,C4730997,


In [18]:
drug_mapping_filtered = drug_mapping.dropna(subset=["kegg_cid","drugbankId"])
drug_mapping_filtered = drug_mapping_filtered[["drugbankId","name", "kegg_cid","kegg_id"]]
# 数据清洗，删除掉kegg_cid不合法的数据
drug_mapping_filtered = drug_mapping_filtered[drug_mapping_filtered["kegg_cid"].str.contains("C\d{5}")]
drug_mapping_filtered.to_csv("drug_with_kegg_cid.csv", index=False)
drug_mapping_filtered.head()

Unnamed: 0,drugbankId,name,kegg_cid,kegg_id
1,DB13089,Enoxolone,C02283,
6,DB13087,Miridesap,C10045,D11464
19,DB03700,D-Threonine,C12317,
23,DB03708,Adenosine 5'-phosphosulfate,C00224,
26,DB03703,Cyclohexanol,C00854,


In [12]:
data_all_path = "../data_atc.csv"
data_all = pd.read_csv(data_all_path)
data_all.head()

Unnamed: 0,dg_id,dg_name,dg_atc_codes,dg_atc_levels
0,DB00001,Lepirudin,['B01AE02'],"[['B01AE', 'B01A', 'B01', 'B']]"
1,DB00002,Cetuximab,['L01FE01'],"[['L01FE', 'L01F', 'L01', 'L']]"
2,DB00003,Dornase alfa,['R05CB13'],"[['R05CB', 'R05C', 'R05', 'R']]"
3,DB00004,Denileukin diftitox,['L01XX29'],"[['L01XX', 'L01X', 'L01', 'L']]"
4,DB00005,Etanercept,['L04AB01'],"[['L04AB', 'L04A', 'L04', 'L']]"


In [31]:
# 得到需要查找的kegg_cid 列表
kegg_cid_list = []
drugbank_id_list = data_all["dg_id"].tolist()
for id in drugbank_id_list:
    kegg_cid = drug_mapping_filtered[drug_mapping_filtered["drugbankId"] == id]["kegg_cid"].values
    if len(kegg_cid) > 0:
        kegg_cid_list.append((id,kegg_cid[0]))
kegg_cid_list, len(kegg_cid_list)

([('DB00005', 'C07897'),
  ('DB00007', 'C07612'),
  ('DB00010', 'C08192'),
  ('DB00017', 'C06865'),
  ('DB00030', 'C00723'),
  ('DB00035', 'C06944'),
  ('DB00036', 'C03378'),
  ('DB00040', 'C01501'),
  ('DB00048', 'C00816'),
  ('DB00068', 'C07901'),
  ('DB00083', 'C07946'),
  ('DB00091', 'C05086'),
  ('DB00104', 'C07306'),
  ('DB00107', 'C00746'),
  ('DB00114', 'C00018'),
  ('DB00115', 'C02823'),
  ('DB00118', 'C00019'),
  ('DB00121', 'C00120'),
  ('DB00123', 'C00047'),
  ('DB00125', 'C00062'),
  ('DB00126', 'C00072'),
  ('DB00129', 'C00077'),
  ('DB00130', 'C00064'),
  ('DB00134', 'C00073'),
  ('DB00136', 'C01673'),
  ('DB00140', 'C00255'),
  ('DB00142', 'C00025'),
  ('DB00143', 'C02471'),
  ('DB00145', 'C00037'),
  ('DB00146', 'C01561'),
  ('DB00150', 'C00078'),
  ('DB00152', 'C00378'),
  ('DB00153', 'C05441'),
  ('DB00158', 'C00504'),
  ('DB00162', 'C17276'),
  ('DB00165', 'C00314'),
  ('DB00166', 'C16241'),
  ('DB00169', 'C05443'),
  ('DB00170', 'C05377'),
  ('DB00175', 'C01844'),


In [32]:
# download kegg compound mol files
output_dir = "mol"
os.makedirs(output_dir, exist_ok=True)

base_url = "http://rest.kegg.jp/get/{}/mol"

def down_mol(cid, output_path):
    url = base_url.format(cid)
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        with open(output_path, "w") as file:
            file.write(response.text)
        return True
    except Exception as e:
        print(e)
        return False
    
t = tqdm.tqdm(total=len(kegg_cid_list))
fail_list = []
for dgid, cid in kegg_cid_list:
    output_path = os.path.join(output_dir, f"{dgid}.mol")
    if not down_mol(cid, output_path):
        fail_list.append(cid)
    t.update(1)
fail_list

  5%|▍         | 39/831 [03:14<1:05:53,  4.99s/it]


404 Client Error: Not Found for url: https://rest.kegg.jp/get/C07897/mol




404 Client Error: Not Found for url: https://rest.kegg.jp/get/C08192/mol




404 Client Error: Not Found for url: https://rest.kegg.jp/get/C00723/mol




404 Client Error: Not Found for url: https://rest.kegg.jp/get/C03378/mol




404 Client Error: Not Found for url: https://rest.kegg.jp/get/C01501/mol




404 Client Error: Not Found for url: https://rest.kegg.jp/get/C00816/mol




404 Client Error: Not Found for url: https://rest.kegg.jp/get/C07901/mol




404 Client Error: Not Found for url: https://rest.kegg.jp/get/C07946/mol




404 Client Error: Not Found for url: https://rest.kegg.jp/get/C00505/mol




404 Client Error: Not Found for url: https://rest.kegg.jp/get/C06926/mol




404 Client Error: Not Found for url: https://rest.kegg.jp/get/C02017/mol




404 Client Error: Not Found for url: https://rest.kegg.jp/get/C13594/mol




404 Client Error: Not Found for url: https://rest.kegg.jp/get/C16051/mol




404 Client Error: Not Found for url: https://rest.kegg.jp/get/C16049/mol




['C07897',
 'C08192',
 'C00723',
 'C03378',
 'C01501',
 'C00816',
 'C07901',
 'C07946',
 'C00505',
 'C06926',
 'C02017',
 'C13594',
 'C16051',
 'C16049']

In [39]:
# update data all
kegg_cid_list = [(dgid, cid) for dgid, cid in kegg_cid_list if cid not in fail_list]
dgid_list = [dgid for dgid, cid in kegg_cid_list]
data_all = data_all[data_all["dg_id"].isin(dgid_list)]
# add kegg_cid to data_all
kegg_cid_dict = dict(kegg_cid_list)
data_all["kegg_cid"] = data_all["dg_id"].map(kegg_cid_dict)
data_all.to_csv("data_with_atc_kegg.csv", index=False)