In [None]:
#module 2.0
#automated PubMed search 

import urllib.request
import pandas as pd
from bs4 import BeautifulSoup
import re
import time


def drug_name_clean(drug):
    if "(abbr) " in drug:
        drug_minus_abbr = drug.replace("(abbr) ","")
        drug = drug_minus_abbr
    drug = re.sub(r" \([A-Z]{2,}.*\)","",drug)
    drug_clean = drug.replace(" ","+")
    return str(drug_clean)

def result_amount(drug_name):
    html ="https://pubmed.ncbi.nlm.nih.gov/?term=%28%28therapeutic+drug+monitoring%5BMeSH+Terms%5D%29+AND+%28{}%5BMeSH+Terms%5D%29%29+AND+%28%28%221900%2F1%2F1%22%5BDate+-+Publication%5D+%3A+%222022%2F12%2F31%22%5BDate+-+Publication%5D%29%29&sort=date".format(drug_name)
    while True:#to prevent error by HTTP403
        try:
           response = urllib.request.urlopen(html)
           soup = BeautifulSoup(response)
           em = soup.find("em",class_="altered-search-explanation query-error-message")#query alternated
           block = soup.find("div",class_="original-query-block")#original query blocked
           div = soup.find("div",class_="results-amount")#default
           if str(div) != "None":
                div_span = div.find("span", class_="value")
                if bool(re.search("\S", drug_name))==False or str(div_span) == "None" or str(em) != "None" or str(block) != "None": #"bool(re.search("\S", drug_name))==False" prevents empty search
                    results_amount = 0
                else:
                    results_amount = int((div_span.get_text()).replace(",",""))#remove scale by replace "," (e.g., 1,000 to 1000)
           else:
               results_amount = 1
           break
        except:
            time.sleep(60)
    return results_amount

def pmid_set_generate(drug_name):#PMID set retrieval
    amount = result_amount(drug_name)
    page_number = 1+amount//200
    ls = []
    for i in range(page_number):
        html = "https://pubmed.ncbi.nlm.nih.gov/?term=((therapeutic%20drug%20monitoring%5BMeSH%20Terms%5D)%20AND%20({}%5BMeSH%20Terms%5D))%20AND%20((%221900%2F1%2F1%22%5BDate%20-%20Publication%5D%20%3A%20%222022%2F12%2F31%22%5BDate%20-%20Publication%5D))&show_snippets=off&sort=date&size=200&page={}".format(drug_name,i+1)
        print(html)
        while True:#to prevent error by not found by
            try:
                response = urllib.request.urlopen(html)
                soup = BeautifulSoup(response)
                docsum_pmid =soup.find_all("span",class_="docsum-pmid")
                if docsum_pmid != []:#if PubMed search return multiple publications
                    for pmid_tag in docsum_pmid:
                        pmid =pmid_tag.get_text()
                        ls.append(pmid)
                else:#if PubMed search return only one publication
                    strong = soup.find("span",class_="identifier pubmed")
                    a = strong.find("a")
                    pmid_pre = a.get_text()
                    pmid = pmid_pre.replace("\n","")#pmid_pre include "\n"
                    ls.append(pmid)
                break
            except:
                break
    pmid_set=set(ls)
    return pmid_set


dic = {} #key = drug name, values = article amounts
df = pd.read_csv("../Files/Figure2/Table1.csv",index_col=0)
display(df)

#get efficacy columns
column_list = list(df.columns.values)#list of columns
efficacy_list = [i for i in column_list if "efficacy" in i]
df_efficacy = df[efficacy_list]

#obtain the publications amount
i = 1 #dic.key()
for entry,name, efficacy in zip(df.index,df["Name"],df_efficacy.itertuples()):
    print("===========")
    print(entry)
    name_list = name.split("\n")
    amount_list = []#to store the amount of publications
    for drug in name_list[:-1]:#remove last index (=empty)
        print("entry; "+str(drug))
        drug_cleaned = drug_name_clean(drug)
        print("search term; "+drug_cleaned)
        amount = result_amount(drug_cleaned)
        #ここに空の場合を入れる
        print("article amounts; "+str(amount))
        amount_list.append(amount)
    values = [entry,name]+list(efficacy[1:])+[max(amount_list)] #list of ["Name","efficacy1",...,"the amount of publications"]
    dic[i]=values
    i = i+1

column = ["Entry","Name"]+efficacy_list+["Max_amounts"]
print("column:"+str(column))
df_dic = pd.DataFrame(dic.values(),columns=column)
df_dic.set_index("Entry", inplace=True)
display(df_dic)

#obtain PMIDs
df_query = df_dic.query("Max_amounts>0")
dic_amount = {}
dic_pmid = {}

for entry, name in zip(df_query.index,df_query["Name"]):
    print("===========")
    print(entry)
    name_list = name.split("\n")
    pmid_set_entry = set()
    for drug in name_list[:-1]:#remove last index (=empty)
        drug_cleaned = drug_name_clean(drug)
        print("search term; "+drug_cleaned)
        pmid_set = pmid_set_generate(drug_cleaned)
        print("pmid; "+str(pmid_set))
        pmid_set_entry.update(pmid_set)
    dic_amount[entry]=len(pmid_set_entry)
    dic_pmid[entry]=pmid_set_entry

df_dic.drop(columns="Max_amounts", inplace=True)
df_amount = pd.DataFrame(dic_amount.values(),index=dic_pmid.keys(),columns=["Amounts"])
df_pmid = pd.DataFrame(dic_pmid.values(),index=dic_pmid.keys())
df_complete = pd.concat([df_dic,df_amount,df_pmid],axis=1)
df_complete.fillna({"Amounts":0},inplace=True)#replace "NaN" in "Amounts" with 0

display(df_complete)
df_complete.to_csv("../Files/Figure2/Table2.csv")






