In [None]:
# import internal .py modules
import file_path_management as fpath
import public_library as plib

In [None]:
# import packages
import csv
import pandas as pd
import PyPDF2
import requests
import time
import os
import random
from requests.auth import HTTPProxyAuth
from bs4 import BeautifulSoup
import numpy as np
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import TimeoutException, WebDriverException, NoSuchElementException
import os
import re
from lxml import etree

In [None]:
# www.ncbi.nlm.nih.gov/pmc/
def www_ncbi_nlm_nih_gov(url):
    soup = plib.request_webpage(url)
    
    # extract information from loaded webpage
    try:
        doi = soup.find_all("span", {"class": "doi"})[0].find_all("a")[0].get_text().strip()
    except:
        doi = np.nan
    try:
        pmid = soup.find_all("span", {"class": "fm-citation-pmid"})[0].find_all("a")[0].get_text().strip()
    except:
        pmid = np.nan
    try:
        pmcid = soup.find_all("span", {"class": "fm-citation-pmcid"})[0].find_all("a")[0].get_text().strip()
    except:
        pmcid = np.nan
    try:
        title = soup.find_all("h1", {"class": "content-title"})[0].get_text().strip()
    except:
        title = np.nan
    try:
        abstract = soup.find_all("p", {"class": "p p-first-last"})[0][0].get_text().strip()
    except:
        abstract = np.nan
    try:
        keywords = soup.find_all("span", {"class": "kwd-text"})[0].get_text().strip()
    except:
        keywords = np.nan
    try:
        intro = ""
        elements = soup.find_all("div", {"id": "S1"})[0].find_all("0")
        for element in elements:
            intro = intro + element.get_text().strip()
    except:
        intro = np.nan
    try:
        pmcid = soup.find_all("li", {"class": "pdf-link other_item"})[0].find_all("a")[0][href]
    except:
        pdf_link = np.nan

    info = {
        "doi": doi,
        "pmid": pmid,
        "pmcid": pmcid,
        "title": title,
        "abstract": abstract,
        "keywords": keywords,
        "introduction": intro,
        "pdf_link": pdf_link
    }

    return info
# --------------------start of test code--------------------
url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10133512/"
info = www_ncbi_nlm_nih_gov(url)
print(info)
# ---------------------end of test code---------------------

In [None]:
# www.frontiersin.org
def www_frontiersin_org(url):
    os.environ['WDM_LOG'] = '0'
    options = Options()
    options.add_argument('--headless')
    
    # load the webpage
    error_label = 0
    while(error_label == 0):
        try:
            driver = webdriver.Chrome(executable_path=ChromeDriverManager().install(), options=options)
            driver.get(url)
            # WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//p[text()='Consent']"))).click()
            error_label = 1
        except:
            print("Extracting content from:" + url + " failed, retrying... This might take longer than 5 minutes...")
            time.sleep(5*60)
            error_label = 0
    
    # extract information from loaded webpage
    try:
        doi = driver.find_element(By.XPATH, "/html/body/main/article/section[3]/div/div[1]/div[1]/div[1]/div[1]/div/div[2]/span[2]/a").text
    except:
        doi = np.nan
    try:
        pmid = driver.find_element(By.XPATH, "/html/body/main/article/section[3]/div/div[1]/div[1]/div[1]/div[2]/div[2]/a").text
    except:
        pmid = np.nan
    try:
        pmcid = driver.find_element(By.XPATH, "/html/body/main/article/section[3]/div/div[1]/div[1]/div[1]/div[2]/div[1]/span[2]").text
    except:
        pmcid = np.nan
    try:
        title = driver.find_element(By.XPATH, "/html/body/main/article/section[3]/div/div[1]/div[1]/h1").text
    except:
        title = np.nan
    try:
        abstract = driver.find_element(By.XPATH, "/html/body/main/article/section[3]/div/div[1]/div[4]/div[2]/p").text
    except:
        abstract = np.nan
    try:
        keywords = driver.find_element(By.XPATH, "/html/body/main/article/section[3]/div/div[1]/div[4]/div[3]/span").text
    except:
        keywords = np.nan
    try:
        intro = driver.find_element(By.XPATH, '//*[@id="S1"]').text
    except:
        intro = np.nan
    try:
        pdf_link = driver.find_element(By.XPATH, "/html/body/main/article/section[3]/div/div[1]/div[1]/div[1]/div[1]/div/div[2]/span[2]/a").get_attribute('href')
    except:
        pdf_link = np.nan

    driver.quit()

    info = {
        "doi": doi,
        "pmid": pmid,
        "pmcid": pmcid,
        "title": title,
        "abstract": abstract,
        "keywords": keywords,
        "introduction": intro,
        "pdf_link": pdf_link
    }

    return info
# --------------------start of test code--------------------
url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10133512/"
info = www_ncbi_nlm_nih_gov(url)
print(info)
# ---------------------end of test code---------------------

In [None]:
def extract_info_from_webpage(url):
    if url != url:
        raise Exception("The given url is np.nan")
    
    url = plib.gei_final_redirected_url(url)
    source = url.split("://")[1].split("/")[0]
    
    for website in plib.websites:
        if website in source:
            # Get the function name by replacing "." with "_" and use globals() to call it
            func_name = website.replace(".", "_")
            func = globals().get(func_name)
            return func(url)
        else:
            print("The url:" + url + " is not included in our websites database yet!")
            return None
# --------------------start of test code--------------------
# websites = ["PMC", "frontiersin", "europepmc", "biorxiv", "jneurosci", "orca.cardiff", "science", "thejns", "cambridge",
#                 "wiley", "ahajournals", "mdpi", "sciencedirect", "pnas", "nature", "cell", "eneuro", "physiology", "springer",
#                 "ieee", "plos", "jstage.jst", "biomedcentral", "jamanetwork", "psycnet.apa", "jnnp.bmj", "degruyter",
#                 "karger", "pure.mpg", "elifesciences", "neurology", "pubs.asahq", "sagepub", "ekja", "liebertpub", "lww",
#                 "tandfonline", "aspetjournals", "oup", "royalsocietypublishing", "psychiatryonline", "jpn", "open.bu.edu",
#                 "agro.icm", "lib.wfu", "mirasmart", "jstor"]
# if len(websites) == len(set(websites)):
#     print("There are no duplicates in the list.")
# else:
#     print("There are duplicates in the list.")
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# url = "https://www.tandfonline.com/doi/abs/10.1080/01616412.1985.11739692"
# info = extract_info_from_webpage(url)
# print(info)
# ---------------------end of test code---------------------

In [None]:
# get doi from url
def url2doi(url):
    if url != url:
        raise Exception("The url given is np.nan")
    
    url = str(url).strip()
    info = extract_info_from_webpage(url) # dictionary
    if info == None:
        return np.nan
    else:
        return info["doi"]
# --------------------start of test code--------------------
# url = "https://www.tandfonline.com/doi/abs/10.1080/01616412.1985.11739692"
# doi = url2doi(url)
# print(doi)
# ---------------------end of test code---------------------