<h2> Searched literature data preprocessing </h2> 

In [1]:
# import internal .py modules
import file_path_management as fpath
import public_library as plib



In [2]:
# import packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import re
import time
import numpy as np
import numpy as np

<h3> Parameters: </h3>

In [3]:
# columns of file: potential_related_literature.csv
columns = ["DOI", "PMID", "PMCID", "Title", "full_text_url", "full_text_source", "pdf_url", "pdf_source"]
# e.g., ["10.1113/JP282626", "35851953", "PMC10087288", 
#        "Cortico-thalamocortical interactions for learning, memory and decision-making",
#        "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10087288/", "PMC",
#        "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10087288/pdf/TJP-601-25.pdf", "PMC"]

<h3> Predefined fucntions: </h3> 

In [4]:
def merge_pubmed(source_path, output_path, columns, start, end):
    print("Starting merging search results from PubMed...")

    df = pd.read_csv(source_path, sep=',')
    df = df[["DOI", "PMID", "PMCID", "Title"]]
    
    for ind in range(start, end):
        # sleep to avoid to be blocked
        time.sleep(random.randint(3,6))
        # if(ind%50 == 0):
        #     time.sleep(random.randint(10,15)*10)
        
        #request the webpage
        # the columns PMID, Title don't contain np.nan
        pmid = str(df["PMID"][ind]).strip()
        url = "https://pubmed.ncbi.nlm.nih.gov/" + pmid + "/"
        proxies = plib.get_proxies()
        soup = plib.request_webpage(url, proxies)
        # print(soup)
        
        # get pmcid
        if df["PMCID"][ind] is np.nan:
            try:
                pmcid = soup.find_all("span", {"class": "identifier pmc"})[0].find_all("a", {"class": "id-link"})[0].get_text().strip()
            except:
                pmcid = np.nan
        else:
            pmcid = str(df["PMCID"][ind]).strip()
        # print(pmcid)

        # get doi
        if df["DOI"][ind] is np.nan:
            try:
                doi = soup.find_all("span", {"class": "identifier doi"})[0].find_all("a", {"class": "id-link"})[0].get_text().strip()
            except:
                doi  = np.nan
        else:
            doi = str(df["DOI"][ind]).strip()
        # print(doi)

        # get full_text_url, full_text_source
        if pmcid is not np.nan:
            full_text_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/"
            full_text_source = "PMC"
        else:
            # PMC does not include this paper
            try:
                full_text_url = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["href"].strip()
                full_text_source = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["data-ga-action"].strip()
            except:
                full_text_url = np.nan
                full_text_source = np.nan
        # print(full_text_url)
        # print(full_text_source)
        
        # get pdf_url, pdf_source
        pdf_url = np.nan
        pdf_source = np.nan
                
        # columns = ["DOI", "PMID", "PMCID", "Title", "full_text_url", "full_text_source", "pdf_url", "pdf_source"]
        row = {
            "DOI": [doi],
            "PMID": [pmid],
            "PMCID": [pmcid],
            "Title": [str(df["Title"][ind]).strip()],
            "full_text_url": [full_text_url],
            "full_text_source": [full_text_source],
            "pdf_url": [pdf_url],
            "pdf_source": [pdf_source]
        }
        # print(row)

        if not plib.add_row_to_csv(output_path, row, columns):
            print("Error detected when adding a row to csv!")
        
        print(ind)
# --------------------start of test code--------------------
# source_path = fpath.poten_litera_pubmed
# output_path = fpath.poten_litera_pubmed_processed
# plib.clear_file(output_path)

# df = pd.read_csv(source_path, sep=',')
# print(df.shape)
# df = df[["DOI", "PMID", "PMCID", "Title"]]
# print(df.head(3))

# print(df["DOI"].isnull().values.any())
# print(df["PMID"].isnull().values.any())
# print(df["PMCID"].isnull().values.any())
# print(df["Title"].isnull().values.any())
# # True, False, True, Flase
# # PMID, Title don't contain np.nan
# # DOI, PMCID contain np.nan
# # we need to fill in what are missing
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# merge_pubmed(source_path, output_path, columns)
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# df = pd.read_csv(output_path, header=None, sep=',')
# print(df.head(3))
# ---------------------end of test code---------------------

In [5]:
def merge_webofscience(source_path, output_path, columns):
    print("Starting merging search results from Web of Science...")
    
    df = pd.read_csv(source_path, sep=";")
    df = df[["DOI", "Pubmed Id", "Article Title"]]
    df.rename(columns={"DOI": "DOI", "Pubmed Id": "PMID", "Article Title": "Title"}, inplace = True, errors= "raise")

    for ind in df.index:
        # sleep to avoid to be blocked
        time.sleep(random.randint(5,10))
        # if(ind%50 == 0):
        #     time.sleep(random.randint(10,15)*10)
        
        # the columns Article Title don't contain np.nan
        # the columns DOI and PMID might contain np.nan
        # get pmid, doi
        if df["PMID"][ind] is np.nan:
            if df["DOI"][ind] is np.nan:
                doi = np.nan
                pmid = np.nan
            else:
                doi = str(df["DOI"][ind]).strip()
                pmid = plib.doi2pmid(doi)
        else:
            pmid = str(int(df["PMID"][ind])).strip()
            if df["DOI"][ind] is np.nan:
                doi = plib.pmid2doi(pmid)
            else:
                doi = str(df["DOI"][ind]).strip()
        
        # get pmcid, full_text_url, full_text_source
        if pmid is np.nan:
            pmcid = np.nan
            if doi == np.nan:
                full_text_url = np.nan
                full_text_source = np.nan
            else:
                full_text_url = "https://doi.org/" + str(doi).strip()
                full_text_source = "DOI"
        else:
            # request the webpage
            url = "https://pubmed.ncbi.nlm.nih.gov/" + pmid + "/"
            # proxies = plib.get_proxies()
            soup = plib.request_webpage(url)
            # print(soup)

            # pmcid
            pmcid = plib.id_converter_in_pubmed(pmid, "pmcid")
            # print(pmcid)
            
            # get full_text_url, full_text_source
            if pmcid is not np.nan:
                full_text_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/"
                full_text_source = "PMC"
            else:
                try:
                    full_text_url = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["href"].strip()
                    full_text_source = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["data-ga-action"].strip()
                except:
                    full_text_url = np.nan
                    full_text_source = np.nan
        
        # get pdf_url, pdf_source
        pdf_url = np.nan
        pdf_source = np.nan

        # columns = ["DOI", "PMID", "PMCID", "Title", "full_text_url", "full_text_source", "pdf_url", "pdf_source"]
        row = {
            "DOI": [doi],
            "PMID": [pmid],
            "PMCID": [pmcid],
            "Title": [str(df["Title"][ind]).strip()],
            "full_text_url": [full_text_url],
            "full_text_source": [full_text_source],
            "pdf_url": [pdf_url],
            "pdf_source": [pdf_source]
        }
        # print(row)

        if not plib.add_row_to_csv(output_path, row, columns):
            print("Error detected when adding a row to csv!")
        
        print(ind)
# --------------------start of test code--------------------
source_path = fpath.poten_litera_wos_1
output_path = fpath.poten_litera_wos_processed
plib.clear_file(output_path)
# ---------------------end of test code--------------------- 

# --------------------start of test code--------------------
df = pd.read_csv(source_path, sep=';')
df = df[["DOI", "Pubmed Id", "Article Title"]]
print(df.head(3))
# ---------------------end of test code--------------------- 

# --------------------start of test code--------------------
print(df["DOI"].isnull().values.any())
print(df["Pubmed Id"].isnull().values.any())
print(df["Article Title"].isnull().values.any())
# True, True, False
# Article Title don't contain np.nan
# DOI, Pubmed Id contain np.nan
# we need to fill in what are missing
# ---------------------end of test code--------------------- 

# --------------------start of test code--------------------
merge_webofscience(source_path, output_path, columns)
# ---------------------end of test code--------------------- 

# --------------------start of test code--------------------
# df = pd.read_csv(output_path, header=None, sep=',')
# print(df.head(3))
# ---------------------end of test code---------------------  

                            DOI  Pubmed Id  \
0  10.1016/0304-3940(96)12319-3  8929980.0   
1  10.1016/0304-3940(93)90180-S  7689715.0   
2         10.1002/cne.903440403  7523458.0   

                                       Article Title  
0  Crossed thalamo-cortical and cortico-thalamic ...  
1  THE RETICULAR THALAMIC NUCLEUS PROJECTS TO THE...  
2  CONTRALATERAL THALAMIC PROJECTIONS PREDOMINANT...  
True
True
False
Starting merging search results from Web of Science...
0
1
2
3
4
5
6


2023-07-31 14:53:23 Didis-MacBook-Pro.local metapub.findit[41501] INFO FindIt Cache initialized at /Users/didihou/.cache/findit.db


7
8
9


2023-07-31 14:53:53 Didis-MacBook-Pro.local metapub.DxDOI[41501] INFO cached results for key 10.1007/BF00231845 (10.1007/BF00231845) 


IDC Converter: nan
Metapub: 10.1007/BF00231845


Exception: Results are different, check your functions.

In [None]:
def merge_eupmc(source_path, output_path, columns):
    print("Starting merging search results from Europe PMC...")
    # process eupmc search results
    df = pd.read_csv(source_path, sep = ",")
    df = df[["DOI", "EXTERNAL_ID", "PMCID", "TITLE"]]
    df = df.rename(columns={"EXTERNAL_ID": "PMID", "TITLE": "Title"}, errors = "raise")
    for ind in df.index:
        print(ind)
        if(ind%10 == 0):
            time.sleep(random.randint(3,6)*10)
        proxies = plib.get_proxies()
        pmid = str(df["PMID"][ind])
        # print(pmid)
        url = "https://pubmed.ncbi.nlm.nih.gov/" + pmid + "/"
        regex = "[a-zA-Z]"
        if len(re.findall(regex, pmid)) == 0:
            # print("pmid")
            soup = plib.request_webpage(url)
            # print(soup)

            # get PMCID
            # print(df["PMCID"][ind])
            if df["PMCID"][ind] is np.nan:
                try:
                    pmcid = soup.find_all("span", {"class": "identifier pmc"})[0].find_all("a", {"class": "id-link"})[0].get_text().strip()
                except:
                    pmcid  ="not found"
            else:
                pmcid = str(df["PMCID"][ind])
            # print(pmcid)
            # get DOI
            if df["DOI"][ind] is np.nan:
                try:
                    doi = soup.find_all("span", {"class": "identifier doi"})[0].find_all("a", {"class": "id-link"})[0].get_text().strip()
                except:
                    doi  ="not found"
            else:
                doi = str(df["DOI"][ind])
            # get full_text_url
            if pmcid != "not found":
                full_text_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/"
                full_text_source = "PMC"
            else:
                try:
                    full_text_url = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["href"].strip()
                    full_text_source = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["data-ga-action"].strip()
                except:
                    full_text_url = "not found"
                    full_text_source = "not found"
            # columns = ["DOI", "PMID", "PMCID", "Title", "First_Author", "full_text_url", "full_text_source"]
        else:
            # print("not pmid")
            if df["DOI"][ind] is np.nan:
                doi  ="not found"
            else:
                doi = str(df["DOI"][ind])
            if df["PMCID"][ind] is np.nan:
                full_text_url = "not found"
                full_text_source = "not found"
                pmcid = "not found"
            else:
                full_text_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/"
                full_text_source = "PMC"
                pmcid = str(df["PMCID"][ind])
            first_author = "not found"
            pmid = "not found"
        row = {
            "DOI": [doi],
            "PMID": [pmid],
            "PMCID": [pmcid],
            "Title": [str(df["Title"][ind])],
            "First_Author": [first_author],
            "full_text_url": [full_text_url],
            "full_text_source": [full_text_source]
        }
        # print(row)
        if not plib.add_row_to_csv(output_path, row, columns):
            print("Error detected when adding a row to csv!")
# --------------------start of test code--------------------
# source_path = fpath.poten_litera_eupmc
# output_path = fpath.poten_litera_eupmc_processed
# plib.clear_file(output_path)
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# df = pd.read_csv(source_path, sep=',')
# df = df[["SOURCE", "DOI", "EXTERNAL_ID", "PMCID", "TITLE"]]
# print(df.head(3))
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# col_one_list = set(df['SOURCE'].tolist())
# print(col_one_list)
# 
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# print(df["SOURCE"].isnull().values.any())
# print(df["DOI"].isnull().values.any())
# print(df["EXTERNAL_ID"].isnull().values.any())
# print(df["PMCID"].isnull().values.any())
# print(df["TITLE"].isnull().values.any())
# # PMID, Title don't contain np.nan
# # DOI, PMCID contain np.nan
# # we need to fill in what are missing
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# merge_eupmc(source_path, output_path, columns)
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# df = pd.read_csv(output_path, header=None, sep=',')
# print(df.head(3))
# ---------------------end of test code---------------------

In [None]:
def merge_google_shcolar(source_path, output_path, columns):
    print("Starting merging search results from Google Scholar...")
    return True
# --------------------start of test code--------------------
# source_path = fpath.poten_litera_gs_test
# output_path = fpath.poten_litera_gs_processed
# plib.clear_file(output_path)
# df = pd.read_csv(source_path, header = None, sep=',')
# df = df[["SOURCE", "DOI", "EXTERNAL_ID", "PMCID", "TITLE"]]
# print(df.head(5))
# col_one_list = set(df['SOURCE'].tolist())
# print(col_one_list)
# print(df["SOURCE"].isnull().values.any())
# print(df["DOI"].isnull().values.any())
# print(df["EXTERNAL_ID"].isnull().values.any())
# print(df["PMCID"].isnull().values.any())
# print(df["TITLE"].isnull().values.any())
# # the columns PMID, Title don't contain np.nan
# # the columns DOI, PMCID contain np.nan, we need to fill in what are missing
# # we also need to reenter the full name of the first author
# merge_google_shcolar(source_path, output_path, columns)

# df = pd.read_csv(output_path, header=None, sep=',')
# print(df.head(3))
# ---------------------end of test code---------------------

In [None]:
def merge_seed_paper_spanning(source_path, output_path, columns):
    print("Starting merging search results from spanning citations of seed paper...")
    return True
# --------------------start of test code--------------------
# test code
# ---------------------end of test code---------------------

In [None]:
def merge_cocomac_paper(source_path, output_path, columns):
    print("Starting merging search results from CoCoMac papers...")
    return True
# --------------------start of test code--------------------
# test code
# ---------------------end of test code---------------------

In [None]:
# make sure at least PMID and PMCID is present as two of the four identifiers, otherwise manually fill in
def fill_in_elements(file_path):
    # PMID -> PMCID
    # done already
    # PMCID -> PMID
    # done already
    # PMID -> DOI
    df = pd.read_csv(file_path, sep = ",")
    for ind in df.index:
        if (df["PMID"][ind] is not np.nan) and (df["DOI"][ind] is np.nan):
            pmid = df["PMID"][ind]
            url = "https://pubmed.ncbi.nlm.nih.gov/" + pmid + "/"
            print(url)
            response = requests.get(url, headers = plib.headers)
            if response.status_code != 200:
                raise Exception("Error when request webpages!")
            soup = BeautifulSoup(response.content, "lxml")
            l = soup.find_all("a", {"class: id-link"}, {"data-ga-action": "DOI"})
            if(len(l) != 0):
                # print(l[0].get_text().strip())
                df.at[ind, "DOI"] = l[0].get_text().strip()
            else:
                df.at[ind, "DOI"] = np.nan
    df.to_csv(fpath.poten_litera_csv, header = True, index = False)
    print("All 3 identifiers: DOI, PMID, and PMCID filled in when possible.")
# --------------------start of test code--------------------
# test code
# ---------------------end of test code---------------------

In [None]:
# remove duplciations based on identifiers in the potential related literature
def remove_dupli(file_path):
    df = pd.read_csv(file_path, sep = ",")
    print(len(df))
    df = df.drop_duplicates(subset=['DOI'])
    df = df.drop_duplicates(subset=['PMID'])
    df = df.drop_duplicates(subset=['PMCID'])
    print(len(df))
    # plib.clear_file(fpath.poten_litera_csv)
    # df.csv(fpath.poten_litera_csv, idnex = None)
    print("Duplication in the potential related literature removed.")
    print("Found " + len(df) + " potential related literature in total.")
# --------------------start of test code--------------------
# test code
# ---------------------end of test code---------------------

<h3> Main program: </h3> 

In [None]:
# clear the file
# source_path = fpath.poten_litera_pubmed
# output_path = fpath.poten_litera_pubmed_processed
# plib.clear_file(output_path)

In [None]:
# # merge search results from PubMed
# # 2606 results
# merge_pubmed(source_path, output_path, columns, 2542, 2606)
# print("Merging results from PubMed succeeded!")
# # print("Attention! Something went wrong when merging results from PubMed!")

In [None]:
# # clear the file
# source_path = fpath.poten_litera_wos_1
# output_path = fpath.poten_litera_wos_processed
# plib.clear_file(output_path)

# # merge search results from Web of Science
# # 1000 results
# merge_webofscience(source_path, output_path, columns)
# print("Merging results from Web of Science part 1 succeeded!")
# # print("Attention! Something went wrong when merging results from Web of Science part 1!")

In [None]:
# # clear the file
# source_path = fpath.poten_litera_wos_2
# output_path = fpath.poten_litera_wos_processed

# # merge search results from Web of Science
# # 976 results
# merge_webofscience(source_path, output_path, columns)
# print("Merging results from Web of Science part 2 succeeded!")
# # print("Attention! Something went wrong when merging results from Web of Science part 2!")

In [None]:
# # clear the file
# source_path = fpath.poten_litera_eupmc
# output_path = fpath.poten_litera_eupmc_processed
# plib.clear_file(output_path)

# # merge search results from Europe PMC
# merge_eupmc(source_path, output_path, columns)
# # 9139 results
# print("Merging results from Europe PMC succeeded!")
# # print("Attention! Something went wrong when merging results from Europe PMC!")

In [None]:
# # clear the file
# source_path = fpath.poten_litera_gs_test
# output_path = fpath.poten_litera_gs_processed
# plib.clear_file(output_path)

# # merge search results from Google Scholar
# merge_google_shcolar(source_path, output_path, columns)
# print("Merging results from Google Scholar succeeded!")
# # print("Attention! Something went wrong when merging results from Google Scholar!")

In [None]:
# # merge search results from spanning citations of seed paper
# merge_seed_paper_spanning(source_path, output_path, columns):
# print("Merging results from spanning citations of seed papers succeeded!")
# # print("Attention! Something went wrong when merging results from spanning citations of seed papers!")

In [None]:
# # merge search results from CoCoMac papers
# merge_cocomac_paper(source_path, output_path, columns)
# print("Merging results from CoCoMac papers succeeded!")
# # print("Attention! Something went wrong when merging results from CoCoMac papers!")

In [None]:
# # fill in all elements in the columns when possible, if not, fill in "not found"
# fill_in_elements(fpath.poten_litera_csv, columns)

In [None]:
# identifier = ["DOI", "PMID", "PMCID"]
# remove_dupli(fpath.poten_litera_csv, identifier)

<h3> Next step: automatic filtering </h3>