<h2> Searched literature data preprocessing </h2> 

In [1]:
# import internal .py modules
import file_path_management as fpath
import public_library as plib



In [2]:
# import packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import re
import time
import numpy as np
import numpy as np

<h3> Parameters: </h3>

In [3]:
# columns of file: potential_related_literature.csv
columns = ["DOI", "PMID", "PMCID", "Title", "full_text_url", "full_text_source", "pdf_url", "pdf_source"]
# e.g., ["10.1113/JP282626", "35851953", "PMC10087288", 
#        "Cortico-thalamocortical interactions for learning, memory and decision-making",
#        "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10087288/", "PMC",
#        "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10087288/pdf/TJP-601-25.pdf", "PMC"]

<h3> Predefined fucntions: </h3> 

In [4]:
def preprocess_pubmed(source_path, output_path, start, end):
    print("Starting preprocessing search results from PubMed...")

    df = pd.read_csv(source_path, sep=',')
    df = df[["DOI", "PMID", "PMCID", "Title"]]
    
    for ind in range(start, end):
        # sleep to avoid to be blocked
        time.sleep(random.randint(3, 5))
        # if(ind%50 == 0):
        #     time.sleep(random.randint(10,15)*10)
        
        # request the webpage
        # the columns PMID, Title don't contain np.nan
        pmid = str(df["PMID"][ind]).strip()
        url = "https://pubmed.ncbi.nlm.nih.gov/" + pmid + "/"
        # proxies = plib.get_proxies()
        soup = plib.request_webpage(url)
        # print(soup)
        
        # get pmcid
        if df["PMCID"][ind] != df["PMCID"][ind]: # PMCID is np.nan
            try:
                pmcid = soup.find_all("span", {"class": "identifier pmc"})[0].find_all("a", {"class": "id-link"})[0].get_text().strip()
            except:
                pmcid = np.nan
        else: # PMCID is not np.nan
            pmcid = str(df["PMCID"][ind]).strip()
        # print(pmcid)

        # get doi
        if df["DOI"][ind] != df["DOI"][ind]: # DOI is np.nan
            try:
                doi = soup.find_all("span", {"class": "identifier doi"})[0].find_all("a", {"class": "id-link"})[0].get_text().strip()
            except:
                doi  = np.nan
        else: # DOI is not np.nan
            doi = str(df["DOI"][ind]).strip()
        # print(doi)

        # get full_text_url, full_text_source
        if pmcid == pmcid: # pmcid is not np.nan
            full_text_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/"
            full_text_source = "PMC"
        else: # pmcid is np.nan
            # PMC does not include this paper
            try:
                full_text_url = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["href"].strip()
                full_text_source = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["data-ga-action"].strip()
            except:
                full_text_url = np.nan
                full_text_source = np.nan
        # print(full_text_url)
        # print(full_text_source)
        
        # get pdf_url, pdf_source
        pdf_url = np.nan
        pdf_source = np.nan
                
        columns = ["DOI", "PMID", "PMCID", "Title", "full_text_url", "full_text_source", "pdf_url", "pdf_source"]
        row = {
            "DOI": [doi],
            "PMID": [pmid],
            "PMCID": [pmcid],
            "Title": [str(df["Title"][ind]).strip()],
            "full_text_url": [full_text_url],
            "full_text_source": [full_text_source],
            "pdf_url": [pdf_url],
            "pdf_source": [pdf_source]
        }
        # print(row)

        if not plib.add_row_to_csv(output_path, row, columns):
            print("Error detected when adding a row to csv!")
        
        print(ind)
# --------------------start of test code--------------------
# source_path = fpath.poten_litera_pubmed
# output_path = fpath.poten_litera_pubmed_processed
# # plib.clear_file(output_path)

# df = pd.read_csv(source_path, sep=',')
# print(df.shape)
# df = df[["DOI", "PMID", "PMCID", "Title"]]
# print(df.head(3))
# print(df.shape)

# print(df["DOI"].isnull().values.any())
# print(df["PMID"].isnull().values.any())
# print(df["PMCID"].isnull().values.any())
# print(df["Title"].isnull().values.any())
# # True, False, True, Flase
# # PMID, Title don't contain np.nan
# # DOI, PMCID contain np.nan
# # we need to fill in what are missing
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# preprocess_pubmed(source_path, output_path, start, end)
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# df = pd.read_csv(output_path, header=None, sep=',')
# print(df.head(3))
# ---------------------end of test code---------------------

In [5]:
def preprocess_webofscience(source_path, output_path, start, end):
    print("Starting preprocessing search results from Web of Science...")
    
    df = pd.read_csv(source_path, sep=";")
    df = df[["DOI", "Pubmed Id", "Article Title"]]

    for ind in range(start, end):
        # sleep to avoid to be blocked
        time.sleep(random.randint(3, 5))
        # if(ind%50 == 0):
        #     time.sleep(random.randint(10,15)*10)
        
        # the columns Article Title don't contain np.nan
        # the columns DOI and PMID might contain np.nan
        # get pmid, doi
        if df["Pubmed Id"][ind] != df["Pubmed Id"][ind]: # Pubmed Id is np.nan
            if df["DOI"][ind] != df["DOI"][ind]: # DOI is np.nan
                doi = np.nan
                pmid = np.nan
            else: # DOI is not np.nan
                doi = str(df["DOI"][ind]).strip()
                pmid = plib.doi2pmid(doi)
        else: # Pubmed Id is not np.nan
            pmid = str(int(df["Pubmed Id"][ind])).strip()
            if df["DOI"][ind] != df["DOI"][ind]: # DOI is not np.nan
                doi = plib.pmid2doi(pmid)
            else: # DOI is not np.nan
                doi = str(df["DOI"][ind]).strip()
        
        # get pmcid, full_text_url, full_text_source
        if pmid != pmid: # pmid is np.nan
            pmcid = np.nan
            if doi != doi: # doi is np.nan
                full_text_url = np.nan
                full_text_source = np.nan
            else:
                full_text_url = "https://doi.org/" + str(doi).strip()
                full_text_source = "DOI"
        else: # pmid is not np.nan
            # request the webpage
            url = "https://pubmed.ncbi.nlm.nih.gov/" + pmid + "/"
            # proxies = plib.get_proxies()
            soup = plib.request_webpage(url)
            # print(soup)

            # get pmcid
            try:
                pmcid = soup.find_all("span", {"class": "identifier pmc"})[0].find_all("a", {"class": "id-link"})[0].get_text().strip()
            except:
                pmcid = np.nan
            # print(pmcid)
            
            # get full_text_url, full_text_source
            if pmcid == pmcid:
                full_text_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/"
                full_text_source = "PMC"
            else:
                try:
                    full_text_url = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["href"].strip()
                    full_text_source = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["data-ga-action"].strip()
                except:
                    full_text_url = np.nan
                    full_text_source = np.nan
        
        # get pdf_url, pdf_source
        pdf_url = np.nan
        pdf_source = np.nan

        columns = ["DOI", "PMID", "PMCID", "Title", "full_text_url", "full_text_source", "pdf_url", "pdf_source"]
        row = {
            "DOI": [doi],
            "PMID": [pmid],
            "PMCID": [pmcid],
            "Title": [str(df["Article Title"][ind]).strip()],
            "full_text_url": [full_text_url],
            "full_text_source": [full_text_source],
            "pdf_url": [pdf_url],
            "pdf_source": [pdf_source]
        }
        # print(row)

        if not plib.add_row_to_csv(output_path, row, columns):
            print("Error detected when adding a row to csv!")
        
        print(ind)
# --------------------start of test code--------------------
# # source_path = fpath.poten_litera_wos
# # output_path = fpath.poten_litera_wos_processed
# plib.clear_file(output_path)

# df = pd.read_csv(source_path, sep=';')
# df = df[["DOI", "Pubmed Id", "Article Title"]]
# print(df.head(3))
# print(df.shape)

# print(df["DOI"].isnull().values.any())
# print(df["Pubmed Id"].isnull().values.any())
# print(df["Article Title"].isnull().values.any())
# # True, True, False
# # Article Title don't contain np.nan
# # DOI, Pubmed Id contain np.nan
# # we need to fill in what are missing
# ---------------------end of test code--------------------- 

# --------------------start of test code--------------------
# preprocess_webofscience(source_path, output_path, 0, 10)
# ---------------------end of test code--------------------- 

# --------------------start of test code--------------------
# df = pd.read_csv(output_path, header=None, sep=';')
# print(df.head(3))
# ---------------------end of test code---------------------  

In [6]:
def preprocess_eupmc(source_path, output_path, start, end):
    print("Starting preprocessing search results from Europe PMC...")

    df = pd.read_csv(source_path, sep=",")
    df = df[["SOURCE", "DOI", "EXTERNAL_ID", "PMCID", "TITLE"]]

    for ind in range(start, end):
        # sleep to avoid to be blocked
        time.sleep(random.randint(1, 3))
        # if(ind%50 == 0):
        #     time.sleep(random.randint(10,15)*10)

        # get pmid, doi
        # SOURCE = {'PMC', 'MED', 'ETH', 'PPR'}
        if df["SOURCE"][ind] != "MED": # SOURCE is not "MED" 
            if df["DOI"][ind] != df["DOI"][ind]: # doi is np.nan
                doi = np.nan
                pmid = np.nan
            else:
                doi = str(df["DOI"][ind]).strip()
                pmid = plib.doi2pmid(doi)
        else: # SOURCE is "MED"
            # get doi, pmid
            if df["EXTERNAL_ID"][ind] != df["EXTERNAL_ID"][ind]: # EXTERNAL_ID is np.nan
                if df["DOI"][ind] != df["DOI"][ind](): # DOI is np.nan
                    doi = np.nan
                    pmid = np.nan
                else: # DOI is not np.nan
                    doi = str(df["DOI"][ind]).strip()
                    pmid = plib.doi2pmid(doi)
            else: # EXTERNAL_ID is not np.nan
                pmid = str(df["EXTERNAL_ID"][ind]).strip()
                if df["DOI"][ind] != df["DOI"][ind]: # DOI is np.nan
                    doi = plib.pmid2doi(pmid)
                else: # DOI is not np.nan
                    doi = str(df["DOI"][ind]).strip()
                
        # get pmcid, full_text_url, full_text_source
        if pmid != pmid: # pmid is np.nan
            pmcid = df["PMCID"][ind]
            if pmcid == pmcid: # pmcid is np.nan
                full_text_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/"
                full_text_source = "PMC"
            elif doi == doi: # doi is not np.nan
                full_text_url = "https://doi.org/" + str(doi).strip()
                full_text_source = "DOI"
            else:
                full_text_url = np.nan
                full_text_source = np.nan
        else: # pmid is not np.nan
            # request the webpage
            url = "https://pubmed.ncbi.nlm.nih.gov/" + pmid + "/"
            # proxies = plib.get_proxies()
            soup = plib.request_webpage(url)
            # print(soup)

            # get pmcid
            try:
                pmcid = soup.find_all("span", {"class": "identifier pmc"})[0].find_all("a", {"class": "id-link"})[0].get_text().strip()
            except:
                pmcid = np.nan
            # print(pmcid)
            
            # get full_text_url, full_text_source
            if pmcid == pmcid: # pmcid is not np.nan
                full_text_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/"
                full_text_source = "PMC"
            else: # pmcid is not np.nan
                try:
                    full_text_url = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["href"].strip()
                    full_text_source = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["data-ga-action"].strip()
                except:
                    full_text_url = np.nan
                    full_text_source = np.nan
        
        # get pdf_url, pdf_source
        pdf_url = np.nan
        pdf_source = np.nan

        columns = ["DOI", "PMID", "PMCID", "Title", "full_text_url", "full_text_source", "pdf_url", "pdf_source"]
        row = {
            "DOI": [doi],
            "PMID": [pmid],
            "PMCID": [pmcid],
            "Title": [str(df["TITLE"][ind]).strip()],
            "full_text_url": [full_text_url],
            "full_text_source": [full_text_source],
            "pdf_url": [pdf_url],
            "pdf_source": [pdf_source]
        }
        # print(row)

        if not plib.add_row_to_csv(output_path, row, columns):
            print("Error detected when adding a row to csv!")
        
        print(ind)
# --------------------start of test code--------------------
# source_path = fpath.poten_litera_eupmc
# output_path = fpath.poten_litera_eupmc_processed
# # plib.clear_file(output_path)

# df = pd.read_csv(source_path, sep=',')
# df = df[["SOURCE", "DOI", "EXTERNAL_ID", "PMCID", "TITLE"]]
# print(df.head(3))
# print(df.shape)

# col_one_list = set(df['SOURCE'].tolist())
# print(col_one_list)
# # ['PMC', 'MED', 'ETH', 'PPR']

# print(df["SOURCE"].isnull().values.any())
# print(df["DOI"].isnull().values.any())
# print(df["EXTERNAL_ID"].isnull().values.any())
# print(df["PMCID"].isnull().values.any())
# print(df["TITLE"].isnull().values.any())
# # False, True, False, True, False
# # SOURCE, EXTERNAL_ID, Title don't contain np.nan
# # DOI, PMCID contain np.nan
# # we need to fill in what are missing
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# preprocess_eupmc(source_path, output_path, 0, 10)
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# df = pd.read_csv(output_path, header=None, sep=',')
# print(df.head(3))
# ---------------------end of test code---------------------

In [7]:
def preprocess_google_shcolar_step1(source_path, output_path, start, end):
    print("Starting merging search results from Google Scholar...")

    df = pd.read_csv(source_path, header=None, sep=',')
    df.columns = ["title", "url", "url_type", "full_text_url", "full_text_type", "full_text_source"]

    for ind in range(start, end):
        # df["url_type"][ind]: {'[CITATION][C]', '[PDF][PDF]', '[BOOK][B]', nan, '[HTML][HTML]'}
        # we don't need citations and books, as they are not likely to include connecivity information
        if (df["url_type"][ind] == "[CITATION][C]") or (df["url_type"][ind] == "[BOOK][B]"):
            continue
        
        # if url or title doesn't exsit AND full_text_url doesn't exist
        if (df["url"][ind] != df["url"][ind]) or (df["title"][ind] != df["title"][ind]):
            continue
        
        # now every row has at least title and url, and the url_text = {"[PDF][PDF]", nan, "[HTML][HTML]"}
        if df["url_type"][ind] == "[PDF][PDF]":
            # full_text_type = {'[HTML]', nan, '[PDF]', 'UB'}
            if df["full_text_type"][ind] == "[HTML]":
                link = str(df["full_text_url"][ind]).strip()
                full_text_url = plib.get_final_redirected_url(link)
                if full_text_url == full_text_url:
                    full_text_source = full_text_url.split("://")[1].split("/")[0]
                else:
                    full_text_source = np.nan
            else:
                full_text_url = np.nan
                full_text_source = np.nan
            # get pdf_url, pdf_source
            link = str(df["url"][ind]).strip()
            pdf_url = plib.get_final_redirected_url(link)
            if pdf_url == pdf_url:
                pdf_source = pdf_url.split("://")[1].split("/")[0]
            else:
                pdf_source = np.nan
        else: # df["url_type"][ind] == nan or '[HTML][HTML]'
            link = str(df["url"][ind]).strip()
            full_text_url = plib.get_final_redirected_url(link)
            if full_text_url == full_text_url:
                full_text_source = full_text_url.split("://")[1].split("/")[0]
            else:
                full_text_source = np.nan
            # get pdf_url, pdf_source
            # full_text_type = {'[HTML]', nan, '[PDF]', 'UB'}
            if df["full_text_type"][ind] == "[PDF]":
                link = str(df["full_text_url"][ind]).strip()
                pdf_url = plib.get_final_redirected_url(link)
                if pdf_url == pdf_url:
                    pdf_source = pdf_url.split("://")[1].split("/")[0]
                else:
                    pdf_source = np.nan
            else:
                pdf_url = np.nan
                pdf_source = np.nan
        
        columns = ["Title", "full_text_url", "full_text_source", "pdf_url", "pdf_source"]
        row = {
            "Title": [str(df["title"][ind]).strip()],
            "full_text_url": [full_text_url],
            "full_text_source": [full_text_source],
            "pdf_url": [pdf_url],
            "pdf_source": [pdf_source]
        }
        # print(row)

        if not plib.add_row_to_csv(output_path, row, columns):
            print("Error detected when adding a row to csv!")
        
        print(ind)
# --------------------start of test code--------------------
# source_path = fpath.poten_litera_gs
# output_path = fpath.poten_litera_gs_processed_step1
# # plib.clear_file(output_path)

# df = pd.read_csv(source_path, header=None, sep=',')
# df.columns = ["title", "url", "url_type", "full_text_url", "full_text_type", "full_text_source"]
# print(df.head(3))
# print(df.head)

# url_type = set(df['url_type'].tolist())
# print(url_type)
# # {'[CITATION][C]', '[PDF][PDF]', '[BOOK][B]', nan, '[HTML][HTML]'}
# full_text_type = set(df['full_text_type'].tolist())
# print(full_text_type)
# # {nan, 'UB', '[HTML]', '[PDF]'}
# full_text_source = set(df['full_text_source'].tolist())
# print(full_text_source)
# # {'ahajournals.org', 'lww.com', 'springer.com', 'academia.edu', 'plos.org', 'ieee.org', 'nature.com', 
# # 'mdpi.com', 'jpn.ca', 'uottawa.ca', nan, 'northwestern.edu', 'bmj.com', 'ekja.org', 'RWTH-Link', 'wiley.com', 
# # 'escholarship.org', 'nyu.edu', 'frontiersin.org', 'sciencedirect.com', 'eneuro.org', 'jneurosci.org', 
# # 'royalsocietypublishing.org', 'karger.com', 'harvard.edu', 'annualreviews.org', 'mcgill.ca', 
# # 'elifesciences.org', 'mirasmart.com', 'duke.edu', 'ucdavis.edu', 'physiology.org', 'cell.com', 
# # 'wustl.edu', 'epfl.ch', 'udc.es', 'psychiatryonline.org', 'jst.go.jp', 'core.ac.uk', 'rero.ch', 
# # 'zsp.com.pk', 'sagepub.com', 'europepmc.org', 'tandfonline.com', 'asahq.org', 'sonar.ch', 'koreamed.org', 
# # 'oup.com', 'science.org', 'scholarpedia.org', 'psu.edu', 'jordanbpeterson.com', 'pnas.org', 'uzh.ch', 'biorxiv.org', 
# # 'biomedcentral.com', 'umich.edu', 'ahuman.org', 'researchgate.net', 'ijpp.com', 'unav.edu', 'nih.gov', 'bu.edu'}
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# # ["title", "url", "url_type", "full_text_url", "full_text_type", "full_text_source"]
# print(df["title"].isnull().any().any())
# print(df["url"].isnull().any().any())
# print(df["url_type"].isnull().any().any())
# print(df["full_text_url"].isnull().any().any())
# print(df["full_text_type"].isnull().any().any())
# print(df["full_text_source"].isnull().any().any())
# # True, True, True, True, True, True
# # title, url, url_type, full_text_url, full_text_type, full_text_source contain np.nan
# # we need to fill in what are missing
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# preprocess_google_shcolar_step1(source_path, output_path, 0, 1000)
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# df = pd.read_csv(output_path, header=None, sep=',')
# print(df.head(3))
# ---------------------end of test code---------------------

In [8]:
def preprocess_google_shcolar_step2(source_path, output_path, start, end):
    print("Starting merging search results from Google Scholar...")

    df = pd.read_csv(source_path, header=None, sep=',')
    df.columns = ["Title", "full_text_url", "full_text_source", "pdf_url", "pdf_source"]

    for ind in range(start, end):
        # get doi from url
        if df["full_text_url"][ind] == df["full_text_url"][ind]: # there's a full_text_url
            url = str(df["full_text_url"][ind]).strip()
            source = str(df["full_text_source"][ind]).strip()
            doi = plib.url2doi(url)
        else:
            doi = np.nan
        # # get pmid from DOI
        # if doi == doi: # there's doi
        #     pmid = plib.doi2pmid(doi)
        # else: # doi not found
        #     pmid = np.nan
        # # get pmcid, full_text_url, full_text_source
        # if pmid != pmid: # pmid is np.nan
        #     pmcid = np.nan
        #     if doi == doi: # doi is not np.nan
        #         full_text_url = "https://doi.org/" + str(doi).strip()
        #         full_text_source = "DOI"
        #     else:
        #         full_text_url = np.nan
        #         full_text_source = np.nan
        # else: # pmid is not np.nan
        #     # request the webpage
        #     url = "https://pubmed.ncbi.nlm.nih.gov/" + pmid + "/"
        #     # proxies = plib.get_proxies()
        #     soup = plib.request_webpage(url)
        #     # print(soup)

        #     # get pmcid
        #     try:
        #         pmcid = soup.find_all("span", {"class": "identifier pmc"})[0].find_all("a", {"class": "id-link"})[0].get_text().strip()
        #     except:
        #         pmcid = np.nan
        #     # print(pmcid)
            
        #     # get full_text_url, full_text_source
        #     if pmcid == pmcid: # pmcid is not np.nan
        #         full_text_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/"
        #         full_text_source = "PMC"
        #     else: # pmcid is not np.nan
        #         try:
        #             full_text_url = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["href"].strip()
        #             full_text_source = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["data-ga-action"].strip()
        #         except:
        #             full_text_url = np.nan
        #             full_text_source = np.nan

        columns = ["DOI", "PMID", "PMCID", "Title", "full_text_url", "full_text_source", "pdf_url", "pdf_source"]
        pmid = np.nan
        pmcid = np.nan
        row = {
            "DOI": [doi],
            "PMID": [pmid],
            "PMCID": [pmcid],
            "Title": [df["Title"][ind]],
            "full_text_url": [df["full_text_url"][ind]],
            "full_text_source": [df["full_text_source"][ind]],
            "pdf_url": [df["pdf_url"][ind]],
            "pdf_source": [df["pdf_source"][ind]]
        }
        # print(row)

        if not plib.add_row_to_csv(output_path, row, columns):
            print("Error detected when adding a row to csv!")
        
        print(doi)
        if doi != doi:
            print([df["full_text_url"][ind]])
        print(ind)
# --------------------start of test code--------------------
source_path = fpath.poten_litera_gs_processed_step1
output_path = fpath.poten_litera_gs_processed_step2
plib.clear_file(output_path)
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# df = pd.read_csv(source_path, header=None, sep=',')
# df.columns = ["Title", "full_text_url", "full_text_source", "pdf_url", "pdf_source"]
# print(df.head(3))
# print(df.shape)
# # # (905, 5)
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# full_text_source = set(df['full_text_source'].tolist())
# print(full_text_source)
# # {'www.frontiersin.org', 'www.elibrary.ru', 'orca.cardiff.ac.uk', 'www.jneurosci.org', 
# # 'europepmc.org', 'www.theses.fr', 'www.biorxiv.org', 'submissions.mirasmart.com', 
# # 'royalsocietypublishing.org', 'www.science.org', 'thejns.org', 
# # 'escholarship.mcgill.ca', 'www.cambridge.org', 'movementdisorders.onlinelibrary.wiley.com', 
# # 'www.ahajournals.org', 'books.google.de', 'www.mdpi.com', 'www.sciencedirect.com', 
# # 'ieeexplore.ieee.org', 'academic.oup.com', 'www.pnas.org', 'physoc.onlinelibrary.wiley.com', 
# # 'www.jstage.jst.go.jp', 'wakespace.lib.wfu.edu', 'elibrary.ru', 'www.cabdirect.org', 
# # 'www.tandfonline.com', 'www.jpn.ca', 'jpet.aspetjournals.org', 'onlinelibrary.wiley.com', 
# # 'open.bu.edu', 'tbiomed.biomedcentral.com', 'www.liebertpub.com', 'journals.lww.com', 
# # 'agro.icm.edu.pl', 'ekja.org', 'analyticalsciencejournals.onlinelibrary.wiley.com', 
# # 'n.neurology.org', 'pubs.asahq.org', 'journals.sagepub.com', 'neuro.psychiatryonline.org', 
# # 'karger.com', 'nyaspubs.onlinelibrary.wiley.com', 'pure.mpg.de', 'elifesciences.org', 
# # 'link.springer.com', 'psycnet.apa.org', 'jnnp.bmj.com', 'www.degruyter.com', 'ajp.psychiatryonline.org', 
# # 'journals.physiology.org', 'www.nature.com', 'www.jstor.org', 'var.scholarpedia.org', 'www.eneuro.org', 
# # 'journals.plos.org', 'www.cell.com', 'www.ncbi.nlm.nih.gov', 'www.taylorfrancis.com', 
# # 'bmcneurosci.biomedcentral.com', nan, 'jamanetwork.com'}
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# # ["Title", "full_text_url", "full_text_source", "pdf_url", "pdf_source"]
# print(df["Title"].isnull().any().any())
# print(df["full_text_url"].isnull().any().any())
# print(df["full_text_source"].isnull().any().any())
# print(df["pdf_url"].isnull().any().any())
# print(df["pdf_source"].isnull().any().any())
# # False, True, True, True, True
# # full_text_url, full_text_source, pdf_url, pdf_source contain np.nan
# # we need to fill in what are missing
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
preprocess_google_shcolar_step2(source_path, output_path, 0, 1000)
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# df = pd.read_csv(output_path, header=None, sep=',')
# print(df.head(3))
# ---------------------end of test code---------------------

Starting merging search results from Google Scholar...
10.1152/jn.2001.85.1.219
0
nan
['https://www.cabdirect.org/cabdirect/welcome/?target=%2fcabdirect%2fabstract%2f19522203025']
1
10.1016/j.neuroimage.2006.07.032
2
10.1523/JNEUROSCI.14-05-02485.1994
3
10.1002/cne.903130106
4
10.1038/372770a0
5
nan
[nan]
6
10.1002/(SICI)1096-9861(19981019)400:2<271::AID-CNE8>3.0.CO;2-6
7
10.1002/(SICI)1096-9861(19960812)372:1<59::AID-CNE6>3.0.CO;2-L
8
10.1016/0165-0173(96)00003-3
9
10.1038/nn.4423
10
nan
['https://onlinelibrary.wiley.com/doi/abs/10.1002/cne.902570211']
11
nan
['https://onlinelibrary.wiley.com/doi/abs/10.1002/cne.902620207']
12
nan
['https://onlinelibrary.wiley.com/doi/abs/10.1002/cne.902440208']
13
10.1007/s00429-022-02463-4
14
10.1016/0006-8993(77)90536-4
15
10.1007/BF00236173
16
10.1002/cne.902620303
17
nan
['https://onlinelibrary.wiley.com/doi/abs/10.1002/cne.902360304']
18
10.1152/jn.1989.61.1.1
19
10.1016/S0168-0102(98)00021-2
20
10.1007/BF00250573
21
10.1002/cne.902520305
22
10.

KeyboardInterrupt: 

In [None]:
def preprocess_seed_paper_spanning(source_path, output_path):
    print("Starting preprocessing search results from spanning citations of seed paper...")
    return True
# --------------------start of test code--------------------
# test code
# ---------------------end of test code---------------------

In [None]:
def preprocess_cocomac_paper(source_path, output_path):
    print("Starting preprocessing search results from CoCoMac papers...")
    return True
# --------------------start of test code--------------------
# test code
# ---------------------end of test code---------------------

In [None]:
# make sure at least PMID and PMCID is present as two of the four identifiers, otherwise manually fill in
def fill_in_elements(file_path):
    # PMID -> PMCID
    # done already
    # PMCID -> PMID
    # done already
    # PMID -> DOI
    df = pd.read_csv(file_path, sep = ",")
    for ind in df.index:
        if (df["PMID"][ind] == df["PMID"][ind]) and (df["DOI"][ind] != df["DOI"][ind]):
            pmid = df["PMID"][ind]
            url = "https://pubmed.ncbi.nlm.nih.gov/" + pmid + "/"
            print(url)
            response = requests.get(url, headers = plib.headers)
            if response.status_code != 200:
                raise Exception("Error when request webpages!")
            soup = BeautifulSoup(response.content, "lxml")
            l = soup.find_all("a", {"class: id-link"}, {"data-ga-action": "DOI"})
            if(len(l) != 0):
                # print(l[0].get_text().strip())
                df.at[ind, "DOI"] = l[0].get_text().strip()
            else:
                df.at[ind, "DOI"] = np.nan
    df.to_csv(fpath.poten_litera_csv, header = True, index = False)
    print("All 3 identifiers: DOI, PMID, and PMCID filled in when possible.")
# --------------------start of test code--------------------
# test code
# ---------------------end of test code---------------------

In [None]:
# remove duplciations based on identifiers in the potential related literature
def merge_and_remove_dupli(file_path):
    df = pd.read_csv(file_path, sep = ",")
    print(len(df))
    df = df.drop_duplicates(subset=['DOI'])
    df = df.drop_duplicates(subset=['PMID'])
    df = df.drop_duplicates(subset=['PMCID'])
    print(len(df))
    # plib.clear_file(fpath.poten_litera_csv)
    # df.csv(fpath.poten_litera_csv, idnex = None)
    print("Duplication in the potential related literature removed.")
    print("Found " + len(df) + " potential related literature in total.")
# --------------------start of test code--------------------
# test code
# ---------------------end of test code---------------------

<h3> Main program: </h3> 

In [None]:
# # preprocess search results from PubMed

# source_path = fpath.poten_litera_pubmed
# output_path = fpath.poten_litera_pubmed_processed

# # clear the file
# plib.clear_file(output_path)

# # preprocess search results from PubMed
# # 2606 results
# preprocess_pubmed(source_path, output_path, columns, 2565, 2606)
# print("preprocessing results from PubMed succeeded!")
# # print("Attention! Something went wrong when preprocessing results from PubMed!")

In [None]:
# # clear the file
# plib.clear_file(fpath.poten_litera_wos)

# # combine the 2 files of search results from web of science
# source_path_1 = fpath.poten_litera_wos_1
# source_path_2 = fpath.poten_litera_wos_2
# df_1 = pd.read_csv(source_path_1, sep=';')
# df_2 = pd.read_csv(source_path_2, sep=';')
# df_1.to_csv(fpath.poten_litera_wos, header=True, index=False, sep=";")
# df_2.to_csv(fpath.poten_litera_wos, mode="a", header=False, index=False, sep=";")
# --------------------start of test code--------------------
# df = pd.read_csv(fpath.poten_litera_wos, sep=';')
# print(df.head(3))
# print(df.shape)
# (1976, 72)
# ---------------------end of test code---------------------

In [None]:
# # preprocess search results from Web of Science

# source_path = fpath.poten_litera_wos
# output_path = fpath.poten_litera_wos_processed

# # clear the file
# # plib.clear_file(output_path)

# # preprocess search results from Web of Science
# # 1976 results
# preprocess_webofscience(source_path, output_path, columns, 0, 1976)
# print("preprocessing results from Web of Science succeeded!")
# # print("Attention! Something went wrong when preprocessing results from Web of Science!")

In [None]:
# # preprocess search results from Europe PMC

# source_path = fpath.poten_litera_eupmc
# output_path = fpath.poten_litera_eupmc_processed

# # clear the file
# # plib.clear_file(output_path)

# # preprocess search results from Europe PMC
# preprocess_eupmc(source_path, output_path, columns, 0, 9140)
# # 9140 results
# print("preprocessing results from Europe PMC succeeded!")
# # print("Attention! Something went wrong when preprocessing results from Europe PMC!")

In [None]:
# # preprocess search results from Google Scholar step 1

# source_path = fpath.poten_litera_gs
# output_path = fpath.poten_litera_gs_processed_step1

# # clear the file
# # plib.clear_file(output_path)

# # preprocess search results from Google Scholar
# preprocess_google_shcolar_step1(source_path, output_path, 0, 1000)
# # 905 results
# print("step 1 of preprocessing results from Google Scholar succeeded!")
# # print("Attention! Something went wrong when preprocessing results from Google Scholar step 1!")

In [None]:
# # preprocess search results from Google Scholar step 2

# source_path = fpath.poten_litera_gs_processed_step1
# output_path = fpath.poten_litera_gs_processed_step2

# # clear the file
# # plib.clear_file(output_path)

# # preprocess search results from Google Scholar
# preprocess_google_shcolar_step2(source_path, output_path, 0, 905)
# # 905 results
# print("step 2 of preprocessing results from Google Scholar succeeded!")
# # print("Attention! Something went wrong when preprocessing results from Google Scholar step 2!")

In [None]:
# # preprocess search results from spanning citations of seed paper

# preprocess_seed_paper_spanning(source_path, output_path, columns):
# print("preprocessing results from spanning citations of seed papers succeeded!")
# # print("Attention! Something went wrong when preprocessing results from spanning citations of seed papers!")

In [None]:
# # preprocess search results from CoCoMac papers

# preprocess_cocomac_paper(source_path, output_path, columns)
# print("preprocessing results from CoCoMac papers succeeded!")
# # print("Attention! Something went wrong when preprocessing results from CoCoMac papers!")

In [None]:
# # fill in all identifiers in the columns when possible

# file_path = fpath.poten_litera
# fill_in_elements(file_path, columns)

In [None]:
# # merge all search results and remove duplication by identifiers

# # identifier = ["DOI", "PMID", "PMCID"]
# file_path = fpath.poten_litera
# merge_and_remove_dupli(file_path)

<h3> Next step: automatic filtering </h3>