<h2> Searched literature data preprocessing </h2> 

In [202]:
# import internal .py modules
import file_path_management as fpath
import public_library as plib

In [203]:
# import packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import re
import time
import numpy as np
import numpy as np

<h3> Parameters: </h3>

In [204]:
# columns of file: potential_related_literature.csv
columns = ["DOI", "PMID", "PMCID", "Title", "First_Author", "full_text_url", "full_text_source"]

<h3> Predefined fucntions: </h3> 

In [205]:
def request_wegpage(url, proxies):
    response = requests.get(url, headers = plib.headers, proxies = proxies)
    if response.status_code != 200:
        # print("Error when requesting:", url)
        # print(response.status_code)
        raise Exception("Your request was declined, again!")
    soup = BeautifulSoup(response.content, "lxml")
    return soup

In [206]:
def merge_pmc(columns):
    print("Starting merging search results from PubMed Central PMC...")
    # process pmc search results
    df = pd.read_csv(fpath.poten_litera_pmc, sep=',')
    df = df[["DOI", "PMID", "PMCID", "Title", "First Author"]]
    for ind in df.index:
        pmid = str(df["PMID"][ind])
        url = "https://pubmed.ncbi.nlm.nih.gov/" + pmid + "/"
        if(ind%5 == 0):
            time.sleep(random.randint(1, 10))
            proxies, auth = plib.get_proxies()
        soup = request_wegpage(url, proxies)
        # print(soup)
        # get full name of first author
        try:
            first_author = soup.find_all("span", {"class": "authors-list-item"})[0].find_all("a", {"class": "full-name"})[0].get_text().strip()
        except:
            first_author = "not found"
        # get PMCID
        # print(df["PMCID"][ind])
        if df["PMCID"][ind] is np.nan:
            try:
                pmcid = soup.find_all("span", {"class": "identifier pmc"})[0].find_all("a", {"class": "id-link"})[0].get_text().strip()
            except:
                pmcid  ="not found"
        else:
            pmcid = str(df["PMCID"][ind])
        # print(pmcid)
        # get DOI
        if df["DOI"][ind] is np.nan:
            try:
                doi = soup.find_all("span", {"class": "identifier doi"})[0].find_all("a", {"class": "id-link"})[0].get_text().strip()
            except:
                doi  ="not found"
        else:
            doi = str(df["DOI"][ind])
        # get full_text_url
        if pmcid != "not found":
            full_text_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/"
            full_text_source = "PMC"
        else:
            try:
                full_text_url = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["href"].strip()
                full_text_source = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["data-ga-action"].strip()
            except:
                full_text_url = "not found"
                full_text_source = "not found"
        # columns = ["DOI", "PMID", "PMCID", "Title", "First_Author", "full_text_url", "full_text_source"]
        row = {
            "DOI": [doi],
            "PMID": [pmid],
            "PMCID": [pmcid],
            "Title": [str(df["Title"][ind])],
            "First_Author": [first_author],
            "full_text_url": [full_text_url],
            "full_text_source": [full_text_source]
        }
        # print(row)
        if not plib.add_row_to_csv(fpath.poten_litera, row, columns):
            print("Error detected when adding a row to csv!")
# --------------------start of test code--------------------
plib.clear_file(fpath.poten_litera)
df = pd.read_csv(fpath.poten_litera_pmc, sep=',')
df = df[["DOI", "PMID", "PMCID", "Title", "First Author"]]
print(df.head(5))
print(df["DOI"].isnull().values.any())
print(df["PMID"].isnull().values.any())
print(df["PMCID"].isnull().values.any())
print(df["Title"].isnull().values.any())
print(df["First Author"].isnull().values.any())
# the columns PMID, Title, First Author don't contain np.nan
# the columns DOI, PMCID contain np.nan, we need to fill in what are missing
# we also need to reenter the full name of the first author
merge_pmc(columns)
# ---------------------end of test code---------------------

                          DOI      PMID        PMCID  \
0  10.1007/s00429-021-02377-7  34524542   PMC8541979   
1            10.1113/JP282626  35851953  PMC10087288   
2           10.1111/ejn.13910  29542210          NaN   
3           10.1002/cne.24389  29322527          NaN   
4  10.31083/j.jin.2021.01.334  33834704          NaN   

                                               Title  First Author  
0  Vision for action: thalamic and cortical input...   Gamberini M  
1  Cortico-thalamocortical interactions for learn...     Perry BAL  
2  Corticothalamic axon morphologies and network ...   Rockland KS  
3  Thalamo-cortical projections to the macaque su...     Impieri D  
4  The superior parietal lobule of primates: a se...  Passarelli L  
True
False
True
False
False
Starting merging search results from PubMed Central PMC...


KeyboardInterrupt: 

In [None]:
def merge_webofscience(columns):
    print("Starting merging search results from Web of Science...")
    df = pd.read_csv(fpath.poten_litera_wos_1, sep = ";")
    df = df[["DOI", "Pubmed Id", "Article Title"]]
    df.rename(columns={"DOI": "DOI", "Pubmed Id": "PMID", "Article Title": "Title"}, inplace = True)
    df["PMID"] = df["PMID"].fillna(0)
    df["PMID"] = df["PMID"].astype(int)
    df["PMID"] = df["PMID"].astype(str)
    pmcid = []
    for ind in df_wos.index:
        # print(df_wos["PMID"][ind])
        if df_wos["PMID"][ind] != "0":
            pmid = df_wos["PMID"][ind]
            # print(pmid)
            df_wos["PMID"][ind] = pmid
            url = "https://pubmed.ncbi.nlm.nih.gov/" + pmid + "/"
            # print(url)
            time.sleep(random.randint(5, 20))
            response = requests.get(url, headers = plib.headers)
            if response.status_code != 200:
                raise Exception("Error when request webpages!")
            soup = BeautifulSoup(response.content, "lxml")
            l = soup.find_all("a", {"data-ga-action": "PMCID"})
            if(len(l) != 0):
                # print(l[0].get_text().strip())
                pmcid.append(l[0].get_text().strip())
            else:
                pmcid.append(np.nan)
        else:
            pmcid.append(np.nan)
        # print(df_wos[ind])
    df_wos["PMCID"] = pmcid
    df_wos["PMCID"].replace("0", np.nan)
    # print(df_wos.head(5))
    df_wos = df_wos[columns]
    df_wos.to_csv(fpath.poten_litera, header = True, index = None)
# --------------------start of test code--------------------
# plib.clear_file(fpath.poten_litera)
# df = pd.read_csv(fpath.poten_litera_wos_1, sep=';')
# df = df[["DOI", "Pubmed Id", "Article Title"]]
# print(df.head(5))
# print(df["DOI"].isnull().values.any())
# print(df["Pubmed Id"].isnull().values.any())
# print(df["Article Title"].isnull().values.any())
# # the columns Article Title don't contain np.nan
# # the columns DOI, Pubmed Id contain np.nan, we need to fill in what are missing
# # we also need to reenter the full name of the first author
# merge_webofscience(columns)
# ---------------------end of test code---------------------  

In [None]:
def merge_eupmc(columns):
    print("Starting merging search results from Europe PMC...")
    # process eupmc search results
    df = pd.read_csv(fpath.poten_litera_eupmc, sep = ",")
    df = df[["DOI", "EXTERNAL_ID", "PMCID", "TITLE"]]
    df = df.rename(columns={"EXTERNAL_ID": "PMID", "TITLE": "Title"}, errors = "raise")
    for ind in df.index:
        pmid = str(df["PMID"][ind])
        url = "https://pubmed.ncbi.nlm.nih.gov/" + pmid + "/"
        if(ind%5 == 0):
            time.sleep(random.randint(1, 10))
            proxies = plib.get_proxies()
        try:
            soup = request_wegpage(url, proxies)
            # print(soup)
            # get full name of first author
            try:
                first_author = soup.find_all("span", {"class": "authors-list-item"})[0].find_all("a", {"class": "full-name"})[0].get_text().strip()
            except:
                first_author = "not found"
            # get PMCID
            # print(df["PMCID"][ind])
            if df["PMCID"][ind] is np.nan:
                try:
                    pmcid = soup.find_all("span", {"class": "identifier pmc"})[0].find_all("a", {"class": "id-link"})[0].get_text().strip()
                except:
                    pmcid  ="not found"
            else:
                pmcid = str(df["PMCID"][ind])
            # print(pmcid)
            # get DOI
            if df["DOI"][ind] is np.nan:
                try:
                    doi = soup.find_all("span", {"class": "identifier doi"})[0].find_all("a", {"class": "id-link"})[0].get_text().strip()
                except:
                    doi  ="not found"
            else:
                doi = str(df["DOI"][ind])
            # get full_text_url
            if pmcid != "not found":
                full_text_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/"
                full_text_source = "PMC"
            else:
                try:
                    full_text_url = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["href"].strip()
                    full_text_source = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["data-ga-action"].strip()
                except:
                    full_text_url = "not found"
                    full_text_source = "not found"
            # columns = ["DOI", "PMID", "PMCID", "Title", "First_Author", "full_text_url", "full_text_source"]
        except:
            if df["DOI"][ind] is np.nan:
                doi  ="not found"
            else:
                doi = str(df["DOI"][ind])
            full_text_url = "not found"
            full_text_source = "not found"
            pmid = "not found"
            pmcid = "not found"
            first_author = "not found"
        row = {
            "DOI": [doi],
            "PMID": [pmid],
            "PMCID": [pmcid],
            "Title": [str(df["Title"][ind])],
            "First_Author": [first_author],
            "full_text_url": [full_text_url],
            "full_text_source": [full_text_source]
        }
        # print(row)
        if not plib.add_row_to_csv(fpath.poten_litera, row, columns):
            print("Error detected when adding a row to csv!")
# --------------------start of test code--------------------
plib.clear_file(fpath.poten_litera)
df = pd.read_csv(fpath.poten_litera_eupmc, sep=',')
df = df[["DOI", "EXTERNAL_ID", "PMCID", "TITLE"]]
print(df.head(5))
print(df["DOI"].isnull().values.any())
print(df["EXTERNAL_ID"].isnull().values.any())
print(df["PMCID"].isnull().values.any())
print(df["TITLE"].isnull().values.any())
# the columns PMID, Title don't contain np.nan
# the columns DOI, PMCID contain np.nan, we need to fill in what are missing
# we also need to reenter the full name of the first author
merge_eupmc(columns)
# ---------------------end of test code---------------------

In [None]:
def merge_google_shcolar(columns):
    print("Starting merging search results from Google Scholar...")
    return True

In [None]:
def merge_seed_paper_spanning(columns):
    print("Starting merging search results from spanning citations of seed paper...")
    return True

In [None]:
def merge_cocomac_paper(columns):
    print("Starting merging search results from CoCoMac papers...")
    return True

In [None]:
# make sure at least PMID and PMCID is present as two of the four identifiers, otherwise manually fill in
def fill_in_elements(file_path):
    # PMID -> PMCID
    # done already
    # PMCID -> PMID
    # done already
    # PMID -> DOI
    df = pd.read_csv(file_path, sep = ",")
    for ind in df.index:
        if (df["PMID"][ind] is not np.nan) and (df["DOI"][ind] is np.nan):
            pmid = df["PMID"][ind]
            url = "https://pubmed.ncbi.nlm.nih.gov/" + pmid + "/"
            print(url)
            response = requests.get(url, headers = plib.headers)
            if response.status_code != 200:
                raise Exception("Error when request webpages!")
            soup = BeautifulSoup(response.content, "lxml")
            l = soup.find_all("a", {"class: id-link"}, {"data-ga-action": "DOI"})
            if(len(l) != 0):
                # print(l[0].get_text().strip())
                df.at[ind, "DOI"] = l[0].get_text().strip()
            else:
                df.at[ind, "DOI"] = np.nan
    df.to_csv(fpath.poten_litera_csv, header = True, index = False)
    print("All 3 identifiers: DOI, PMID, and PMCID filled in when possible.")
# --------------------start of test code--------------------
# test code
# ---------------------end of test code---------------------

In [None]:
# remove duplciations based on identifiers in the potential related literature
def remove_dupli(file_path):
    df = pd.read_csv(file_path, sep = ",")
    print(len(df))
    df = df.drop_duplicates(subset=['DOI'])
    df = df.drop_duplicates(subset=['PMID'])
    df = df.drop_duplicates(subset=['PMCID'])
    print(len(df))
    # plib.clear_file(fpath.poten_litera_csv)
    # df.csv(fpath.poten_litera_csv, idnex = None)
    print("Duplication in the potential related literature removed.")
    print("Found " + len(df) + " potential related literature in total.")
# --------------------start of test code--------------------
# test code
# ---------------------end of test code---------------------

<h3> Main program: </h3> 

In [None]:
# clear the file
plib.clear_file(fpath.poten_litera)

In [None]:
# merge search results from PubMed Central PMC
# if merge_pmc(columns):
#     print("Merging results from PubMed Central PMC succeeded!")
# else:
#     print("Attention! Something went wrong when merging results from PubMed Central PMC!")

In [None]:
# # merge search results from Web of Science
# if merge_webofscience():
#     print("Merging results from Web of Science succeeded!")
# else:
#     print("Attention! Something went wrong when merging results from Web of Science!")

In [None]:
# # merge search results from Europe PMC
# if merge_eupmc():
#     print("Merging results from Europe PMC succeeded!")
# else:
#     print("Attention! Something went wrong when merging results from Europe PMC!")

In [None]:
# # merge search results from Google Scholar
# if merge_google_shcolar():
#     print("Merging results from Google Scholar succeeded!")
# else:
#     print("Attention! Something went wrong when merging results from Google Scholar!")

In [None]:
# # merge search results from spanning citations of seed paper
# if merge_seed_paper_spanning():
#     print("Merging results from spanning citations of seed papers succeeded!")
# else:
#     print("Attention! Something went wrong when merging results from spanning citations of seed papers!")

In [None]:
# # merge search results from CoCoMac papers
# if merge_cocomac_paper():
#     print("Merging results from CoCoMac papers succeeded!")
# else:
#     print("Attention! Something went wrong when merging results from CoCoMac papers!")

In [None]:
# # fill in all elements in the columns when possible, if not, fill in "not found"
# fill_in_elements(fpath.poten_litera_csv, columns)

In [None]:
# identifier = ["DOI", "PMID", "PMCID"]
# remove_dupli(fpath.poten_litera_csv, identifier)

<h3> Next step: automatic filtering </h3>