<h2> Searched literature data preprocessing </h2> 

In [1]:
# import internal modules
import file_path_management as fpath
import public_library as plib
import extract_info_from_webpage as extra_info
import parameters as params



In [2]:
# import packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import re
import time
import numpy as np
import numpy as np

<h3> Parameters: </h3>

In [3]:
# columns of file: potential_related_literature.csv
columns = ["DOI", "PMID", "PMCID", "full_text_url", "pdf_url", "Title", "Abstract", "Keywords"]

<h3> Predefined fucntions: </h3> 

In [4]:
def preprocess_pubmed(source_path, output_path, start, end):
    print("Starting preprocessing search results from PubMed...")

    df = pd.read_csv(source_path, sep=',')
    df = df[["DOI", "PMID", "PMCID", "Title"]]
    
    for ind in range(start, end):
        # sleep to avoid to be blocked
        time.sleep(random.randint(1, 3))
        
        # request the webpage
        # the columns PMID, Title don't contain np.nan
        pmid = str(df["PMID"][ind]).strip()
        url = "https://pubmed.ncbi.nlm.nih.gov/" + pmid + "/"
        # proxies = plib.get_proxies()
        soup = plib.request_webpage(url)
        # print(soup)
        
        # get pmcid
        if df["PMCID"][ind] != df["PMCID"][ind]: # PMCID is np.nan
            try:
                pmcid = soup.find_all("span", {"class": "identifier pmc"})[0].find_all("a", {"class": "id-link"})[0].get_text().strip()
            except:
                pmcid = np.nan
        else: # PMCID is not np.nan
            pmcid = str(df["PMCID"][ind]).strip()
        # print(pmcid)

        # get doi
        if df["DOI"][ind] != df["DOI"][ind]: # DOI is np.nan
            try:
                doi = soup.find_all("span", {"class": "identifier doi"})[0].find_all("a", {"class": "id-link"})[0].get_text().strip()
            except:
                doi  = np.nan
        else: # DOI is not np.nan
            doi = str(df["DOI"][ind]).strip()
        # print(doi)

        # get full_text_url
        # if pmcid == pmcid: # pmcid is not np.nan
        #     full_text_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/"
        #     full_text_source = "PMC"
        # else: # pmcid is np.nan
        #     # PMC does not include this paper
        #     try:
        #         full_text_url = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["href"].strip()
        #         full_text_source = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["data-ga-action"].strip()
        #     except:
        #         full_text_url = np.nan
        #         full_text_source = np.nan
        full_text_url = np.nan
        # print(full_text_url)
        
        # get pdf_url
        pdf_url = np.nan
        title = (df.at[ind, "Title"]).strip()
        abstract = np.nan
        keywords = np.nan
        
        columns = ["DOI", "PMID", "PMCID", "full_text_url", "pdf_url", "Title", "Abstract", "Keywords"]
        row = {
            "DOI": [doi],
            "PMID": [pmid],
            "PMCID": [pmcid],
            "full_text_url": [full_text_url],
            "pdf_url": [pdf_url],
            "Title": [title],
            "Abstract": [abstract],
            "Keywords": [keywords]
        }
        # print(row)

        if not plib.add_row_to_csv(output_path, row, columns):
            print("Error detected when adding a row to csv!")
        
        print(ind)
# --------------------start of test code--------------------
# source_path = fpath.poten_litera_pubmed
# output_path = fpath.poten_litera_pubmed_processed
# plib.clear_file(output_path)

# df = pd.read_csv(source_path, sep=',')
# print(df.shape)
# # (2612, 11)
# df = df[["DOI", "PMID", "PMCID", "Title"]]
# print(df.head(3))
# print(df.shape)

# print(df["DOI"].isnull().values.any())
# print(df["PMID"].isnull().values.any())
# print(df["PMCID"].isnull().values.any())
# print(df["Title"].isnull().values.any())
# # True, False, True, Flase
# # PMID, Title don't contain np.nan
# # DOI, PMCID contain np.nan
# # we need to fill in what are missing
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# preprocess_pubmed(source_path, output_path, start, end)
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# df = pd.read_csv(output_path, header=None, sep=',')
# print(df.head(3))
# ---------------------end of test code---------------------

In [5]:
def preprocess_webofscience(source_path, output_path, start, end):
    print("Starting preprocessing search results from Web of Science...")
    
    df = pd.read_csv(source_path, sep=",")
    df = df[["DOI", "Pubmed Id", "Article Title", "Abstract", "Author Keywords", "Keywords Plus"]]

    for ind in range(start, end):
        # sleep to avoid to be blocked
        time.sleep(random.randint(1, 3))
        
        # the columns Article Title don't contain np.nan
        # the columns DOI and PMID might contain np.nan
        # get pmid, doi
        if df["Pubmed Id"][ind] != df["Pubmed Id"][ind]: # Pubmed Id is np.nan
            if df["DOI"][ind] != df["DOI"][ind]: # DOI is np.nan
                doi = np.nan
                pmid = np.nan
            else: # DOI is not np.nan
                doi = str(df["DOI"][ind]).strip()
                pmid = plib.doi2pmid(doi)
        else: # Pubmed Id is not np.nan
            pmid = str(int(df["Pubmed Id"][ind])).strip()
            if df["DOI"][ind] != df["DOI"][ind]: # DOI is not np.nan
                doi, a = plib.pmid2doi_pmcid(pmid)
            else: # DOI is not np.nan
                doi = str(df["DOI"][ind]).strip()
        
        # get pmcid
        if pmid != pmid: # pmid is np.nan
            pmcid = np.nan
            # if doi != doi: # doi is np.nan
            #     full_text_url = np.nan
            #     full_text_source = np.nan
            # else:
            #     full_text_url = "https://doi.org/" + str(doi).strip()
            #     full_text_source = "DOI"
        else: # pmid is not np.nan
            # request the webpage
            url = "https://pubmed.ncbi.nlm.nih.gov/" + pmid + "/"
            soup = plib.request_webpage(url)
            # print(soup)

            # get pmcid
            try:
                pmcid = soup.find_all("span", {"class": "identifier pmc"})[0].find_all("a", {"class": "id-link"})[0].get_text().strip()
            except:
                pmcid = np.nan
            # print(pmcid)
            
            # get full_text_url
            # if pmcid == pmcid:
            #     full_text_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/"
            #     full_text_source = "PMC"
            # else:
            #     try:
            #         full_text_url = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["href"].strip()
            #         full_text_source = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["data-ga-action"].strip()
            #     except:
            #         full_text_url = np.nan
            #         full_text_source = np.nan
        
        full_text_url = np.nan
        pdf_url = np.nan
        title = str(df["Article Title"][ind]).strip()
        abstract = str(df["Abstract"][ind]).strip()
        keywords = str(df["Author Keywords"][ind]).strip() + "; " + str(df["Keywords Plus"][ind]).strip()

        columns = ["DOI", "PMID", "PMCID", "full_text_url", "pdf_url", "Title", "Abstract", "Keywords"]
        row = {
            "DOI": [doi],
            "PMID": [pmid],
            "PMCID": [pmcid],
            "full_text_url": [full_text_url],
            "pdf_url": [pdf_url],
            "Title": [title],
            "Abstract": [abstract],
            "Keywords": [keywords]
        }
        # print(row)

        if not plib.add_row_to_csv(output_path, row, columns):
            print("Error detected when adding a row to csv!")
        
        print(ind)
# --------------------start of test code--------------------
# # source_path = fpath.poten_litera_wos
# # output_path = fpath.poten_litera_wos_processed
# plib.clear_file(output_path)

# df = pd.read_csv(source_path, sep=';')
# df = df[["DOI", "Pubmed Id", "Article Title", "Abstract", "Author Keywords", "Keywords Plus"]]
# print(df.head(3))
# print(df.shape)

# print(df["DOI"].isnull().values.any())
# print(df["Pubmed Id"].isnull().values.any())
# print(df["Article Title"].isnull().values.any())
# print(df["Abstract"].isnull().values.any())
# print(df["Author Keywords"].isnull().values.any())
# print(df["Keywords Plus"].isnull().values.any())
# # True, True, False
# # Article Title don't contain np.nan
# # DOI, Pubmed Id contain np.nan
# # we need to fill in what are missing
# ---------------------end of test code--------------------- 

# --------------------start of test code--------------------
# preprocess_webofscience(source_path, output_path, 0, 10)
# ---------------------end of test code--------------------- 

# --------------------start of test code--------------------
# df = pd.read_csv(output_path, header=None, sep=';')
# print(df.head(3))
# ---------------------end of test code---------------------  

In [6]:
def preprocess_eupmc(source_path, output_path, start, end):
    print("Starting preprocessing search results from Europe PMC...")

    df = pd.read_csv(source_path, sep=",")
    df = df[["SOURCE", "DOI", "EXTERNAL_ID", "PMCID", "TITLE"]]

    for ind in range(start, end):
        # sleep to avoid to be blocked
        time.sleep(random.randint(1, 3))

        # get pmid, doi
        # SOURCE = {'PMC', 'MED', 'ETH', 'PPR'}
        if df["SOURCE"][ind] != "MED": # SOURCE is not "MED" 
            if df["DOI"][ind] != df["DOI"][ind]: # doi is np.nan
                doi = np.nan
                pmid = np.nan
            else:
                doi = str(df["DOI"][ind]).strip()
                pmid = plib.doi2pmid(doi)
        else: # SOURCE is "MED"
            # get doi, pmid
            if df["EXTERNAL_ID"][ind] != df["EXTERNAL_ID"][ind]: # EXTERNAL_ID is np.nan
                if df["DOI"][ind] != df["DOI"][ind](): # DOI is np.nan
                    doi = np.nan
                    pmid = np.nan
                else: # DOI is not np.nan
                    doi = str(df["DOI"][ind]).strip()
                    pmid = plib.doi2pmid(doi)
            else: # EXTERNAL_ID is not np.nan
                pmid = str(df["EXTERNAL_ID"][ind]).strip()
                if df["DOI"][ind] != df["DOI"][ind]: # DOI is np.nan
                    doi, a = plib.pmid2doi_pmcid(pmid)
                else: # DOI is not np.nan
                    doi = str(df["DOI"][ind]).strip()
                
        # get pmcid, full_text_url, full_text_source
        if pmid != pmid: # pmid is np.nan
            pmcid = df["PMCID"][ind]
            # if pmcid == pmcid: # pmcid is np.nan
            #     full_text_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/"
            #     full_text_source = "PMC"
            # elif doi == doi: # doi is not np.nan
            #     full_text_url = "https://doi.org/" + str(doi).strip()
            #     full_text_source = "DOI"
            # else:
            #     full_text_url = np.nan
            #     full_text_source = np.nan
        else: # pmid is not np.nan
            # request the webpage
            url = "https://pubmed.ncbi.nlm.nih.gov/" + pmid + "/"
            # proxies = plib.get_proxies()
            soup = plib.request_webpage(url)
            # print(soup)

            # get pmcid
            try:
                pmcid = soup.find_all("span", {"class": "identifier pmc"})[0].find_all("a", {"class": "id-link"})[0].get_text().strip()
            except:
                pmcid = np.nan
            # print(pmcid)
            
            # get full_text_url
            # if pmcid == pmcid: # pmcid is not np.nan
            #     full_text_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/"
            #     full_text_source = "PMC"
            # else: # pmcid is not np.nan
            #     try:
            #         full_text_url = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["href"].strip()
            #         full_text_source = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["data-ga-action"].strip()
            #     except:
            #         full_text_url = np.nan
            #         full_text_source = np.nan
        
        full_text_url = np.nan
        pdf_url = np.nan
        title = (df.at[ind, "TITLE"]).strip()
        abstract = np.nan
        keywords = np.nan
        
        columns = ["DOI", "PMID", "PMCID", "full_text_url", "pdf_url", "Title", "Abstract", "Keywords"]
        row = {
            "DOI": [doi],
            "PMID": [pmid],
            "PMCID": [pmcid],
            "full_text_url": [full_text_url],
            "pdf_url": [pdf_url],
            "Title": [title],
            "Abstract": [abstract],
            "Keywords": [keywords]
        }
        # print(row)

        if not plib.add_row_to_csv(output_path, row, columns):
            print("Error detected when adding a row to csv!")
        
        print(ind)
# --------------------start of test code--------------------
# source_path = fpath.poten_litera_eupmc
# output_path = fpath.poten_litera_eupmc_processed
# # plib.clear_file(output_path)

# df = pd.read_csv(source_path, sep=',')
# df = df[["SOURCE", "DOI", "EXTERNAL_ID", "PMCID", "TITLE"]]
# print(df.head(3))
# print(df.shape)

# col_one_list = set(df['SOURCE'].tolist())
# print(col_one_list)
# # ['PMC', 'MED', 'ETH', 'PPR']

# print(df["SOURCE"].isnull().values.any())
# print(df["DOI"].isnull().values.any())
# print(df["EXTERNAL_ID"].isnull().values.any())
# print(df["PMCID"].isnull().values.any())
# print(df["TITLE"].isnull().values.any())
# # False, True, False, True, False
# # SOURCE, EXTERNAL_ID, Title don't contain np.nan
# # DOI, PMCID contain np.nan
# # we need to fill in what are missing
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# preprocess_eupmc(source_path, output_path, 0, 10)
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# df = pd.read_csv(output_path, header=None, sep=',')
# print(df.head(3))
# ---------------------end of test code---------------------

In [7]:
def preprocess_google_shcolar_step1(source_path, output_path, start, end):
    print("Starting merging search results from Google Scholar...")

    df = pd.read_csv(source_path, header=None, sep=',')
    df.columns = ["title", "url", "url_tag", "full_text_url", "full_text_tag"]

    for ind in range(start, end):
        # df["url_tag"][ind]: {'[CITATION][C]', '[HTML][HTML]', '[PDF][PDF]', '[BOOK][B]', nan}
        # we don't need books, as they are not likely to include connecivity information
        if df.at[ind, "url_tag"] == "[BOOK][B]":
            continue
        
        if (df.at[ind, "url"] != df.at[ind, "url"] or df.at[ind, "title"] != df.at[ind, "title"]) and (df.at[ind, "full_text_tag"] == "[PDF]" or df.at[ind, "full_text_tag"] == "[HTML]"):
            raise Exception(ind, ": url or title are both nan, but full_text_tag is [PDF] or [HTML]!")

        # if url or title doesn't exsit AND full_text_url doesn't exist
        if (df.at[ind, "url"] != df.at[ind, "url"] or df.at[ind, "title"] != df.at[ind, "title"]):
            continue 
        
        title = str(df["title"][ind]).strip()

        # now every row has at least title and url
        if df["url_tag"][ind] == "[PDF][PDF]": # {'[CITATION][C]', '[HTML][HTML]', '[PDF][PDF]', nan}
            if df["full_text_tag"][ind] == "[HTML]": # {'[PDF]', '[HTML]', nan}
                link = str(df["full_text_url"][ind]).strip()
                full_text_url, status_code  = plib.get_final_redirected_url(link)
                if full_text_url == full_text_url:
                    full_text_source = full_text_url.split("://")[1].split("/")[0]
                else:
                    full_text_source = np.nan
            else:
                full_text_url = np.nan
                full_text_source = np.nan
            # get pdf_url, pdf_source
            link = str(df["url"][ind]).strip()
            pdf_url, status_code = plib.get_final_redirected_url(link)
            # if pdf_url == pdf_url:
            #     pdf_source = pdf_url.split("://")[1].split("/")[0]
            # else:
            #     pdf_source = np.nan
        else: # {'[CITATION][C]', '[HTML][HTML]', '[PDF][PDF]', nan}
            link = str(df["url"][ind]).strip()
            full_text_url, status_code = plib.get_final_redirected_url(link)
            if full_text_url == full_text_url:
                full_text_source = full_text_url.split("://")[1].split("/")[0]
            else:
                full_text_source = np.nan
            # get pdf_url, pdf_source
            if df["full_text_tag"][ind] == "[PDF]": # full_text_type = {'[HTML]', nan, '[PDF]'}
                link = str(df["full_text_url"][ind]).strip()
                pdf_url, status_code  = plib.get_final_redirected_url(link)
                # if pdf_url == pdf_url:
                #     pdf_source = pdf_url.split("://")[1].split("/")[0]
                # else:
                #     pdf_source = np.nan
            else:
                pdf_url = np.nan
        
        columns = ["Title", "full_text_url", "full_text_source", "pdf_url"]
        row = {
            "Title": [title],
            "full_text_url": [full_text_url],
            "full_text_source": [full_text_source],
            "pdf_url": [pdf_url]
        }
        # print(row)

        if not plib.add_row_to_csv(output_path, row, columns):
            print("Error detected when adding a row to csv!")
        
        print(ind)
# --------------------start of test code--------------------
# source_path = fpath.poten_litera_gs
# output_path = fpath.poten_litera_gs_processed_step1
# plib.clear_file(output_path)

# df = pd.read_csv(source_path, header=None, sep=',')
# df.columns = ["title", "url", "url_tag", "full_text_url", "full_text_tag"]
# # print(df.head(3))
# print(df.shape)
# # (980, 5)

# url_type = set(df['url_tag'].tolist())
# print(url_type)
# # {'[CITATION][C]', '[HTML][HTML]', '[PDF][PDF]', '[BOOK][B]', nan}
# full_text_tag = set(df['full_text_tag'].tolist())
# print(full_text_tag)
# # {'[PDF]', '[HTML]', nan}
# # ---------------------end of test code---------------------

# # --------------------start of test code--------------------
# # ["title", "url", "url_tag", "full_text_url", "full_text_tag"]
# print(df["title"].isnull().any().any())
# print(df["url"].isnull().any().any())
# print(df["url_tag"].isnull().any().any())
# print(df["full_text_url"].isnull().any().any())
# print(df["full_text_tag"].isnull().any().any())
# # True, True, True, True, True
# # title, url, url_tag, full_text_url, full_text_tag, all contain np.nan
# # we need to fill in what are missing
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# preprocess_google_shcolar_step1(source_path, output_path, 0, 1000)
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# df = pd.read_csv(output_path, header=None, sep=',')
# print(df.head(3))
# ---------------------end of test code---------------------

In [8]:
def preprocess_google_shcolar_step2(source_path, output_path, start, end):
    print("Starting merging search results from Google Scholar...")

    df = pd.read_csv(source_path, header=None, sep=',')
    df.columns = ["Title", "full_text_url", "full_text_source", "pdf_url"]

    for ind in range(start, end):
        # get doi from url
        if df["full_text_url"][ind] == df["full_text_url"][ind]: # there's a full_text_url
            url = str(df["full_text_url"][ind]).strip()
            source = url.split("://")[1].split("/")[0]
            # check if the full_text_url is one of our websites
            flag = False
            for website in params.websites_gs:
                if website in source:
                    flag = True
                    break
            if not flag:
                continue
            info = extra_info.extract_info_from_webpage(url, params.websites_gs)
            doi = info["doi"]
            pmid = info["pmid"]
            pmcid = info["pmcid"]
        else:
            url = np.nan
            doi = np.nan
            pmid = np.nan
            pmcid = np.nan
        
        # # get pmid from DOI
        # if doi == doi: # there's doi
        #     pmid = plib.doi2pmid(doi)
        # else: # doi not found
        #     pmid = np.nan
        # # get pmcid, full_text_url, full_text_source
        # if pmid != pmid: # pmid is np.nan
        #     pmcid = np.nan
        #     if doi == doi: # doi is not np.nan
        #         full_text_url = "https://doi.org/" + str(doi).strip()
        #         full_text_source = "DOI"
        #     else:
        #         full_text_url = np.nan
        #         full_text_source = np.nan
        # else: # pmid is not np.nan
        #     # request the webpage
        #     url = "https://pubmed.ncbi.nlm.nih.gov/" + pmid + "/"
        #     # proxies = plib.get_proxies()
        #     soup = plib.request_webpage(url)
        #     # print(soup)

        #     # get pmcid
        #     try:
        #         pmcid = soup.find_all("span", {"class": "identifier pmc"})[0].find_all("a", {"class": "id-link"})[0].get_text().strip()
        #     except:
        #         pmcid = np.nan
        #     # print(pmcid)
            
        #     # get full_text_url, full_text_source
        #     if pmcid == pmcid: # pmcid is not np.nan
        #         full_text_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/"
        #         full_text_source = "PMC"
        #     else: # pmcid is not np.nan
        #         try:
        #             full_text_url = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["href"].strip()
        #             full_text_source = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["data-ga-action"].strip()
        #         except:
        #             full_text_url = np.nan
        #             full_text_source = np.nan
        
        full_text_url = url
        pdf_url = df.at[ind, "pdf_url"]
        title = df.at[ind, "Title"]
        abstract = np.nan
        keywords = np.nan

        columns = ["DOI", "PMID", "PMCID", "full_text_url", "pdf_url", "Title", "Abstract", "Keywords"]
        row = {
            "DOI": [doi],
            "PMID": [pmid],
            "PMCID": [pmcid],
            "full_text_url": [full_text_url],
            "pdf_url": [pdf_url],
            "Title": [title],
            "Abstract": [abstract],
            "Keywords": [keywords]
        }
        # print(row)

        if not plib.add_row_to_csv(output_path, row, columns):
            print("Error detected when adding a row to csv!")
        
        print(doi)
        if doi != doi:
            print([df["full_text_url"][ind]])
        print(ind)
# --------------------start of test code--------------------
# source_path = fpath.poten_litera_gs_processed_step1
# output_path = fpath.poten_litera_gs_processed_step2
# plib.clear_file(output_path)
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# df = pd.read_csv(source_path, header=None, sep=',')
# df.columns = ["Title", "full_text_url", "full_text_source", "pdf_url"]
# # print(df.head(3))
# print(df.shape)
# # (926, 4)
# full_text_source = set(df['full_text_source'].tolist())
# print(full_text_source)
# # {'www.elibrary.ru', 'n.neurology.org', 'jnnp.bmj.com', 'anatomypubs.onlinelibrary.wiley.com', 'academic.oup.com', 
# #  'nyaspubs.onlinelibrary.wiley.com', 'cir.nii.ac.jp', 'link.springer.com', 'www.mdpi.com', 'pure.mpg.de', 
# #  'bmcneurosci.biomedcentral.com', 'elibrary.ru', 'journals.sagepub.com', 'tbiomed.biomedcentral.com', 
# #  'onlinelibrary.wiley.com', 'www.cambridge.org', 'wakespace.lib.wfu.edu', nan, 'www.cell.com', 'europepmc.org', 
# #  'var.scholarpedia.org', 'jpet.aspetjournals.org', 'journal.psych.ac.cn', 'www.biorxiv.org', 'ieeexplore.ieee.org', 
# #  'www.jstor.org', 'www.cabdirect.org', 'royalsocietypublishing.org', 'analyticalsciencejournals.onlinelibrary.wiley.com', 
# #  'open.bu.edu', 'journals.lww.com', 'www.eneuro.org', 'www.jstage.jst.go.jp', 'journals.plos.org', 'www.ncbi.nlm.nih.gov', 
# #  'www.liebertpub.com', 'neuro.psychiatryonline.org', 'www.sciencedirect.com', 'psycnet.apa.org', 'www.taylorfrancis.com', 
# #  'www.degruyter.com', 'www.nature.com', 'jamanetwork.com', 'karger.com', 'www.tandfonline.com', 'journals.physiology.org', 
# #  'movementdisorders.onlinelibrary.wiley.com', 'www.pnas.org', 'www.jneurosci.org', 'thejns.org', 'pascal-francis.inist.fr', 
# #  'physoc.onlinelibrary.wiley.com', 'agro.icm.edu.pl', 'elifesciences.org', 'www.frontiersin.org', 'escholarship.mcgill.ca', 
# #  'ajp.psychiatryonline.org', 'www.science.org', 'books.google.de'}

# # {'elibrary.ru', 'neurology.org', 'bmj.com', 'wiley.com', 'oup.com', 'cir.nii.ac.jp', 'springer.com', 'mdpi.com', 'mpg.de', 
# #  'biomedcentral.com', 'sagepub.com', 'cambridge.org', 'wfu.edu', nan, 'cell.com', 'europepmc.org', 'scholarpedia.org', 
# #  'aspetjournals.org', 'psych.ac.cn', 'biorxiv.org', 'ieee.org', 'jstor.org', 'cabdirect.org', 'royalsocietypublishing.org', 
# #  'bu.edu', 'lww.com', 'eneuro.org', 'jst.go.jp', 'plos.org', 'ncbi.nlm.nih.gov', 'liebertpub.com', 'psychiatryonline.org', 
# #  'sciencedirect.com', 'psycnet.apa.org', 'taylorfrancis.com', 'degruyter.com', 'nature.com', 'jamanetwork.com', 
# #  'karger.com', 'www.tandfonline.com', 'physiology.org', 'www.pnas.org', 'jneurosci.org', 'thejns.org', 
# #  'pascal-francis.inist.fr', 'agro.icm.edu.pl', 'elifesciences.org', 'frontiersin.org', 'mcgill.ca', 
# #  'science.org', 'books.google.de'}

# # websites_gs = {
# #     'neurology.org', 'bmj.com', 'wiley.com', 'oup.com', 'springer.com', 'mdpi.com', 
# #     'biomedcentral.com', 'sagepub.com', 'cambridge.org', 'wfu.edu', 'cell.com', 'europepmc.org', 
# #     'aspetjournals.org', 'psych.ac.cn', 'biorxiv.org', 'ieee.org', 'jstor.org', 'royalsocietypublishing.org', 
# #     'bu.edu', 'lww.com', 'eneuro.org', 'jst.go.jp', 'plos.org', 'ncbi.nlm.nih.gov', 'liebertpub.com', 
# #     'psychiatryonline.org', 'sciencedirect.com', 'psycnet.apa.org', 'degruyter.com', 'nature.com', 'jamanetwork.com', 
# #     'karger.com', 'tandfonline.com', 'physiology.org', 'pnas.org', 'jneurosci.org', 'thejns.org', 
# #     'agro.icm.edu.pl', 'elifesciences.org', 'frontiersin.org', 'science.org'}
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# # ["Title", "full_text_url", "full_text_source", "pdf_url"]
# print(df["Title"].isnull().any().any())
# print(df["full_text_url"].isnull().any().any())
# print(df["full_text_source"].isnull().any().any())
# print(df["pdf_url"].isnull().any().any())
# # False, True, True, True
# # full_text_url, full_text_source, pdf_url contain np.nan
# # we need to fill in what are missing
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# preprocess_google_shcolar_step2(source_path, output_path, 0, 905)
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# df = pd.read_csv(output_path, header=None, sep=',')
# print(df.head(3))
# ---------------------end of test code---------------------

In [9]:
def preprocess_seed_paper_spanning(source_path, output_path):
    print("Starting preprocessing search results from spanning citations of seed paper...")
    return True
# --------------------start of test code--------------------
# test code
# ---------------------end of test code---------------------

In [10]:
def preprocess_cocomac_paper(source_path, output_path):
    print("Starting preprocessing search results from CoCoMac papers...")
    return True
# --------------------start of test code--------------------
# test code
# ---------------------end of test code---------------------

In [11]:
def combine(input, output_path):
    # combine all results
    df = pd.DataFrame()
    for search_result in input:
        df_single = pd.read_csv(search_result, header=None, sep = ",")
        # df = df.append(df_single, ignore_index=True, sort=False)
        df = pd.concat([df, df_single], ignore_index=True, sort=False)
    df.columns = ["DOI", "PMID", "PMCID", "full_text_url", "pdf_url", "Title", "Abstract", "Keywords"]
    df.reset_index(drop=True, inplace=True)
    df.to_csv(output_path, header=False, index=False)
# --------------------start of test code--------------------
# gos = fpath.poten_litera_gs_processed_step2
# wos = fpath.poten_litera_wos_processed
# pubmed = fpath.poten_litera_pubmed_processed
# eupmc = fpath.poten_litera_eupmc_processed
# input = [gos, wos, pubmed, eupmc]
# output_path = fpath.poten_litera_combined
# # plib.clear_file(output_path)
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# combine(input, output_path)
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# df = pd.read_csv(output_path, header=None, sep=',')
# print(df.head(3))
# print(df.shape)
# # (14627, 8)
# ---------------------end of test code---------------------

In [12]:
def fill_in_identifiers(input_path, output_path, start, end):
    df = pd.read_csv(input_path, header=None, sep = ",")
    df.columns = ["DOI", "PMID", "PMCID", "full_text_url", "pdf_url", "Title", "Abstract", "Keywords"]
    
    # fill in elements that are missing
    for ind in range(start, end):
        # if all 3 identifiers are missing, and full_text_url and pdf_url are missing, skip
        if df.at[ind, "DOI"] != df.at[ind, "DOI"] and df.at[ind, "PMID"] != df.at[ind, "PMID"] and df.at[ind, "PMCID"] != df.at[ind, "PMCID"] and df.at[ind, "full_text_url"] != df.at[ind, "full_text_url"] and df.at[ind, "pdf_url"] != df.at[ind, "pdf_url"]:
            continue
        
        # initialzie
        doi = np.nan
        pmid = np.nan
        pmcid = np.nan
        full_text_url = df.at[ind, "full_text_url"]
        pdf_url = df.at[ind, "pdf_url"]
        title = df.at[ind, "Title"]
        abstract = df.at[ind, "Abstract"]
        keywords = df.at[ind, "Keywords"]

        # if all 3 identifiers are missing, and full_text_url is missing
        if df.at[ind, "DOI"] != df.at[ind, "DOI"] and df.at[ind, "PMID"] != df.at[ind, "PMID"] and df.at[ind, "PMCID"] != df.at[ind, "PMCID"]:
            columns = ["DOI", "PMID", "PMCID", "full_text_url", "pdf_url", "Title", "Abstract", "Keywords"]
            row = {
                "DOI": [doi],
                "PMID": [pmid],
                "PMCID": [pmcid],
                "full_text_url": [full_text_url],
                "pdf_url": [pdf_url],
                "Title": [title],
                "Abstract": [abstract],
                "Keywords": [keywords]
            }

            if not plib.add_row_to_csv(output_path, row, columns):
                print("Error detected when adding a row to csv!")

            print(ind)
            continue
        
        # we have at least one of the 3 identifiers
        # doi, pmid
        if df["DOI"][ind] == df["DOI"][ind]: # DOI -> PMID
            doi = str(df["DOI"][ind]).strip().lower()
            # print(doi)
            if df["PMID"][ind] == df["PMID"][ind]:
                pmid = str(df["PMID"][ind]).strip()
                # print(pmid)
            else:
                pmid = plib.doi2pmid(doi)
                # print(pmid)
                if pmid != pmid:
                    pmid_cadidate = plib.title2pmid(title)
                    # print(pmid_cadidate)
                    if pmid_cadidate == pmid_cadidate:   
                        doi_validate, a = plib.pmid2doi_pmcid(pmid_cadidate)
                        if doi_validate == doi_validate:
                            doi_validate = doi_validate.lower()
                            if doi_validate == doi:
                                pmid = pmid_cadidate
                                # print(pmid)
        elif df["PMID"][ind] == df["PMID"][ind]: # PMID -> DOI
            pmid = str(int(df["PMID"][ind])).strip()
            # print(pmid)
            doi, pmcid = plib.pmid2doi_pmcid(pmid)
            # print(doi)
        elif df["PMCID"][ind] == df["PMCID"][ind]: # PMCID -> DOI, PMID
            pmcid = str(df["PMCID"][ind]).strip()
            try:
                doi, pmid = plib.pmcid2doi_pmid(pmcid)
            except:
                doi = np.nan
                pmid = np.nan
            # print(doi)
            # print(pmid)
        else:
            doi = np.nan
            pmid = np.nan
        # print(doi)
        # print(pmid)
        
        # pmcid
        if df["PMCID"][ind] == df["PMCID"][ind]:
            pmcid = str(df["PMCID"][ind]).strip()
        elif pmid == pmid:
            a, pmcid = plib.pmid2doi_pmcid(pmid)
        else:
            pmcid = np.nan
        # print(pmcid)

        # full_text_url
        # if pmcid == pmcid:
        #     full_text_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/"
        # elif doi == doi:
        #     full_text_url = plib.get_final_redirected_url(str("https://doi.org/" + doi).strip())
        # elif df["full_text_url"][ind] == df["full_text_url"][ind]:
        #     full_text_url = plib.get_final_redirected_url(df["full_text_url"][ind])
        # else:
        #     full_text_url = np.nan
        # print(full_text_url)

        # pdf_url
        # if df["pdf_url"][ind] == df["pdf_url"][ind]:
        #     pdf_url = plib.get_final_redirected_url(str(df["pdf_url"][ind]).strip())
        # else:
        #     pdf_url = np.nan
        # print(pdf_url)
    
        columns = ["DOI", "PMID", "PMCID", "full_text_url", "pdf_url", "Title", "Abstract", "Keywords"]
        row = {
            "DOI": [doi],
            "PMID": [pmid],
            "PMCID": [pmcid],
            "full_text_url": [full_text_url],
            "pdf_url": [pdf_url],
            "Title": [title],
            "Abstract": [abstract],
            "Keywords": [keywords]
        }

        if not plib.add_row_to_csv(output_path, row, columns):
            print("Error detected when adding a row to csv!")

        print(ind)
# --------------------start of test code--------------------
# input_path = fpath.poten_litera_combined
# # output_path = fpath.poten_litera_filled
# # plib.clear_file(output_path)
# df = pd.read_csv(input_path, header=None, sep = ",")
# print(df.shape)
# df.columns = ["DOI", "PMID", "PMCID", "full_text_url", "pdf_url", "Title", "Abstract", "Keywords"]
# print(df["DOI"].isnull().any().any())
# print(df["PMID"].isnull().any().any())
# print(df["PMCID"].isnull().any().any())
# print(df["full_text_url"].isnull().any().any())
# print(df["pdf_url"].isnull().any().any())
# print(df["Title"].isnull().any().any())
# print(df["Abstract"].isnull().any().any())
# print(df["Keywords"].isnull().any().any())
# True, True, True, True, True, False, True, True
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# fill_in_identifiers(input_path, output_path, 0, 14690)
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# df = pd.read_csv(output_path, header=None, sep=',')
# print(df.head(3))
# ---------------------end of test code---------------------

In [13]:
def merge_remove_dupli(input_path, output_path, identifiers): 
    df = pd.read_csv(input_path, header=None, sep = ",")
    df.columns = ["DOI", "PMID", "PMCID", "full_text_url", "pdf_url", "Title", "Abstract", "Keywords"]

    # remove all duplicates
    for identifier in identifiers:
        remove_dup_by = identifier
        df = df[df[remove_dup_by].isnull() | ~df[df[remove_dup_by].notnull()].duplicated(subset=remove_dup_by, keep='first')]
        # df = df.drop_duplicates(subset=['DOI'])
        # df = df.drop_duplicates(subset=['PMID'])
        # df = df.drop_duplicates(subset=['PMCID'])

    # reset index
    df.reset_index(drop=True, inplace=True)
    
    df.to_csv(output_path, header=False, index=False)
    print("Duplication in the potential related literature removed.")
# --------------------start of test code--------------------
# source_path = fpath.poten_litera_ids_filled
# output_path = fpath.poten_litra_filtered
# plib.clear_file(output_path)
# ---------------------end of test code---------------------

# --------------------start of test code--------------------
# merge all search results
# identifiers = ["DOI", "PMID", "PMCID"]
# merge_remove_dupli(source_path, output_path, identifiers)
# ---------------------end of test code---------------------

In [14]:
def full_text_url_filling(input_path, output_path, start, end):
    df = pd.read_csv(input_path, header=None, sep = ",")
    df.columns = ["DOI", "PMID", "PMCID", "full_text_url", "pdf_url", "Title", "Abstract", "Keywords"]
    
    for ind in range(start, end):
        doi = df.at[ind, "DOI"]
        pmid = df.at[ind, "PMID"]
        pmcid = df.at[ind, "PMCID"]
        full_text_url = np.nan
        pdf_url = df.at[ind, "pdf_url"]

        # get full text link
        if full_text_url!= full_text_url and pmcid == pmcid:
            url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + str(pmcid).strip() + "/"
            try:
                full_text_url, status_code = plib.get_final_redirected_url(url)
                if status_code == 403:
                    if pmid == pmid:
                        link = "https://pubmed.ncbi.nlm.nih.gov/" + str(int(df.at[ind, "PMID"])).strip() + "/"
                        soup = plib.request_webpage(link)
                        link = soup.find("div", {"class": "full-text-links-list"}).find("a", {"class": "link-item dialog-focus"})["href"]
                        full_text_url, status_code = plib.get_final_redirected_url(link)
                    else:
                        print("full_text_url is not found for pmid and pmcid", pmid, pmcid)
                        full_text_url = np.nan
            except:
                raise Exception(status_code, "Error when trying to get final redirected url from", url)
        
        if full_text_url!= full_text_url and pmid == pmid:
            try:
                url = "https://pubmed.ncbi.nlm.nih.gov/" + str(int(df.at[ind, "PMID"])).strip() + "/"
                try:
                    soup = plib.request_webpage(url)
                    link = soup.find("div", {"class": "full-text-links-list"}).find("a", {"class": "link-item pmc"})["href"]
                    full_text_url, status_code = plib.get_final_redirected_url(link)
                except:
                    soup = plib.request_webpage(url)
                    link = soup.find("div", {"class": "full-text-links-list"}).find("a", {"class": "link-item dialog-focus"})["href"]
                    full_text_url, status_code = plib.get_final_redirected_url(link)
            except:
                full_text_url = np.nan

        if full_text_url != full_text_url and doi == doi:
            url = "https://doi.org/" + str(doi).strip().lower()
            try:
                full_text_url, status_code = plib.get_final_redirected_url(url)
            except:
                raise Exception(status_code, "Error when trying to get final redirected url from", url)
        
        if full_text_url != full_text_url and df.at[ind, "full_text_url"] == df.at[ind, "full_text_url"]:
            try:
                full_text_url, status_code = plib.get_final_redirected_url(df.at[ind, "full_text_url"])
            except:
                raise Exception(status_code, "Error when trying to get final redirected url from", df.at[ind, "full_text_url"])
            
        if full_text_url != full_text_url and pmid == pmid:
            try:
                url = "https://pubmed.ncbi.nlm.nih.gov/" + str(int(df.at[ind, "PMID"])).strip() + "/"
                full_text_url, status_code = plib.get_final_redirected_url(url)
            except:
                raise Exception(status_code, "Error when trying to get final redirected url from", url)
        
        if  full_text_url != full_text_url:
            print(doi)
            print(pmid)
            print(pmcid)
            print(df.at[ind, "full_text_url"])
            print(df.at[ind, "pdf_url"])
            if pdf_url == pdf_url:
                full_text_url = np.nan
            else:
                continue
        # print(full_text_url)

        # get full text source
        if full_text_url == full_text_url:
            full_text_source = full_text_url.split("://")[1].split("/")[0]
        else:
            full_text_source = np.nan
        
        # get pdf url, pdf source
        if pdf_url == pdf_url:
            try:
                pdf_url, status_code = plib.get_final_redirected_url(pdf_url)
            except:
                print("error when getting final redirected url from: ", pdf_url)
                pdf_url = np.nan
            
        if pdf_url == pdf_url:
            pdf_source = pdf_url.split("://")[1].split("/")[0]
        else:
            pdf_source = np.nan
        
        if doi == doi:
            doi = doi.lower()

        columns = ["DOI", "PMID", "PMCID", "full_text_url", "full_text_source", "pdf_url", "pdf_source", "Title", "Abstract", "Keywords"]
        row = {
            "DOI": [doi],
            "PMID": [pmid],
            "PMCID": [pmcid],
            "full_text_url": [full_text_url],
            "full_text_source": [full_text_source],
            "pdf_url": [pdf_url],
            "pdf_source": [pdf_source],
            "Title": [df.at[ind, "Title"]],
            "Abstract": [df.at[ind, "Abstract"]],
            "Keywords": [df.at[ind, "Keywords"]]
        }
        # print(row)

        if not plib.add_row_to_csv(output_path, row, columns):
            print("Error detected when adding a row to csv!")
        
        print(ind)

<h3> Main program: </h3> 

In [15]:
# # preprocess search results from PubMed

# source_path = fpath.poten_litera_pubmed
# output_path = fpath.poten_litera_pubmed_processed

# # clear the file
# # plib.clear_file(output_path)

# # preprocess search results from PubMed
# # 2612 results
# preprocess_pubmed(source_path, output_path, 0, 2612)
# print("preprocessing results from PubMed succeeded!")
# # print("Attention! Something went wrong when preprocessing results from PubMed!")

In [16]:
# combine 2 files of search results from web of science
# # clear the file
# plib.clear_file(fpath.poten_litera_wos)

# # combine the 2 files of search results from web of science
# source_path_1 = fpath.poten_litera_wos_1
# source_path_2 = fpath.poten_litera_wos_2
# df_1 = pd.read_csv(source_path_1, sep=',')
# df_2 = pd.read_csv(source_path_2, sep=',')
# df_1.to_csv(fpath.poten_litera_wos, header=True, index=False, sep=",")
# df_2.to_csv(fpath.poten_litera_wos, mode="a", header=False, index=False, sep=",")
# # --------------------start of test code--------------------
# df = pd.read_csv(fpath.poten_litera_wos, sep=',')
# print(df.head(3))
# print(df.shape)
# # (1993, 72)
# # ---------------------end of test code---------------------

In [17]:
# # preprocess search results from Web of Science

# source_path = fpath.poten_litera_wos
# output_path = fpath.poten_litera_wos_processed

# # clear the file
# # plib.clear_file(output_path)

# # preprocess search results from Web of Science
# # 1993 results
# preprocess_webofscience(source_path, output_path, 0, 1993)
# print("preprocessing results from Web of Science succeeded!")
# # print("Attention! Something went wrong when preprocessing results from Web of Science!")

In [18]:
# # preprocess search results from Europe PMC

# source_path = fpath.poten_litera_eupmc
# output_path = fpath.poten_litera_eupmc_processed

# # clear the file
# # plib.clear_file(output_path)

# # preprocess search results from Europe PMC
# preprocess_eupmc(source_path, output_path, 2980, 9178)
# # 9178 results
# print("preprocessing results from Europe PMC succeeded!")
# # print("Attention! Something went wrong when preprocessing results from Europe PMC!")

# # 2980

In [19]:
# # preprocess search results from Google Scholar step 1

# source_path = fpath.poten_litera_gs
# # 980 results
# output_path = fpath.poten_litera_gs_processed_step1

# # clear the file
# # plib.clear_file(output_path)

# # preprocess search results from Google Scholar
# preprocess_google_shcolar_step1(source_path, output_path, 0, 980)
# # 926 results
# print("step 1 of preprocessing results from Google Scholar succeeded!")
# # print("Attention! Something went wrong when preprocessing results from Google Scholar step 1!")

In [20]:
# # reset index for poten_litera_gs_processed_step1
# input_path = fpath.poten_litera_gs_processed_step1
# output_path = fpath.poten_litera_gs_processed_step1
# df = pd.read_csv(input_path, header=None, sep = ",")
# df.reset_index(drop=True, inplace=True)
# df.to_csv(output_path, header=False, index=False)

# input_path = fpath.poten_litera_gs_processed_step1
# df = pd.read_csv(input_path, header=None, sep = ",")
# print(df.shape)
# # (926, 4)

In [21]:
# # preprocess search results from Google Scholar step 2

# source_path = fpath.poten_litera_gs_processed_step1
# # (926, 4)
# output_path = fpath.poten_litera_gs_processed_step2


# # clear the file
# # plib.clear_file(output_path)

# # preprocess search results from Google Scholar
# preprocess_google_shcolar_step2(source_path, output_path, 0, 926)
# print("step 2 of preprocessing results from Google Scholar succeeded!")
# # print("Attention! Something went wrong when preprocessing results from Google Scholar step 2!")

In [22]:
# # reset index for poten_litera_gs_processed_step2
# input_path = fpath.poten_litera_gs_processed_step2
# output_path = fpath.poten_litera_gs_processed_step2
# df = pd.read_csv(input_path, header=None, sep = ",")
# df.reset_index(drop=True, inplace=True)
# df.to_csv(output_path, header=False, index=False)

# input_path = fpath.poten_litera_gs_processed_step2
# df = pd.read_csv(input_path, header=None, sep = ",")
# print(df.shape)
# # (926, 4)

In [23]:
# # preprocess search results from spanning citations of seed paper

# preprocess_seed_paper_spanning(source_path, output_path, columns):
# print("preprocessing results from spanning citations of seed papers succeeded!")
# # print("Attention! Something went wrong when preprocessing results from spanning citations of seed papers!")

In [24]:
# # preprocess search results from CoCoMac papers

# preprocess_cocomac_paper(source_path, output_path, columns)
# print("preprocessing results from CoCoMac papers succeeded!")
# # print("Attention! Something went wrong when preprocessing results from CoCoMac papers!")

In [25]:
# # take a look at all the preprossed search results
# gos = fpath.poten_litera_gs_processed_step2
# wos = fpath.poten_litera_wos_processed
# pubmed = fpath.poten_litera_pubmed_processed
# eupmc = fpath.poten_litera_eupmc_processed

# df_gs = pd.read_csv(gos, header=None, sep=',')
# print(df_gs.shape)
# # (907, 6)
# df_wos = pd.read_csv(wos, header=None, sep=',')
# print(df_wos.shape)
# # (1993, 8)
# df_pubmed = pd.read_csv(pubmed, header=None, sep=',')
# print(df_pubmed.shape)
# # (2612, 8)
# df_eupmc = pd.read_csv(eupmc, header=None, sep=',')
# print(df_eupmc.shape)
# # (9178, 8)

In [26]:
# # combine all search results

# gos = fpath.poten_litera_gs_processed_step2
# wos = fpath.poten_litera_wos_processed
# pubmed = fpath.poten_litera_pubmed_processed
# eupmc = fpath.poten_litera_eupmc_processed
# input = [gos, wos, pubmed, eupmc]
# output_path = fpath.poten_litera_combined

# # clear the file
# plib.clear_file(output_path)

# combine(input, output_path)
# # (14627, 8)
# print("Combining all search results succeeded!")
# # print("Attention! Something went wrong when combining all search results!")

# df_combined = pd.read_csv(output_path, header=None, sep=',')
# print(df_combined.shape)
# # (14690, 8)

In [27]:
# # fill in missing identifiers
# input_path = fpath.poten_litera_combined
# output_path = fpath.poten_litera_ids_filled

# # clear file
# # plib.clear_file(output_path)

# fill_in_identifiers(input_path, output_path, 0, 14690)
# print("Filling in missing elements succeeded!")

In [28]:
# # Mannualy check the missing elements
# source_path = fpath.poten_litera_ids_filled
# df = pd.read_csv(source_path, header=None, sep=',')
# df.columns = ["DOI", "PMID", "PMCID", "full_text_url", "pdf_url", "Title", "Abstract", "Keywords"]
# # for ind in df.index:
# #     if df.at[ind, "DOI"] != df.at[ind, "DOI"] and df.at[ind, "PMID"] != df.at[ind, "PMID"] and df.at[ind, "PMCID"] != df.at[ind, "PMCID"] and df.at[ind, "full_text_url"] != df.at[ind, "full_text_url"] and df.at[ind, "pdf_url"] != df.at[ind, "pdf_url"]:
# #         print(ind)
# #         print(df.at[ind, "Title"])
# #         print(df.at[ind, "full_text_url"])
# #         print(df.at[ind, "pdf_url"])

# # for ind in df.index:
# #     if df.at[ind, "DOI"] != df.at[ind, "DOI"] and df.at[ind, "PMID"] != df.at[ind, "PMID"] and df.at[ind, "PMCID"] != df.at[ind, "PMCID"] and df.at[ind, "full_text_url"] != df.at[ind, "full_text_url"]:
# #         print(ind)
# #         print(df.at[ind, "Title"])
# #         print(df.at[ind, "full_text_url"])
# #         print(df.at[ind, "pdf_url"])

In [29]:
# # merge all search results and remove duplication by identifiers

# source_path = fpath.poten_litera_ids_filled
# output_path = fpath.poten_litra_filtered

# # clear the file
# plib.clear_file(output_path)

# # merge all search results
# identifiers = ["DOI", "PMID", "PMCID"]
# merge_remove_dupli(source_path, output_path, identifiers)

# --------------------start of test code--------------------
# source_path = fpath.poten_litra_filtered
# df = pd.read_csv(source_path, header=None, sep=',')
# print(df.shape)
# # (10982, 8)
# ---------------------end of test code---------------------

In [30]:
# # Mannualy check the missing elements
# source_path = fpath.poten_litra_filtered
# df = pd.read_csv(source_path, header=None, sep=',')
# df.columns = ["DOI", "PMID", "PMCID", "full_text_url", "pdf_url", "Title", "Abstract", "Keywords"]

# # for ind in df.index:
# #     if df.at[ind, "DOI"] != df.at[ind, "DOI"] and df.at[ind, "PMID"] != df.at[ind, "PMID"] and df.at[ind, "PMCID"] != df.at[ind, "PMCID"] and df.at[ind, "full_text_url"] != df.at[ind, "full_text_url"] and df.at[ind, "pdf_url"] != df.at[ind, "pdf_url"]:
# #         print(ind)
# #         print(df.at[ind, "Title"])
# #         print(df.at[ind, "full_text_url"])
# #         print(df.at[ind, "pdf_url"])

# for ind in df.index:
#     if df.at[ind, "DOI"] != df.at[ind, "DOI"] and df.at[ind, "PMID"] != df.at[ind, "PMID"] and df.at[ind, "PMCID"] != df.at[ind, "PMCID"] and df.at[ind, "full_text_url"] != df.at[ind, "full_text_url"]:
#         print(ind)
#         print(df.at[ind, "Title"])
#         print(df.at[ind, "full_text_url"])
#         print(df.at[ind, "pdf_url"])

In [31]:
# fill in full_text_url

source_path = fpath.poten_litra_filtered
output_path = fpath.poten_litera_ids_ftl_filled

# clear the file
# plib.clear_file(output_path)

# merge all search results
full_text_url_filling(source_path, output_path, 0, 10982)
# 1344

0
1
2
3
4
5
nan
nan
nan
nan
https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=086111ccf8db5585f16a54ba754ea75ebac97d6c
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
nan
nan
nan
nan
https://journals.physiology.org/doi/pdf/10.1152/jn.1977.40.6.1339
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
nan
nan
nan
nan
https://journals.physiology.org/doi/pdf/10.1152/jn.1944.7.3.171
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
nan
nan
nan
nan
https://www.hifo.uzh.ch/dam/jcr:00000000-2999-c151-ffff-ffffd9509b89/paper_CorticalArea8.pdf
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
nan
nan
nan
nan
https:/

1344


In [None]:
# # reset the index of poten_litera_ids_ftl_filled
# source_path = fpath.poten_litera_ids_ftl_filled
# output_path = fpath.poten_litera_ids_ftl_filled
# df = pd.read_csv(source_path, header=None, sep=',')
# df.reset_index(drop=True, inplace=True)
# df.to_csv(output_path, header=False, index=False)
# --------------------start of test code--------------------
# source_path = fpath.poten_litera_ids_ftl_filled
# df = pd.read_csv(source_path, header=None, sep=',')
# # print(df.head(5))
# print(df.shape)
# # (10769, 7)
# ---------------------end of test code---------------------

In [None]:
# # check all possible full_text_source
# input_path = fpath.poten_litera_ids_ftl_filled
# df = pd.read_csv(input_path, header=None, sep=",")
# columns = ["DOI", "PMID", "PMCID", "Title", "full_text_url", "full_text_source", "pdf_url"]
# df.columns = columns

# print(df.head(3))
# print(df.shape)
# # (10769, 7)

# full_text_source = set(df['full_text_source'].tolist())
# print(full_text_source)
# # {'pure.mpg.de', 'direct.mit.edu', 'pubs.acs.org', 'thejns.org', 'www.microbiologyresearch.org', 'www.ncbi.nlm.nih.gov', 
# #  'journals.sagepub.com', 'journals.physiology.org', 'www.thieme-connect.de', 'www.jstage.jst.go.jp', 'www.rbojournal.org', 
# #  'www.annualreviews.org', 'var.scholarpedia.org', 'ujms.net', 'papers.ssrn.com', 'www.degruyter.com', 'jamanetwork.com', 
# #  'escholarship.mcgill.ca', 'www.tandfonline.com', 'wakespace.lib.wfu.edu', 'www.taylorfrancis.com', 'content.iospress.com:443', 
# #  'www.cambridge.org', 'n.neurology.org', 'journals.biologists.com', 'www.nature.com', 'pubs.aip.org', 'books.google.de', 
# #  'linkinghub.elsevier.com', 'academic.oup.com', 'link.springer.com', 'karger.com', 'neurologia.com', 'onlinelibrary.wiley.com', 
# #  'www.ajtmh.org', 'iovs.arvojournals.org', 'elibrary.ru', 'psycnet.apa.org:443', 'journals.aps.org', 'royalsocietypublishing.org', 
# #  'jpet.aspetjournals.org', 'www.biorxiv.org', 'ieeexplore.ieee.org', 'journals.lww.com', 'ekja.org', 'open.bu.edu', 
# #  'www.cabdirect.org', 'www.elibrary.ru', 'jnm.snmjournals.org', 'www.architalbiol.org', 'www.imrpress.com', 
# #  'neuro.psychiatryonline.org', 'submissions.mirasmart.com', 'pubs.asahq.org', 'europepmc.org', 'www.ahajournals.org', 
# #  'www.science.org', 'nrc-prod.literatumonline.com', 'pharmrev.aspetjournals.org', 'www.liebertpub.com', 'opg.optica.org', 
# #  'www.ingentaconnect.com', 'symposium.cshlp.org', 'ajp.psychiatryonline.org', 'webview.isho.jp', 'www.theses.fr', 
# #  'www.worldscientific.com'}

# # ["DOI", "PMID", "PMCID", "Title", "full_text_url", "full_text_source", "pdf_url"]
# print(df["DOI"].isnull().any().any()) # True
# print(df["PMID"].isnull().any().any()) # True
# print(df["PMCID"].isnull().any().any()) # True
# print(df["Title"].isnull().any().any()) # False
# print(df["full_text_url"].isnull().any().any()) # False
# print(df["full_text_source"].isnull().any().any()) # False
# print(df["pdf_url"].isnull().any().any()) # True

# print(df["DOI"].dtypes) # object
# print(df["PMID"].dtypes) # float64
# print(df["PMCID"].dtypes) # object
# print(df["Title"].dtypes) # object
# print(df["full_text_url"].dtypes) # object
# print(df["full_text_source"].dtypes) # object
# print(df["pdf_url"].dtypes) # object

In [None]:
# # websites_hosts
# # {'pure.mpg.de', 'direct.mit.edu', 'pubs.acs.org', 'thejns.org', 'www.microbiologyresearch.org', 'www.ncbi.nlm.nih.gov', 
# #  'journals.sagepub.com', 'journals.physiology.org', 'www.thieme-connect.de', 'www.jstage.jst.go.jp', 'www.rbojournal.org', 
# #  'www.annualreviews.org', 'var.scholarpedia.org', 'ujms.net', 'papers.ssrn.com', 'www.degruyter.com', 'jamanetwork.com', 
# #  'escholarship.mcgill.ca', 'www.tandfonline.com', 'wakespace.lib.wfu.edu', 'www.taylorfrancis.com', 'content.iospress.com:443', 
# #  'www.cambridge.org', 'n.neurology.org', 'journals.biologists.com', 'www.nature.com', 'pubs.aip.org', 'books.google.de', 
# #  'linkinghub.elsevier.com', 'academic.oup.com', 'link.springer.com', 'karger.com', 'neurologia.com', 'onlinelibrary.wiley.com', 
# #  'www.ajtmh.org', 'iovs.arvojournals.org', 'elibrary.ru', 'psycnet.apa.org:443', 'journals.aps.org', 'royalsocietypublishing.org', 
# #  'jpet.aspetjournals.org', 'www.biorxiv.org', 'ieeexplore.ieee.org', 'journals.lww.com', 'ekja.org', 'open.bu.edu', 
# #  'www.cabdirect.org', 'www.elibrary.ru', 'jnm.snmjournals.org', 'www.architalbiol.org', 'www.imrpress.com', 
# #  'neuro.psychiatryonline.org', 'submissions.mirasmart.com', 'pubs.asahq.org', 'europepmc.org', 'www.ahajournals.org', 
# #  'www.science.org', 'nrc-prod.literatumonline.com', 'pharmrev.aspetjournals.org', 'www.liebertpub.com', 'opg.optica.org', 
# #  'www.ingentaconnect.com', 'symposium.cshlp.org', 'ajp.psychiatryonline.org', 'webview.isho.jp', 'www.theses.fr', 
# #  'www.worldscientific.com'}
# websites_hosts = [
#     'karger.com', 'rbojournal.org', 'sagepub.com', 'neurology.org', 'asahq.org', 'aspetjournals.org', 'thieme-connect.de', 
#     'taylorfrancis.com', 'lww.com', 'neurologia.com', 'ekja.org', 'www.imrpress.com', 'europepmc.org', 'springer.com', 
#     'theses.fr', 'ieee.org', 'ssrn.com', 'nature.com', 'liebertpub.com', 'oup.com', 'open.bu.edu', 'journals.biologists.com', 
#     'aip.org', 'mpg.de', 'lib.wfu.edu', 'cambridge.org', 'literatumonline.com', 'acs.org', 'scholarpedia.org', 'isho.jp', 
#     'mirasmart.com', 'jstage.jst.go.jp', 'psychiatryonline.org', 'psycnet.apa.org', 'thejns.org', 'microbiologyresearch.org', 
#     'wiley.com', 'snmjournals.org', 'degruyter.com', 'worldscientific.com', 'opg.optica.org', 'science.org', 'aps.org', 
#     'ujms.net', 'mit.edu', 'biorxiv.org','annualreviews.org', 'elibrary.ru', 'www.ingentaconnect.com', 'mcgill.ca', 
#     'symposium.cshlp.org', 'architalbiol.org', 'arvojournals.org', 'jamanetwork.com', 'elsevier.com', 'ncbi.nlm.nih.gov', 
#     'cabdirect.org', 'books.google.de', 'iospress.com', 'tandfonline.com', 'ajtmh.org', 'royalsocietypublishing.org', 
#     'ahajournals.org', 'physiology.org']
# # --------------------start of test code--------------------
# if len(websites_hosts) == len(set(websites_hosts)):
#     print("There are no duplicates in the list.")
# else:
#     print("There are duplicates in the list.")
# # ---------------------end of test code---------------------

In [None]:
# # sort the websites by the number of articles they have
# input_path = fpath.poten_litera_ids_ftl_filled
# df = pd.read_csv(input_path, header=None, sep=",")
# columns = ["DOI", "PMID", "PMCID", "Title", "full_text_url", "full_text_source", "pdf_url"]
# df.columns = columns
# func_dict = {website: 0 for website in websites_hosts}
# # print(func_dict)

# for ind in df.index:
#     for website in websites_hosts:
#         if website in df.loc[ind, "full_text_source"]:
#             func_dict[website] += 1
#             break

# # Sort dictionary by values
# sorted_dict = dict(sorted(func_dict.items(), key=lambda item: item[1], reverse=True))
# print(sorted_dict)
# # {'ncbi.nlm.nih.gov': 7886, 'elsevier.com': 1019, 'wiley.com': 696, 'springer.com': 285, 'physiology.org': 205, 
# #  'oup.com': 152, 'cambridge.org': 74, 'karger.com': 53, 'lww.com': 49, 'nature.com': 44, 'science.org': 30, 
# #  'tandfonline.com': 29, 'sagepub.com': 21, 'jamanetwork.com': 20, 'neurology.org': 16, 'biorxiv.org': 15, 
# #  'royalsocietypublishing.org': 13, 'psycnet.apa.org': 12, 'arvojournals.org': 12, 'jstage.jst.go.jp': 11, 
# #  'psychiatryonline.org': 11, 'europepmc.org': 10, 'mit.edu': 10, 'thejns.org': 8, 'annualreviews.org': 8, 
# #  'snmjournals.org': 7, 'aspetjournals.org': 6, 'elibrary.ru': 5, 'books.google.de': 5, 'architalbiol.org': 4, 
# #  'ahajournals.org': 4, 'liebertpub.com': 3, 'acs.org': 3, 'degruyter.com': 3, 'worldscientific.com': 3, 
# #  'iospress.com': 3, 'asahq.org': 2, 'thieme-connect.de': 2, 'neurologia.com': 2, 'mpg.de': 2, 'opg.optica.org': 2, 
# #  'mcgill.ca': 2, 'rbojournal.org': 1, 'taylorfrancis.com': 1, 'ekja.org': 1, 'www.imrpress.com': 1, 'theses.fr': 1, 
# #  'ieee.org': 1, 'ssrn.com': 1, 'open.bu.edu': 1, 'journals.biologists.com': 1, 'aip.org': 1, 'lib.wfu.edu': 1, 
# #  'literatumonline.com': 1, 'scholarpedia.org': 1, 'isho.jp': 1, 'mirasmart.com': 1, 'microbiologyresearch.org': 1, 
# #  'aps.org': 1, 'ujms.net': 1, 'www.ingentaconnect.com': 1, 'symposium.cshlp.org': 1, 'cabdirect.org': 1, 'ajtmh.org': 1}

# non_zero_keys = [key for key, value in sorted_dict.items() if value != 0]
# print(non_zero_keys)
# # ['ncbi.nlm.nih.gov', 'elsevier.com', 'wiley.com', 'springer.com', 'physiology.org', 'oup.com', 'cambridge.org', 
# #  'karger.com', 'lww.com', 'nature.com', 'science.org', 'tandfonline.com', 'sagepub.com', 'jamanetwork.com', 
# #  'neurology.org', 'biorxiv.org', 'royalsocietypublishing.org', 'psycnet.apa.org', 'arvojournals.org', 'jstage.jst.go.jp', 
# #  'psychiatryonline.org', 'europepmc.org', 'mit.edu', 'thejns.org', 'annualreviews.org', 'snmjournals.org', 
# #  'aspetjournals.org', 'elibrary.ru', 'books.google.de', 'architalbiol.org', 'ahajournals.org', 'liebertpub.com', 
# #  'acs.org', 'degruyter.com', 'worldscientific.com', 'iospress.com', 'asahq.org', 'thieme-connect.de', 'neurologia.com', 
# #  'mpg.de', 'opg.optica.org', 'mcgill.ca', 'rbojournal.org', 'taylorfrancis.com', 'ekja.org', 'www.imrpress.com', 
# #  'theses.fr', 'ieee.org', 'ssrn.com', 'open.bu.edu', 'journals.biologists.com', 'aip.org', 'lib.wfu.edu', 
# #  'literatumonline.com', 'scholarpedia.org', 'isho.jp', 'mirasmart.com', 'microbiologyresearch.org', 'aps.org', 
# #  'ujms.net', 'www.ingentaconnect.com', 'symposium.cshlp.org', 'cabdirect.org', 'ajtmh.org']

In [None]:
# # websites
# websites = [
#     'ncbi.nlm.nih.gov', 'elsevier.com', 'wiley.com', 'springer.com', 'physiology.org', 'oup.com', 
#     'cambridge.org', 'karger.com', 'lww.com', 'nature.com', 'science.org', 'tandfonline.com', 
#     'sagepub.com', 'jamanetwork.com', 'neurology.org', 'biorxiv.org', 'royalsocietypublishing.org', 
#     'psycnet.apa.org', 'arvojournals.org', 'jstage.jst.go.jp', 'psychiatryonline.org', 'europepmc.org', 
#     'mit.edu', 'thejns.org', 'annualreviews.org', 'snmjournals.org', 'aspetjournals.org', 'elibrary.ru', 
#     'books.google.de', 'architalbiol.org', 'ahajournals.org', 'liebertpub.com', 'acs.org', 'degruyter.com', 
#     'worldscientific.com', 'iospress.com', 'asahq.org', 'thieme-connect.de', 'neurologia.com', 'mpg.de', 
#     'opg.optica.org', 'mcgill.ca', 'rbojournal.org', 'taylorfrancis.com', 'ekja.org', 'www.imrpress.com', 
#     'theses.fr', 'ieee.org', 'ssrn.com', 'open.bu.edu', 'journals.biologists.com', 'aip.org', 'lib.wfu.edu', 
#     'literatumonline.com', 'scholarpedia.org', 'isho.jp', 'mirasmart.com', 'microbiologyresearch.org', 
#     'aps.org', 'ujms.net', 'www.ingentaconnect.com', 'symposium.cshlp.org', 'cabdirect.org', 'ajtmh.org'
# ]
# # --------------------start of test code--------------------
# if len(websites) == len(websites_hosts):
#     print('The number of websites is correct')
# # ---------------------end of test code---------------------

<h3> Next step: automatic filtering </h3>