<h2> Searched literature data preprocessing </h2> 

In [1]:
# import internal .py modules
import file_path_management as fpath
import public_library as plib

In [2]:
# import packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import re
import time
import numpy as np
import numpy as np

<h3> Parameters: </h3>

In [3]:
# columns of file: potential_related_literature.csv
columns = ["DOI", "PMID", "PMCID", "Title", "full_text_url", "full_text_source"]

<h3> Predefined fucntions: </h3> 

In [4]:
def request_wegpage(url, proxies):
    response = requests.get(url, headers = plib.headers, proxies = proxies)
    if response.status_code != 200:
        print("Error when requesting:", url)
        # print(response.status_code)
        raise Exception("Your request was declined, again!")
    soup = BeautifulSoup(response.content, "lxml")
    return soup

In [5]:
def merge_pubmed(source_path, output_path, columns, start, end):
    print("Starting merging search results from PubMed...")
    # process pmc search results
    df = pd.read_csv(source_path, header=None, sep=',')
    df = df[["DOI", "PMID", "PMCID", "Title"]]
    for ind in range(start, end):
        # sleep to avoid to be blocked
        time.sleep(random.randint(3,6))
        # if(ind%10 == 0):
        #     time.sleep(random.randint(3,6)*10)
        # if(ind%50 == 0):
        #     time.sleep(random.randint(10,15)*10)
        
        #request the webpage
        proxies = plib.get_proxies()
        pmid = str(df["PMID"][ind])
        url = "https://pubmed.ncbi.nlm.nih.gov/" + pmid + "/"
        soup = request_wegpage(url, proxies)
        # print(soup)
        
        # get PMCID
        if df["PMCID"][ind] is np.nan:
            try:
                pmcid = soup.find_all("span", {"class": "identifier pmc"})[0].find_all("a", {"class": "id-link"})[0].get_text().strip()
            except:
                pmcid = np.nan
        else:
            pmcid = str(df["PMCID"][ind])
        # print(pmcid)

        # get DOI
        if df["DOI"][ind] is np.nan:
            try:
                doi = soup.find_all("span", {"class": "identifier doi"})[0].find_all("a", {"class": "id-link"})[0].get_text().strip()
            except:
                doi  = np.nan
        else:
            doi = str(df["DOI"][ind])
        # print(doi)

        # get full_text_url
        if pmcid is not np.nan:
            full_text_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/"
            full_text_source = "PMC"
        else:
            try:
                full_text_url = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["href"].strip()
                full_text_source = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["data-ga-action"].strip()
            except:
                full_text_url = np.nan
                full_text_source = np.nan
                
        # columns =  ["DOI", "PMID", "PMCID", "Title", "full_text_url", "full_text_source"]            
        row = {
            "DOI": [doi],
            "PMID": [pmid],
            "PMCID": [pmcid],
            "Title": [str(df["Title"][ind])],
            "full_text_url": [full_text_url],
            "full_text_source": [full_text_source]
        }
        # print(row)
        if not plib.add_row_to_csv(output_path, row, columns):
            print("Error detected when adding a row to csv!")
        print(ind)
# --------------------start of test code--------------------
# source_path = fpath.poten_litera_pubmed
# output_path = fpath.poten_litera_pubmed_processed
# plib.clear_file(output_path)
# df = pd.read_csv(source_path, sep=',')
# df.shape
# df = df[["DOI", "PMID", "PMCID", "Title"]]
# print(df.head(5))
# print(df["DOI"].isnull().values.any())
# print(df["PMID"].isnull().values.any())
# print(df["PMCID"].isnull().values.any())
# print(df["Title"].isnull().values.any())
# # the columns PMID, Title don't contain np.nan
# # the columns DOI, PMCID contain np.nan, we need to fill in what are missing
# # we also need to reenter the full name of the first author
# merge_pubmed(source_path, output_path, columns)

# df = pd.read_csv(output_path, sep=',')
# print(df.head(5))
# ---------------------end of test code---------------------

In [6]:
def merge_webofscience(source_path, output_path, columns):
    print("Starting merging search results from Web of Science...")
    df = pd.read_csv(source_path, sep = ";")
    df = df[["DOI", "Pubmed Id", "Article Title"]]
    df.rename(columns={"DOI": "DOI", "Pubmed Id": "PMID", "Article Title": "Title"}, inplace = True)
    df = df.fillna(0.0)
    for ind in df.index:
        print(ind)
        if(ind%10 == 0):
            time.sleep(random.randint(3,6)*10)
        proxies = plib.get_proxies()
        if df["PMID"][ind] == 0.0:
            if df["DOI"][ind] == 0.0:
                doi  ="not found"
            else:
                doi = str(df["DOI"][ind])
            if df["PMCID"][ind] == 0.0:
                full_text_url = "not found"
                full_text_source = "not found"
                pmcid = "not found"
            else:
                full_text_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/"
                full_text_source = "PMC"
                pmcid = str(df["PMCID"][ind])
            first_author = "not found"
            pmid = "not found"
        else:
            # print(df["PMID"][ind])
            pmid = str(int(df["PMID"][ind]))
            url = "https://pubmed.ncbi.nlm.nih.gov/" + pmid + "/"
            soup = request_wegpage(url, proxies)
            # print(soup)
            # get full name of first author
            try:
                first_author = soup.find_all("span", {"class": "authors-list-item"})[0].find_all("a", {"class": "full-name"})[0].get_text().strip()
            except:
                first_author = "not found"
            # get PMCID
            # print(df["PMCID"][ind])
            try:
                pmcid = soup.find_all("span", {"class": "identifier pmc"})[0].find_all("a", {"class": "id-link"})[0].get_text().strip()
            except:
                pmcid  ="not found"
            # print(pmcid)
            # get DOI
            if df["DOI"][ind] == 0.0:
                try:
                    doi = soup.find_all("span", {"class": "identifier doi"})[0].find_all("a", {"class": "id-link"})[0].get_text().strip()
                except:
                    doi  ="not found"
            else:
                doi = str(df["DOI"][ind])
            # get full_text_url
            if pmcid != "not found":
                full_text_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/"
                full_text_source = "PMC"
            else:
                try:
                    full_text_url = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["href"].strip()
                    full_text_source = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["data-ga-action"].strip()
                except:
                    full_text_url = "not found"
                    full_text_source = "not found"

        # columns = ["DOI", "PMID", "PMCID", "Title", "First_Author", "full_text_url", "full_text_source"]
        row = {
            "DOI": [doi],
            "PMID": [pmid],
            "PMCID": [pmcid],
            "Title": [str(df["Title"][ind])],
            "First_Author": [first_author],
            "full_text_url": [full_text_url],
            "full_text_source": [full_text_source]
        }
        # print(row)
        if not plib.add_row_to_csv(output_path, row, columns):
            print("Error detected when adding a row to csv!")
# --------------------start of test code--------------------
# source_path = fpath.poten_litera_wos_1
# output_path = fpath.poten_litera_wos_processed
# plib.clear_file(output_path)
# df = pd.read_csv(source_path, sep=';')
# df = df[["DOI", "Pubmed Id", "Article Title"]]
# print(df.head(5))
# print(df["DOI"].isnull().values.any())
# print(df["Pubmed Id"].isnull().values.any())
# print(df["Article Title"].isnull().values.any())
# # the columns Article Title don't contain np.nan
# # the columns DOI, Pubmed Id contain np.nan, we need to fill in what are missing
# # we also need to reenter the full name of the first author
# merge_webofscience(fpath.poten_litera_wos_1, columns)
# merge_webofscience(source_path, output_path, columns)

# df = pd.read_csv(output_path, sep=',')
# print(df.head(5))
# ---------------------end of test code---------------------  

In [7]:
def merge_eupmc(source_path, output_path, columns):
    print("Starting merging search results from Europe PMC...")
    # process eupmc search results
    df = pd.read_csv(source_path, sep = ",")
    df = df[["DOI", "EXTERNAL_ID", "PMCID", "TITLE"]]
    df = df.rename(columns={"EXTERNAL_ID": "PMID", "TITLE": "Title"}, errors = "raise")
    for ind in df.index:
        print(ind)
        if(ind%10 == 0):
            time.sleep(random.randint(3,6)*10)
        proxies = plib.get_proxies()
        pmid = str(df["PMID"][ind])
        # print(pmid)
        url = "https://pubmed.ncbi.nlm.nih.gov/" + pmid + "/"
        regex = "[a-zA-Z]"
        if len(re.findall(regex, pmid)) == 0:
            # print("pmid")
            soup = request_wegpage(url, proxies)
            # print(soup)
            # get full name of first author
            try:
                first_author = soup.find_all("span", {"class": "authors-list-item"})[0].find_all("a", {"class": "full-name"})[0].get_text().strip()
            except:
                first_author = "not found"
            # get PMCID
            # print(df["PMCID"][ind])
            if df["PMCID"][ind] is np.nan:
                try:
                    pmcid = soup.find_all("span", {"class": "identifier pmc"})[0].find_all("a", {"class": "id-link"})[0].get_text().strip()
                except:
                    pmcid  ="not found"
            else:
                pmcid = str(df["PMCID"][ind])
            # print(pmcid)
            # get DOI
            if df["DOI"][ind] is np.nan:
                try:
                    doi = soup.find_all("span", {"class": "identifier doi"})[0].find_all("a", {"class": "id-link"})[0].get_text().strip()
                except:
                    doi  ="not found"
            else:
                doi = str(df["DOI"][ind])
            # get full_text_url
            if pmcid != "not found":
                full_text_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/"
                full_text_source = "PMC"
            else:
                try:
                    full_text_url = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["href"].strip()
                    full_text_source = soup.find_all("div", {"class": "full-text-links-list"})[0].find_all("a", {"class": "link-item dialog-focus"})[0]["data-ga-action"].strip()
                except:
                    full_text_url = "not found"
                    full_text_source = "not found"
            # columns = ["DOI", "PMID", "PMCID", "Title", "First_Author", "full_text_url", "full_text_source"]
        else:
            # print("not pmid")
            if df["DOI"][ind] is np.nan:
                doi  ="not found"
            else:
                doi = str(df["DOI"][ind])
            if df["PMCID"][ind] is np.nan:
                full_text_url = "not found"
                full_text_source = "not found"
                pmcid = "not found"
            else:
                full_text_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/"
                full_text_source = "PMC"
                pmcid = str(df["PMCID"][ind])
            first_author = "not found"
            pmid = "not found"
        row = {
            "DOI": [doi],
            "PMID": [pmid],
            "PMCID": [pmcid],
            "Title": [str(df["Title"][ind])],
            "First_Author": [first_author],
            "full_text_url": [full_text_url],
            "full_text_source": [full_text_source]
        }
        # print(row)
        if not plib.add_row_to_csv(output_path, row, columns):
            print("Error detected when adding a row to csv!")
# --------------------start of test code--------------------
# source_path = fpath.poten_litera_eupmc
# output_path = fpath.poten_litera_eupmc_processed
# plib.clear_file(output_path)
# df = pd.read_csv(source_path, sep=',')
# df = df[["SOURCE", "DOI", "EXTERNAL_ID", "PMCID", "TITLE"]]
# print(df.head(5))
# col_one_list = set(df['SOURCE'].tolist())
# print(col_one_list)
# print(df["SOURCE"].isnull().values.any())
# print(df["DOI"].isnull().values.any())
# print(df["EXTERNAL_ID"].isnull().values.any())
# print(df["PMCID"].isnull().values.any())
# print(df["TITLE"].isnull().values.any())
# # the columns PMID, Title don't contain np.nan
# # the columns DOI, PMCID contain np.nan, we need to fill in what are missing
# # we also need to reenter the full name of the first author
# merge_eupmc(source_path, output_path, columns)

# df = pd.read_csv(output_path, sep=',')
# print(df.head(5))
# ---------------------end of test code---------------------

In [8]:
def merge_google_shcolar(source_path, output_path, columns):
    print("Starting merging search results from Google Scholar...")
    return True
# --------------------start of test code--------------------
# source_path = fpath.poten_litera_gs_test
# output_path = fpath.poten_litera_gs_processed
# plib.clear_file(output_path)
# df = pd.read_csv(source_path, header = None, sep=',')
# df = df[["SOURCE", "DOI", "EXTERNAL_ID", "PMCID", "TITLE"]]
# print(df.head(5))
# col_one_list = set(df['SOURCE'].tolist())
# print(col_one_list)
# print(df["SOURCE"].isnull().values.any())
# print(df["DOI"].isnull().values.any())
# print(df["EXTERNAL_ID"].isnull().values.any())
# print(df["PMCID"].isnull().values.any())
# print(df["TITLE"].isnull().values.any())
# # the columns PMID, Title don't contain np.nan
# # the columns DOI, PMCID contain np.nan, we need to fill in what are missing
# # we also need to reenter the full name of the first author
# merge_google_shcolar(source_path, output_path, columns)

# df = pd.read_csv(output_path, header = None, sep=',')
# print(df.head(5))
# ---------------------end of test code---------------------

In [9]:
def merge_seed_paper_spanning(source_path, output_path, columns):
    print("Starting merging search results from spanning citations of seed paper...")
    return True
# --------------------start of test code--------------------
# test code
# ---------------------end of test code---------------------

In [10]:
def merge_cocomac_paper(source_path, output_path, columns):
    print("Starting merging search results from CoCoMac papers...")
    return True
# --------------------start of test code--------------------
# test code
# ---------------------end of test code---------------------

In [11]:
# make sure at least PMID and PMCID is present as two of the four identifiers, otherwise manually fill in
def fill_in_elements(file_path):
    # PMID -> PMCID
    # done already
    # PMCID -> PMID
    # done already
    # PMID -> DOI
    df = pd.read_csv(file_path, sep = ",")
    for ind in df.index:
        if (df["PMID"][ind] is not np.nan) and (df["DOI"][ind] is np.nan):
            pmid = df["PMID"][ind]
            url = "https://pubmed.ncbi.nlm.nih.gov/" + pmid + "/"
            print(url)
            response = requests.get(url, headers = plib.headers)
            if response.status_code != 200:
                raise Exception("Error when request webpages!")
            soup = BeautifulSoup(response.content, "lxml")
            l = soup.find_all("a", {"class: id-link"}, {"data-ga-action": "DOI"})
            if(len(l) != 0):
                # print(l[0].get_text().strip())
                df.at[ind, "DOI"] = l[0].get_text().strip()
            else:
                df.at[ind, "DOI"] = np.nan
    df.to_csv(fpath.poten_litera_csv, header = True, index = False)
    print("All 3 identifiers: DOI, PMID, and PMCID filled in when possible.")
# --------------------start of test code--------------------
# test code
# ---------------------end of test code---------------------

In [12]:
# remove duplciations based on identifiers in the potential related literature
def remove_dupli(file_path):
    df = pd.read_csv(file_path, sep = ",")
    print(len(df))
    df = df.drop_duplicates(subset=['DOI'])
    df = df.drop_duplicates(subset=['PMID'])
    df = df.drop_duplicates(subset=['PMCID'])
    print(len(df))
    # plib.clear_file(fpath.poten_litera_csv)
    # df.csv(fpath.poten_litera_csv, idnex = None)
    print("Duplication in the potential related literature removed.")
    print("Found " + len(df) + " potential related literature in total.")
# --------------------start of test code--------------------
# test code
# ---------------------end of test code---------------------

<h3> Main program: </h3> 

In [13]:
# clear the file
source_path = fpath.poten_litera_pubmed
output_path = fpath.poten_litera_pubmed_processed
# plib.clear_file(output_path)

In [24]:
# merge search results from PubMed
# 2606 results
merge_pubmed(source_path, output_path, columns, 200, 500)
print("Merging results from PubMed succeeded!")
# print("Attention! Something went wrong when merging results from PubMed!")

Starting merging search results from PubMed...
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
4

In [15]:
# # clear the file
# source_path = fpath.poten_litera_wos_1
# output_path = fpath.poten_litera_wos_processed
# plib.clear_file(output_path)

# # merge search results from Web of Science
# # 1000 results
# merge_webofscience(source_path, output_path, columns)
# print("Merging results from Web of Science part 1 succeeded!")
# # print("Attention! Something went wrong when merging results from Web of Science part 1!")

In [16]:
# # clear the file
# source_path = fpath.poten_litera_wos_2
# output_path = fpath.poten_litera_wos_processed

# # merge search results from Web of Science
# # 976 results
# merge_webofscience(source_path, output_path, columns)
# print("Merging results from Web of Science part 2 succeeded!")
# # print("Attention! Something went wrong when merging results from Web of Science part 2!")

In [17]:
# # clear the file
# source_path = fpath.poten_litera_eupmc
# output_path = fpath.poten_litera_eupmc_processed
# plib.clear_file(output_path)

# # merge search results from Europe PMC
# merge_eupmc(source_path, output_path, columns)
# # 9139 results
# print("Merging results from Europe PMC succeeded!")
# # print("Attention! Something went wrong when merging results from Europe PMC!")

In [18]:
# # clear the file
# source_path = fpath.poten_litera_gs_test
# output_path = fpath.poten_litera_gs_processed
# plib.clear_file(output_path)

# # merge search results from Google Scholar
# merge_google_shcolar(source_path, output_path, columns)
# print("Merging results from Google Scholar succeeded!")
# # print("Attention! Something went wrong when merging results from Google Scholar!")

In [19]:
# # merge search results from spanning citations of seed paper
# merge_seed_paper_spanning(source_path, output_path, columns):
# print("Merging results from spanning citations of seed papers succeeded!")
# # print("Attention! Something went wrong when merging results from spanning citations of seed papers!")

In [20]:
# # merge search results from CoCoMac papers
# merge_cocomac_paper(source_path, output_path, columns)
# print("Merging results from CoCoMac papers succeeded!")
# # print("Attention! Something went wrong when merging results from CoCoMac papers!")

In [21]:
# # fill in all elements in the columns when possible, if not, fill in "not found"
# fill_in_elements(fpath.poten_litera_csv, columns)

In [22]:
# identifier = ["DOI", "PMID", "PMCID"]
# remove_dupli(fpath.poten_litera_csv, identifier)

<h3> Next step: automatic filtering </h3>