<h2> Semi-automatic literature search </h2> 

<h3> Notebook description: </h3>
Some description text here to write.

In [9]:
# import internal .py modules
import file_path_management as fpath
import public_library as plib

In [10]:
# import packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import re
import time
import numpy as np
from numpy import NaN

<h3> Parameters: </h3>
In the next cell, we present all parameters that might have an effect on the search results, including:<br>
1. searching keyword lexicon<br>
2. academic databases<br>
3. initial urls when searching academic databases<br>
4. seed paper list for spanning citations<br>
5. conenctome database<br>
6. seaching queries of the connectome database<br>
7. on-topic keyword lexicon<br>
8. weights of on-topic keywords when calculating relatedness of a literature<br>
9. ChatGPT queries for relatedness of topic<br>
10. meta categories when extracting information of related literature<br>
11. keywords for searching meta categories<br>
12. ChatGPT queries for extracting information of meta categories of related literature<br>

In [4]:
# searching keywords lexicon
search_kws_lexicon = "macaque AND (thalamus OR thalamocortical OR thalamo-cortical)" # in all fields

# academic databases
# Google Scholar: "https://scholar.google.com/"
# macaque thalamus OR thalamocortical OR thalamo-cortical
# 78100 results
# Web of Science: "https://www.webofscience.com/wos/woscc/advanced-search" # can be exported to excel file
# (ALL=(thalamus) OR ALL=(thalamocortical) OR ALL=(thalamo-cortical)) AND ALL=(macaque)
# 880 results
# PubMed Central PMC: "https://pubmed.ncbi.nlm.nih.gov/advanced/" # can be exported to .csv file
# ((thalamus) OR (thalamocortical) OR (thalamo-cortical)) AND (macaque)
# 2448 results
# Europe PMC = "https://europepmc.org/advancesearch" # search resuts can be exported to .csv file
# ("macaque") AND ("thalamus" OR "thalamocortical" OR "thalamo-cortical") AND (LANG:"eng" OR LANG:"en" OR LANG:"us")
# 5130 results
acad_dbs = ["Google Scholar", "Web of Science", "PubMed Central PMC", "Europe PMC"]

# initial urls for specified searching keyword lexicon and all academic databases
init_urls = {
    "gs": "https://scholar.google.com/scholar?start=0&q=macaque+thalamus+OR+thalamocortical+OR+thalamo-cortical&hl=en&as_sdt=1,5",
    "wos": "https://www.webofscience.com/wos/woscc/summary/3a00a41f-3135-4142-a950-c8d6eb3b20a7-99be93b8/relevance/1",
    "pmc": "https://pubmed.ncbi.nlm.nih.gov/?term=((thalamus)%20OR%20(thalamocortical)%20OR%20(thalamo-cortical))%20AND%20(macaque)&sort=relevance&page=1",
    "eupmc": "https://europepmc.org/search?query=%28%22macaque%22%29%20AND%20%28%22thalamus%22%20OR%20%22thalamocortical%22%20OR%20%22thalamo-cortical%22%29%20AND%20%28LANG%3A%22eng%22%20OR%20LANG%3A%22en%22%20OR%20LANG%3A%22us%22%29&page=1"
}

# seed literature list
seed_litera_list = []

# cocomac literature list
cocomac_litera_list = []

<h3> Predefined fucntions: </h3> 

In [None]:
# search academic database google scholar given a initial url and return the search results
# the given initial url is manually obtained by input search query in the google scholar
# the search results include title, url, full_text_url
def search_google_scholar(init_url):
    # create a .txt file to record the urls of google scholar search results, clear the file if already exists
    f = open(fpath.poten_litera_gs, "w")
    f.truncate()
    f.close()

    # request the first page and extract the number of pages of the search results
    first_page = init_url
    # request the webpage unitl the status code is 200
    response = requests.get(first_page, headers = plib.headers, proxies = plib.get_proxies())
    if response.status_code != 200:
        raise Exception("Error when request webpages!")
    # while(response.status_code != 200):
    #     # sleep for 5 minutes
    #     time.sleep(300)
    #     response = requests.get(first_page, headers = plib.headers, proxies = plib.get_proxies())
    # parse the webpage
    soup = BeautifulSoup(response.content, "lxml")
    # print(soup)
    num_results_str_list = soup.select("div", {"class": "gs_ab_mdw"})
    for item in num_results_str_list:
        if "results" in item.get_text():
            num_results_str = item.get_text().split()
    num_results_str = num_results_str[1]
    # print(num_results_str)
    # print(int(num_results_str))
    num_results = int(re.sub(r"[^\w\s]", "", num_results_str))
    pages = int(num_results/10)
    print("Google Scholar searched " + str(num_results) + " results" + " displayed in " + str(pages) + " pages.")
    
    # iterate all pages and record the results
    # pages = 5
    for page in range(pages):
        time.sleep(random.randint(1, 10))
        time.sleep(random.randint(1*60, 10*60))
        start = page * 10
        page_url = init_url.split("?start=")[0] + "?start=" + str(start) + "&q=" + init_url.split("?start=")[1].split("&q=")[1]
        # print(page_url)
        # search a page
        response = requests.get(page_url, headers = plib.headers, proxies = plib.get_proxies())
        if response.status_code != 200:
            raise Exception("Error when request webpages!")
        # while(response.status_code != 200):
        #     # sleep for 5 minutes
        #     time.sleep(300)
        #     response = requests.get(page_url, headers = plib.headers, proxies = plib.get_proxies())
        soup = BeautifulSoup(response.content, "lxml")
        # print(soup)
        # print(soup.select("[data-lid]")) 
        for item in soup.select("[data-lid]"):
            # print(item)
            try:
                add_title = item.select("h3")[0].select("a", href = True)[0].get_text()
                # print(add_title)
            except:
                add_title = "not found"
            try:
                add_url = item.select("h3")[0].select("a", href = True)[0]["href"]
                # print(add_url)
            except:
                add_url = "not found"
            try:
                add_full_text_link = item.find_all("div", {'class': "gs_or_ggsm"})[0].find_all("a", href = True)[0]["href"]
                # print("1")
                # print(add_full_text_link)
            except:
                add_full_text_link = "not found"
            # print(add_full_text_link)
            row = {
                "title": [add_title],
                "url": [add_url],
                "full_text_url": [add_full_text_link]
            }
            columns = ["title", "url", "full_text_url"]
            plib.add_row_to_csv(fpath.poten_litera_gs, row, columns)
    print("Searching Google Scholar complated!")
# --------------------start of test code--------------------
# init_url = init_urls["gs"]
# headers = plib.headers
# search_google_scholar(init_url)
# ---------------------end of test code---------------------
# end of search_google_scholar

In [None]:
def search_webofscience(init_url):
    # search in the website and export the search results
    print("Searching Web of Science complated!")

In [None]:
def search_pmc(init_url):
    # search in the website and export the search results
    print("Searching PubMedd Central PMC complated!")

In [None]:
def search_eupmc(init_url):
    # search in the website and export the search results
    print("Searching Europe PMC complated!")

In [None]:
# search academic databases, record the urls as a line in a .txt file from the webpages
def search_acad_dbs(acad_dbs, init_urls):
    try:
        for acad_db in acad_dbs:
            if acad_db == "Google Scholar":
                print("Searching Google Scholar...")
                search_google_scholar(init_urls["gs"])
            elif acad_db == "Web of Science":
                print("Searching Web of Science...")
                search_webofscience(init_urls["wos"])
            elif acad_db == "PubMed Central PMC":
                print("Searching PubMed Central PMC...")
                search_pmc(init_urls["pmc"])
            elif acad_db == "Europe PMC":
                print("Searching Europe PMC...")
                search_eupmc(init_urls["eupmc"])
            else:
                print("Searching the specified academic database: " + acad_db + " is not supported by this function.")
                print("Plese choose one of the following databases:",)
                for db in ["Google Scholar", "Web of Science", "PubMed Central PMC", "Europe PMC"]:
                    print(db)
        return True
    except:
        return False
# --------------------start of test code--------------------
# if search_acad_dbs(acad_dbs, init_urls):
#     print("Searching academic databases completed!")
# else:
#     print("Attention! Something went wrong when searching academic databases completed!")
# ---------------------end of test code---------------------
# end of search_acad_dbs

In [None]:
def span_citations(seed_litera_list, num_span_time):
    None

In [None]:
def search_conne_db():
    None

In [11]:
# merge all searched literature results
def merge_search_results(merged_file_path):
    # clear the file
    f = open(merged_file_path, "w")
    f.truncate()
    f.close()

    # DOI, PMID, PMCID, title
    columns = ["DOI", "PMID", "PMCID", "Title"]

    # google scholar search results
    # df_gs = pd.read_csv(fpath.poten_litera_gs, sep = ",")

    # # process web of science search results
    # df_wos = pd.read_csv(fpath.poten_litera_wos, sep = ";")
    # df_wos = df_wos[["DOI", "Pubmed Id", "Article Title"]]
    # df_wos.rename(columns={"DOI": "DOI", "Pubmed Id": "PMID", "Article Title": "Title"}, inplace = True)
    # df_wos["PMID"] = df_wos["PMID"].fillna(0)
    # df_wos["PMID"] = df_wos["PMID"].astype(int)
    # df_wos["PMID"] = df_wos["PMID"].astype(str)
    # # print(df_wos.head(5))
    # # print(df_wos.dtypes)
    # pmcid = []
    # for ind in df_wos.index:
    #     # print(df_wos["PMID"][ind])
    #     if df_wos["PMID"][ind] != "0":
    #         pmid = df_wos["PMID"][ind]
    #         # print(pmid)
    #         df_wos["PMID"][ind] = pmid
    #         url = "https://pubmed.ncbi.nlm.nih.gov/" + pmid + "/"
    #         # print(url)
    #         response = requests.get(url, headers = plib.headers, proxies = plib.get_proxies())
    #         if response.status_code != 200:
    #             raise Exception("Error when request webpages!")
    #         soup = BeautifulSoup(response.content, "lxml")
    #         l = soup.find_all("a", {"data-ga-action": "PMCID"})
    #         if(len(l) != 0):
    #             # print(l[0].get_text().strip())
    #             pmcid.append(l[0].get_text().strip())
    #         else:
    #             pmcid.append(NaN)
    #     else:
    #         pmcid.append(NaN)
    #     # print(df_wos[ind])
    # df_wos["PMCID"] = pmcid
    # df_wos["PMCID"].replace("0", NaN)
    # # print(df_wos.head(5))
    # df_wos = df_wos[columns]
    # df_wos.to_csv(merged_file_path, header = True, index = None)

    # # process pmc search results
    # df_pmc = pd.read_csv(fpath.poten_litera_pmc, sep=',')
    # doi = df_pmc[["DOI", "PMID", "PMCID", "Title"]]
    # doi.to_csv(merged_file_path, mode = "a", header = None, index = None)

    # process eupmc search results
    df_eupmc = pd.read_csv(fpath.poten_litera_eupmc, sep = ",")
    df_eupmc = df_eupmc[["DOI", "PMCID", "TITLE"]]
    df_eupmc = df_eupmc.rename(columns={"TITLE": "Title"}, errors = "raise")
    # print(df_eupmc.head(5))
    pmid = []
    for ind in df_eupmc.index:
        # print(df_eupmc["PMCID"][ind])
        if df_eupmc["PMCID"][ind] is not NaN:
            pmcid = str(df_eupmc["PMCID"][ind])
            # print(pmcid)
            df_eupmc["PMCID"][ind] = pmcid
            url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/"
            print(url)
            response = requests.get(url, headers = plib.headers, proxies = plib.get_proxies())
            if response.status_code != 200:
                print(response.status_code)
                raise Exception("Error when request webpages!")
            soup = BeautifulSoup(response.content, "lxml")
            l = soup.find_all("div", {"class": "fm-citation-pmid"})
            if (len(l)) != 0:
                ll = l[0].find_all("a", href = True)
                if(len(ll) != 0):
                    # print(ll[0].get_text().strip())
                    pmid.append(ll[0].get_text().strip())
            else:
                pmid.append(NaN)
        else:
            pmid.append(NaN)
        # print(df_wos[ind])
    df_eupmc["PMID"] = pmid
    df_eupmc = df_eupmc[columns]
    df_eupmc.to_csv(merged_file_path, header = None, index = None)

    # process seed literature citation spanning results
    # not processed for now

    # process connctome database literature results
    # not processed for now
    
    # eliminate duplicates
    # df_merged = pd.read_csv(merged_file_path)
    # print(len(df_merged))
    # doi_df = df_merged.drop_duplicates(subset = "DOI")
    # print(len(df_merged))
    # df_merged.to_csv(merged_file_path, index=False)
# end of merge_search_results
# --------------------start of test code--------------------
merge_search_results(fpath.poten_litera_csv)
# ---------------------end of test code---------------------

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10133512/


ProxyError: HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Max retries exceeded with url: /pmc/articles/PMC10133512/ (Caused by ProxyError('Cannot connect to proxy.', OSError('Tunnel connection failed: 400 Bad Request')))

<h3> Main program: </h3> 

In [None]:
# first we need to search all related literature that might include data or information of thalamocortical connections
# search for potentially related literature using the listed 3 methods

# method 1: search acdemic databases using keywords
# if search_acad_dbs(acad_dbs, init_urls):
#     print("Searching academic databases completed!")
# else:
#     print("Attention! Something went wrong when searching academic databases completed!")

In [None]:
# # method 2: spanning citations of seed papers
# if span_citations(seed_papers, num_span_time):
#     print("Spanning citations of seed literature list completed!")
# else:
#     print("Attention! Something went wrong when spanning citations of seed literature list!")


In [None]:
# # method 3: search existing connectome databases
# if search_conne_db(connec_db, connec_db_quries):
#     print("Searching connectome databases completed!")
# else:
#     print("Attention! Something went wrong when searching connectome databases!")

In [None]:
# # merge all results
# if merge_search_results():
#     print("Merging all results completed!")
# else:
#     print("Attention! Something went wrong when merging all results completed!")

Next step: automatic filtering the potential related literature

<h3> Some test code, please ignore: </h3> 

In [None]:
# # test request
# url = 'https://journals.physiology.org/doi/pdf/10.1152/jn.2001.85.1.219'

# response = requests.get(url, headers = plib.headers)
# soup = BeautifulSoup(response.content,"lxml")
# print(soup)