<h2> Semi-automatic literature search </h2> 

In [23]:
# import internal .py modules
import file_path_management as fpath
import public_library as plib

In [24]:
# import packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import re
import time
import numpy as np
import numpy as np

<h3> Parameters: </h3>

In [25]:
# searching keywords lexicon
# search in all fields
# "" means exact match, otherwise the search engine will treat every word separately

# search_kws_lexicon = (macaque OR macaca OR "rhesus monkey") AND (thalamus OR thalamic OR thalamocortical OR "thalamo-cortical")

# academic databases
# 27.07.2023
# Google Scholar: "https://scholar.google.com/"
# (macaque OR macaca OR "rhesus monkey") (thalamus OR thalamic OR thalamocortical OR "thalamo-cortical")
# 132000 results
# https://scholar.google.com/scholar?start=0&q=(macaque+OR+macaca+OR+%22rhesus+monkey%22)+(thalamus+OR+thalamic+OR+thalamocortical+OR+%22thalamo-cortical%22)&hl=en&as_sdt=0,5

# Web of Science: "https://www.webofscience.com/wos/woscc/advanced-search" # can be exported to excel file
# (ALL=(macaque) OR ALL=(macaca) OR All=("rhesus monkey")) AND (ALL=(thalamus) OR ALL=(thalamic) OR ALL=(thalamocortical) OR ALL=("thalamo-cortical"))
# 1976 results
# https://www.webofscience.com/wos/woscc/summary/cbcda45c-f1a5-45d2-bc24-ff8c17e0c083-9a6726cd/relevance/1

# PubMed Central PMC: "https://pubmed.ncbi.nlm.nih.gov/advanced/" # can be exported to .csv file
# (macaque OR macaca OR "rhesus monkey") AND (thalamus OR thalamic OR thalamocortical OR "thalamo-cortical")
# 2606 results
# https://pubmed.ncbi.nlm.nih.gov/?term=(macaque%20OR%20macaca%20OR%20%22rhesus%20monkey%22)%20AND%20(thalamus%20OR%20thalamic%20OR%20thalamocortical%20OR%20%22thalamo-cortical%22)&page=1

# Europe PMC = "https://europepmc.org/advancesearch" # search resuts can be exported to .csv file
# (macaque OR macaca OR "rhesus monkey") AND (thalamus OR thalamic OR thalamocortical OR "thalamo-cortical") AND (LANG:"eng" OR LANG:"en" OR LANG:"us")
# 9140 results
# https://europepmc.org/search?query=%28macaque%20OR%20macaca%20OR%20%22rhesus%20monkey%22%29%20AND%20%28thalamus%20OR%20thalamic%20OR%20thalamocortical%20OR%20%22thalamo-cortical%22%29%20AND%20%28LANG%3A%22eng%22%20OR%20LANG%3A%22en%22%20OR%20LANG%3A%22us%22%29&page=1

acad_dbs = ["Google Scholar", "Web of Science", "PubMed Central PMC", "Europe PMC"]

# initial urls for specified searching keyword lexicon and all academic databases
init_urls = {
    "gs": "https://scholar.google.com/scholar?start=0&q=(macaque+OR+macaca+OR+%22rhesus+monkey%22)+(thalamus+OR+thalamic+OR+thalamocortical+OR+%22thalamo-cortical%22)&hl=en&as_sdt=0,5",
    "wos": "https://www.webofscience.com/wos/woscc/summary/cbcda45c-f1a5-45d2-bc24-ff8c17e0c083-9a6726cd/relevance/1",
    "pmc": "https://pubmed.ncbi.nlm.nih.gov/?term=(macaque%20OR%20macaca%20OR%20%22rhesus%20monkey%22)%20AND%20(thalamus%20OR%20thalamic%20OR%20thalamocortical%20OR%20%22thalamo-cortical%22)&page=1",
    "eupmc": "https://europepmc.org/search?query=%28macaque%20OR%20macaca%20OR%20%22rhesus%20monkey%22%29%20AND%20%28thalamus%20OR%20thalamic%20OR%20thalamocortical%20OR%20%22thalamo-cortical%22%29%20AND%20%28LANG%3A%22eng%22%20OR%20LANG%3A%22en%22%20OR%20LANG%3A%22us%22%29&page=1"
}

# seed literature list
seed_litera_list = []

# cocomac literature list
cocomac_litera_list = []

<h3> Predefined fucntions: </h3> 

In [26]:
def search_webofscience(init_url):
    try:
        print("Searching Web of Science...")
        # search on the website and export the search results
        return True
    except:
        return False

In [27]:
def search_pmc(init_url):
    try:
        print("Searching PubMed Central PMC...")
        # search on the website and export the search results
        return True
    except:
        return False

In [28]:
def search_eupmc(init_url):
    try:
        print("Searching Europe PMC...")
        # search on the website and export the search results
        return True
    except:
        return False

In [29]:
def search_google_scholar(first_page):
    print("Searching Google Scholar...")

    f = open(fpath.poten_litera_gs, "w")
    f.truncate()
    f.close()

    # request the first page 1
    proxies = plib.get_proxies()
    response = requests.get(first_page, headers = plib.headers, proxies = proxies)
    if response.status_code != 200:
        print("Error when searching page:", first_page)
        print(response.status_code)
        raise Exception("Your request was declined, again!")
    soup = BeautifulSoup(response.content, "lxml")
    # print(soup)
    num_results_str_list = soup.find_all("div", {"class": "gs_ab_mdw"})
    # print(num_results_str_list)
    for item in num_results_str_list:
        num_results_str = []
        if "results" in item.get_text():
            num_results_str = item.get_text().split()
    num_results = num_results_str[1]
    # print(num_results)
    num_results = int(re.sub(r"[^0-9]", "", num_results))
    pages = int(num_results/10)
    print("Google Scholar searched " + str(num_results) + " results" + " displayed in " + str(pages) + " pages.")
    
    # iterate all pages and record the results
    # pages = 5
    for page in range(100):
        print("Searching page ", page)
        time.sleep(random.randint(1, 10))

        start = page * 10
        page_url = first_page.split("?start=")[0] + "?start=" + str(start) + "&q=" + first_page.split("&q=")[1]
        # print(page_url)

        # search a page
        if(page%5 == 0):
            # time.sleep(random.randint(5, 10)*60)
            proxies = plib.get_proxies()
        print(proxies)
        response = requests.get(page_url, headers = plib.headers, proxies = proxies)
        if response.status_code != 200:
            print("Error when searching page:", page)
            print(response.status_code)
            raise Exception("Your request was declined, again!")
        soup = BeautifulSoup(response.content, "lxml")
        # print(soup)
        items = soup.select('[data-lid]')
        # print(items)
        for item in items:
            # print(item[i])
            try:
                add_title = item.select("h3")[0].select("a", href = True)[0].get_text().strip()
            except:
                add_title = "not found"
            # print(add_title)
            try:
                add_url = item.select("h3")[0].select("a", href = True)[0]["href"]
            except:
                add_url = "not found"
            # print(add_url)
            try:
                add_full_text_link = item.find_all("div", {'class': "gs_or_ggsm"})[0].find_all("a", href = True)[0]["href"]
            except:
                add_full_text_link = "not found"
            # print(add_full_text_link)
            try:
                full_text_type = item.find_all("div", {'class': "gs_or_ggsm"})[0].find_all("a", href = True)[0].get_text().strip().split()[0]
                full_text_source = item.find_all("div", {'class': "gs_or_ggsm"})[0].find_all("a", href = True)[0].get_text().strip().split()[1]
            except:
                full_text_type = "not found"
                full_text_source = "not found"
            # print(full_text_type)
            # print(full_text_source)
            row = {
                "title": [add_title],
                "url": [add_url],
                "full_text_url": [add_full_text_link],
                "full_text_type": [full_text_type],
                "full_text_source": [full_text_source]
            }
            # print(row)
            columns = ["title", "url", "full_text_url", "full_text_type", "full_text_source"]
            if plib.add_row_to_csv(fpath.poten_litera_gs, row, columns):
                None
            else:
                print("Error detected when adding a row to csv!")
# --------------------start of test code--------------------
# init_url = init_urls["gs"]
# search_google_scholar(init_url)
# ---------------------end of test code---------------------

In [30]:
def span_citations(seed_litera_list, num_span_time):
    try:
        return True
    except:
        return False

In [31]:
def search_conne_db():
    try:    
        return True
    except:
        return False

<h3> Main program: </h3> 

In [32]:
# first we need to search all related literature that might include data or information of thalamocortical connections
# search for potentially related literature using the following listed 3 methods

In [33]:
# method 1: search acdemic databases using keywords
if search_webofscience(init_urls["wos"]):
    print("Searching Web of Science succeeded!")
else:
    print("Attention! Something went wrong when searching Web of Science!")

if search_pmc(init_urls["pmc"]):
    print("Searching PubMed Central PMC succeeded!")
else:
    print("Attention! Something went wrong when searching PubMed Central PMC!")

if search_eupmc(init_urls["eupmc"]):
    print("Searching Europe PMC succeeded!")
else:
    print("Attention! Something went wrong when searching Europe PMC!")

search_google_scholar(init_urls["gs"])
print("Searching Google Scholar succeeded!")

Searching Web of Science...
Searching Web of Science succeeded!
Searching PubMed Central PMC...
Searching PubMed Central PMC succeeded!
Searching Europe PMC...
Searching Europe PMC succeeded!
Searching Google Scholar...
Error when searching page: https://scholar.google.com/scholar?start=0&q=(macaque+OR+macaca+OR+%22rhesus+monkey%22)+(thalamus+OR+thalamic+OR+thalamocortical+OR+%22thalamo-cortical%22)&hl=en&as_sdt=0,5
429


Exception: Your request was declined, again!

In [None]:
# # method 2: spanning citations of seed papers
# if span_citations(seed_papers, num_span_time):
#     print("Spanning citations of seed literature list succeeded!")
# else:
#     print("Attention! Something went wrong when spanning citations of seed literature list!")

In [None]:
# # method 3: search existing connectome databases
# if search_conne_db(connec_db, connec_db_quries):
#     print("Searching connectome databases succeeded!")
# else:
#     print("Attention! Something went wrong when searching connectome databases!")

<h3> Next step: searched literature data preprocessing </h3>