<h2> Semi-automatic literature search </h2> 

In [31]:
# import internal .py modules
import file_path_management as fpath
import public_library as plib

In [32]:
# import packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import re
import time
import numpy as np
import numpy as np

<h3> Parameters: </h3>

In [33]:
# searching keywords lexicon
# search in all fields
# "" means exact match, otherwise the search engine will treat every word separately

# search_kws_lexicon = (macaque OR macaca OR "rhesus monkey") AND (thalamus OR thalamic OR thalamocortical OR "thalamo-cortical")

# academic databases
# Google Scholar: "https://scholar.google.com/"
# (macaque OR macaca OR "rhesus monkey") (thalamus OR thalamic OR thalamocortical OR "thalamo-cortical")
# 132000 results
# https://scholar.google.com/scholar?start=0&q=(macaque+OR+macaca+OR+%22rhesus+monkey%22)+(thalamus+OR+thalamic+OR+thalamocortical+OR+%22thalamo-cortical%22)&hl=en&as_sdt=0,5

# Web of Science: "https://www.webofscience.com/wos/woscc/advanced-search" # can be exported to excel file
# (ALL=(macaque) OR ALL=(macaca) OR All=("rhesus monkey")) AND (ALL=(thalamus) OR ALL=(thalamic) OR ALL=(thalamocortical) OR ALL=("thalamo-cortical"))
# 1976 results
# https://www.webofscience.com/wos/woscc/summary/cbcda45c-f1a5-45d2-bc24-ff8c17e0c083-9a6726cd/relevance/1

# PubMed Central PMC: "https://pubmed.ncbi.nlm.nih.gov/advanced/" # can be exported to .csv file
# (macaque OR macaca OR "rhesus monkey") AND (thalamus OR thalamic OR thalamocortical OR "thalamo-cortical")
# 2606 results
# https://pubmed.ncbi.nlm.nih.gov/?term=(macaque%20OR%20macaca%20OR%20%22rhesus%20monkey%22)%20AND%20(thalamus%20OR%20thalamic%20OR%20thalamocortical%20OR%20%22thalamo-cortical%22)&page=1

# Europe PMC = "https://europepmc.org/advancesearch" # search resuts can be exported to .csv file
# (macaque OR macaca OR "rhesus monkey") AND (thalamus OR thalamic OR thalamocortical OR "thalamo-cortical") AND (LANG:"eng" OR LANG:"en" OR LANG:"us")
# 9140 results
# https://europepmc.org/search?query=%28macaque%20OR%20macaca%20OR%20%22rhesus%20monkey%22%29%20AND%20%28thalamus%20OR%20thalamic%20OR%20thalamocortical%20OR%20%22thalamo-cortical%22%29%20AND%20%28LANG%3A%22eng%22%20OR%20LANG%3A%22en%22%20OR%20LANG%3A%22us%22%29&page=1

acad_dbs = ["Google Scholar", "Web of Science", "PubMed Central PMC", "Europe PMC"]

# initial urls for specified searching keyword lexicon and all academic databases
init_urls = {
    "gs": "https://scholar.google.com/scholar?start=0&q=(macaque+OR+macaca+OR+%22rhesus+monkey%22)+(thalamus+OR+thalamic+OR+thalamocortical+OR+%22thalamo-cortical%22)&hl=en&as_sdt=0,5",
    "wos": "https://www.webofscience.com/wos/woscc/summary/cbcda45c-f1a5-45d2-bc24-ff8c17e0c083-9a6726cd/relevance/1",
    "pmc": "https://pubmed.ncbi.nlm.nih.gov/?term=(macaque%20OR%20macaca%20OR%20%22rhesus%20monkey%22)%20AND%20(thalamus%20OR%20thalamic%20OR%20thalamocortical%20OR%20%22thalamo-cortical%22)&page=1",
    "eupmc": "https://europepmc.org/search?query=%28macaque%20OR%20macaca%20OR%20%22rhesus%20monkey%22%29%20AND%20%28thalamus%20OR%20thalamic%20OR%20thalamocortical%20OR%20%22thalamo-cortical%22%29%20AND%20%28LANG%3A%22eng%22%20OR%20LANG%3A%22en%22%20OR%20LANG%3A%22us%22%29&page=1"
}

# seed literature list
seed_litera_list = []

# cocomac literature list
cocomac_litera_list = []

<h3> Predefined fucntions: </h3> 

In [38]:
# search academic database google scholar given a initial url and return the search results
# the given initial url is manually obtained by input search query in the google scholar
def search_google_scholar(first_page):
    print("Searching Google Scholar...")
    f = open(fpath.poten_litera_gs, "w")
    f.truncate()
    f.close()

    # request the webpage unitl the status code is 200
    proxies, auth = plib.get_proxies()
    response = requests.get(first_page, headers = plib.headers, proxies = proxies, auth = auth)
    print(response.status_code)
    if response.status_code != 200:
        print("not 200")
        raise Exception("Your request was declined, again!")
    # parse the webpage
    soup = BeautifulSoup(response.content, "lxml")
    # print(soup)
    num_results_str_list = soup.find_all("div", {"class": "gs_ab_mdw"})
    # print(num_results_str_list)
    for item in num_results_str_list:
        num_results_str = []
        if "results" in item.get_text():
            num_results_str = item.get_text().split()
    num_results = num_results_str[1]
    # print(num_results)
    num_results = int(re.sub(r"[^0-9]", "", num_results))
    pages = int(num_results/10)
    print("Google Scholar searched " + str(num_results) + " results" + " displayed in " + str(pages) + " pages.")
    
    # iterate all pages and record the results
    pages = 20
    for page in range(pages):
        time.sleep(random.randint(2, 5))
        # time.sleep(random.randint(1*60, 10*60))
        start = page * 10
        # https://scholar.google.com/scholar?start=0&q=(macaque+OR+macaca+OR+%22rhesus+monkey%22)+(thalamus+OR+thalamic+OR+thalamocortical+OR+%22thalamo-cortical%22)&hl=en&as_sdt=0,5
        page_url = first_page.split("?start=")[0] + "?start=" + str(start) + "&q=" + first_page.split("&q=")[1]
        # print(page_url)
        # search a page
        proxies, auth = plib.get_proxies()
        response = requests.get(page_url, headers = plib.headers, proxies = proxies, auth = auth)
        if response.status_code != 200:
            raise Exception("Your request was declined, again!")
        # while(response.status_code != 200):
        #     # sleep for 5 minutes
        #     time.sleep(300)
        #     response = requests.get(page_url, headers = plib.headers)
        soup = BeautifulSoup(response.content, "lxml")
        # print(soup)
        # print(soup.select("[data-lid]")) 
        for item in soup.select("[data-lid]"):
            # print(item)
            try:
                add_title = item.select("h3")[0].select("a", href = True)[0].get_text().strip()
            except:
                add_title = "not found"
            # print(add_title)
            try:
                add_url = item.select("h3")[0].select("a", href = True)[0]["href"]
            except:
                add_url = "not found"
            # print(add_url)
            try:
                add_full_text_link = item.find_all("div", {'class': "gs_or_ggsm"})[0].find_all("a", href = True)[0]["href"]
            except:
                add_full_text_link = "not found"
            # print(add_full_text_link)
            try:
                full_text_type = item.find_all("div", {'class': "gs_or_ggsm"})[0].find_all("a", href = True)[0].get_text().strip().split()[0]
                full_text_source = item.find_all("div", {'class': "gs_or_ggsm"})[0].find_all("a", href = True)[0].get_text().strip().split()[1]
                # print(full_text_source)
            except:
                full_text_type = "not found"
                full_text_source = "not found"
            # print(full_text_source)
            row = {
                "url": [add_url],
                "title": [add_title],
                "full_text_url": [add_full_text_link],
                "full_text_type": [full_text_type],
                "full_text_source": [full_text_source]
            }
            columns = ["url", "title", "full_text_url", "full_text_type", "full_text_source"]
            plib.add_row_to_csv(fpath.poten_litera_gs, row, columns)
    #     return True
    # except:
    #     return False
# --------------------start of test code--------------------
init_url = init_urls["gs"]
search_google_scholar(init_url)
# ---------------------end of test code---------------------

Searching Google Scholar...


TypeError: 'str' object is not callable

In [35]:
from proxy_seller_user_api import Api
api = Api({'key':'c223ca2f84843d299ce2f3c5c0aa486d'})
print(api.proxyCheck())

TypeError: Api.proxyCheck() missing 1 required positional argument: 'proxy'

In [None]:
from requests.auth import HTTPProxyAuth

init_url = init_urls["gs"]
# proxies, auth=plib.get_proxies()
proxies = { 
        "http": "http://23.165.240.230:14409",
        "http": "https://23.165.240.230:14414"
    }
auth = HTTPProxyAuth("didihou", "KqpKtsynqI")
response = requests.get(init_url, headers = plib.headers, proxies=proxies, auth=auth)
print(response.status_code)
soup = BeautifulSoup(response.content, "lxml")
print(soup)

200
<!DOCTYPE html>
<html><head><title>Google Scholar</title><meta content="text/html;charset=utf-8" http-equiv="Content-Type"/><meta content="IE=Edge" http-equiv="X-UA-Compatible"/><meta content="always" name="referrer"/><meta content="width=device-width,initial-scale=1,minimum-scale=1,maximum-scale=2" name="viewport"/><meta content="telephone=no" name="format-detection"/><link href="/favicon.ico" rel="shortcut icon"/><style>html,body,form,table,div,h1,h2,h3,h4,h5,h6,img,ol,ul,li,button{margin:0;padding:0;border:0;}table{border-collapse:collapse;border-width:0;empty-cells:show;}html,body{height:100%}#gs_top{position:relative;box-sizing:border-box;min-height:100%;min-width:964px;-webkit-tap-highlight-color:rgba(0,0,0,0);}#gs_top>*:not(#x){-webkit-tap-highlight-color:rgba(204,204,204,.5);}.gs_el_ph #gs_top,.gs_el_ta #gs_top{min-width:320px;}#gs_top.gs_nscl{position:fixed;width:100%;}body,td,input,button{font-size:13px;font-family:Arial,sans-serif;line-height:1.24;}body{background:#fff;c

In [None]:
def search_webofscience(init_url):
    try:
        print("Searching Web of Science...")
        # search on the website and export the search results
        return True
    except:
        return False

In [None]:
def search_pmc(init_url):
    try:
        print("Searching PubMed Central PMC...")
        # search on the website and export the search results
        return True
    except:
        return False

In [None]:
def search_eupmc(init_url):
    try:
        print("Searching Europe PMC...")
        # search on the website and export the search results
        return True
    except:
        return False

In [None]:
def span_citations(seed_litera_list, num_span_time):
    try:
        return True
    except:
        return False

In [None]:
def search_conne_db():
    try:    
        return True
    except:
        return False

In [None]:
def merge_webofscience(columns):
    print("Starting merging search results from Web of Science...")
    try:
        df_wos = pd.read_csv(fpath.poten_litera_wos, sep = ";")
        df_wos = df_wos[["DOI", "Pubmed Id", "Article Title"]]
        df_wos.rename(columns={"DOI": "DOI", "Pubmed Id": "PMID", "Article Title": "Title"}, inplace = True)
        df_wos["PMID"] = df_wos["PMID"].fillna(0)
        df_wos["PMID"] = df_wos["PMID"].astype(int)
        df_wos["PMID"] = df_wos["PMID"].astype(str)
        # print(df_wos.head(5))
        # print(df_wos.dtypes)
        pmcid = []
        for ind in df_wos.index:
            # print(df_wos["PMID"][ind])
            if df_wos["PMID"][ind] != "0":
                pmid = df_wos["PMID"][ind]
                # print(pmid)
                df_wos["PMID"][ind] = pmid
                url = "https://pubmed.ncbi.nlm.nih.gov/" + pmid + "/"
                # print(url)
                time.sleep(random.randint(5, 20))
                response = requests.get(url, headers = plib.headers)
                if response.status_code != 200:
                    raise Exception("Error when request webpages!")
                soup = BeautifulSoup(response.content, "lxml")
                l = soup.find_all("a", {"data-ga-action": "PMCID"})
                if(len(l) != 0):
                    # print(l[0].get_text().strip())
                    pmcid.append(l[0].get_text().strip())
                else:
                    pmcid.append(np.nan)
            else:
                pmcid.append(np.nan)
            # print(df_wos[ind])
        df_wos["PMCID"] = pmcid
        df_wos["PMCID"].replace("0", np.nan)
        # print(df_wos.head(5))
        df_wos = df_wos[columns]
        df_wos.to_csv(fpath.poten_litera, header = True, index = None)
        return True
    except:
        return False   

In [None]:
def merge_pmc(columns):
    print("Starting merging search results from PubMed Central PMC...")
    try:
        # process pmc search results
        df_pmc = pd.read_csv(fpath.poten_litera_pmc, sep=',')
        doi = df_pmc[["DOI", "PMID", "PMCID", "Title"]]
        doi.to_csv(fpath.poten_litera, mode = "a", header = None, index = None)
        return True
    except:
        return False

In [None]:
def merge_eupmc(columns):
    print("Starting merging search results from Europe PMC...")
    try:
        # process eupmc search results
        df_eupmc = pd.read_csv(fpath.poten_litera_eupmc, sep = ",")
        df_eupmc = df_eupmc[["DOI", "PMCID", "TITLE"]]
        df_eupmc = df_eupmc.rename(columns={"TITLE": "Title"}, errors = "raise")
        # print(df_eupmc.head(5))
        pmid = []
        for ind in df_eupmc.index:
            # print(df_eupmc["PMCID"][ind])
            if df_eupmc["PMCID"][ind] is not np.nan:
                pmcid = str(df_eupmc["PMCID"][ind])
                # print(pmcid)
                df_eupmc["PMCID"][ind] = pmcid
                url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/"
                # print(url)
                time.sleep(random.randint(5, 20))
                response = requests.get(url, headers = plib.headers)
                if response.status_code != 200:
                    print(response.status_code)
                    raise Exception("Error when request webpages!")
                soup = BeautifulSoup(response.content, "lxml")
                l = soup.find_all("div", {"class": "fm-citation-pmid"})
                if (len(l)) != 0:
                    ll = l[0].find_all("a", href = True)
                    if(len(ll) != 0):
                        # print(ll[0].get_text().strip())
                        pmid.append(ll[0].get_text().strip())
                else:
                    pmid.append(np.nan)
            else:
                pmid.append(np.nan)
            # print(df_wos[ind])
        df_eupmc["PMID"] = pmid
        df_eupmc = df_eupmc[columns]
        df_eupmc.to_csv(fpath.poten_litera, header = None, index = None)
        return True
    except:
        return False

In [None]:
def merge_google_shcolar(columns):
    print("Starting merging search results from Google Scholar...")
    return True

In [None]:
def merge_seed_paper_spanning(columns):
    print("Starting merging search results from spanning citations of seed paper...")
    return True

In [None]:
def merge_cocomac_paper(columns):
    print("Starting merging search results from CoCoMac papers...")
    return True

<h3> Main program: </h3> 

In [None]:
# first we need to search all related literature that might include data or information of thalamocortical connections
# search for potentially related literature using the following listed 3 methods

In [None]:
# method 1: search acdemic databases using keywords
if search_webofscience(init_urls["wos"]):
    print("Searching Web of Science succeeded!")
else:
    print("Attention! Something went wrong when searching Web of Science!")

if search_pmc(init_urls["pmc"]):
    print("Searching PubMed Central PMC succeeded!")
else:
    print("Attention! Something went wrong when searching PubMed Central PMC!")

if search_eupmc(init_urls["eupmc"]):
    print("Searching Europe PMC succeeded!")
else:
    print("Attention! Something went wrong when searching Europe PMC!")

if search_google_scholar(init_urls["gs"]):
    print("Searching Google Scholar succeeded!")
else:
    print("Attention! Something went wrong when searching Google Scholar!")

In [None]:
# # method 2: spanning citations of seed papers
# if span_citations(seed_papers, num_span_time):
#     print("Spanning citations of seed literature list succeeded!")
# else:
#     print("Attention! Something went wrong when spanning citations of seed literature list!")

In [None]:
# # method 3: search existing connectome databases
# if search_conne_db(connec_db, connec_db_quries):
#     print("Searching connectome databases succeeded!")
# else:
#     print("Attention! Something went wrong when searching connectome databases!")

In [None]:
# # merge all searched literature results
# # clear the file
# f = open(fpath.poten_litera, "w")
# f.truncate()
# f.close()

# # column: ["DOI", "PMID", "PMCID", "Title"]
# columns = ["DOI", "PMID", "PMCID", "Title"]

In [None]:
# # merge search results from Web of Science
# if merge_webofscience(columns):
#     print("Merging results from Web of Science succeeded!")
# else:
#     print("Attention! Something went wrong when merging results from Web of Science!")

In [None]:
# # merge search results from PubMed Central PMC
# if merge_pmc(columns):
#     print("Merging results from PubMed Central PMC succeeded!")
# else:
#     print("Attention! Something went wrong when merging results from PubMed Central PMC!")

In [None]:
# # merge search results from Europe PMC
# if merge_eupmc(columns):
#     print("Merging results from Europe PMC succeeded!")
# else:
#     print("Attention! Something went wrong when merging results from Europe PMC!")

In [None]:
# # merge search results from Google Scholar
# if merge_google_shcolar(columns):
#     print("Merging results from Google Scholar succeeded!")
# else:
#     print("Attention! Something went wrong when merging results from Google Scholar!")

In [None]:
# # merge search results from spanning citations of seed paper
# if merge_seed_paper_spanning(columns):
#     print("Merging results from spanning citations of seed papers succeeded!")
# else:
#     print("Attention! Something went wrong when merging results from spanning citations of seed papers!")

In [None]:
# # merge search results from CoCoMac papers
# if merge_cocomac_paper(columns):
#     print("Merging results from CoCoMac papers succeeded!")
# else:
#     print("Attention! Something went wrong when merging results from CoCoMac papers!")

Next step: automatic filtering the potential related literature

<h3> Some test code, please ignore: </h3> 

In [None]:
# # test request
# # webpage that require subscription
# # url = "https://www.sciencedirect.com/science/article/abs/pii/S0079612305490019?via%3Dihub"
# url = "https://www.sciencedirect.com/science/article/pii/S095943880900049X?via%3Dihub"
# # webpage that does not require subscription
# # url = "https://pubmed.ncbi.nlm.nih.gov/34524542/"
# # url = "https://link.springer.com/article/10.1007/s00429-021-02377-7"
# response = requests.get(url, headers = plib.headers)
# soup = BeautifulSoup(response.content,"html.parser")
# # print(soup.find_all("a"))
# print(soup)

In [None]:
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.common.keys import Keys
# # from selenium.webdriver.chrome.options import Options

# # options = Options()
# # options.add_argument('--headless')
# browser = webdriver.Firefox()

# browser.get(url)
# time.sleep(10)
# # assert 'Yahoo' in browser.title

# # ele = browse?r.find_element("a")
# # elems = browser.find_element(By.CLASS_NAME, "title-text")
# # pdf_links = browser.find_elements(By.XPATH, "//a[contains(@href, '.pdf')]")
# pdf_links = browser.find_elements(By.XPATH, "//h1")[0].get_attribute('innerHTML')
# print(pdf_links)
# # for elem in elems:
# #     print(elem)
# # print(elems)
# # elem = browser.find_element(By.NAME, 'p')  # Find the search box
# # elem.send_keys('seleniumhq' + Keys.RETURN)
# browser.quit()