In [10]:
# import internal .py files
import file_path_management as fpath
import public_library as plib

In [11]:
# all the file paths in file_path_management.py
# project_folder
# gs_poten_urls
# wos_poten_urls
# pubmed_pmc_poten_urls
# eupmc_poten_urls
# path_poten_csv
# path_related_urls
# path_related_csv
# pdf_folder_path
# seed_paper_urls
# connec_db_urls

In [12]:
# import packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import re
import time

In the next cell, we present all parameters that might have an effect on the search results, including:<br>
1. searching keyword lexicon
2. academic databases
3. initial urls when searching academic databases
4. seed paper list for spanning citations
5. conenctome database
6. seaching queries of the connectome database
7. on-topic keyword lexicon
8. weights of on-topic keywords when calculating relatedness of a literature
9. ChatGPT queries for relatedness of topic
10. meta categories when extracting information of related literature
11. keywords for searching meta categories
12. ChatGPT queries for extracting information of meta categories of related literature

In [13]:
# parameters

# searching keyword lexicon
search_kws_lexicon = 'macaque AND (thalamus OR thalamocortical OR thalamo-cortical)' # in all fields

# academic databases
# Google Scholar: 'https://scholar.google.com/'
# 78300 results
# Web of Science: 'https://www.webofscience.com/wos/woscc/advanced-search' # can be exported to excel file
# 961 results
# PubMed Central PMC: 'https://pubmed.ncbi.nlm.nih.gov/advanced/' # can be exported to .csv file and abstract.txt file
# 2448 results
# Europe PMC = 'https://europepmc.org/advancesearch' # can be exported to .csv file or abstract and full open access file .xml
# 5129 results
acad_dbs = ['Google Scholar', 'Web of Science', 'PubMed_Central_PMC', 'Europe_PMC']

# initial urls for specified searching keyword lexicon and all academic databases
init_urls = {
    'gs': 'https://scholar.google.com/scholar?start=0&q=macaque+thalamus+OR+thalamocortical+OR+thalamo-cortical&hl=en&as_sdt=1,5',
    'wos': 'https://www.webofscience.com/wos/woscc/summary/79530a3c-47d5-4dd0-9b7d-b1d92fd11882-98d8472a/relevance/1',
    'pubmed': 'https://pubmed.ncbi.nlm.nih.gov/?term=(((thalamus)%20OR%20(thalamocortical))%20OR%20(thalamo-cortical))%20AND%20(macaque)&sort=&page=1',
    'eupmc': 'https://europepmc.org/search?query=%28%22macaque%22%20AND%20%28%22thalamus%22%20OR%20%22thalamocortical%22%20OR%20%22thalamo-cortical%22%29%20%29%20AND%20%28LANG%3A%22eng%22%20OR%20LANG%3A%22en%22%20OR%20LANG%3A%22us%22%29&page=1'
}

# seed papers specification
seed_papers = []

# connectome database and queries specification
# we search the CoCoMac
connec_db = ''
connec_db_quries = []

In [14]:
def search_google_scholar(init_url, headers):
    # create a .txt file to record the urls of google scholar search results, clear the file if already exists
    f = open(fpath.gs_poten_urls, 'w')
    f.truncate()
    f.close()

    # request the first page and extract the number of pages of the search results
    first_page = init_url
    response = requests.get(first_page, headers = headers)
    soup = BeautifulSoup(response.content,'lxml')
    # print(soup)
    num_results_str = soup.find_all('div', {'class': 'gs_ab_mdw'})[1].get_text().split()[1]
    # print(num_results_str)
    # print(int(num_results_str))
    num_results = int(re.sub(r'[^\w\s]', '', num_results_str))
    pages = int(num_results/10)
    print(pages)
    
    # iterate all pages and record the results
    pages = 5
    for page in range(pages):
        time.sleep(random.randint(1, 10))
        start = page * 10
        # google scholar
        page_url = init_url.split('?start=')[0] + '?start=' + str(start) + '&q=' + init_url.split('?start=')[1].split('&q=')[1]
        # print(page_url)
        # search a page
        response = requests.get(page_url, headers = headers)
        soup = BeautifulSoup(response.content,'lxml')
        # print(soup)
        # print(soup.select('[data-lid]')) 
        for item in soup.find_all('[data-lid]'):
            # print(item)
            add_url = item.find_all('h3')[0].find_all('a', href = True)[0]['href']
            print(add_url)
            try: 
                with open(fpath.gs_poten_urls, 'a') as url_file:
                    # append text at the end of file
                    url_file.write(f'{add_url}\n')
            except Exception as e: 
                print("Error when trying to write in google_scholar_poten_urls.txt")
                raise e
    print("Searching Google Scholar complated!")

def search_webofscience(init_url, headers):
    print("Searching Web of Science complated!")

def search_PubMed_Central_PMC(iinit_url, headers):
    print("Searching PubMedd Central PMC complated!")

def search_Europe_PMC(init_url, headers):
    print("Searching Europe PMC complated!")

In [15]:
# search academic databases, record the urls as a line in a .txt file from the webpages
def search_acad_dbs(acad_dbs, init_urls, headers, proxy):
    for acad_db in acad_dbs:
        if acad_db == 'Google Scholar':
            print("Searching Google Scholar...")
            search_google_scholar(init_urls['gs'], headers)
        elif acad_db == 'Web of Science':
            print("Searching Web of Science...")
            search_webofscience(init_urls['wos'], headers)
        elif acad_db == 'PubMed_Central_PMC':
            print("Searching PubMed Central PMC...")
            search_PubMed_Central_PMC(init_urls['pubmed'], headers)
        elif acad_db == 'Europe_PMC':
            print("Searching Europe PMC...")
            search_Europe_PMC(init_urls['pubmed'], headers)
        else:
            print("The specified academic database: " + acad_db + " is not supported by this function.")
            print("Plese choose one of the following databases:",)
            for db in ['Google Scholar', 'Web of Science', 'PubMed_Central_PMC', 'Europe_PMC']:
                print(db)
        
        # for page in range(pages):
        #     time.sleep(2)
        #     start = page * 10
        #     # google scholar
        #     url = 'https://scholar.google.com/scholar?start=' + str(start) + '&q=macaque+thalamus+OR+thalamocortical+OR+thalamo-cortical&hl=en&as_sdt=1,5'
        #     # pubmed
        #     url = 'https://pubmed.ncbi.nlm.nih.gov/?term=macaque%20AND%20(thalamus%20OR%20cortex%20OR%20thalamocortical%20OR%20thalamo-cortical%20or%20corticothalamic%20OR%20cortico-thalamic)&page=1
        #     response = requests.get(url,headers = headers)
        #     # print(url)
        #     soup = BeautifulSoup(response.content,'lxml') 
        #     #print(soup.select('[data-lid]')) 
        #     for item in soup.select('[data-lid]'): 
        #         try: 
        #             # print('----------------------------------------') 
        #             # print(item)  
        #             # print(item.select('h3')[0])
        #             with open(path_urls, 'a+') as url_file:
        #                 url_file.seek(0)
        #                 # If file is not empty then append '\n'
        #                 data = url_file.read(100)
        #                 if len(data) > 0 :
        #                     url_file.write('\n')
        #                     # Append text at the end of file
        #                 url_file.write('----------------------------------------\n')
        #                 url_file.write(item.select('h3')[0].get_text())
        #                 url_file.write('\n')
        #                 # print(item.select('h3')[0].get_text())
        #                 for a in item.select('h3')[0].find_all('a', href=True):
        #                     # print(a['href'])
        #                     url_file.write(a['href'])
        #                     url_file.write('\n')
        #                     # print(item.select('a'))
        #                     # print("PDF link:")
        #                 url_file.write(item.select('a')[0]['href'])
        #                 url_file.write('\n')
        #                 # print(item.select('a')[0]['href'])
        #                 # print(item.select('.gs_rs')[0].get_text()) 
        #                 # print('----------------------------------------') 
        #         except Exception as e: 
        #             #raise e 
        #             print('')

def span_citations(seed_papers, num_span_time, headers, proxy):
    None
    
def search_conne_db(connec_db, connec_db_quries):
    None

In [16]:
def merge_search_results(headers):
    # process gs_poten_urls
    with open(fpath.gs_poten_urls, 'r') as file:
        lines = []
        for line in file:
            print(line)
            line = line.strip()
            lines.append(line)
    print(len(lines))
    doi_list = []
    for url in lines:
        response = requests.get(url, headers = headers)
        soup = BeautifulSoup(response.content,'lxml')
        # print(soup)
        num_results_str = soup.find_all('a', href = True)
        for href in num_results_str:
            if '//doi.org/' in href:
                doi_list.append(href)
    doi_df = pd.DataFrame({'DOI': doi_list})
    plib.clear_file(fpath.path_poten_csv)
    doi_df.to_csv(fpath.path_poten_csv, index=False)
    
    # process wos_poten_urls
    doi_df = pd.read_csv(fpath.wos_poten_urls, sep=';')
    # print(doi_df.columns)
    # print(doi_df.head())
    doi_df = doi_df[['DOI']]
    doi_df.to_csv(fpath.path_poten_csv, mode='a', index=False, header=False)
    
    # process pubmed_pmc_poten_urls
    doi_df = pd.read_csv(fpath.pubmed_pmc_poten_urls)
    doi_df = doi_df[['DOI']]
    doi_df.to_csv(fpath.path_poten_csv, mode='a', index=False, header=False)
    
    # process eupmc_poten_urls
    doi_df = pd.read_csv(fpath.eupmc_poten_urls)
    doi_df = doi_df[['DOI']]
    doi_df.to_csv(fpath.path_poten_csv, mode='a', index=False, header=False)
    
    # eliminate duplicates
    doi_df = pd.read_csv(fpath.path_poten_csv)
    print(len(doi_df))
    doi_df = doi_df.drop_duplicates(subset = 'DOI')
    print(len(doi_df))
    doi_df.to_csv(fpath.path_poten_csv, index=False)
    
    # end of merge_search_results

In [17]:
# main program
# first we need to search all related literature that might include data or information of thalamocortical connections
# search for potentially related literature using the listed 3 methods

# setting headers and proxies
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9'}
http_proxy  = "http://103.148.39.50:83"
https_proxy = "https://47.254.158.115:20201"
proxy = {
    "http": http_proxy, 
    "https": https_proxy
}
# end of setting header and proxies

# method 1: search acdemic databases using keywords
search_acad_dbs(acad_dbs, init_urls, headers, proxy)

# # method 2: spanning citations of seed papers
# span_citations(seed_papers, num_span_time, headers, proxy)

# # method 3: search existing connectome databases
# search_conne_db(connec_db, connec_db_quries)

Searching Google Scholar...


7830
[<div class="gs_r gs_or gs_scl" data-aid="6fVxOhTnsSsJ" data-cid="6fVxOhTnsSsJ" data-did="6fVxOhTnsSsJ" data-lid="" data-rp="0"><div class="gs_ggs gs_fl"><div class="gs_ggsd"><div class="gs_or_ggsm" ontouchstart="gs_evt_dsp(event)" tabindex="-1"><a data-clk="hl=en&amp;sa=T&amp;oi=gga&amp;ct=gga&amp;cd=0&amp;d=3148551688574596585&amp;ei=0jK5ZL2iJd-Ty9YP4fG9-Ak" data-clk-atid="6fVxOhTnsSsJ" href="https://journals.physiology.org/doi/pdf/10.1152/jn.2001.85.1.219"><span class="gs_ctg2">[PDF]</span> physiology.org</a><a href="/scholar?output=instlink&amp;q=info:6fVxOhTnsSsJ:scholar.google.com/&amp;hl=en&amp;as_sdt=1,5&amp;scillfp=1846943134921136726&amp;oi=lle">Full View</a></div></div></div><div class="gs_ri"><h3 class="gs_rt" ontouchstart="gs_evt_dsp(event)"><a data-clk="hl=en&amp;sa=T&amp;ct=res&amp;cd=0&amp;d=3148551688574596585&amp;ei=0jK5ZL2iJd-Ty9YP4fG9-Ak" data-clk-atid="6fVxOhTnsSsJ" href="https://journals.physiology.org/doi/abs/10.1152/jn.2001.85.1.219" id="6fVxOhTnsSsJ">Effec

In [18]:
# merge all search results
merge_search_results(headers)

# send .PDF publication of all potential related literatures to ChatPDF.con and ask for relatedness 
# then record the answer to the list_of_potential_related_literature.csv as well
# ChatPDF_relatedness(path_urls, chatpdf_related_queries)

# now we have a list of potential related literature and the information about relatedness 
# stored in the file "list_of_potential_related_literature.csv"
# now we may perform a automatic filtering and manual filtering of the literature

# automatic filtering
#auto_filter(path_potential)

# manual filtering
# manual_filter(path_potential, path_related_urls)

0
8538
7058


This is the end of semi-automated literature search.

Now we have a list of actually related literature stored in list_of_related_literature.txt

Next step: we perform a information search on the list of related literature
We have a list of actually related literature at the moment, now we need to extract information we need from the literature. We intend to achieve this with a combination of automated searching and manual extraction

In [19]:
# some test code, should comment-out

In [20]:
'''
# test the redirect of the urls
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9'} 

response_pdf = requests.get('https://doi.org/10.1016/j.neuron.2020.01.005', allow_redirects=True, headers = headers)
print(response_pdf.history)
print(response_pdf.url)
response_pdf_1 = requests.get('https://linkinghub.elsevier.com/retrieve/pii/S0896627320300052', allow_redirects=True, headers = headers)
print(response_pdf_1.history)
print(response_pdf_1.url)


response_pdf = requests.get('https://onlinelibrary.wiley.com/doi/10.1111/ejn.13910', headers = headers)
soup_pdf = BeautifulSoup(response_pdf.content,'lxml')
print(soup_pdf)
'''

"\n# test the redirect of the urls\nheaders = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9'} \n\nresponse_pdf = requests.get('https://doi.org/10.1016/j.neuron.2020.01.005', allow_redirects=True, headers = headers)\nprint(response_pdf.history)\nprint(response_pdf.url)\nresponse_pdf_1 = requests.get('https://linkinghub.elsevier.com/retrieve/pii/S0896627320300052', allow_redirects=True, headers = headers)\nprint(response_pdf_1.history)\nprint(response_pdf_1.url)\n\n\nresponse_pdf = requests.get('https://onlinelibrary.wiley.com/doi/10.1111/ejn.13910', headers = headers)\nsoup_pdf = BeautifulSoup(response_pdf.content,'lxml')\nprint(soup_pdf)\n"

In [21]:
import re
if '//doi.org/' in 'https://doi.org/10.1016/0165-0173(96)00003-3':
    print("yes")

yes


In [22]:
# test 
with open(fpath.gs_poten_urls, 'r') as file:
    lines = []
    for line in file:
        print(line)
        line = line.strip()
        lines.append(line)
print(len(lines))
doi_list = []
for url in lines:
    response = requests.get(url, headers = headers)
    soup = BeautifulSoup(response.content,'lxml')
    # print(soup)
    num_results_str = soup.find_all('a', href = True)
    print(num_results_str)
    for href in num_results_str:
        if '//doi.org/' in href['href']:
            doi_list.append(href['href'])
            print(href['href'])
        else:
            print("Ops! Did't find DOI on this page!")

0


In [23]:
# find DOI
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9'} 
# this link does not have 'DOI' in href form but text from
url = 'https://www.jneurosci.org/content/28/43/11042.short'
# url = 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2613515/'
response = requests.get(url, headers = headers)
soup = BeautifulSoup(response.content,'lxml')
# print(soup)
doi_list = []
num_results_str = soup.find_all('a', href = True)
# print(num_results_str)
for item in num_results_str:
    if '//doi.org/' in item['href']:
        print(item['href'])
        doi_list.append(item['href'].split('//doi.org/')[1])

print(doi_list)
        
if len(doi_list) == 0:
    print("Ops! Did't find DOI on this page!")

[]
Ops! Did't find DOI on this page!


In [24]:
# redirect when access the doi link
from elsapy.elsdoc import FullDoc, AbsDoc
from elsapy.elsclient import ElsClient
import json
headers = {
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9', 
    'X-ELS-APIKEY': "310946e6e005957982c2c9cad6833ad3",
    'Accept': 'application/pdf',
    'X-ELS-Insttoken': "instToken",
    'view': 'FULL'
} 
# url = 'https://www.jneurosci.org/content/28/43/11042.short'
 #url = 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2613515/'

# Journal of Neurophysiology
# url = 'https://doi.org/10.1152/jn.2001.85.1.219'
# url = 'https://journals.physiology.org/doi/10.1152/jn.2001.85.1.219'

# science direct
# url = 'https://doi.org/10.1016/j.biopsych.2004.10.014'
# url = 'https://linkinghub.elsevier.com/retrieve/pii/S0006322304010947'
# url = 'https://www.sciencedirect.com/science/article/pii/S0006322304010947?via%3Dihub'
url = 'https://api.elsevier.com/content/article/doi/{10.1016/j.biopsych.2004.10.014}'
'''
response = requests.get(url, headers = headers)
soup = BeautifulSoup(response.content,'lxml')
print(soup)
print(response.history)
print(response.url)
# Load configuration
con_file = open("config.json")
config = json.load(con_file)
con_file.close()
'''
response = requests.get(url, headers = headers)
print(response)
'''
## Initialize client
client = ElsClient(config['apikey'])

## ScienceDirect (full-text) document example using DOI
doi_doc = FullDoc(doi = '10.1016/j.biopsych.2004.10.014')
print(doi_doc)
if doi_doc.read(client):
    print ("doi_doc.title: ", doi_doc.title)
    doi_doc.write("doi_doc")   
else:
    print ("Read document failed.")
'''

<Response [500]>


'\n## Initialize client\nclient = ElsClient(config[\'apikey\'])\n\n## ScienceDirect (full-text) document example using DOI\ndoi_doc = FullDoc(doi = \'10.1016/j.biopsych.2004.10.014\')\nprint(doi_doc)\nif doi_doc.read(client):\n    print ("doi_doc.title: ", doi_doc.title)\n    doi_doc.write("doi_doc")   \nelse:\n    print ("Read document failed.")\n'

In [25]:
# importing required modules
from PyPDF2 import PdfReader
  
# creating a pdf reader object
reader = PdfReader("/Users/didihou/Downloads/fncir-09-00079.pdf")
  
# printing number of pages in pdf file
print(len(reader.pages))
  
# getting a specific page from the pdf file
page = reader.pages[0]
  
# extracting text from page
text = ''.join(page.extract_text().splitlines())
print(text)

8
MINI REVIEWpublished: 02 December 2015doi: 10.3389/fncir.2015.00079Vestibular Interactions in theThalamusRajiv Wijesinghe1,Dario A. Protti2andAaron J. Camp1*1Sensory Systems and Integration Laboratory, Sydney Medical School, Discipline of Biomedical Science, University ofSydney, Sydney, NSW, Australia,2Vision Laboratory, Sydney Medical School, Discipline of Physiology, University of Sydney,Sydney, NSW, AustraliaEdited by:W. Martin Usrey,University of California, Davis, USAReviewed by:Elizabeth Quinlan,University of Maryland College Park,USAMarianne Dieterich,Ludwig-Maximilians-University,Germany*Correspondence:Aaron J. Campaaron.camp@sydney.edu.auReceived: 28 August 2015Accepted: 10 November 2015Published: 02 December 2015Citation:Wijesinghe R, Protti DA and Camp AJ(2015) Vestibular Interactions inthe Thalamus.Front. Neural Circuits 9:79.doi: 10.3389/fncir.2015.00079It has long been known that the vast majority of all information en route to the cerebralcortex must ﬁrst pass through 