In [2]:
'''
# import external packages
import pandas as pd
import re
from bs4 import BeautifulSoup 
import requests
import time
import os
import os.path
import csv
'''

In [3]:
# import internal packages
import semi_automated_literature_search as LS
import semi_automated_information_search as IS
import file_path_management
import public_library as PL
import automatic_filtering as AF

NameError: name 'pd' is not defined

In the next cell, we present all parameters that might have an effect on the search results, including:<br>
1. searching keyword lexicon
2. on-topic keyword lexicon
3. academic databases
4. seed papers
5. connectome database and 
6. connectome database queries
7. ChatPDF queries for relatedness of topic

In [None]:
# parameters

# searching keyword lexicon
# (macaque OR Macaque) AND (thalamocortical OR thalamocortical OR corticothalamic OR 'cortico-thalamic' OR thalamus OR cortex)
search_kws_lexicon = 'macaque AND (thalamus OR cortex OR thalamocortical OR thalamo-cortical OR corticothalamic OR cortico-thalamic)'

# on-topic keyword lexicon
on_topic_kws = ['thalamocortical', 'thalamo-cortical', 'corticothalamic', 'cortico-thalamic',
                'tracing', 'tracer', 'tract tracing', 'tract-tracing', 'axonal tracing', 'neural anatomical tracing', 
                'connection', 'projection', 'connectivity', 'connectome', 
                'thalamus', 'cortex']

# academic databases
# Semantic Scholar: 'https://www.semanticscholar.org/'
# Google Scholar: 'https://scholar.google.com/'
# Web of Science: 'https://www.webofscience.com/wos/woscc/basic-search' # can be expoted to excel file
# PubMed Central PMC: 'https://www.ncbi.nlm.nih.gov/pmc/' # can be exported to .csv file and abstract.txt file
# Europe PMC = 'https://europepmc.org/' # can be exported to .csv file or abstract and full open access file .xml
acad_dbs = ['Semantic Scholar', 'Google Scholar', 'Web of Science', 'PubMed_Central_PMC', 'Europe_PMC']

# initial urls for specified searching keyword lexicon and all academic databases
init_url = {'gs' = 'https://scholar.google.com/scholar?start=0&q=%22thalamus%22+OR+%22thalamocortical%22+OR+%22thalamo-cortical%22+%22macaque%22&hl=en&as_sdt=1,5',
            'wos' = 
            'pubmed' = 
            'eupmc' = 
           }

# seed papers specification
seed_papers = []

# connectome database and queries specification
# we search the CoCoMac
connec_db = ''
connec_db_quries = []

# ChatGPT, queries for relatedness of topic
ChatGPT_related_queries = ['Does the given text include information of thalamocotical connection?', 
                           'Does the given text include information of connection between thalamus and cortex?']

# meta categories, keywords, and correspond queries
meta_categ = ['DOI', 'Publication_link', 'Authors', 'Year', 'Country', 'Affiliation', 
              'Title', 'Abstract', 'Keywords', 'Thalamic_parcellation_scheme', 'Cortical_parcellation_scheme', 
              'Thalamic_area_focused', 'Cortical_area_focused',
              'Steriotactic_axis', 'Type_of_data']
meta_categ_kws = []
meta_categ_quries = []

In [None]:
# all the file paths in file_path_management.py
# path_poten_urls
# path_poten_csv
# path_related_urls
# path_related_csv
# pdf_folder_path
# seed_paper_urls
# connec_db_urls

In [None]:
# main program
if __name__ == "__main__":
    # first we need to search all related literature that might include data or information of thalamocortical connections
    # search for potentially related literature using the listed 3 methods
    
    # headers simulating a broswer so that the web scraping won't get blocked
    headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9'}

    # method 1: search acdemic databases using keywords
    LS.search_acad_dbs(acad_dbs, init_urls, headers)
    
    '''
    # method 2: spanning citations of seed papers
    LS.scan_poten_urls(path_urls, on_topic_kws)
    
    # method 3: search existing connectome databases
    LS.search_conne_db(connec_db, connec_db_quries)
    
    # send .PDF publication of all potential related literatures to ChatPDF.con and ask for relatedness 
    # then record the answer to the list_of_potential_related_literature.csv as well
    # ChatPDF_relatedness(path_urls, chatpdf_related_queries)

    # now we have a list of potential related literature and the information about relatedness 
    # stored in the file "list_of_potential_related_literature.csv"
    # now we may perform a automatic filtering and manual filtering of the literature

    # automatic filtering
    #auto_filter(path_potential)

    # manual filtering
    # manual_filter(path_potential, path_related_urls)

In [None]:
'''
# test the redirect of the urls
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9'} 

response_pdf = requests.get('https://doi.org/10.1016/j.neuron.2020.01.005', allow_redirects=True, headers = headers)
print(response_pdf.history)
print(response_pdf.url)
response_pdf_1 = requests.get('https://linkinghub.elsevier.com/retrieve/pii/S0896627320300052', allow_redirects=True, headers = headers)
print(response_pdf_1.history)
print(response_pdf_1.url)


response_pdf = requests.get('https://onlinelibrary.wiley.com/doi/10.1111/ejn.13910', headers = headers)
soup_pdf = BeautifulSoup(response_pdf.content,'lxml')
print(soup_pdf)
'''

This is the end of semi-automated literature search.

Now we have a list of actually related literature stored in list_of_related_literature.txt

Next step: we perform a information search on the list of related literature
We have a list of actually related literature at the moment, now we need to extract information we need from the literature. We intend to achieve this with a combination of automated searching and manual extraction

In [1]:
# semi-automated information search