<h2> Automatic filtering </h2> 

In [7]:
# import internal .py files
import file_path_management as fpath
import public_library as plib

In [8]:
# import packages
import csv
import pandas as pd
import re
from bs4 import BeautifulSoup
import requests
import os
from numpy import NaN

<h3> Parameters: </h3> 

In [9]:
# on-topic keyword lexicon
on_topic_kws = ['thalamocortical', 'thalamo-cortical', 'corticothalamic', 'cortico-thalamic',
                'tracing', 'tracer', 'tract tracing', 'tract-tracing', 'axonal tracing', 'neural tracing', 'anatomical tracing', 'neuroanatomical tracing',
                'thalamus', 'cortex', 'thalamic', 'cortical', "staining", "dye", 
                'connection', 'projection', 'connectivity', 'connectome', "anterograde", "retrograde", "injection", "injected", "injecting", "inject"]

related_kws_weights = {'tracing': 10, 'tracer': 10, 'tract tracing': 10, 'tract-tracing': 10, 'axonal tracing': 10, 'neural tracing': 10, 'anatomical tracing': 10, 'anatomical neural tracing': 10,
                       'thalamocortical': 5, 'thalamo-cortical': 5, 'corticothalamic': 5, 'cortico-thalamic': 5,
                       'connection': 2, 'projection': 2, 'connectivity': 2, 
                       'thalamus': 1, 'cortex': 1, 'thalamic': 1, 'cortical': 1, 
                       'connectome': 1}

# false negative, false positive

# ChatGPT, queries for relatedness of topic
ChatGPT_related_queries = ['Does the given text include information of thalamocotical connection?',
                           'Does this paper provide data of thalamocotical connection?',
                           'Does the given text include information of connection between thalamus and cortex?']

<h3> Predefined fucntions: </h3> 

In [10]:
# count the number of times that certain on-topic keyword appear in a given text
def count_keyword(text: str, keyword: str) -> int:
    # print(text)
    # remove spaces before and after the text and split the string by word
    text = text.strip().split(" ")
    word_count = 0
    for word in text:
        # print(word)
        if word == keyword:
            word_count += 1
    return word_count
# end of count_keyword
# --------------------start of test code--------------------
# text = 'This apple 6i7s very tasty？、  2but th&e banana is not delicious at all.6'
# keyword = 'is'
# count = count_keyword(text, keyword)
# print(count)
# ---------------------end of test code---------------------

In [12]:
# count the number of times all on-topic keywords appear in the text
# extracted from the given url
def count_freq_from_liter(text, on_topic_kws, type):
    print(text)
    text_length = len(text)
    keywords_count_fre = {}
    # count the on-topic keywords
    for i in range(len(on_topic_kws)):
        word_count = count_keyword(text, on_topic_kws[i])
        if type == "count":
            keywords_count_fre[on_topic_kws[i]] = word_count
        elif type == "frequency":
            keywords_count_fre[on_topic_kws[i]] = word_count/text_length
    return keywords_count_fre
# end of count_freq_from_liter
# --------------------start of test code--------------------
# text = 'Vision for action: thalamic and cortical inputs to the macaque superior parietal lobule The dorsal visual stream, the cortical circuit that in the primate brain is mainly dedicated to the visual control of actions, is split into two routes, a lateral and a medial one, both involved in coding different aspects of sensorimotor control of actions. The lateral route, named "lateral grasping network", is mainly involved in the control of the distal part of prehension, namely grasping and manipulation. The medial route, named "reach-to-grasp network", is involved in the control of the full deployment of prehension act, from the direction of arm movement to the shaping of the hand according to the object to be grasped. In macaque monkeys, the reach-to-grasp network (the target of this review) includes areas of the superior parietal lobule (SPL) that hosts visual and somatosensory neurons well suited to control goal-directed limb movements toward stationary as well as moving objects. After a brief summary of the neuronal functional properties of these areas, we will analyze their cortical and thalamic inputs thanks to retrograde neuronal tracers separately injected into the SPL areas V6, V6A, PEc, and PE. These areas receive visual and somatosensory information distributed in a caudorostral, visuosomatic trend, and some of them are directly connected with the dorsal premotor cortex. This review is particularly focused on the origin and type of visual information reaching the SPL, and on the functional role this information can play in guiding limb interaction with objects in structured and dynamic environments. Area PEc; Area V6; Area V6A; Dorsal visual stream; Goal-directed arm movement; Sensorimotor integration.'
# keywords_count_fre = count_freq_from_liter(text, on_topic_kws)
# print(keywords_count_fre)
# ---------------------end of test code---------------------

In [None]:
def calcul_related(info_count, related_kws_weights):
    weight = 0
    for key, value in info_count.items():
        weight += info_count[key] * related_kws_weights[key]
    return weight
# end of def calcul_related(info_count, related_kws_weights):
# --------------------start of test code--------------------
info_count = {'thalamocortical': 2, 'thalamo-cortical': 2, 'corticothalamic': 0, 'cortico-thalamic': 1,
              'tracing': 0, 'tracer': 0, 'tract tracing': 0, 'tract-tracing': 0, 'axonal tracing': 1, 'neural tracing': 1, 'anatomical tracing': 1, 'anatomical neural tracing': 0,
              'connection': 1, 'projection': 2, 'connectivity': 0, 'connectome': 0, 
              'thalamus': 0, 'cortex': 4, 'thalamic': 2, 'cortical': 3}
weight = calcul_related(info_count, related_kws_weights)
print(weight)
# ---------------------end of test code---------------------


In [14]:
# scan each url in list_of_literature_urls.txt and record information and download pdf
def scan_download_record(on_topic_kws, pdf_folder_path):
    # scan each row in the potential related literature and extract information
    df = pd.read_csv(fpath.poten_litera_csv, sep = ",")
    for ind in df.index:
        if df["PMID"][ind] is not NaN:
            pmid = df["PMID"][ind]
            url = "https://pubmed.ncbi.nlm.nih.gov/" + pmid + "/"
            print(url)
            response = requests.get(url, headers = plib.headers)
            if response.status_code != 200:
                raise Exception("Error when request webpages!")
            soup = BeautifulSoup(response.content, "lxml")
            l = soup.find_all("a", {"class: id-link"}, {"data-ga-action": "DOI"})
            if(len(l) != 0):
                # print(l[0].get_text().strip())
                doi = l[0].get_text().strip()
            else:
                doi = NaN
            l = soup.find_all({"class": "current-id"}, {"title": "PubMed ID"})
            if(len(l) != 0):
                # print(l[0].get_text().strip())
                pmcid = l[0].get_text().strip()
            else:
                doi = NaN
            title = soup.find_all("h1", {"class": "heading-title"})[0].get_text().strip()
            abstract = soup.find_all("div", {"class": "abstract-content selected"})[0].get_text().strip()
            keywords = soup.find_all("p", {"class": "sub-title"})[0].get_text().strip()

            # extract title, abstract, keywords, introduction from the returned html file
            # count keywords from title + abstract + keywords
            # and make sure the process the text so that only punctuation marks and numbers are removed
            text = title + " " + title + " " + "abstract" + " " + "keywords"
            text = text.strip()
            text = re.sub(' +', ' ', text)
            text = re.sub(r"[^a-zA-Z' ']", "", text).lower()
    
            # record the information into json
            info_json = {}
            info_json['DOI'] = doi
            info_json['PMID'] = pmid
            info_json['PMCID'] = pmcid
            info_json['title'] = title
            info_json['abstract'] = abstract
            info_json['keywords'] = keywords
            info_count = count_freq_from_liter(text, on_topic_kws, type = "count")
            info_json = {**info_json, **info_count}
            weight = calcul_related(info_count, related_kws_weights)
            info_json["weight"] = weight
            print(info_json)
            columns = ['DOI', "PMID", "PMCID", 'title', "abstract", "keywords", "weight"] + on_topic_kws
            plib.add_row_to_csv(fpath.auto_filtered, info_json, columns)
# end of scan_record_download
# --------------------start of test code--------------------
# scan_record_download(path_urls, on_topic_kws, pdf_folder_path)
# ---------------------end of test code---------------------

In [None]:
def weight_and_rank(weights_dict):
    # weight formula
    print("Enjoy reading!")
# end of weight_and_rank
# --------------------start of test code--------------------
# test code
# ---------------------end of test code---------------------

<h3> Main program: </h3> 

In [None]:
# first we need to iterate every row, fill in the empty cells and search for the text information we need 
# according to existing information
# what we need is: DOI, PMID, PMCID, Title, Authors, Abstract, Keywords, full_text_url, pdf_url

# step 1: fill in empty cells of the three identifiers: DOI, PMID, PMCID, and make sure the title is in lower case
fill_in_identifier()

In [None]:
# step 2: remove duplications according to identifiers
remove_dupli()

In [None]:
# step 3: search for text information of the literature
# download the .pdf file when available and 
# record the keywords matching results
scan_download_record(on_topic_kws, fpath.litera_pdf_folder)

In [None]:
# step 4: assign weight to each literature and rank them
weight_and_rank(related_kws_weights)

<h3> Some test code, please ignore: </h3> 

In [None]:
# import re
# if "//doi.org/" in "https://doi.org/10.1016/0165-0173(96)00003-3":
#     print("yes")

In [None]:
# # test redirect when access the doi link
# from elsapy.elsdoc import FullDoc, AbsDoc
# from elsapy.elsclient import ElsClient
# import json
# headers = {
#     "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9", 
#     "X-ELS-APIKEY": "310946e6e005957982c2c9cad6833ad3",
#     "Accept": "application/pdf",
#     "X-ELS-Insttoken": "instToken",
#     "view": "FULL"
# } 
# # url = "https://www.jneurosci.org/content/28/43/11042.short"
#  #url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2613515/"

# # Journal of Neurophysiology
# # url = "https://doi.org/10.1152/jn.2001.85.1.219"
# # url = "https://journals.physiology.org/doi/10.1152/jn.2001.85.1.219"

# # science direct
# # url = "https://doi.org/10.1016/j.biopsych.2004.10.014"
# # url = "https://linkinghub.elsevier.com/retrieve/pii/S0006322304010947"
# # url = "https://www.sciencedirect.com/science/article/pii/S0006322304010947?via%3Dihub"
# url = "https://api.elsevier.com/content/article/doi/{10.1016/j.biopsych.2004.10.014}"

# # response = requests.get(url, headers = headers)
# # soup = BeautifulSoup(response.content,"lxml")
# # print(soup)
# # print(response.history)
# # print(response.url)
# # # Load configuration
# # con_file = open("config.json")
# # config = json.load(con_file)
# # con_file.close()

# # response = requests.get(url, headers = headers)
# # print(response)

# # ## Initialize client
# # client = ElsClient(config["apikey"])

# # ## ScienceDirect (full-text) document example using DOI
# # doi_doc = FullDoc(doi = "10.1016/j.biopsych.2004.10.014")
# # print(doi_doc)
# # if doi_doc.read(client):
# #     print ("doi_doc.title: ", doi_doc.title)
# #     doi_doc.write("doi_doc")   
# # else:
# #     print ("Read document failed.")

In [None]:
# # find DOI
# # this link does not have "DOI" in href form but text from
# url = "https://www.jneurosci.org/content/28/43/11042.short"
# # url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2613515/"
# response = requests.get(url, headers = plib.headers)
# soup = BeautifulSoup(response.content,"lxml")
# # print(soup)
# doi_list = []
# num_results_str = soup.select("a", href = True)
# # print(num_results_str)
# for item in num_results_str:
#     if "//doi.org/" in item["href"]:
#         print(item["href"])
#         doi_list.append(item["href"].split("//doi.org/")[1])

# print(doi_list)
        
# if len(doi_list) == 0:
#     print("Ops! Did't find DOI on this page!")



# test extract doi from url
# with open(fpath.gs_poten_urls, "r") as file:
#     lines = []
#     for line in file:
#         print(line)
#         line = line.strip()
#         lines.append(line)
# print(len(lines))
# doi_list = []
# for url in lines:
#     response = requests.get(url, headers = plib.headers)
#     while
#     soup = BeautifulSoup(response.content,"lxml")
#     # print(soup)
#     num_results_str = soup.select("a", href = True)
#     print(num_results_str)
#     for href in num_results_str:
#         if "//doi.org/" in href["href"]:
#             doi_list.append(href["href"])
#             print(href["href"])
#         else:
#             print("Ops! Did't find DOI on this page!")

In [None]:
#     # extract PDF link if exists
#     print(doi)
#     response_pdf = requests.get(doi, headers = headers)
#     print(response_pdf.url)
#     pdf_page_link = response_pdf.url
        
#     pdf_page = soup.find_all("a", {'class':'link-item dialog-focus'}, href = True)[0]['href']
    
#     # print(pdf_page_link)
#     pdf_page = requests.get(pdf_page_link, headers = headers)
#     soup_pdf = BeautifulSoup(pdf_page.content,'lxml')
#     print(len(soup_pdf.find_all("a", href = True)))
#     pdf_link = soup_pdf.find_all("a", href = True)[0]['href']
    
    
#     print(pdf_link)
#     pdf_link = 'https://www.ncbi.nlm.nih.gov' + pdf_link

<h3> Next step: semi-automated information search </h3>