<h2> Automatic filtering </h2> 

In [None]:
# import internal .py files
import file_path_management as fpath
import public_library as plib

In [None]:
# import packages
import csv
import pandas as pd
import re
from bs4 import BeautifulSoup
import requests
import os
import numpy as np
import string
import math
from nltk import ngrams

<h3> Parameters: </h3> 

In [None]:
# on-topic keyword lexicon
on_topic_kws = [
    'thalamocortical', 'thalamo-cortical', 'corticothalamic', 'cortico-thalamic',
    'tracing', 'tracer', 'tract tracing', 'tract-tracing', 'axonal tracing', 'neural tracing', 'anatomical tracing', 'neuroanatomical tracing',
    'thalamus', 'cortex', 'thalamic', 'cortical', "staining", "dye", 
    'connection', 'projection', 'connectivity', 'connectome', "anterograde", "retrograde", "injection", "injected", "injecting", "inject"]

on_topic_kws_weights = {
    'tracing': 10, 'tracer': 10, 'tract tracing': 10, 'tract-tracing': 10, 'axonal tracing': 10, 'neural tracing': 10, 'anatomical tracing': 10, 'anatomical neural tracing': 10,
    'thalamocortical': 5, 'thalamo-cortical': 5, 'corticothalamic': 5, 'cortico-thalamic': 5,
    'connection': 2, 'projection': 2, 'connectivity': 2, 
    'thalamus': 1, 'cortex': 1, 'thalamic': 1, 'cortical': 1, 
    'connectome': 1}

# pay attention to false negative, false positive cases

# ChatGPT, queries for relatedness of topic
ChatGPT_related_queries = ['Does the given text include information of thalamocotical connection?',
                           'Does this paper provide data of thalamocotical connection?',
                           'Does the given text include information of connection between thalamus and cortex?']

<h3> Predefined fucntions: </h3> 

In [None]:
def extract_text(url):
    text = ""
    return text
# --------------------start of test code--------------------
# test code
# ---------------------end of test code---------------------

In [None]:
# count the number of times that certain on-topic keyword appear in a given text
def count_keyword(text, keyword, keyword_length):
    # remove non-alphabetic characters but keep spaces and "-"
    text = ''.join(e for e in text if (e.isalpha() or e == " " or e == "-"))
    # print(text)
    text = text.strip().lower()
    # print(text)
    
    words = []
    # sentence = 'I have a laptop case and a laptop bag'
    n = keyword_length
    n_grams = ngrams(text.split(), n)
    for gram in n_grams:
        word = gram[0]
        if n > 0:
            for i in range(1, n):
                word = word + " " + gram[i]
        words.append(word)
    
    # print(words)
    
    word_count = 0
    for word in words:
        # print(word)
        if word == keyword:
            word_count += 1
    return word_count
# --------------------start of test code--------------------
# text = 'This apple 6i7s very tasty？、  2but th&e banana this is not delicious at Is all.6'
# keyword = 'this apple'
# count = count_keyword(text, keyword, 2)
# print(count)
# ---------------------end of test code---------------------

In [None]:
# count the number of times all on-topic keywords appear in the text
def count_freq_from_liter(text, on_topic_kws, type):
    text = ''.join(e for e in text if (e.isalpha() or e == " " or e == "-"))
    # print(text)
    text = text.strip().lower()
    # print(text)

    text_length = len(text)
    keywords_count = {}
    keywords_fre = {}

    # count the on-topic keywords
    for i in range(len(on_topic_kws)):
        word_count = count_keyword(text, on_topic_kws[i], len(on_topic_kws[i].split()))
        if type == "count":
            keywords_count[on_topic_kws[i]] = word_count
        elif type == "frequency":
            keywords_fre[on_topic_kws[i]] = math.ceil((word_count*100/text_length))/100
        else:
            raise Exception("Error! The only two options for type are 'count' or 'frequency'!")
    
    if type == "count":
        return keywords_count
    elif type == "frequency":
        return keywords_fre
    else:
        raise Exception("Error! The only two options for type are 'count' or 'frequency'!")
# --------------------start of test code--------------------
# text = 'Vision for action: thalamic and cortical inputs to the macaque superior tract neural tracing, parietal lobule The dorsal visual stream, the cortical circuit that in the primate brain is mainly dedicated to the visual control of actions, is split into two routes, a lateral and a medial one, both involved in coding different aspects of sensorimotor control of actions. The lateral route, named "lateral grasping network", is mainly involved in the control of the distal part of prehension, namely grasping and manipulation. The medial route, named "reach-to-grasp network", is involved in the control of the full deployment of prehension act, from the direction of arm movement to the shaping of the hand according to the object to be grasped. In macaque monkeys, the reach-to-grasp network (the target of this review) includes areas of the superior parietal lobule (SPL) that hosts visual and somatosensory neurons well suited to control goal-directed limb movements toward stationary as well as moving objects. After a brief summary of the neuronal functional properties of these areas, we will analyze their cortical and thalamic inputs thanks to retrograde neuronal tracers separately injected into the SPL areas V6, V6A, PEc, and PE. These areas receive visual and somatosensory information distributed in a caudorostral, visuosomatic trend, and some of them are directly connected with the dorsal premotor cortex. This review is particularly focused on the origin and type of visual information reaching the SPL, and on the functional role this information can play in guiding limb interaction with objects in structured and dynamic environments. Area PEc; Area V6; Area V6A; Dorsal visual stream; Goal-directed arm movement; Sensorimotor integration.'
# keywords_count_fre = count_freq_from_liter(text, on_topic_kws, type="count")
# print(keywords_count_fre)
# ---------------------end of test code---------------------

In [None]:
def info_filling(input_path, output_path):
    # scan each row in the potential related literature and extract information
    df = pd.read_csv(fpath.poten_litera_csv, sep = ",")
    for ind in df.index:
        url = ""
        text_dict = extract_text(url)
        if df["PMID"][ind] == df["PMID"][ind]:
            pmid = df["PMID"][ind]
            url = "https://pubmed.ncbi.nlm.nih.gov/" + pmid + "/"
            print(url)
            response = requests.get(url, headers = plib.headers)
            if response.status_code != 200:
                raise Exception("Error when request webpages!")
            soup = BeautifulSoup(response.content, "lxml")
            l = soup.find_all("a", {"class: id-link"}, {"data-ga-action": "DOI"})
            if(len(l) != 0):
                # print(l[0].get_text().strip())
                doi = l[0].get_text().strip()
            else:
                doi = np.nan
            l = soup.find_all({"class": "current-id"}, {"title": "PubMed ID"})
            if(len(l) != 0):
                # print(l[0].get_text().strip())
                pmcid = l[0].get_text().strip()
            else:
                doi = np.nan
            title = soup.find_all("h1", {"class": "heading-title"})[0].get_text().strip()
            abstract = soup.find_all("div", {"class": "abstract-content selected"})[0].get_text().strip()
            keywords = soup.find_all("p", {"class": "sub-title"})[0].get_text().strip()

            # extract title, abstract, keywords, introduction from the returned html file
            # count keywords from title + abstract + keywords
            # and make sure the process the text so that only punctuation marks and numbers are removed
            text = title + " " + title + " " + "abstract" + " " + "keywords"
            text = text.strip()
            text = re.sub(' +', ' ', text)
            text = re.sub(r"[^a-zA-Z' ']", "", text).lower()
    
            # record the information into json
            info_json = {}
            info_json['DOI'] = doi
            info_json['PMID'] = pmid
            info_json['PMCID'] = pmcid
            info_json['title'] = title
            info_json['abstract'] = abstract
            info_json['keywords'] = keywords
            info_count = count_freq_from_liter(text, on_topic_kws, type = "count")
            info_json = {**info_json, **info_count}
            print(info_json)
            plib.add_row_to_csv(fpath.auto_filtered, info_json)
# --------------------start of test code--------------------
# test code
# ---------------------end of test code---------------------

In [None]:
def calcul_index(keywords_count_or_fre, on_topic_kws_weights):
    index = 0
    for key, value in keywords_count_or_fre.items():
        index += keywords_count_or_fre[key] * keywords_count_or_fre[key]
    return index
# --------------------start of test code--------------------
# keywords_count_or_fre = {}
# on_topic_kws_weights = {}
# index = calcul_related(keywords_count_or_fre, on_topic_kws_weights)
# print(index)
# ---------------------end of test code---------------------

In [None]:
def weight_and_rank(input_path, output_path, on_topic_kws_weights):
    df = pd.read_csv(input_path, header=None, sep=",")
    df.columns = []
    for ind in range(len(df)):
        text = ""
        keywords_count_or_fre = (text, on_topic_kws, type)
        index = calcul_index(keywords_count_or_fre, on_topic_kws_weights)
        df["index"][ind] = index
        print(ind)
    
    # rank
    df = df.sort_values(by=['2'], ascending=False)
    df.to_csv(output_path, header=True, index=False)
    print("Weighting and ranking the potentially related literature succeded!")
    print("Enjoy reading!")
# --------------------start of test code--------------------
# test code
# ---------------------end of test code---------------------

<h3> Main program: </h3> 

In [None]:
# # check all possible full_text_source
# input_path = fpath.poten_litra_filtered
# df = pd.read_csv(input_path, header=None, sep=",")
# columns = ["DOI", "PMID", "PMCID", "Title", "full_text_link", "full_text_source", "pdf_link"]
# df.columns = columns

# print(df.head(5))
# print(df.shape)
# # (10768, 7)

# full_text_source = set(df['full_text_source'].tolist())
# print(full_text_source)
# # {'neuro.psychiatryonline.org', 'submissions.mirasmart.com', 'www.architalbiol.org', 'www.jstage.jst.go.jp', 
# #  'pubs.asahq.org', 'jpet.aspetjournals.org', 'ajp.psychiatryonline.org', 'papers.ssrn.com', 'ieeexplore.ieee.org', 
# #  'lww.com', 'n.neurology.org', 'ekja.org', 'symposium.cshlp.org', 'open.bu.edu', 'karger.com', 'wakespace.lib.wfu.edu', 
# #  'www.ingentaconnect.com', 'www.cambridge.org', 'ujms.net', 'www.taylorfrancis.com', 'journals.biologists.com', 
# #  'nrc-prod.literatumonline.com', 'opg.optica.org', 'pure.mpg.de', 'link.springer.com', 'neurologia.com', 'www.ajtmh.org', 
# #  'www.tandfonline.com', 'www.ncbi.nlm.nih.gov', 'www.biorxiv.org', 'journals.physiology.org', 'www.nature.com', 
# #  'www.thieme-connect.de', 'europepmc.org', 'www.degruyter.com', 'pharmrev.aspetjournals.org', 'academic.oup.com', 
# #  'var.scholarpedia.org', 'direct.mit.edu', 'journals.sagepub.com', 'www.microbiologyresearch.org', 'journals.aps.org', 
# #  'www.cabdirect.org', 'www.annualreviews.org', 'journals.lww.com', 'jnm.snmjournals.org', 'www.worldscientific.com', 
# #  'content.iospress.com:443', 'iovs.arvojournals.org', 'webview.isho.jp', 'books.google.de', 'www.elibrary.ru', 
# #  'www.theses.fr', 'jamanetwork.com', 'www.rbojournal.org', 'onlinelibrary.wiley.com', 'linkinghub.elsevier.com', 
# #  'royalsocietypublishing.org', 'www.ahajournals.org', 'www.imrpress.com', 'www.liebertpub.com', 'psycnet.apa.org:443', 
# #  'pubs.aip.org', 'escholarship.mcgill.ca', 'pubs.acs.org', 'elibrary.ru', 'thejns.org', 'www.science.org'}

In [1]:
websites = ["ncbi.nlm.nih.gov", "frontiersin.org", "sciencedirect.com", "wiley.com", "springer.com", "europepmc.org", 
            "www.biorxiv.org", "www.jneurosci.org", "orca.cardiff.ac.uk", "www.science.org", 
            "thejns.org", "www.cambridge.org", "www.ahajournals.org", "www.mdpi.com", "www.pnas.org", "www.nature.com", 
            "www.cell.com", "www.eneuro.org", "physiology.org", "ieee.org", "plos.org", "jstage.jst.go.jp", "biomedcentral.com", 
            "jamanetwork.com", "psycnet.apa.org", "bmj.com", "degruyter.com", "karger.com", "elifesciences.org", 
            "neurology_org", "asahq.org", "sagepub.com", "ekja.org", "liebertpub.com", "lww.com", "tandfonline.com", 
            "aspetjournals.org", "oup.com", "royalsocietypublishing.org", "psychiatryonline.org", "jpn.ca", "bu.edu",
            "agro.icm.edu.pl", "lib.wfu.edu", "mirasmart.com", "jstor.org", "mpg.de"
            "architalbiol.org", "jpet.aspetjournals.org", "pharmrev.aspetjournals.org", "ssrn.com", "neurology.org", 
            "symposium.cshlp.org", "ingentaconnect.com", "ujms.net", "taylorfrancis.com", "biologists.com", "literatumonline.com",
            "opg.optica.org", "neurologia.com", "ajtmh.org", "www.thieme-connect.de", "var.scholarpedia.org", "direct.mit.edu",
            "www.microbiologyresearch.org", "journals.aps.org", "www.cabdirect.org", "www.annualreviews.org", "jnm.snmjournals.org",
            "www.worldscientific.com", "content.iospress.com", "iovs.arvojournals.org", "webview.isho.jp", "books.google.de",
            "www.theses.fr", "www.rbojournal.org", "linkinghub.elsevier.com", "www.imrpress.com", "pubs.aip.org",
            "escholarship.mcgill.ca", "pubs.acs.org", "elibrary.ru"]
# --------------------start of test code--------------------
# if len(websites) == len(set(websites)):
#     print("There are no duplicates in the list.")
# else:
#     print("There are duplicates in the list.")
# ---------------------end of test code---------------------

There are no duplicates in the list.


In [None]:
# # step 1: extract and filling text
# input_path = fpath.poten_litra_filtered
# output_path = fpath.poten_litera_litera_db

# # clear file
# plib.clear_file(output_path)

# info_filling(input_path, output_path)(input_path, output_path, 0, 10768)

In [None]:
# # step 2: assign index to each literature and rank them
# input_path = fpath.poten_litera_litera_db
# output_path = fpath.poten_litera_litera_db_ranked

# # clear file
# plib.clear_file(output_path)

# weight_and_rank(input_path, output_path, on_topic_kws_weights)

<h3> Next step: manually read papers and find all actually related literature </h3>