<h2> Automatic filtering </h2> 

In [None]:
# import internal .py files
import file_path_management as fpath
import public_library as plib

In [None]:
# import packages
import csv
import pandas as pd
import re
from bs4 import BeautifulSoup
import requests
import os
import numpy as np
import string
import math
from nltk import ngrams

<h3> Parameters: </h3> 

In [None]:
# on-topic keyword lexicon
on_topic_kws = [
    'thalamocortical', 'thalamo-cortical', 'corticothalamic', 'cortico-thalamic',
    'tracing', 'tracer', 'tract tracing', 'tract-tracing', 'axonal tracing', 'neural tracing', 'anatomical tracing', 'neuroanatomical tracing',
    'thalamus', 'cortex', 'thalamic', 'cortical', "staining", "dye", 
    'connection', 'projection', 'connectivity', 'connectome', "anterograde", "retrograde", "injection", "injected", "injecting", "inject"]

on_topic_kws_weights = {
    'tracing': 10, 'tracer': 10, 'tract tracing': 10, 'tract-tracing': 10, 'axonal tracing': 10, 'neural tracing': 10, 'anatomical tracing': 10, 'anatomical neural tracing': 10,
    'thalamocortical': 5, 'thalamo-cortical': 5, 'corticothalamic': 5, 'cortico-thalamic': 5,
    'connection': 2, 'projection': 2, 'connectivity': 2, 
    'thalamus': 1, 'cortex': 1, 'thalamic': 1, 'cortical': 1, 
    'connectome': 1}

# pay attention to false negative, false positive cases

# ChatGPT, queries for relatedness of topic
ChatGPT_related_queries = ['Does the given text include information of thalamocotical connection?',
                           'Does this paper provide data of thalamocotical connection?',
                           'Does the given text include information of connection between thalamus and cortex?']

<h3> Predefined fucntions: </h3> 

In [None]:
def info_filling(input_path, output_path, start, end):
    # scan each row in the potential related literature and extract information
    df = pd.read_csv(input_path, header=None, sep=",")
    df.columns = ["DOI", "PMID", "PMCID", "title", "full_text_link", "full_text_source", "pdf_link"]
    
    for ind in range(start, end):
        # check if the full_text_link is one of our websites
        flag = False
        for website in plib.websites:
            if website in df.at[ind, "full_text_source"]:
                flag = True
                break
        if not flag:
            continue

        if df.at[ind, "full_text_link"] == df.at[ind, "full_text_link"]:
            url = df.at[ind, "full_text_link"]
            info = plib.extract_info_from_webpage(url)
        else:
            raise Exception("Error! Full text link is not available!")
        
        # info = {
        #     "doi": doi,
        #     "pmid": pmid,
        #     "pmcid": pmcid,
        #     "title": title,
        #     "abstract": abstract,
        #     "keywords": keywords,
        #     "introduction": intro,
        #     "pdf_link": pdf_link
        # }

        # doi
        if info['doi'] == info['doi'] and df.at[ind, "DOI"] == df.at[ind, "DOI"] and info['doi'] != df.at[ind, "DOI"]:
            # print(info['doi'])
            # print(df.at[ind, "DOI"])
            df.at[ind, "DOI"] = info['doi']
            # raise Exception("Error! DOI does not match!")
        elif info['doi'] == info['doi']:
            doi = info['doi']
        else:
            doi = df.at[ind, "DOI"]
        
        # pmid
        if info['pmid'] == info['pmid'] and df.at[ind, "PMID"] == df.at[ind, "PMID"] and str(int(info['pmid'])) != str(int(df.at[ind, "PMID"])):
            print(info['pmid'])
            print(df.at[ind, "PMID"])
            # raise Exception("Error! PMID does not match!")
        elif info['pmid'] == info['pmid']:
            pmid = str(int(info['pmid']))
        elif df.at[ind, "PMID"] == df.at[ind, "PMID"]:
            pmid = str(int(df.at[ind, "PMID"]))
        else:
            pmid = np.nan
        # if info['pmid'] == info['pmid'] and (info['pmid'] == df.at[ind, "PMID"] or df.at[ind, "PMID"] != df.at[ind, "PMID"]):
        #     pmid = info['pmid']
        # elif info['pmid'] != info['pmid']:
        #     pmid = df.at[ind, "PMID"]
        # else:
        #     raise Exception("Error! PMID does not match!")
        
        # pmcid
        if info['pmcid'] == info['pmcid'] and df.at[ind, "PMCID"] == df.at[ind, "PMCID"] and info['pmcid'] != df.at[ind, "PMCID"]:
            print(info['pmcid'])
            print(df.at[ind, "PMCID"])
            # raise Exception("Error! PMCID does not match!")
        elif info['pmcid'] == info['pmcid']:
            pmcid = info['pmcid']
        else:
            pmcid = df.at[ind, "PMCID"]
        # if info['pmcid'] == info['pmcid'] and (info['pmcid'] == df.at[ind, "PMCID"] or df.at[ind, "PMCID"] != df.at[ind, "PMCID"]):
        #     pmcid = info['pmcid']
        # elif info['pmcid'] != info['pmcid']:
        #     pmcid = df.at[ind, "PMCID"]
        # else:
        #     raise Exception("Error! PMCID does not match!")
        
        # full_text_link
        if df.at[ind, "full_text_link"] == df.at[ind, "full_text_link"]:
            full_text_link = df.at[ind, "full_text_link"]
        else:
            raise Exception("Error! Full text link is not available!")
        
        # full_text_surce
        if df.at[ind, "full_text_source"] == df.at[ind, "full_text_source"]:
            full_text_surce = df.at[ind, "full_text_source"]
        else:
            raise Exception("Error! full text surce is not available!")
        
        # pdf_link
        if info['pdf_link'] == info['pdf_link']:
            pdf_link = info['pdf_link']
        else:
            pdf_link = df.at[ind, "pdf_link"]
        
        # title
        if info['title'] == info['title']:
            title = info['title']
        else:
            title = df.at[ind, "title"]
        
        # abstract
        if info['abstract'] == info['abstract']:
            abstract = info['abstract']
        else:
            # print("Warning! Abstract is not available!", full_text_link)
            abstract = np.nan
        
        # keywords
        if info['keywords'] == info['keywords']:
            keywords = info['keywords']
        else:
            keywords = np.nan

        # introduction
        if info['introduction'] == info['introduction']:
            introduction = info['introduction']
        else:
            introduction = np.nan
    
        columns = ["DOI", "PMID", "PMCID", "full_text_link", "full_text_source", "pdf_link", "Title", "Abstract", "Keywords", "Introduction"]
        row = {
            "DOI": [doi],
            "PMID": [pmid],
            "PMCID": [pmcid],
            "full_text_link": [full_text_link],
            "full_text_source": [full_text_surce],
            "pdf_link": [pdf_link],
            "Title": [title],
            "Abstract": [abstract],
            "Keywords": [keywords],
            "Introduction": [introduction]
        }
        # print(row)

        if not plib.add_row_to_csv(output_path, row, columns):
            print("Error detected when adding a row to csv!")
        
        # print(ind)
# --------------------start of test code--------------------
# input_path = fpath.poten_litra_filtered
# output_path = fpath.poten_litera_litera_db

# # clear file
# plib.clear_file(output_path)

# info_filling(input_path, output_path)
# ---------------------end of test code---------------------

In [None]:
def count_keyword(text, keyword, keyword_length):
    # remove non-alphabetic characters but keep spaces and "-"
    text = ''.join(e for e in text if (e.isalpha() or e == " " or e == "-"))
    # print(text)
    text = text.strip().lower()
    # print(text)
    
    words = []
    # sentence = 'I have a laptop case and a laptop bag'
    n = keyword_length
    n_grams = ngrams(text.split(), n)
    for gram in n_grams:
        word = gram[0]
        if n > 0:
            for i in range(1, n):
                word = word + " " + gram[i]
        words.append(word)
    
    # print(words)
    
    word_count = 0
    for word in words:
        # print(word)
        if word == keyword:
            word_count += 1
    return word_count
# --------------------start of test code--------------------
# text = 'This apple 6i7s very tasty？、  2but th&e banana this is not delicious at Is all.6'
# keyword = 'this apple'
# count = count_keyword(text, keyword, 2)
# print(count)
# ---------------------end of test code---------------------

In [None]:
def count_freq_from_liter(text, on_topic_kws, type):
    text = ''.join(e for e in text if (e.isalpha() or e == " " or e == "-"))
    # print(text)
    text = text.strip().lower()
    # print(text)

    text_length = len(text)
    keywords_count = {}
    keywords_fre = {}

    # count the on-topic keywords
    for i in range(len(on_topic_kws)):
        word_count = count_keyword(text, on_topic_kws[i], len(on_topic_kws[i].split()))
        if type == "count":
            keywords_count[on_topic_kws[i]] = word_count
        elif type == "frequency":
            keywords_fre[on_topic_kws[i]] = math.ceil((word_count*100/text_length))/100
        else:
            raise Exception("Error! The only two options for type are 'count' or 'frequency'!")
    
    if type == "count":
        return keywords_count
    elif type == "frequency":
        return keywords_fre
    else:
        raise Exception("Error! The only two options for type are 'count' or 'frequency'!")
# --------------------start of test code--------------------
# text = 'Vision for action: thalamic and cortical inputs to the macaque superior tract neural tracing, parietal lobule The dorsal visual stream, the cortical circuit that in the primate brain is mainly dedicated to the visual control of actions, is split into two routes, a lateral and a medial one, both involved in coding different aspects of sensorimotor control of actions. The lateral route, named "lateral grasping network", is mainly involved in the control of the distal part of prehension, namely grasping and manipulation. The medial route, named "reach-to-grasp network", is involved in the control of the full deployment of prehension act, from the direction of arm movement to the shaping of the hand according to the object to be grasped. In macaque monkeys, the reach-to-grasp network (the target of this review) includes areas of the superior parietal lobule (SPL) that hosts visual and somatosensory neurons well suited to control goal-directed limb movements toward stationary as well as moving objects. After a brief summary of the neuronal functional properties of these areas, we will analyze their cortical and thalamic inputs thanks to retrograde neuronal tracers separately injected into the SPL areas V6, V6A, PEc, and PE. These areas receive visual and somatosensory information distributed in a caudorostral, visuosomatic trend, and some of them are directly connected with the dorsal premotor cortex. This review is particularly focused on the origin and type of visual information reaching the SPL, and on the functional role this information can play in guiding limb interaction with objects in structured and dynamic environments. Area PEc; Area V6; Area V6A; Dorsal visual stream; Goal-directed arm movement; Sensorimotor integration.'
# keywords_count_fre = count_freq_from_liter(text, on_topic_kws, type="count")
# print(keywords_count_fre)
# ---------------------end of test code---------------------

In [None]:
def calcul_index(keywords_count_or_fre, on_topic_kws_weights):
    index = 0
    for key, value in keywords_count_or_fre.items():
        index += keywords_count_or_fre[key] * keywords_count_or_fre[key]
    return index
# --------------------start of test code--------------------
# keywords_count_or_fre = {}
# on_topic_kws_weights = {}
# index = calcul_related(keywords_count_or_fre, on_topic_kws_weights)
# print(index)
# ---------------------end of test code---------------------

In [None]:
def weight_and_rank(input_path, output_path, on_topic_kws_weights):
    df = pd.read_csv(input_path, header=None, sep=",")
    df.columns = ["DOI", "PMID", "PMCID", "full_text_link", "full_text_source", "pdf_link", "Title", "Abstract", "Keywords", "Introduction"]

    for ind in df.index:
        text = ""
        text = df["Title"][ind] + " " + df["Abstract"][ind] + " " + df["Keywords"][ind] + " " + df["Introduction"][ind]
        # type = "count"
        type = "frequency"
        keywords_count_or_fre = (text, on_topic_kws, type)
        index = calcul_index(keywords_count_or_fre, on_topic_kws_weights)
        df.at[ind, "index"] = index
        print(ind)
    
    # rank
    df = df.sort_values(by=["index"], ascending=False)
    df.to_csv(output_path, header=True, index=False)
    print("Weighting and ranking the potentially related literature succeded!")
    print("Enjoy reading!")
# --------------------start of test code--------------------
# test code
# ---------------------end of test code---------------------

<h3> Main program: </h3> 

In [None]:
# check all possible full_text_source
input_path = fpath.poten_litra_filtered
df = pd.read_csv(input_path, header=None, sep=",")
columns = ["DOI", "PMID", "PMCID", "Title", "full_text_link", "full_text_source", "pdf_link"]
df.columns = columns

print(df.head(5))
print(df.shape)
# # (10768, 7)

full_text_source = set(df['full_text_source'].tolist())
print(full_text_source)
# {'karger.com', 'www.rbojournal.org', 'journals.sagepub.com', 'n.neurology.org', 'pubs.asahq.org', 'jpet.aspetjournals.org', 
#  'www.thieme-connect.de', 'www.taylorfrancis.com', 'lww.com', 'neurologia.com', 'ekja.org', 'neuro.psychiatryonline.org', 
#  'pharmrev.aspetjournals.org', 'www.imrpress.com', 'europepmc.org', 'link.springer.com', 'www.theses.fr', 'ieeexplore.ieee.org', 
#  'papers.ssrn.com', 'www.nature.com', 'www.liebertpub.com', 'academic.oup.com', 'open.bu.edu', 'www.elibrary.ru', 
#  'journals.biologists.com', 'pubs.aip.org', 'pure.mpg.de', 'wakespace.lib.wfu.edu', 'www.cambridge.org', 
#  'nrc-prod.literatumonline.com', 'pubs.acs.org', 'var.scholarpedia.org', 'webview.isho.jp', 'submissions.mirasmart.com', 
#  'www.jstage.jst.go.jp', 'ajp.psychiatryonline.org', 'psycnet.apa.org:443', 'thejns.org', 'www.microbiologyresearch.org', 
#  'onlinelibrary.wiley.com', 'jnm.snmjournals.org', 'www.degruyter.com', 'www.worldscientific.com', 'opg.optica.org', 
#  'journals.lww.com', 'www.science.org', 'journals.aps.org', 'ujms.net', 'direct.mit.edu', 'www.biorxiv.org', 
#  'www.annualreviews.org', 'elibrary.ru', 'www.ingentaconnect.com', 'escholarship.mcgill.ca', 'symposium.cshlp.org', 
#  'www.architalbiol.org', 'iovs.arvojournals.org', 'jamanetwork.com', 'linkinghub.elsevier.com', 'www.ncbi.nlm.nih.gov', 
#  'www.cabdirect.org', 'books.google.de', 'content.iospress.com:443', 'www.tandfonline.com', 'www.ajtmh.org', 
#  'royalsocietypublishing.org', 'www.ahajournals.org', 'journals.physiology.org'}

# ["DOI", "PMID", "PMCID", "Title", "full_text_link", "full_text_source", "pdf_link"]
print(df["DOI"].isnull().any().any())
print(df["PMID"].isnull().any().any())
print(df["PMCID"].isnull().any().any())
print(df["Title"].isnull().any().any())
print(df["full_text_link"].isnull().any().any())
print(df["full_text_source"].isnull().any().any())
print(df["pdf_link"].isnull().any().any())
# True, True, True, False, False, False, True

print(df["DOI"].dtypes)
print(df["PMID"].dtypes)
print(df["PMCID"].dtypes)
print(df["Title"].dtypes)
print(df["full_text_link"].dtypes)
print(df["full_text_source"].dtypes)
print(df["pdf_link"].dtypes)
# object
# float64
# object
# object
# object
# object
# object

In [None]:
# websites_hosts
# {'karger.com', 'www.rbojournal.org', 'journals.sagepub.com', 'n.neurology.org', 'pubs.asahq.org', 'jpet.aspetjournals.org', 
#  'www.thieme-connect.de', 'www.taylorfrancis.com', 'lww.com', 'neurologia.com', 'ekja.org', 'neuro.psychiatryonline.org', 
#  'pharmrev.aspetjournals.org', 'www.imrpress.com', 'europepmc.org', 'link.springer.com', 'www.theses.fr', 'ieeexplore.ieee.org', 
#  'papers.ssrn.com', 'www.nature.com', 'www.liebertpub.com', 'academic.oup.com', 'open.bu.edu', 'www.elibrary.ru', 
#  'journals.biologists.com', 'pubs.aip.org', 'pure.mpg.de', 'wakespace.lib.wfu.edu', 'www.cambridge.org', 
#  'nrc-prod.literatumonline.com', 'pubs.acs.org', 'var.scholarpedia.org', 'webview.isho.jp', 'submissions.mirasmart.com', 
#  'www.jstage.jst.go.jp', 'ajp.psychiatryonline.org', 'psycnet.apa.org:443', 'thejns.org', 'www.microbiologyresearch.org', 
#  'onlinelibrary.wiley.com', 'jnm.snmjournals.org', 'www.degruyter.com', 'www.worldscientific.com', 'opg.optica.org', 
#  'journals.lww.com', 'www.science.org', 'journals.aps.org', 'ujms.net', 'direct.mit.edu', 'www.biorxiv.org', 
#  'www.annualreviews.org', 'elibrary.ru', 'www.ingentaconnect.com', 'escholarship.mcgill.ca', 'symposium.cshlp.org', 
#  'www.architalbiol.org', 'iovs.arvojournals.org', 'jamanetwork.com', 'linkinghub.elsevier.com', 'www.ncbi.nlm.nih.gov', 
#  'www.cabdirect.org', 'books.google.de', 'content.iospress.com:443', 'www.tandfonline.com', 'www.ajtmh.org', 
#  'royalsocietypublishing.org', 'www.ahajournals.org', 'journals.physiology.org'}
websites_hosts = [
    'karger.com', 'rbojournal.org', 'sagepub.com', 'neurology.org', 'asahq.org', 'aspetjournals.org', 'thieme-connect.de', 
    'taylorfrancis.com', 'lww.com', 'neurologia.com', 'ekja.org', 'www.imrpress.com', 'europepmc.org', 'springer.com', 
    'theses.fr', 'ieee.org', 'ssrn.com', 'nature.com', 'liebertpub.com', 'oup.com', 'open.bu.edu', 'journals.biologists.com', 
    'aip.org', 'mpg.de', 'lib.wfu.edu', 'cambridge.org', 'literatumonline.com', 'acs.org', 'scholarpedia.org', 'isho.jp', 
    'mirasmart.com', 'jstage.jst.go.jp', 'psychiatryonline.org', 'psycnet.apa.org', 'thejns.org', 'microbiologyresearch.org', 
    'wiley.com', 'snmjournals.org', 'degruyter.com', 'worldscientific.com', 'opg.optica.org', 'science.org', 'aps.org', 
    'ujms.net', 'mit.edu', 'biorxiv.org','annualreviews.org', 'elibrary.ru', 'www.ingentaconnect.com', 'mcgill.ca', 
    'symposium.cshlp.org', 'architalbiol.org', 'arvojournals.org', 'jamanetwork.com', 'elsevier.com', 'ncbi.nlm.nih.gov', 
    'cabdirect.org', 'books.google.de', 'iospress.com', 'tandfonline.com', 'ajtmh.org', 'royalsocietypublishing.org', 
    'ahajournals.org', 'physiology.org']
# --------------------start of test code--------------------
if len(websites_hosts) == len(set(websites_hosts)):
    print("There are no duplicates in the list.")
else:
    print("There are duplicates in the list.")
# ---------------------end of test code---------------------

In [None]:
# sort the websites by the number of articles they have
input_path = fpath.poten_litra_filtered
df = pd.read_csv(input_path, header=None, sep=",")
columns = ["DOI", "PMID", "PMCID", "Title", "full_text_link", "full_text_source", "pdf_link"]
df.columns = columns
func_dict = {website: 0 for website in websites_hosts}
# print(func_dict)

for ind in df.index:
    for website in websites_hosts:
        if website in df.loc[ind, "full_text_source"]:
            func_dict[website] += 1
            break

# Sort dictionary by values
sorted_dict = dict(sorted(func_dict.items(), key=lambda item: item[1], reverse=True))
print(sorted_dict)
# {
#     'ncbi.nlm.nih.gov': 7886, 'elsevier.com': 1019, 'wiley.com': 696, 'springer.com': 285, 'physiology.org': 205, 
#     'oup.com': 152, 'cambridge.org': 74, 'karger.com': 53, 'lww.com': 48, 'nature.com': 44, 'science.org': 30, 
#     'tandfonline.com': 29, 'sagepub.com': 21, 'jamanetwork.com': 20, 'neurology.org': 16, 'biorxiv.org': 15, 
#     'royalsocietypublishing.org': 13, 'psycnet.apa.org': 12, 'arvojournals.org': 12, 'jstage.jst.go.jp': 11, 
#     'psychiatryonline.org': 11, 'europepmc.org': 10, 'mit.edu': 10, 'thejns.org': 8, 'annualreviews.org': 8, 
#     'snmjournals.org': 7, 'aspetjournals.org': 6, 'elibrary.ru': 5, 'books.google.de': 5, 'architalbiol.org': 4, 
#     'ahajournals.org': 4, 'liebertpub.com': 3, 'acs.org': 3, 'degruyter.com': 3, 'worldscientific.com': 3, 
#     'iospress.com': 3, 'asahq.org': 2, 'thieme-connect.de': 2, 'neurologia.com': 2, 'mpg.de': 2, 'opg.optica.org': 2, 
#     'mcgill.ca': 2, 'rbojournal.org': 1, 'taylorfrancis.com': 1, 'ekja.org': 1, 'www.imrpress.com': 1, 'theses.fr': 1, 
#     'ieee.org': 1, 'ssrn.com': 1, 'open.bu.edu': 1, 'journals.biologists.com': 1, 'aip.org': 1, 'lib.wfu.edu': 1, 
#     'literatumonline.com': 1, 'scholarpedia.org': 1, 'isho.jp': 1, 'mirasmart.com': 1, 'microbiologyresearch.org': 1, 
#     'aps.org': 1, 'ujms.net': 1, 'www.ingentaconnect.com': 1, 'symposium.cshlp.org': 1, 'cabdirect.org': 1, 'ajtmh.org': 1
#  }

non_zero_keys = [key for key, value in sorted_dict.items() if value != 0]
print(non_zero_keys)
# [
#     'ncbi.nlm.nih.gov', 'elsevier.com', 'wiley.com', 'springer.com', 'physiology.org', 'oup.com', 'cambridge.org', 
#     'karger.com', 'lww.com', 'nature.com', 'science.org', 'tandfonline.com', 'sagepub.com', 'jamanetwork.com', 
#     'neurology.org', 'biorxiv.org', 'royalsocietypublishing.org', 'psycnet.apa.org', 'arvojournals.org', 'jstage.jst.go.jp', 
#     'psychiatryonline.org', 'europepmc.org', 'mit.edu', 'thejns.org', 'annualreviews.org', 'snmjournals.org', 
#     'aspetjournals.org', 'elibrary.ru', 'books.google.de', 'architalbiol.org', 'ahajournals.org', 'liebertpub.com', 
#     'acs.org', 'degruyter.com', 'worldscientific.com', 'iospress.com', 'asahq.org', 'thieme-connect.de', 'neurologia.com', 
#     'mpg.de', 'opg.optica.org', 'mcgill.ca', 'rbojournal.org', 'taylorfrancis.com', 'ekja.org', 'www.imrpress.com', 
#     'theses.fr', 'ieee.org', 'ssrn.com', 'open.bu.edu', 'journals.biologists.com', 'aip.org', 'lib.wfu.edu', 
#     'literatumonline.com', 'scholarpedia.org', 'isho.jp', 'mirasmart.com', 'microbiologyresearch.org', 'aps.org', 
#     'ujms.net', 'www.ingentaconnect.com', 'symposium.cshlp.org', 'cabdirect.org', 'ajtmh.org'
#  ]

In [None]:
# websites
websites = [
    'ncbi.nlm.nih.gov', 'elsevier.com', 'wiley.com', 'springer.com', 'physiology.org', 'oup.com', 
    'cambridge.org', 'karger.com', 'lww.com', 'nature.com', 'science.org', 'tandfonline.com', 
    'sagepub.com', 'jamanetwork.com', 'neurology.org', 'biorxiv.org', 'royalsocietypublishing.org', 
    'psycnet.apa.org', 'arvojournals.org', 'jstage.jst.go.jp', 'psychiatryonline.org', 'europepmc.org', 
    'mit.edu', 'thejns.org', 'annualreviews.org', 'snmjournals.org', 'aspetjournals.org', 'elibrary.ru', 
    'books.google.de', 'architalbiol.org', 'ahajournals.org', 'liebertpub.com', 'acs.org', 'degruyter.com', 
    'worldscientific.com', 'iospress.com', 'asahq.org', 'thieme-connect.de', 'neurologia.com', 'mpg.de', 
    'opg.optica.org', 'mcgill.ca', 'rbojournal.org', 'taylorfrancis.com', 'ekja.org', 'www.imrpress.com', 
    'theses.fr', 'ieee.org', 'ssrn.com', 'open.bu.edu', 'journals.biologists.com', 'aip.org', 'lib.wfu.edu', 
    'literatumonline.com', 'scholarpedia.org', 'isho.jp', 'mirasmart.com', 'microbiologyresearch.org', 
    'aps.org', 'ujms.net', 'www.ingentaconnect.com', 'symposium.cshlp.org', 'cabdirect.org', 'ajtmh.org'
]
# --------------------start of test code--------------------
if len(websites) == len(set(websites)):
    print("There are no duplicates in the list.")
else:
    print("There are duplicates in the list.")
# ---------------------end of test code---------------------

In [None]:
# step 1: extract and filling text
input_path = fpath.poten_litra_filtered
output_path = fpath.poten_litera_litera_db

# clear file
plib.clear_file(output_path)

info_filling(input_path, output_path, 0, 10768)

In [None]:
# # step 2: assign index to each literature and rank them
# input_path = fpath.poten_litera_litera_db
# output_path = fpath.poten_litera_litera_db_ranked

# # clear file
# plib.clear_file(output_path)

# weight_and_rank(input_path, output_path, on_topic_kws_weights)

<h3> Next step: manually read papers and find all actually related literature </h3>