In [1]:
import pickle
import pandas as pd

In [2]:
import re

def clean_text(text):
    cleaned_text = text
    cleaned_text = re.sub(r'http\S+', '', cleaned_text) # Remove URLs
    cleaned_text = re.sub(r'-\n', '', cleaned_text) # Remove Hyphenations
    cleaned_text = re.sub("\s+", " ", cleaned_text) # Remove Duplicate Spaces
    cleaned_text = re.sub(r"\s+([?.!,])", r"\1", cleaned_text) # Remove Spaces Before Punctuation
    return cleaned_text.strip()

def save_df(df, name):
    df.to_csv(f'{name}.csv', encoding='utf-8', index=False)
    df.to_excel(f'{name}.xlsx', index=False, header=True)
    df.to_pickle(f'{name}.pkl')

def load_df(name, filetype='csv', verbose=False):
    filename = f"{name}.{filetype}"
    if verbose:
        print(f"File Name: {filename}")
    if filetype == 'csv':
        return pd.read_csv(filename)
    if filetype == 'xlsx':
        return pd.read_excel(filename)
    if filetype == 'pkl':
        return pd.read_pickle(filename)
    raise Exception("Invalid File Type")

def remove_unnamed_columns(name, filetype='csv'):
    df = load_df(name, filetype)
    df.drop(df.columns[df.columns.str.contains('unnamed', case=False)], axis=1, inplace=True)
    save_df(df, name)

def sample_df(name, filetype='csv', frac=None, n=None, save=False):
    df = load_df(name, filetype=filtype)
    sub_df = df.copy().sample(frac=frac, n=n)
    if save:
        save_df(sub_df, f"Sub{name}")
    return df

In [3]:
# !python -m pip install tf-keras
from transformers import pipeline

classifier = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli")
# classifier = pipeline("zero-shot-classification", model="MoritzLaurer/deberta-v3-large-zeroshot-v2.0")

def on_topic(text, verbose=False):
    # Topic and Threshold
    # I'm using a different model which seems to be more confident,
    # so I'm going to increase the threshold.
    topics = [("ecology", 0.9), ("interaction", 0.9)]
            
    for topic, threshold in topics:
        result = classifier(text, [topic])
        if verbose:
            print(result["scores"][0])
        if result["scores"][0] < threshold:
            return False
    return True

# print(on_topic(examples[0]["Abstract"], verbose=True))
# print(on_topic(examples[1]["Abstract"]))
# print(on_topic(examples[2]["Abstract"]))
# print(on_topic(examples[3]["Abstract"]))
# print(on_topic("Fatal familial insomnia (FFI) and a subtype of familial Creutzfeldt-Jakob disease (CJD), two clinically and pathologically distinct diseases, are linked to the same mutation at codon 178 (Asn 178 ) of the prion protein gene. The possibility that a second genetic component modified the phenotypic expression of the Asn 178 mutation was investigated. FFI and the familial CJD subtype segregated with different genotypes determined by the Asn 178 mutation and the methionine-valine polymorphism at codon 129. The Met 129 , Asn 178 allele segregated with FFI in all 15 affected members of five kindreds whereas the Val 129 , Asn 178 allele segregated with the familial CJD subtype in all 15 affected members of six kindreds. Thus, two distinct disease phenotypes linked to a single pathogenic mutation can be determined by a common polymorphism.", verbose=True))
# print(on_topic("Metacognition, the ability to reflect upon, and evaluate our own beliefs, can help us avoid making decisions based on unreliable evidence. Here, we provide empirical tests of the importance of human metacognition during the COVID-19 pandemic. ...Metacognition, our ability to reflect on our own beliefs, manifests itself in the confidence we have in these beliefs, and helps us guide our behavior in complex and uncertain environments. Here, we provide empirical tests of the importance of...", verbose=True))
# print(on_topic("The serotonin transporter (5-HTT) regulates serotonergic neurotransmission and is thought to influence emotion. A 5-HTT-linked polymorphic region (5-HTTLPR) has two common variants, short (s) and long (l). We previously found population and within-family associations between the lower-expressing s allele and neuroticism, a trait related to anxiety, hostility, and depression, on a standard measure (the NEO Personality Inventory, Revised [NEO-PI-R]) in a primarily male population (n=505), and that the s allele was dominant. We investigated this association in a new sample (n=397, 84% female, primarily sib-pairs). The results robustly replicated the 5-HTTLPR neuroticism association, and the dominance of the s allele. Combined data from the two studies (n=902) showed a highly significant association between the s allele and higher NEO Neuroticism both across individuals and within families. Association between genotype and a related measure, Anxiety on the 16PF inventory, was replicated in the new population and within families in the combined sample. Association to another trait, estimated TPQ Harm Avoidance, was not replicated in the new sample but found only within the combined sibship group. Another association found in our original study, between the s allele and lower scores on NEO-PI-R Agreeableness, was also replicated and was more robust in the current and the combined samples. Associations between the functional 5-HTTLPR polymorphism were similar in women and men. These results help to define specific personality features reproducibly associated with 5-HTTLPR genotype. Such associations were strongest for traits defined by the NEO, enhancing the attractiveness of the five-factor personality model in genetic research on complex behavioral dimensions. Am. J. Med. Genet. (Neuropsychiatr. Genet.) 96:202â€“216, 2000. Published 2000 Wiley-Liss, Inc.", verbose=True))
# print(on_topic("This investigation examines the role of trait-mediated indirect interactions in a simple aquatic food web. We conducted the experiments in cattle watering tanks in order to establish whether competitive and predatorâ€“prey interactions between two species are affected by other species in the system; i.e., are pairwise interaction strengths affected by the background species assemblage? We examined the survival and growth response of small bullfrog (Rana catesbeiana) and small green frog (Rana clamitans) tadpoles in the presence and absence of a competitor (large bullfrogs), the lethal presence of the larval odonate predator Tramea lacerata, and the nonlethal (caged) presence of the larval odonate predators Anax junius and Anax longipes. We demonstrate that large bullfrog competitors and caged Anax affect traits (foraging activity level) of small bullfrog and small green frog tadpoles and that these changes in traits, in turn, affect interactions of the small tadpole species with each other and with the other species. In particular, the following four trait-mediated indirect interactions were evident: (1) Presence of large bullfrog competitors increased the predation rate of Tramea on small green frogs and small bullfrogs. (2) Presence of nonlethal Anax reduced the predation rate of Tramea on small green frogs. (3) Presence of nonlethal Anax increased the competitive advantage of bullfrogs over green frogs. (4) Presence of nonlethal Anax facilitated midge invasion of the experimental units. The proposed mechanisms (changes in small tadpole activity) involved in these trait-mediated indirect interactions were supported by observational data on tadpole activity and resource levels in the experimental units, and in laboratory experiments examining tadpole activity responses to predators. The occurrence of strong trait-mediated indirect interactions in this simple food web underscores the potential importance of such interactions in animal communities.", verbose=True))

  from .autonotebook import tqdm as notebook_tqdm





Device set to use cuda:0


In [4]:
columns = ["Title", "Abstract", "DOI", "Score"]

In [5]:
# Load Examples (of TMII Papers)
# The abstracts of the examples are only being used.
examples = [
    {
        "Title": "Dynamic population stage structure due to juvenile–adult asymmetry stabilizes complex ecological communities",
        "Abstract": clean_text('''
            Using food web models that account for juvenile and adult
            individuals of species, I show that commonly observed differences between juveniles and adults in foraging capacity
            and predation risk result in larger, more complex communities than predicted by models without stage structure.
            Based on their species interaction networks these complex
            and diverse communities would be expected to be unstable, but these destabilizing effects of species interactions are
            overruled by stabilizing changes in juvenile–adult stage structure. Differences between juvenile and adult individuals hence
            offer a natural resolution to the diversity–stability enigma of
            ecological communities.
        '''),
        "DOI": "",
        "Score": 0
    },
    {
        "Title": "EFFECTS OF TOP PREDATOR SPECIES ON DIRECT AND INDIRECT INTERACTIONS IN A FOOD WEB",
        "Abstract": clean_text('''
            Current theory on trophic interactions in food webs assumes thatecologically
            similar species can be treated collectively as a single functional unit such as a guild or
            trophic level. This theory implies that all species within that unit transmit identical direct
            and indirect effects throughout the community. We evaluated this assumption by conducting
            experiments to compare the direct and indirect effects of three top-predator species, be
            longing to the same hunting spider guild, on the same species of grasshopper and on old
            field grasses and herbs. Observations under field conditions revealed that each spiderspecies
            exhibited different hunting behavior (i.e., sit-and-wait, sit-and-pursue, and active hunting)
            and occupied different locations within the vegetation canopy. These differences resulted
            in different direct effects on grasshopper prey. Grasshoppers demonstrated significant be
            havioral (diet) shifts in the presence of sit-and-wait and sit-and-pursue species but not when
            faced with actively hunting species. Grasshopper density was significantly reduced byspider
            species that occupied lower parts of the vegetation canopy (sit-and-pursue and actively
            hunting species), but it was not significantly reduced by the sit-and-wait spider species that
            occupied the upper parts of the canopy. These direct effects manifested themselves differ
            ently in the plant trophic level. The sit-and-wait spider caused indirect effects on plants
            by changing grasshopper foraging behavior (a trait-mediated effect). The sit-and-pursue
            spider caused indirect effects by reducing grasshopper density (density-mediated effects);
            the effects of changes in grasshopper behavior were thus not reflected in the plant trophic
            level. The actively hunting spiders had strictly density-mediated indirect effects on plants.
            The study offers mechanistic insight into how predator species within the same guild can
            have very different trophic effects in food webs. Thus classical modeling approaches that
            treat all predator species as a single functional unit may not adequately capture biologically
            relevant details that influence community dynamics.
        '''),
        "DOI": "",
        "Score": 0
    },
    {
        "Title": "Higher-Order Interaction between Species Inhibits Bacterial Invasion of a Phototroph-Predator Microbial Community",
        "Abstract": clean_text('''
            The composition of an ecosystem is thought to be
            important for determining its resistance to invasion.
            Studies of natural ecosystems, from plant to microbial communities, have found that more diverse communities are more resistant to invasion. In some
            cases, more diverse communities resist invasion by more completely consuming the resources necessary for the invader. We show that Escherichia
            coli can successfully invade cultures of the alga Chlamydomonas reinhardtii (phototroph) or the ciliate
            Tetrahymena thermophila (predator) but cannot invade a community where both are present. The invasion resistance of the algae-ciliate community
            arises from a higher-order interaction between species (interaction modification) that is unrelated to resource consumption. We show that the mode of
            this interaction is the algal inhibition of bacterial aggregation, which leaves bacteria vulnerable to predation. This mode requires both the algae and the ciliate
            to be present and provides an example of invasion resistance through an interaction modification.
        '''),
        "DOI": "",
        "Score": 0
    },
    {
        "Title": "Multiple predator effects result in risk reduction for prey across multiple prey densities",
        "Abstract": clean_text('''
            Investigating how prey density influences a
            prey’s combined predation risk from multiple predator
            species is critical for understanding the widespread
            importance of multiple predator effects. We conducted
            experiments that crossed six treatments consisting of
            zero, one, or two predator species (hellgrammites,
            greenside darters, and creek chubs) with three treat
            ments in which we varied the density of mayfly prey.
            None of the multiple predator effects in our system were
            independent, and instead, the presence of multiple
            predator species resulted in risk reduction for the prey
            across both multiple predator combinations and all
            three levels of prey density. Risk reduction is likely to
            have population-level consequences for the prey,
            resulting in larger prey populations than would be pre
            dicted if the effects of multiple predator species were
            independent. For one of the two multiple predator
            combinations, the magnitude of risk reduction margin
            ally increased with prey density. As a result, models
            predicting the combined risk from multiple predator
            species in this system will sometimes need to account for
            prey density as a factor influencing per-capita prey death
            rates.
        '''),
        "DOI": "",
        "Score": 0
    },
]

# # Create Dataset
# examples_df = pd.DataFrame([], columns=columns)
# for example in examples:
#     example = pd.DataFrame([[example["Title"], example["Abstract"], example["DOI"], example["Score"]]],  columns=columns)
#     examples_df = pd.concat([examples_df, example])
# examples_df.reset_index(drop=True, inplace=True)

# # Store Dataset (CSV and Pickle)
# save_df(examples_df, "Examples")

In [6]:
# Level 1 Keywords
keywords_1 = [
    "trait",
    "phenotype"
]

# Level 2 Keywords
keywords_2 = [
    "trait-mediated",
    "higher-order interaction",
    "polymorphism",
    "interaction modification",
    "indirect effect"
]

# Level 3 Keywords
keywords_3 = [
    "apparent competition",
    "resource competition",
    "keystone predation",
    "intraguild predation",
    "intransitive competition",
    "trophic chain",
    "competition chain",
    "mutual competition"
]

# Broken Level 3 Keywords
# These are the keywords of before but broken down, so it's
# like "apparent" and "competition" instead of "apparent competition".
broken_keywords_3 = list(set(" ".join(keywords_3).split(" ")))
print(f"Broken Keywords: {broken_keywords_3}")

search_1 = "|".join(keywords_1).replace(" ", "%20")
search_2 = "|".join(keywords_2).replace(" ", "%20")
search_3 = "|".join(keywords_3).replace(" ", "%20")
search_3_broken = "|".join(broken_keywords_3).replace(" ", "%20")

Broken Keywords: ['mutual', 'trophic', 'intransitive', 'competition', 'keystone', 'predation', 'intraguild', 'resource', 'apparent', 'chain']


In [20]:
import requests

def revert_abstract(inverted_abstract):
    if not inverted_abstract:
        return ""
    abstract = ""
    i = 0
    while True:
        index_found = False
        for k, v in inverted_abstract.items():
            # print(k, v)
            if i in v:
                if abstract and k not in [".", ",", "?", "!"]:
                    abstract += " "
                abstract += k
                i += 1
                index_found = True
        if not index_found:
            break
    return abstract

def search_papers(searches, verbose=False):
    page = 1
    search_filter = ','.join([f'title.search:{search}' for search in searches])
    url = f"https://api.openalex.org/works?page={page}&filter={search_filter}"
    if verbose:
        print(f"URL: {url}")

    works = []
    data = requests.get(url).json()
    data = [] if 'results' not in data.keys() else data['results']
    
    while len(data) > 0:
        works += data
        page += 1
        url = f"https://api.openalex.org/works?page={page}&filter={search_filter}"
        if verbose:
            print(f"URL: {url}")
        data = requests.get(url).json()
        if 'results' in data.keys():
            data = data['results']
        else:
            break

    # Print Size and 1st 10 Titles
    if verbose:
        num_works = len(works)
        print(f"Number Works: {num_works}")
        for i in range(0, min(10, num_works)):
            print(works[i]["title"])

    # Create Dataset
    df = pd.DataFrame([], columns=columns)
    for work in works:
        work = pd.DataFrame([[work["title"], revert_abstract(work["abstract_inverted_index"]), work["doi"], 0]],  columns=columns)
        df = pd.concat([df, work])
    df.reset_index(drop=True, inplace=True)

    return df

def on_topic_filter(row):
    abstract = row['Abstract']
    if not abstract:
        return False
    torch.cuda.empty_cache()
    return on_topic(abstract)

In [23]:
import torch

# Dataset A
# df_a = search_papers([search_1], verbose=False)
# save_df(df_a, "A")

# Dataset B
# df_b = search_papers([search_1, search_2], verbose=False)
# save_df(df_b, "B")

# Dataset B + Filtered
# This portion is unable to run because my memory keeps running out.
# I'll try it out on HiPerGator as soon as that works.
# df_b_filtered = df_b[df_b.apply(on_topic_filter, axis=1)]
# save_df(df_b_filtered, "BFiltered")

# Dataset C
# df_c = search_papers([search_1, search_2, search_3], verbose=False)
# save_df(df_c, "C")

# Dataset C + Filtered
# df_c_filtered = df_c[df_c.apply(on_topic_filter, axis=1)]
# save_df(df_c_filtered, "CFiltered")

# Dataset D
# df_d = search_papers([search_1, search_2, search_3_broken], verbose=False)
# save_df(df_d, "D")

# Dataset D + Filtered
# df_d_filtered = df_d[df_d.apply(on_topic_filter, axis=1)]
# save_df(df_d_filtered, "DFiltered")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [11]:
# A and B Subsets
# Datasets A and B are too large (so it takes too long to run).
# I'll use a subset of each.
# The filtering function should be able to run now.

# Subset of Dataset A
# sub_df_a = sample_df("A", n=150, save=True)

# # Subset of Dataset A + Filtered
# sub_df_a_filtered = sub_df_a[sub_df_a.apply(on_topic_filter, axis=1)]
# save_df(sub_df_a_filtered, "SubAFiltered")

# # Subset of Dataset B
# sub_df_b = sample_df("B", n=150, save=True)

# # Subset of Dataset B + Filtered
# sub_df_b_filtered = sub_df_b[sub_df_b.apply(on_topic_filter, axis=1)]
# save_df(sub_df_b_filtered, "SubBFiltered")