In [1]:
import re
import math
import requests
import pandas as pd
from transformers import pipeline
%run "./DataFrame.ipynb"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def clean(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'-\n', '', text)
    text = re.sub("\s+", " ", text)
    text = re.sub(r"\s+([?.!,])", r"\1", text)
    text = text.strip()
    return text

In [3]:
class Papers(DataFrame):
    COLUMNS = {"DOI": str, "Title": str, "Abstract": str, "ID": str}

    def __init__(self):
        super(Papers, self).__init__(columns=Papers.COLUMNS)

In [4]:
class Source(Papers):
    CLF_MDL = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
    CLF = pipeline("zero-shot-classification", model=CLF_MDL, device="cpu")

    SEARCH_TERMS = [
        [
            "trait",
            "phenotype"
        ],
        [
            "trait-mediated",
            "higher-order interaction",
            "polymorphism",
            "interaction modification",
            "indirect effect"
        ],
        [
            "apparent competition",
            "resource competition",
            "intransitive competition",
            "mutual competition",
            "keystone predation",
            "intraguild predation",
            "trophic chain",
            "competition chain"
        ]
    ]

    def __init__(self, name):
        super().__init__()
        self.name = name
        
    def on_topic(self, text):
        topics = [("ecology", 0.9)]
        scores = {}

        is_on_topic = True
    
        for topic, threshold in topics:
            out = Source.CLF(text, [topic])
            score = out["scores"][0]
            scores[topic] = score
            
            if score < threshold:
                is_on_topic = False
                
        return (is_on_topic, scores)

    def find(self):
        return None

    def clean(self):
        self.df.drop_duplicates(subset=["DOI"], inplace=True)
        print(f"Dropped Duplicates, Number Papers: {self.df.shape[0]}")
        
        self.df.dropna(inplace=True)
        print(f"Dropped NAs, Number Papers: {self.df.shape[0]}")
    
    def filter(self):
        if "Is Ecology" not in self.df.columns:
            self.add_col("Is Ecology", False, bool)

        if "Ecology Score" not in self.df.columns:
            self.add_col("Ecology Score", 0)
        
        for idx, row in self.df.iterrows():
            out = self.on_topic(self.df.loc[idx].Abstract[:3000])
            self.df.loc[idx, "Is Ecology"] = out[0]
            self.df.loc[idx, "Ecology Score"] = out[1]["ecology"]

        df_eco = DataFrame(df=self.df.loc[self.df["Is Ecology"] == True])
        df_eco.write(f"{self.name}-Ecology", reset_index=True)
        
        df_not_eco = self.df.loc[self.df["Is Ecology"] == False]
        df_not_eco.write(f"{self.name}-NotEcology", reset_index=True)

        self.df = df_eco.df
        print(f"Number Ecological Papers: {self.df.shape[0]}")
        
    def expand(self):
        return None

    def run(self):
        return None




Device set to use cpu


In [5]:
class OpenAlex(Source):
    def __init__(self):
        super().__init__("OpenAlex")

    def extract_id(self, url):
        url_prefix = "https://openalex.org/"
        url_prefix_len = len(url_prefix)
        return url[url_prefix_len:]
  
    def revert_abstract(self, inverted_abstract):
        if not inverted_abstract:
            return ""

        i = 0
        abstract = ""
        
        while True:
            index_found = False
            for k, v in inverted_abstract.items():
                if i in v:
                    abstract += " "
                    abstract += k
                    i += 1
                    index_found = True
            if not index_found:
                break
        
        return abstract

    def make_search_filter(self, search_terms_2D, from_date="1800-01-01", to_date="2030-01-01"):
        search_terms_1D = []
        
        for search_terms in search_terms_2D:
            search_terms_1D.append("|".join(search_terms).replace(" ", "%20"))
        search_filter = ",".join([f"title_and_abstract.search:{search}" for search in search_terms_1D])

        search_filter += f',has_abstract:true'
        search_filter += f',to_publication_date:{to_date}'
        search_filter += f',from_publication_date:{from_date}'
        search_filter += f',type:article'

        return search_filter

    def find(self, search_filter, max_num_papers=math.inf):
        URL = f"https://api.openalex.org/works?filter={search_filter}"

        page = 1
        while self.size() < max_num_papers:
            url = f"{URL}&page={page}"
            print(url)
            out = requests.get(url).json()

            if not out or not out.get("results", []):
                print("No Results")
                break

            if page == 1:
                print(f"Count: {out['meta']['count']}")

            i = 0
            papers = out["results"]
            num_papers = len(papers)
            print('a')
            while i < num_papers and self.size() < max_num_papers:
                paper = papers[i]
                doi = paper.get("doi", "")
                
                if not doi:
                    i += 1
                    continue
                
                print('b')
                abstract = paper["abstract_inverted_index"]
                abstract = clean(self.revert_abstract(abstract))
                print('c')
                self.add_row({
                    "DOI": doi, 
                    "Title": paper["title"],
                    "Abstract": abstract, 
                    "ID": self.extract_id(paper["id"])
                })
                print('d')
                i += 1

            print(self.size(), end=', ')          
            page += 1

        print(f"Retrieved {self.size()} Paper(s)")

    def expand(self, max_hops=2, max_citations=math.inf):
        seen = df["ID"].tolist()
        stack = [(id_, 0) for id_ in seen]

        while stack:
            curr_id, curr_hops = stack.pop()
            
            if curr_hops >= max_hops:
                continue
            
            out = requests.get(f"https://api.openalex.org/works/{curr_id}?select=referenced_works").json()
        
            if not out or not out.get('referenced_works', []):
                continue

            num_citations = 0
            for citation_url in out['referenced_works']:
                if num_citations >= max_citations:
                    break
    
                citation_id = self.extract_id(citation_url)
                
                if citation_id in seen:
                    continue

                seen.append(citation_id)
                stack.append((citation_id, curr_hops + 1))
                num_citations += 1

        fields = "id,doi,title,abstract_inverted_index"
        for id_ in seen:
            try:
                out = requests.get(f"https://api.openalex.org/works/{id_}?select={fields}").json()
                if not out:
                    continue
                
                self.add_row({
                    "DOI": out["doi"], 
                    "Title": out["title"],
                    "Abstract": clean(self.revert_abstract(out["abstract_inverted_index"])), 
                    "ID": self.extract_id(out["id"])
                })
            except Exception as e:
                print(e)
                continue
        
        print(f"Expanded, Number of Papers: {self.size()}")

    def run(self):
        search_filter = self.make_search_filter(Source.SEARCH_TERMS)
        print(1)
        self.find(search_filter)
        print(2)
        self.clean()
        print(3)
        self.filter()
        print(4)
        self.expand()
        print(5)
        self.filter()
        print(6)
        self.write(self.name, reset_index=True)
        print(7)

In [6]:
class SemanticScholar(Source):
    def __init__(self):
        super().__init__(self, "Semantic Scholar")

    def make_search_filter(self):
        return "query=(trait %7C phenotype) (higher order interaction %7C interaction modification %7C indirect effect %7C trait mediated %7C polymorphism) (competition chain %7C trophic chain %7C intraguild predation %7C keystone predation %7C apparent competition %7C resource competition %7C intransitive competition %7C mutual competition)&fields=title,abstract,externalIds&publicationTypes=CaseReport,Book,BookSection,JournalArticle,Study"

    def find(self, search_filter, max_num_papers=math.inf):
        URL = f"https://api.semanticscholar.org/graph/v1/paper/search/bulk?{search_filter}"
        print(URL)

        token = ""
        while self.size() < max_num_papers:
            url = URL
            if token:
                url += f"&token={token}"

            out = requests.get(url).json()

            if not out or not "data" in out:
                print("No Results")
                break

            if not token:
                print(f"Count: {out['total']}")

            i = 0
            papers = out["data"]
            num_papers = len(data)

            while i < num_papers and self.size() < max_num_papers:    
                paper = papers[i]
                doi = paper["externalIds"].get("DOI", "")
                
                if doi:
                    self.add_row({
                        "DOI": f'https://doi.org/{paper_doi}', 
                        "Title": paper["title"], 
                        "Abstract": paper.get("abstract"), 
                        "ID": paper["paperId"]
                    })
                    
                i += 1

            print(self.size(), end=", ")
            
            token = out["token"]
            if not token:
                break

        print(f"Retrieved {self.size()} Paper(s)")

    def expand(self, max_hops=5, max_citations=math.inf):
        seen = df.loc[df["ID"]].tolist()
        stack = [(id_, 0) for id_ in seen]
        
        while stack:
            curr_id, curr_hops = stack.pop()
            
            if curr_hops >= max_hops:
                continue
            
            out = requests.get(f"https://api.semanticscholar.org/graph/v1/paper/{curr_id}/citations").json()
        
            if not out or not out.get('data', []):
                continue
        
            num_citations = 0
            for citation in out['data']:
                if num_citations >= max_citations:
                    break
                
                citation_id = citation['citingPaper']['paperId']
                
                if citation_id in seen:
                    continue
                
                seen.append(citation_id)
                stack.append((citation_id, curr_hops + 1))
                num_citations += 1

        fields = "title,externalIds,abstract"
        for id_ in seen:
            try:
                out = requests.get(f"https://api.semanticscholar.org/graph/v1/paper/{id_}?fields={fields}").json()
                if not out:
                    continue
                
                self.add_row({
                    "DOI": out["externalIds"]["DOI"], 
                    "Title": out["title"],
                    "Abstract": out["abstract"], 
                    "ID": out["paperId"]
                })
            except Exception as e:
                print(e)
                continue
        
        print(f"Expanded, Number of Papers: {self.size()}")

    def run(self):
        search_filter = self.make_search_filter()
        self.find(search_filter)
        self.clean()
        self.filter()
        self.expand()
        self.filter()
        self.write(self.name, reset_index=True)

In [7]:
OpenAlex().run()
SemanticScholar().run()

Empty DataFrame
Columns: [DOI, Title, Abstract, ID]
Index: []
1
https://api.openalex.org/works?filter=title_and_abstract.search:trait|phenotype,title_and_abstract.search:trait-mediated|higher-order%20interaction|polymorphism|interaction%20modification|indirect%20effect,title_and_abstract.search:apparent%20competition|resource%20competition|intransitive%20competition|mutual%20competition|keystone%20predation|intraguild%20predation|trophic%20chain|competition%20chain,has_abstract:true,to_publication_date:2030-01-01,from_publication_date:1800-01-01,type:article&page=1
Count: 664
a
b
c
d
b
c
d
b
c
d
b
c
d
b
c
d
b
c
d
b
c
d
b
c
d
b
c
d
b
c
d
b
c
d
b
c
d
b
c
d
b
c
d
b
c
d
b
c
d
b
c
d
b
c
d
b
c
d
b
c
d
b
c
d
b
c
d
b
c
d
b
c
d
b
c
d
25, https://api.openalex.org/works?filter=title_and_abstract.search:trait|phenotype,title_and_abstract.search:trait-mediated|higher-order%20interaction|polymorphism|interaction%20modification|indirect%20effect,title_and_abstract.search:apparent%20competition|resour

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


No Results
Retrieved 629 Paper(s)
2
Dropped Duplicates, Number Papers: 628
Dropped NAs, Number Papers: 628
3


  self.df.loc[idx, "Ecology Score"] = out[1]["ecology"]


ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().