In [None]:
import re
import math
import time
import requests
import pandas as pd
from transformers import pipeline
%run "./DataFrame.ipynb"

In [None]:
def clean(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'-\n', '', text)
    text = re.sub("\s+", " ", text)
    text = re.sub(r"\s+([?.!,])", r"\1", text)
    text = text.strip()
    return text

In [None]:
class Papers(DataFrame):
    COLUMNS = {"DOI": str, "Title": str, "Abstract": str, "ID": str}

    def __init__(self):
        super(Papers, self).__init__(columns=Papers.COLUMNS)

In [None]:
class Source(Papers):
    CLF_MDL = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
    CLF = pipeline("zero-shot-classification", model=CLF_MDL, device="cpu")

    SEARCH_TERMS = [
        [
            "trait",
            "phenotype"
        ],
        [
            "trait-mediated",
            "higher-order interaction",
            "polymorphism",
            "interaction modification",
            "indirect effect"
        ],
        [
            "apparent competition",
            "resource competition",
            "intransitive competition",
            "mutual competition",
            "keystone predation",
            "intraguild predation",
            "trophic chain",
            "competition chain"
        ]
    ]


    
    def __init__(self, name):
        super().__init__()
        self.name = name


    
    def on_topic(self, text):
        topics = [("ecology", 0.9)]
        scores = {}

        good = True
    
        for topic, threshold in topics:
            output = Source.CLF(text, [topic])
            
            score = output["scores"][0]
            scores[topic] = score
            
            if score < threshold:
                good = False
                
        return (good, scores)


    
    def clean(self):
        print("Cleaning...")

        self.df.drop_duplicates(subset=["DOI"], inplace=True)
        print(f"Dropped Duplicates, Number Papers: {self.df.shape[0]}")
        
        self.df.dropna(inplace=True)
        print(f"Dropped NAs, Number Papers: {self.df.shape[0]}")

        self.df.reset_index(drop=True, inplace=True)


    
    def save_filter_results(self, name):
        # Ecology
        df_eco = DataFrame(df=self.df.loc[self.df["Is Ecology"] == True])
        df_eco.write(f"{name}-Ecology", reset_index=True)

        # Not Ecology
        df_not_eco = DataFrame(df=self.df.loc[self.df["Is Ecology"] == False])
        df_not_eco.write(f"{name}-NotEcology", reset_index=True)

        return (df_eco, df_not_eco)


    
    def filter(self, name, auto_save=500):
        print("Filtering...")

        
        if "Is Ecology" not in self.df.columns:
            self.add_col("Is Ecology", False, dtype=bool)

        if "Ecology Score" not in self.df.columns:
            self.add_col("Ecology Score", math.nan, dtype=float)

        
        i = 0
        num_filtered = 0
        
        for idx, row in self.df.iterrows():
            print(f"{i+1}/{self.df.shape[0]}")

            # Abstract
            abstract = self.df.loc[idx].Abstract
            abstract_NaN = type(abstract) in [type(0), type(0.0)] and math.isnan(abstract)
            
            if abstract_NaN or not abstract:
                abstract = ""
            
            abstract = abstract[:3000]
            
            # Deters Redundancy
            unfiltered = math.isnan(self.df.loc[idx, "Ecology Score"])
            
            if not abstract:
                self.df.loc[idx, "Is Ecology"] = False
                self.df.loc[idx, "Ecology Score"] = -1
            elif unfiltered:
                out = self.on_topic(abstract)
                self.df.loc[idx, "Is Ecology"] = out[0]
                self.df.loc[idx, "Ecology Score"] = out[1]["ecology"]
                num_filtered += 1

                if num_filtered % auto_save == 0:
                    self.save_filter_results(name)

            i += 1
            
        df_eco, df_not_eco = self.save_filter_results(name)
        
        print(f"Number Ecology Papers: {df_eco.df.shape[0]}")
        print(f"Number Not-Ecology Papers: {df_not_eco.df.shape[0]}")

        self.df = df_eco.df

    def search(self):
        raise NotImplementedError
    
    def expand(self, max_hops=3, max_citations=3):
        # max_hops: The maximum level we can
        # reach in the citation tree.
        # max_citations: The maximum number of
        # citations we can visit for each work.
        raise NotImplementedError

    def identify(self):
        raise NotImplementedError

In [None]:
class OpenAlex(Source):
    def __init__(self):
        super().__init__("OpenAlex")


    
    def extract_id(self, url):
        if not url:
            return ""
        
        url_prefix = "https://openalex.org/"
        url_prefix_len = len(url_prefix)
        return url[url_prefix_len:]


    
    def revert_abstract(self, inverted_abstract):
        if not inverted_abstract:
            return ""

        i = 0
        abstract = ""
        
        while True:
            index_found = False
            for k, v in inverted_abstract.items():
                if i in v:
                    abstract += " "
                    abstract += k
                    i += 1
                    index_found = True
            if not index_found:
                break
        
        return abstract


    
    def get_search_filter(self, search_terms_2D, from_date="1800-01-01", to_date="2030-01-01"):
        search_terms_1D = []
        
        for search_terms in search_terms_2D:
            search_terms_1D.append("|".join(search_terms).replace(" ", "%20"))
        search_filter = ",".join([f"title_and_abstract.search:{search}" for search in search_terms_1D])

        search_filter += f',has_abstract:true'
        search_filter += f',to_publication_date:{to_date}'
        search_filter += f',from_publication_date:{from_date}'
        search_filter += f',type:article'

        return search_filter


    
    def search(self, search_filter, max_num_papers=math.inf):
        print("Search...")
        
        # Exponential Backoff
        # c is the number of adverse events. An adverse event
        # is a thrown exception. If c >= max_c, we exit the 
        # process.
        c = 0
        max_c = 6
        
        URL = f"https://api.openalex.org/works?filter={search_filter}"
        print(URL)
    
        page = 1
        while self.df.shape[0] < max_num_papers:
            if c > 0:
                t = 2 ** c
                print(f"Sleeping for {t}s")
                time.sleep(t)

            # Requesting the API
            url = f"{URL}&page={page}"
            print(url)

            try:
                out = requests.get(url)
                out = out.json()
            except Exception as e:
                print(e)

                c += 1
                if c >= max_c:
                    # Things aren't working out, and we
                    # should exit the process.
                    break
                else:
                    # We can still try again.
                    continue

            # If the output is null or there are no
            # results, we exit the search.
            if not out or not out.get("results"):
                print("No Results")
                break

            # This prints the number of papers
            # the API identified.
            count = out['meta']['count']
            if page == 1:
                print(f"Count: {count}")

            # Storing the Results
            i = 0
            papers = out["results"]
            num_papers = len(papers)
            
            while i < num_papers and self.df.shape[0] < max_num_papers:
                paper = papers[i]
                
                # The abstract needs to be converted
                # into readable text (and cleaned).
                abstract = paper.get("abstract_inverted_index", "")
                abstract = self.revert_abstract(abstract)
                abstract = clean(abstract)

                # The ID is returned as an URL. We take
                # the minimal portion of that ID.
                id_ = paper.get("id", "")
                id_ = self.extract_id(id_)
                
                self.add_row({
                    "DOI": paper.get("doi", ""), 
                    "Title": paper.get("title", ""),
                    "Abstract": abstract,
                    "ID": id_
                })
                
                i += 1
      
            page += 1

        print(f"Retrieved {self.size()} Paper(s)")

    def expand(self, max_hops=3, max_citations=3):
        # This list contains the IDs that we have
        # already seen and thus cannot visit.
        seen = self.df["ID"].tolist()

        # This stack contains the IDs that we need
        # to visit.
        stack = [(start_id, 0) for start_id in seen]

        # These two variables are used as a progress
        # indicator. We monitor the number of start
        # IDs that have been searched.
        start_seen = 0
        start_size = len(seen)
        
        while stack:
            curr_id, curr_hops = stack.pop()

            # Print Progress
            # The starting IDs have a hop value
            # of 0.
            if curr_hops == 0:
                start_seen += 1
                print(f"({start_seen}/{start_size}) Searching...")

            # We have reached the maximum level
            # of the citation tree.
            if curr_hops >= max_hops:
                continue

            try:
                url = f"https://api.openalex.org/works/{curr_id}?select=referenced_works"
                print(url)
                out = requests.get(url)
                out = out.json()
            except Exception as e:
                # We can continue as we have already popped the
                # troublesome ID from the stack.
                print(e)
                continue

            if not out or not out.get('referenced_works'):
                    continue

            num_citations = 0
            for paper_id in out['referenced_works']:
                if num_citations >= max_citations:
                    break
    
                paper_id = self.extract_id(paper_id)
                print(f"\t{paper_id}")
                
                if paper_id in seen:
                    continue

                seen.append(paper_id)
                stack.append((paper_id, curr_hops + 1))
                num_citations += 1

        # Storing Papers
        print("Storing Papers...")
        
        FIELDS = "id,doi,title,abstract_inverted_index"
        
        for i, paper_id in enumerate(seen):
            print(f"{i+1}/{len(seen)}")
            
            url = f"https://api.openalex.org/works/{paper_id}?select={FIELDS}"
            print(url)

            try:
                out = requests.get(url)
                out = out.json()
            except Exception as e:
                print(e)
                continue
                
            if not out:
                continue

            # Process Abstract
            abstract = out.get("abstract_inverted_index", "")
            abstract = self.revert_abstract(abstract)
            abstract = clean(abstract)
            
            self.add_row({
                "DOI": out["doi"], 
                "Title": out["title"],
                "Abstract": abstract, 
                "ID": paper_id
            })

        self.write(f"{self.name} Expanded", reset_index=True)
        print(f"Expanded, Number of Papers: {self.size()}")

    def identify(self):
        # NOTE: This likely will not be called as
        # running all of the functions below without
        # running out of memory (or bug) is unlikely.
        
        # 1. Search
        search_filter = self.make_search_filter(Source.SEARCH_TERMS)
        self.search(search_filter)
        self.clean()
        self.filter(f"{self.name}")

        # 2. Expand
        self.expand()
        self.filter(f"{self.name} Expanded")
        
        # self.write(self.name, reset_index=True)

In [None]:
class SemanticScholar(Source):
    def __init__(self):
        super().__init__("Semantic Scholar")

    def make_search_filter(self):
        # NOTE: No hate, but this source is questionable.
        # Maybe it's me. I am not create a more adaptable
        # function as the API is finicky.
        # Hence, it's a literal.
        a = "(trait %7C phenotype)"
        b = "(higher order interaction %7C interaction modification %7C indirect effect %7C trait mediated %7C polymorphism)"
        c = "(competition chain %7C trophic chain %7C intraguild predation %7C keystone predation %7C apparent competition %7C resource competition %7C intransitive competition %7C mutual competition)"
        fields = "title,abstract,externalIds"
        types = "CaseReport,Book,BookSection,JournalArticle,Study"
        return f"query={a} {b} {c}&fields={fields}&publicationTypes={types}"
    
    def search(self, search_filter, max_num_papers=math.inf):
        print("Search...")
        
        # Exponential Backoff
        # c is the number of adverse events. An adverse event
        # is a thrown exception. If c >= max_c, we exit the 
        # process.
        c = 0
        max_c = 6
        
        URL = f"https://api.semanticscholar.org/graph/v1/paper/search/bulk?{search_filter}"
        print(URL)

        token = ""
        while self.size() < max_num_papers:
            # Sleep
            if c > 0:
                t = 2 ** c
                print(f"Sleeping for {t}s")
                time.sleep(t)

            # Add Token
            url = URL
            if token:
                url += f"&token={token}"

            # Fetch Works
            try:
                out = requests.get(url)
                out = out.json()
            except Exception as e:
                print(e)
                
                c += 1
                if c >= max_c:
                    break
                else:
                    continue

            # If there are no results, we end
            # the search.
            if not out or not out.get("data"):
                print("No Results")
                break

            # Print Count
            if not token:
                print(f"Count: {out['total']}")

            i = 0
            papers = out["data"]
            num_papers = len(papers)

            while i < num_papers and self.size() < max_num_papers:    
                paper = papers[i]

                # Process DOI
                # If there is no DOI, we do not convert it
                # into an URL.
                doi = paper["externalIds"].get("DOI", "")
                doi = not doi or f'https://doi.org/{doi}'
                
                self.add_row({
                    "DOI": doi, 
                    "Title": paper["title"], 
                    "Abstract": paper.get("abstract"), 
                    "ID": paper["paperId"]
                })
                    
                i += 1

            token = out.get("token")
            if not token:
                break

        print(f"Retrieved {self.size()} Paper(s)")

    def expand(self, max_hops=3, max_citations=3):
        seen = self.df["ID"].tolist()
        stack = [(start_id, 0) for start_id in seen]

        start_seen = 0
        start_size = len(seen)

        while stack:
            curr_id, curr_hops = stack.pop()

            if curr_hops == 0:
                start_seen += 1
                print(f"{start_seen}/{start_size} Searching...")
            
            if curr_hops >= max_hops:
                continue

            # Exponential Backoff
            # Ideally, I wouldn't need this, but
            # Semantic Scholar has strict request rates.
            c = 0
            max_c = 6
            cont_loop = False
            
            while c < max_c:
                try:
                    # Sleep
                    if c > 0:
                        t = 2 ** c
                        print(f"Sleeping for {t}s")
                        time.sleep(t)

                    url = f"https://api.semanticscholar.org/graph/v1/paper/{curr_id}/citations"
                    out = requests.get(url)
                    out = out.json()
                    
                    if not out or not out.get("data"):
                        cont_loop = True
                        break
                        
                except Exception as e:
                    print(e)
                    c += 1
                    if c >= max_c:
                        cont_loop = True
                        break
                    else:
                        continue

            if cont_loop:
                continue

            num_citations = 0
            for citation in out['data']:
                if num_citations >= max_citations:
                    break
                
                paper_id = citation['citingPaper']['paperId']
                
                if paper_id in seen:
                    continue
                
                seen.append(paper_id)
                stack.append((paper_id, curr_hops + 1))
                num_citations += 1

        
        # Storing Papers
        print("Storing Papers...")

        # Exponential Backoff
        c = 0
        max_c = 6
        
        FIELDS = "title,externalIds,abstract"
        
        for i, paper_id in enumerate(seen):
            if c > 0:
                t = 2 ** c
                print(f"Sleeping for {t}s")
                time.sleep(t)
            
            print(f"{i+1}/{len(seen)}")
            
            try:
                url = f"https://api.semanticscholar.org/graph/v1/paper/{paper_id}?fields={FIELDS}"
                out = requests.get(url)
                out = out.json()
                
                if not out:
                    print(f"No Output")
                    continue

                doi = paper["externalIds"].get("DOI", "")
                doi = not doi or f'https://doi.org/{doi}'
                
                self.add_row({
                    "DOI": doi, 
                    "Title": out["title"],
                    "Abstract": out["abstract"], 
                    "ID": out["paperId"]
                })
            except Exception as e:
                print(e)
                c += 1
                if c >= max_c:
                    break
                else:
                    continue

        self.write(f"{self.name} Expanded", reset_index=True)
        print(f"Expanded, Number of Papers: {self.size()}")

    def run(self):
        # 1. Search
        search_filter = self.make_search_filter()
        self.find(search_filter)
        self.clean()
        self.filter(name=f"{self.name}")

        # 2. Expand
        self.expand()
        self.filter(name=f"{self.name} Expanded")
        
        # self.write(self.name, reset_index=True)

In [None]:
# Completing Interrupted Work
# Apparently, it did finish. When I was running it yesterday morning,
# the page ran out of memory. I never closed the page, but it seemed
# like I'd have to run it again. It appears that this is not the case.
# oa = OpenAlex()
# oa.df = pd.read_csv("OpenAlex Expanded.csv")
# oa.filter(name=f"{oa.name} Expanded")
# oa.write(oa.name, reset_index=True)

In [None]:
# sc = SemanticScholar()
# sc.expand()
# sc.df = pd.read_csv("Semantic Scholar Expanded-5.csv")
# sc.filter(name=f"{sc.name} Expanded")

In [None]:
# Combining the Two Sources
df1 = pd.read_csv("OpenAlex Expanded-Ecology.csv")
df2 = pd.read_csv("Semantic Scholar Expanded-Ecology-1.csv")
df = pd.concat([df1, df2], ignore_index=True, sort=False)

In [None]:
df_wrapper = DataFrame(df=df)
df_wrapper.write("Papers", reset_index=True)