In [1]:
import re
import math
import json
import requests
import pandas as pd

In [2]:
search_terms = [
    [
        "trait",
        "phenotype"
    ],
    [
        "trait-mediated",
        "higher-order interaction",
        "polymorphism",
        "interaction modification",
        "indirect effect"
    ],
    [
        "apparent competition",
        "resource competition",
        "intransitive competition",
        "mutual competition",
        "keystone predation",
        "intraguild predation",
        "trophic chain",
        "competition chain"
    ]
]

In [75]:
def clean(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'-\n', '', text)
    text = re.sub("\s+", " ", text)
    text = re.sub(r"\s+([?.!,])", r"\1", text)
    text = text.strip()
    return text

def store_df(df, name):
    df.reset_index(drop=True, inplace=True)
    df.to_excel(f'{name}.xlsx', header=True, index=False)
    df.to_csv(f'{name}.csv', encoding='utf-8', index=False)

In [None]:
class Papers():
    COLUMNS = ["DOI", "Title", "Abstract", "LID", "UID", "Score"]
    
    def __init__(self, source):
        self.source = source
        self.data = []
        self.df = None

    def size(self):
        return len(self.data)

    def add(self, *, doi, title, abstract, lid):
        self.data.append({
            "DOI": doi,
            "Title": title,
            "Abstract": abstract,
            "LID": lid
        })
        return len(self.data)

    def update_df(self):
        df = pd.DataFrame([], columns=Papers.COLUMNS)
        for paper in self.data:
            rw = pd.DataFrame([[paper["DOI"], paper["Title"], paper["Abstract"], paper["LID"], None, 0]],  columns=Papers.COLUMNS)
            df = pd.concat([df, rw])
        self.df = df
    
    def store_df(self, name):
        store_df(self.df, name)

In [None]:
class OpenAlex(Papers):
    def __init__(self):
        super().__init__("OA")


    def revert_abstract(self, inverted_abstract):
        if not inverted_abstract:
            return ""

        i = 0
        abstract = ""
        
        while True:
            index_found = False
            for k, v in inverted_abstract.items():
                if i in v:
                    abstract += " "
                    abstract += k
                    i += 1
                    index_found = True
            if not index_found:
                break
        
        return abstract

    
    def create_search_filter(self, search_terms_2D, from_date="1800-01-01", to_date="2030-01-01"):
        search_terms_1D = []

        # Field
        for search_terms in search_terms_2D:
            search_terms_1D.append("|".join(search_terms).replace(" ", "%20"))
        search_filter = ",".join([f"title_and_abstract.search:{search}" for search in search_terms_1D])

        search_filter += f',has_abstract:true'
        search_filter += f',to_publication_date:{to_date}'
        search_filter += f',from_publication_date:{from_date}'
        search_filter += f',type:article'

        return search_filter


    def search_papers(self, search_filter, max_num_papers=math.inf):
        BASE_URL = f"https://api.openalex.org/works?filter={search_filter}"
        print(BASE_URL)

        page = 1

        while self.size() < max_num_papers:
            url = f"{BASE_URL}&page={page}"
            out = requests.get(url).json()

            if not out or not "results" in out or not out["results"]:
                print("No Results")
                break

            if page == 1:
                print(f"Count: {out['meta']['count']}")

            i = 0
            results = out["results"]
            page_size = len(results)
            
            while i < page_size and self.size() < max_num_papers:
                paper = results[i]
                paper_doi = paper.get("doi", "")
                
                if paper_doi:
                    paper_abstract = clean(self.revert_abstract(paper["abstract_inverted_index"]))
                    self.add(
                        doi=paper_doi, 
                        title=paper["title"], 
                        abstract=paper_abstract, 
                        lid=paper["id"]
                    )
                
                i += 1

            print(self.size())
            
            page += 1

        print(f"Retrieved {self.size()} Paper(s)")

    
    def run(self):
        search_filter = self.create_search_filter(search_terms)
        self.search_papers(search_filter)
        self.update_df()
        self.store_df("OA")

In [98]:
class SemanticScholar(Papers):
    def __init__(self):
        super().__init__("OA")


    def create_search_filter(self, search_terms_2D, from_year="1800", to_year="2030"):
        # Something is fishy. I'm not sure what it is. So, I will be hardcoding in
        # the filter that worked moderately well.
        return "query=(trait %7C phenotype) (higher order interaction %7C interaction modification %7C indirect effect %7C trait mediated %7C polymorphism) (competition chain %7C trophic chain %7C intraguild predation %7C keystone predation %7C apparent competition %7C resource competition %7C intransitive competition %7C mutual competition)&fields=title,abstract,externalIds&publicationTypes=CaseReport,Book,BookSection,JournalArticle,Study"
        
        # search_terms_1D = []
        # for search_terms in search_terms_2D:
        #     search_terms_1D.append("(" + "%7C".join([f"\"{term}*\"" for term in search_terms]).replace(" ", "%20") + ")")
        
        # search_filter = "&query=" + "+".join(search_terms_1D)
        # search_filter += f'&publicationTypes=CaseReport,Book,BookSection,JournalArticle,Study'
        # search_filter += f'&year={from_year}-{to_year}'
        # search_filter += f'&fields=abstract,title,externalIds'

        # return search_filter

    
    def search_papers(self, search_filter, max_num_papers=math.inf):
        BASE_URL = f"https://api.semanticscholar.org/graph/v1/paper/search/bulk?{search_filter}"
        print(BASE_URL)

        token = ""
        while self.size() < max_num_papers:
            url = BASE_URL
            if token:
                url += f"&token={token}"

            out = requests.get(url).json()

            if not out or not "data" in out:
                print("No Results")
                break

            if not token:
                print(f"Count: {out['total']}")

            i = 0
            data = out["data"]
            page_size = len(data)

            while i < page_size and self.size() < max_num_papers:    
                paper = data[i]
                paper_doi = paper["externalIds"].get("DOI", "")
                
                if paper_doi:
                    self.add(
                        doi=f'https://doi.org/{paper_doi}', 
                        title=paper["title"], 
                        abstract=paper.get("abstract"), 
                        lid=paper["paperId"]
                    )
                    
                i += 1

            print(self.size())
            
            token = out["token"]
            if not token:
                break

        print(f"Retrieved {self.size()} Paper(s)")
    
    
    def run(self):
        search_filter = self.create_search_filter(search_terms)
        self.search_papers(search_filter)
        self.update_df()
        self.store_df("SC")

In [56]:
oa = OpenAlex()
oa.run()

https://api.openalex.org/works?filter=title_and_abstract.search:trait|phenotype,title_and_abstract.search:trait-mediated|higher-order%20interaction|polymorphism|interaction%20modification|indirect%20effect,title_and_abstract.search:apparent%20competition|resource%20competition|intransitive%20competition|mutual%20competition|keystone%20predation|intraguild%20predation|trophic%20chain|competition%20chain,has_abstract:true,to_publication_date:2030-01-01,from_publication_date:1800-01-01,type:article
Count: 661
0
25
50
75
100
125
150
175
200
225
250
275
300
325
350
375
400
425
450
475
500
525
550
575
600
625
650
No Results
Retrieved 661 Paper(s)


In [99]:
sc = SemanticScholar()
sc.run()

https://api.semanticscholar.org/graph/v1/paper/search/bulk?query=(trait %7C phenotype) (higher order interaction %7C interaction modification %7C indirect effect %7C trait mediated %7C polymorphism) (competition chain %7C trophic chain %7C intraguild predation %7C keystone predation %7C apparent competition %7C resource competition %7C intransitive competition %7C mutual competition)&fields=title,abstract,externalIds
Count: 372
372
Retrieved 372 Paper(s)


In [58]:
# Add Universal IDs
oa_df = pd.read_csv("OA.csv")
oa_df["UID"] = oa_df["UID"].astype(str)

for index, row in oa_df.iterrows():
    if not row['DOI'] or row['DOI'] != row['DOI']:
        continue

    title = row['Title']
    url = f"https://api.semanticscholar.org/graph/v1/paper/search/match?query={row['Title'].replace(' ', '%20')}&fields=title,externalIds"
    output = requests.get(url).json()

    if not output or 'data' not in output:
        continue

    print(row['DOI'], index)
    doi = row['DOI']
    doi = doi[doi.find("doi.org/")+8:].lower()
    
    for _ in output['data']:
        if _['externalIds'].get('DOI', '').lower() == doi:
            oa_df.loc[index, "UID"] = _['paperId']
            break
store_df(oa_df, "OA")

In [100]:
# Set Univeral IDs
sc_df = pd.read_csv("SC.csv")
sc_df["UID"] = sc_df["UID"].astype(str)

for index, row in sc_df.iterrows():
    sc_df.loc[index, "UID"] = row["LID"]

In [101]:
df1 = pd.concat([oa_df, sc_df])

store_df(df1, "Papers-1")
print(f"Number Papers: {df1.shape[0]}")

df2 = df1.drop_duplicates(subset=['DOI'])
print(f"Dropped Duplicates, Number Papers: {df2.shape[0]}")

df2.dropna(inplace=True)
print(f"Dropped Rows w NAs, Number Papers: {df2.shape[0]}")

store_df(df2, "Papers-2")
print(f"Number Papers: {df2.shape[0]}")

Number Papers: 1033
Dropped Duplicates, Number Papers: 786
Dropped Rows w NAs, Number Papers: 657


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.dropna(inplace=True)


Number Papers: 657


In [2]:
oa_df = pd.read_csv("OA.csv")
sc_df = pd.read_csv("SC.csv")

df1 = pd.concat([oa_df, sc_df])
df2 = df1.drop_duplicates(subset=['DOI'])
df2.dropna(subset=["Abstract", "DOI"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.dropna(subset=["Abstract", "DOI"], inplace=True)


In [3]:
df2.shape
store_df(df2, "Papers-3")

(724, 6)