In [2]:
import time
import requests
import numpy as np

In [3]:
def create_query(title_words=[], abstract_words=[]):
    format_word = lambda word: word if " " not in word else f"%22{word}%22"
    formatted_title_words = f"title:{" OR ".join([format_word(word) for word in title_words])}"
    formatted_abstract_words = f"abstract:{" OR ".join([format_word(word) for word in abstract_words])}"

    if len(title_words) != 0 and len(abstract_words) != 0:
        return f"{formatted_title_words} OR {formatted_abstract_words}".replace(" ", "%20")
    elif len(title_words) != 0:
        return f"{formatted_title_words}".replace(" ", "%20")
    elif len(abstract_words) != 0:
        return f"{formatted_abstract_words}".replace(" ", "%20")
    else:
        return ""

In [4]:
def print_keys(d):
    print("Keys:")
    for key in d.keys():
        print(f"\t{key}")

In [11]:
KEY = "RqvYPz6UI8nsX5S21Kfpuke4xQ0lAwaD"

def get_response(title_words=[], abstract_words=[], limit=1000):
    max_size = 10000
    
    base_url = "https://api.core.ac.uk/v3/search/works"
    results = []
    headers = {"Authorization": f"Bearer {KEY}"}
    scroll_id = None

    while len(results) < max_size:
        try:
            q = create_query(title_words, abstract_words)
            url = f"{base_url}?q={q}&limit={limit}{"" if scroll_id is not None else "&scroll=true"}{f"&scrollId={scroll_id}" if scroll_id else ""}"
            print(url)
            response = requests.get(url, headers=headers)

            response_headers = response.headers
            print(f"Remaining Tokens: {response_headers["x-ratelimit-remaining"]}")
            if response_headers["x-ratelimit-remaining"] == "0":
                print("\tPausing for a Minute...")
                time.sleep(65)
                continue
            
            if response.status_code != 200:
                print("Status Code of 200", response.text)
                break

            result = response.json()
            if len(result["results"]) == 0:
                print("No More Results")
                break
    
            results += result["results"]
            scroll_id = result["scrollId"]

            print(f"Added {len(result["results"])} Papers\t\t|\t\tTotal: {len(results)} Papers")
        except Exception as e:
            print(e)
            break

    return results

In [14]:
r = get_response([], ["trait"])

https://api.core.ac.uk/v3/search/works?q=abstract:trait&limit=1000&scroll=true
Remaining Tokens: 4
Status Code of 200 {"message":"Error: Allowed memory size of 134217728 bytes exhausted (tried to allocate 67732752 bytes)"}


In [None]:
%%capture
%run DocumentSimilarity1.ipynb

In [None]:
%%time
similarities = []
for result in r:
    if "language" in result and result["language"]["name"] != "English":
        similarities.append(0)
        continue
    if "abstract" not in result or not result["abstract"]:
        similarities.append(0)
        continue
    label, similarity = classify_text(result["abstract"])
    if label == 0:
        similarity = 1 - similarity
    similarities.append(similarity)

In [None]:
sortedIndices = np.argsort(similarities)
for index in sortedIndices:
    text = ""
    if r[index]["abstract"]:
        text = r[index]["abstract"][0:25]
    print(f"Index: {index}\t|\tSimilarity: {similarities[index]}\t:\tText: {text}")