In [12]:
import requests
from tqdm import tqdm

In [2]:
base_url = "http://0.0.0.0:5000"

api_endpoints = [
    "http://0.0.0.0:5000" + "/api/inspect/describe_openaire/",
    "http://0.0.0.0:5000" + "/api/inspect/describe_opencitation/",
    "http://0.0.0.0:5000" + "/api/inspect/describe_wikidata/",
]

In [3]:
TIMEOUT = (10, 300)
NB = 10000
TYPE = "dataset"
OUTPUT_DIR = "./results/zenodo_data/"

def zenodoRestRequest():
    print("REST request to zenodo...")
    # rest request
    url = 'https://zenodo.org/api/records/?sort=mostrecent&page=1&size=' + "7000"
    while True:
        try:
            response = requests.get(url, timeout=TIMEOUT)
            break
        except SSLError:
            time.sleep(5)
        except requests.exceptions.Timeout:
            time.sleep(5)

    return response

def zenodoJsonParser(response):
    print("Parsing result...")
    json_response = response.json()

    dois_list = []
    count = 1
    for element in json_response["hits"]["hits"]:
        type = element["metadata"]["resource_type"]["type"]
        if type == TYPE:
            if count > NB: break
            dois_list.append("https://doi.org/" + element["doi"])
            count += 1

    print("Retrieved " + str(len(dois_list)) + " DOIs")
    return dois_list

In [4]:
response = zenodoRestRequest()
dois_list = zenodoJsonParser(response)

REST request to zenodo...
Parsing result...
Retrieved 331 DOIs


In [15]:
import random
#sample_tool = random.choice(index.keys())

random.seed(10)

print(len(dois_list))

#samples = random.sample(list(index.items()), 1000)
samples_dois_list = random.sample(dois_list, 20)

splitedSize = 100
samples_chunks = [samples_dois_list[x:x+splitedSize] for x in range(0, len(samples_dois_list), splitedSize)]

331


In [16]:
print(len(samples_dois_list))

20


In [17]:
results = []
for url in tqdm(samples_dois_list):
    for endpoint in api_endpoints:
        response = requests.get(endpoint + url)
        result = {
            "url": url,
            "endpoint": endpoint.split("/")[-2],
            "triples_before": response.json()["triples_before"],
            "triples_after": response.json()["triples_after"],
            "improved": response.json()["triples_before"] < response.json()["triples_after"],
        }
        results.append(result)

100%|██████████| 20/20 [16:33<00:00, 49.70s/it]


In [18]:
for res in results:
    if not res["improved"]:
        print("nop")
    if res["improved"]:
        print("ENDPOINT: " + res["api_endpoints"])
        print("URL: " + res["url"])
        