In [1]:
import requests
from tqdm import tqdm
import json

In [2]:
base_url = "http://0.0.0.0:5000"

api_endpoints = [
    "http://0.0.0.0:5000" + "/api/inspect/describe_openaire/",
    "http://0.0.0.0:5000" + "/api/inspect/describe_opencitation/",
    "http://0.0.0.0:5000" + "/api/inspect/describe_wikidata/",
]

In [3]:
TIMEOUT = (10, 300)
NB = 10000
TYPE = "dataset"
OUTPUT_DIR = "./results/zenodo_data/"

def zenodoRestRequest():
    print("REST request to zenodo...")
    # rest request
    url = 'https://zenodo.org/api/records/?sort=mostrecent&page=1&size=' + "7000"
    while True:
        try:
            response = requests.get(url, timeout=TIMEOUT)
            break
        except SSLError:
            time.sleep(5)
        except requests.exceptions.Timeout:
            time.sleep(5)

    return response

def zenodoJsonParser(response):
    print("Parsing result...")
    json_response = response.json()

    dois_list = []
    count = 1
    for element in json_response["hits"]["hits"]:
        type = element["metadata"]["resource_type"]["type"]
        if type == TYPE:
            if count > NB: break
            dois_list.append("https://doi.org/" + element["doi"])
            count += 1

    print("Retrieved " + str(len(dois_list)) + " DOIs")
    return dois_list

In [4]:
response = zenodoRestRequest()
dois_list = zenodoJsonParser(response)

REST request to zenodo...
Parsing result...
Retrieved 405 DOIs


In [8]:
import random
#sample_tool = random.choice(index.keys())

random.seed(10)

print(len(dois_list))

#samples = random.sample(list(index.items()), 1000)
samples_dois_list = random.sample(dois_list, 100)

splitedSize = 100
samples_chunks = [samples_dois_list[x:x+splitedSize] for x in range(0, len(samples_dois_list), splitedSize)]

405


In [9]:
print(len(samples_dois_list))

100


In [10]:
results = []
for url in tqdm(samples_dois_list):
    response = requests.get("http://0.0.0.0:5000/api/inspect/get_rdf_metadata/" + url)
    graph = json.dumps(response.json(), ensure_ascii=False)
    for endpoint in api_endpoints:
        response = requests.post(endpoint + url, json={"graph": graph})
        result = {
            "url": url,
            "endpoint": endpoint.split("/")[-2],
            "triples_before": response.json()["triples_before"],
            "triples_after": response.json()["triples_after"],
            "improved": response.json()["triples_before"] < response.json()["triples_after"],
        }
        results.append(result)

  7%|▋         | 7/100 [02:33<34:01, 21.95s/it]

In [None]:
improved = 0
not_improved = 0

for res in results:
    if not res["improved"]:
        not_improved += 1
    else:
        print("ENDPOINT: " + res["api_endpoints"])
        print("URL: " + res["url"])
        improved += 1
        
print("Improved: " + improved)
print("Not improved: " + not_improved)