## 1. Import Libraries

In [None]:
import sys
parentdir = ".."
sys.path.insert(0, parentdir)

import requests
import os
import datetime
#from os import path
from tqdm.notebook import tqdm
import pandas as pd
import time

from metrics.WebResource import WebResource
from rdflib import Graph
from xml.dom import minidom
from rdflib import ConjunctiveGraph, Namespace, URIRef
from rdflib.namespace import RDF, RDFS

import random
import matplotlib.pyplot as plt

from SPARQLWrapper import SPARQLWrapper, JSON

from rdflib import Graph, URIRef
from rdflib.namespace import RDFS, SKOS

## 2. Retrieve a List of Target URLs

In [None]:
# parse the xml file
mydoc = minidom.parse('Sitemap.xml')
urls = mydoc.getElementsByTagName('ExpertLink')

with open('URLs.txt', 'w') as f:
    for u in urls:
        f.write(u.firstChild.data)
        f.writelines('\n')

In [None]:
mydoc = open('URLs.txt', 'r')
urls = mydoc.readlines()

## 3. Feed the RDF Graph

In [None]:
schema = Namespace("http://schema.org/")

In [None]:
## ratelimit decorator
from ratelimit import limits, RateLimitException, sleep_and_retry

five_by_seconds = 0.8

@sleep_and_retry
@limits(calls=1, period=five_by_seconds)
def ask_orphanet(u):
    wr = WebResource(url=u)
    return wr.get_rdf()

In [None]:
%%time
KG_Total = ConjunctiveGraph()

for u in tqdm(urls):
    KG_Total += ask_orphanet(u)

In [None]:
##Display the total triples contained in the graph, after scrapping all URLs in the XML file
print(f"Loaded {len(KG_Total)} triples")

In [None]:
#Save the KG in a turtle file
KG_Total.serialize("new_orphanet_dump.ttl", format="turtle")

In [None]:
!cat new_orphanet_dump.ttl

In [None]:
#Display the file's size
#!cat scrapped_orphanet_bioschemas.ttl
from pathlib import Path
file_size =Path(r'new_orphanet_dump.ttl').stat().st_size
print("The file size is:", file_size,"bytes")

## 5. Display Bioschemas Properties Counts

In [None]:
def get_dataframe_from_query_results(res):    
    return pd.DataFrame(res.bindings)

In [None]:
### display used properties
property_counts = """
SELECT ?p (count(?p) as ?count) WHERE {
    ?s ?p ?o .
} 
GROUP BY ?p
ORDER BY DESC(?count)
"""

res = KG_Total.query(property_counts)
print(res)
print("********** Used properties **********")
df = get_dataframe_from_query_results(res)
df

## 6. Display Bioschemas Classes Counts

In [None]:
### display used classes
classes_counts = """
SELECT ?c (count(?c) as ?count) WHERE {
    ?s rdf:type ?c .
} 
GROUP BY ?c
ORDER BY DESC(?count)
"""

res = KG_Total.query(classes_counts)
print()
print("********** Used classes **********")
df = get_dataframe_from_query_results(res)
df

In [None]:
df = pd.DataFrame(res ,columns=['class','count'])

df["class"] = df["class"].astype("str")
df["count"] = df["count"].astype("int")
df.dtypes

In [None]:
count_sum=94

def pourcentage(x):
          return x*100/count_sum
def get_suffix(x):
      return str(x).split(sep="/")[-1]

df2 = df.copy()
df2["%"] = df2['count'].apply(pourcentage)
df2["label"] = df2['class'].apply(get_suffix)
df2

In [None]:
#Ordering for better visualisation
cols = df2.columns.tolist()
#cols = [cols[0], cols[3], cols[1], cols[2]]
cols = [cols[3], cols[1]]
df2 = df2[cols]
df2

In [None]:
print("********** ********** Scatter Chart ********** **********")
df2.plot(x ='label', y='count', kind = 'scatter', rot=80)
plt.show()

print("********** ********** Bar Chart ********** **********")
df2.plot(x ='label', y='count', kind = 'bar')
plt.show()

print("********** ********** Pie Chart ********** **********")

my_labels=['MedicalCode','PronounceableText','MedicalCondition']
df2.plot.pie(title="Pie Chart",y='count',figsize=(4,4),labels=my_labels)