In [None]:
import sys
fc_src = "../.."
sys.path.insert(0, fc_src)

import requests
from os import path
from tqdm.notebook import tqdm
import pandas as pd
import time

from metrics.WebResource import WebResource
from metrics.FAIRMetricsFactory import FAIRMetricsFactory
from metrics.AbstractFAIRMetrics import AbstractFAIRMetrics

# Retrieve a Bioschemas RDF dump

In [None]:
dump = "bioschemas-dump.ttl"
if not path.isfile(dump):
    r = requests.get("https://github.com/bio-tools/content/raw/master/datasets/bioschemas-dump.ttl")
    assert r.status_code == 200
    with open(dump, "wb") as f:
        f.write(r.content)

assert path.isfile(dump)

# Load the RDF dump

In [None]:
from rdflib import ConjunctiveGraph, Namespace, URIRef
from rdflib.namespace import RDF, RDFS

schema = Namespace("http://schema.org/")

In [None]:
KG = ConjunctiveGraph()
KG.parse(dump, format="turtle")
print(f"{len(KG)} loaded triples")

In [None]:
index = {}
for s, p, o in KG.triples((None, RDF.type, schema.SoftwareApplication)):
    index[str(s)] = None    
print(print(f"{len(index)} software applications"))

In [None]:
def index_dump():
    for i in tqdm(index.keys()):
        sub_graph = ConjunctiveGraph()
        for s, p, o in KG.triples((URIRef(i), None, None)):
            sub_graph.add((s, p, o))
        index[i]=sub_graph


def get_RDF_sparql(bio_tools_Id):
    q = f"CONSTRUCT {{<{bio_tools_Id}> ?p ?o}} WHERE {{<{bio_tools_Id}> rdf:type schema:SoftwareApplication . <{bio_tools_Id}> ?p ?o .}}"
    res = KG.query(q)
    print(res.serialize(format="turtle"))

def get_RDF(bio_tools_Id):
    sub_graph = ConjunctiveGraph()
    for s, p, o in KG.triples((URIRef(bio_tools_Id), None, None)):
        sub_graph.add((s, p, o))
    print(sub_graph.serialize(format="turtle"))    
    
#get_RDF_sparql("https://bio.tools/bwa")
#get_RDF("https://bio.tools/bwa")

In [None]:
index_dump()

# Sampling 10 entries

In [None]:
import random
#sample_tool = random.choice(index.keys())

random.seed(10)

samples = random.sample(list(index.items()), 10)
#samples = random.sample(list(index.items()), len(index.items()))

splitedSize = 1000
samples_chunks = [samples[x:x+splitedSize] for x in range(0, len(samples), splitedSize)]

# Evaluating 10 entries

In [None]:
metrics_collection_remote = []
metrics_collection_remote.append(FAIRMetricsFactory.get_F2B(None))
metrics_collection_remote.append(FAIRMetricsFactory.get_I2(None))
metrics_collection_remote.append(FAIRMetricsFactory.get_R13(None))

df_columns = ['ID']
for m in metrics_collection_remote:
    #print(m.get_principle_tag())
    df_columns.append(m.get_principle_tag())
    

In [None]:
exec_time_df = pd.DataFrame(columns=df_columns)

def eval_metrics(web_res):
    metrics_collection_remote = []
    metrics_collection_remote.append(FAIRMetricsFactory.get_F2B(web_res))
    metrics_collection_remote.append(FAIRMetricsFactory.get_I2(web_res))
    metrics_collection_remote.append(FAIRMetricsFactory.get_R13(web_res))
    
    row = {"ID":web_res.get_url()}
    row_time = {"ID":web_res.get_url()}
    for m in metrics_collection_remote:
        ts1 = time.time()
        e = m.evaluate()
        duration = round((time.time() - ts1), 2)
        if e is not None:
            row[m.get_principle_tag()] = e.get_score()
            row_time[m.get_principle_tag()] = duration
    
    return row, row_time

In [None]:
def mass_eval(samples):
    evals = []
    exec_time = []

    for sample in tqdm(samples):
        wr = WebResource(url=sample[0], rdf_graph=sample[1])
        row, row_time = eval_metrics(wr)
        evals.append(row)
        exec_time.append(row_time)
        
    
    return evals, exec_time

In [None]:
import logging
logging.getLogger().setLevel(logging.ERROR)

i = 0
for c in tqdm(samples_chunks):
    i += 1
    df = pd.DataFrame()
    df_time = pd.DataFrame()
    
    evals, exec_time = mass_eval(c)
    df = pd.concat([df, pd.DataFrame.from_records(evals)])
    df_time = pd.concat([df_time, pd.DataFrame.from_records(exec_time)])
    
    df.to_csv("../results/biotools/FC_results_long_metrics_"+str(i)+".csv")
    df_time.to_csv("../results/biotools/exec_time_long_metrics_"+str(i)+".csv")

In [None]:
df

In [None]:
df_time

# Plotting results

In [None]:
import glob

all_files = glob.glob("../results/biotools/FC_results_long*.csv")

li = []

for filename in tqdm(all_files):
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)
df = pd.concat(li, axis=0, ignore_index=True)

In [None]:
from upsetplot import generate_counts, from_contents, generate_samples, UpSet, plot
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 200 

F2B = df[((df["F2B"].astype(int) > 0))]["ID"]
I2 = df[((df["I2"].astype(int) > 0))]["ID"]
R13 = df[((df["R1.3"].astype(int) > 0))]["ID"]

df_upset = from_contents({'F2B': F2B, 
                          'I2': I2, 
                          'R13': R13})
df_upset

In [None]:
upset = UpSet(df_upset, subset_size='count', show_counts=True, sort_categories_by=None, orientation='horizontal')

params = {'legend.fontsize': 8}
with plt.rc_context(params):
    upset.plot()
plt.suptitle("Bioinformatics softwares from Bio.Tools")
plt.savefig('../results/biotools-F2B-I2-R13.png', format="png")
plt.show()

In [None]:
import seaborn as sns

df_time.drop('ID', 1, inplace=True)

sns.boxplot(data=df_time)
plt.savefig('../results/exec_time.png')
print(df_time["F2B"].mean())
print(df_time["I2"].mean())
print(df_time["R1.3"].mean())