# Aim of this notebook
The goal of this note book is to automate the FAIR assesment of mutliple online resources through the [FAIR-Checker tool](https://fair-checker.france-bioinformatique.fr). 
All resulst are stored in a matrix and serialized into a CSV file. Scores can be interpreted as follows: 
 - 0 -> `failure`
 - 1 -> `weak` assesment
 - 2 -> `strong` assesment

To run this notebook you just need the `requests` and `pandas` python libraries. 

The FAIR-Checker API is better described at https://fair-checker.france-bioinformatique.fr/swagger 

Please report any issue at https://github.com/IFB-ElixirFr/fair-checker/issues or contact alban.gaignard@univ-nantes.fr. 

In [1]:
import time
import requests
import pandas as pd
from rdflib import ConjunctiveGraph
from json import JSONDecodeError
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import matplotlib as mpl

In [2]:
FC_all_metrics_url = "https://fair-checker.france-bioinformatique.fr/api/check/metrics_all"

def retrieve_rdf(url):  
    FC_get_md = "https://fair-checker.france-bioinformatique.fr/api/inspect/get_rdf_metadata"
    kg = ConjunctiveGraph()
    res = requests.get(url=FC_get_md, params={"url": url})
    try:
        kg.parse(data=res.text, format="json-ld")
    except Exception as e:
            print(e)
    print(f"Loaded {len(kg)} RDF triples from {url}")
    return kg

## Input dataset

In [3]:
bench_urls = [
    "https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/JGO6VI", 
    "https://www.data.gouv.fr/en/datasets/donnees-relatives-a-lepidemie-de-covid-19-en-france-vue-densemble/", 
    "https://www.kaggle.com/datasets/imdevskp/corona-virus-report",
    "https://data.who.int/dashboards/covid19/data", 
    "https://data.opendatasoft.com/explore/dataset/donnees-hospitalieres-covid-19-dep-france%40public/table/?disjunctive.countrycode_iso_3166_1_alpha3&disjunctive.nom_dep_min", 
    
    "https://bio.tools/bwa", 
    
    "https://hpo.jax.org", 
    "https://www.ebi.ac.uk/ols4/ontologies/go", 
    
    "https://tess.elixir-europe.org/materials/make-your-research-fairer-with-quarto-github-and-zenodo", 
    "https://moodle.polytechnique.fr/course/index.php?categoryid=1018", 
    
    "http://doi.org/10.1594/PANGAEA.908011", 
    "http://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge", 
    "https://data.rivm.nl/meta/srv/eng/rdf.metadata.get?uuid=1c0fcd57-1102-4620-9cfa-441e93ea5604&approved=true"
]

## FAIR assesment over all inputs 

In [14]:
df = pd.DataFrame()
rows = []
kg = ConjunctiveGraph()

for u in tqdm(bench_urls):
    # call to the FC API
    start = time.time()
    res = requests.get(url=FC_all_metrics_url, params={"url": u})
    eval_in_sec = time.time() - start
    
    try : 
        evaluations = res.json()
        row = {"URL": u}
        # iterating over all evaluation results
        if (type(evaluations) is list): 
            for e in evaluations:
                row[e["metric"]] = int(e["score"])
        rows.append(row)
    except JSONDecodeError as e:
        pass
    
    kg += retrieve_rdf(u)

kg.serialize("out.ttl", format="turtle")

  0%|          | 0/13 [00:00<?, ?it/s]

Loaded 136 RDF triples from https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/JGO6VI
Loaded 198 RDF triples from https://www.data.gouv.fr/en/datasets/donnees-relatives-a-lepidemie-de-covid-19-en-france-vue-densemble/
Loaded 110 RDF triples from https://www.kaggle.com/datasets/imdevskp/corona-virus-report
Loaded 6 RDF triples from https://data.who.int/dashboards/covid19/data
Loaded 7 RDF triples from https://data.opendatasoft.com/explore/dataset/donnees-hospitalieres-covid-19-dep-france%40public/table/?disjunctive.countrycode_iso_3166_1_alpha3&disjunctive.nom_dep_min
Loaded 127 RDF triples from https://bio.tools/bwa
Loaded 0 RDF triples from https://hpo.jax.org
Loaded 0 RDF triples from https://www.ebi.ac.uk/ols4/ontologies/go
Loaded 27 RDF triples from https://tess.elixir-europe.org/materials/make-your-research-fairer-with-quarto-github-and-zenodo
Loaded 0 RDF triples from https://moodle.polytechnique.fr/course/index.php?categoryid=1018
Loaded 168 RDF triples fro

<Graph identifier=Neff308aef5b54e8babfe3085e75affd2 (<class 'rdflib.graph.ConjunctiveGraph'>)>

## Evaluation matrix

In [15]:
from IPython.display import display, Markdown
df = pd.DataFrame.from_records(rows)
df

Unnamed: 0,URL,F1A,F1B,F2A,F2B,A1.1,A1.2,I1,I2,I3,R1.1,R1.2,R1.3
0,https://dataverse.harvard.edu/dataset.xhtml?pe...,2,2,1,1,2,2,1,1,2,2,2,1
1,https://www.data.gouv.fr/en/datasets/donnees-r...,2,0,1,1,2,2,1,1,2,2,2,1
2,https://www.kaggle.com/datasets/imdevskp/coron...,2,2,1,1,2,2,1,1,0,2,2,1
3,https://data.who.int/dashboards/covid19/data,2,0,1,2,2,0,1,2,0,0,0,2
4,https://data.opendatasoft.com/explore/dataset/...,2,0,1,2,2,0,1,2,0,0,0,2
5,https://bio.tools/bwa,2,2,1,1,2,0,1,1,2,0,0,1
6,https://hpo.jax.org,2,0,0,0,2,0,0,0,0,0,0,0
7,https://www.ebi.ac.uk/ols4/ontologies/go,2,0,0,0,2,0,0,0,0,0,0,0
8,https://tess.elixir-europe.org/materials/make-...,2,0,1,1,2,2,1,1,2,2,2,1
9,https://moodle.polytechnique.fr/course/index.p...,2,0,0,0,2,0,0,0,0,0,0,0


In [16]:
df['F_score'] = df.apply(lambda row: round((row["F1A"] + row["F1B"] + row["F2A"] + row["F2B"])*100/8, 1), axis = 1)
df['A_score'] = df.apply(lambda row: round((row["A1.1"] + row["A1.2"])*100/4, 1), axis = 1)
df['I_score'] = df.apply(lambda row: round((row["I1"] + row["I2"] + row["I3"])*100/6, 1), axis = 1)
df['R_score'] = df.apply(lambda row: round((row["R1.1"] + row["R1.2"] + row["R1.3"])*100/6, 1), axis = 1)
#df['FAIR_score'] = df.apply(lambda row: round((row["F_score"] + row["A_score"] + row["I_score"] + row["R_score"])/4, 1), axis = 1)
df['FAIR_score'] = df.apply(lambda row: round((row["F1A"] + row["F1B"] + row["F2A"] + row["F2B"] + row["A1.1"] + row["A1.2"] + row["I1"] + row["I2"] + row["I3"] + row["R1.1"] + row["R1.2"] + row["R1.3"]) * 100 / 24, 1), axis = 1)

df.to_csv("fairchecker_dekalog_evals.csv")
df

Unnamed: 0,URL,F1A,F1B,F2A,F2B,A1.1,A1.2,I1,I2,I3,R1.1,R1.2,R1.3,F_score,A_score,I_score,R_score,FAIR_score
0,https://dataverse.harvard.edu/dataset.xhtml?pe...,2,2,1,1,2,2,1,1,2,2,2,1,75.0,100.0,66.7,83.3,79.2
1,https://www.data.gouv.fr/en/datasets/donnees-r...,2,0,1,1,2,2,1,1,2,2,2,1,50.0,100.0,66.7,83.3,70.8
2,https://www.kaggle.com/datasets/imdevskp/coron...,2,2,1,1,2,2,1,1,0,2,2,1,75.0,100.0,33.3,83.3,70.8
3,https://data.who.int/dashboards/covid19/data,2,0,1,2,2,0,1,2,0,0,0,2,62.5,50.0,50.0,33.3,50.0
4,https://data.opendatasoft.com/explore/dataset/...,2,0,1,2,2,0,1,2,0,0,0,2,62.5,50.0,50.0,33.3,50.0
5,https://bio.tools/bwa,2,2,1,1,2,0,1,1,2,0,0,1,75.0,50.0,66.7,16.7,54.2
6,https://hpo.jax.org,2,0,0,0,2,0,0,0,0,0,0,0,25.0,50.0,0.0,0.0,16.7
7,https://www.ebi.ac.uk/ols4/ontologies/go,2,0,0,0,2,0,0,0,0,0,0,0,25.0,50.0,0.0,0.0,16.7
8,https://tess.elixir-europe.org/materials/make-...,2,0,1,1,2,2,1,1,2,2,2,1,50.0,100.0,66.7,83.3,70.8
9,https://moodle.polytechnique.fr/course/index.p...,2,0,0,0,2,0,0,0,0,0,0,0,25.0,50.0,0.0,0.0,16.7


# Manual evaluations with F-UJI

In [17]:
fuji_evals = {}
fuji_evals["harvard_dataverse"] = 75
fuji_evals["data.gouv.fr"] = 52
fuji_evals["kaggle"] = 60
fuji_evals["who"] = 27
fuji_evals["opendatasoft"] = 31 
fuji_evals["bio.tools"] = 18
fuji_evals["HPO"] = 18
fuji_evals["HPO_ols"] = 18
fuji_evals["tess"] = 39
fuji_evals["moodle"] = 4
fuji_evals["pangaea"] = 91
fuji_evals["kaggle_2"] = 60
fuji_evals["rdf_content_neg"] = 43

## Visualisation

In [18]:
#import seaborn as sns 
#sns.set_theme(style="whitegrid", palette="muted")

df = pd.read_csv("fc_dekalog_evals.csv")
df = df[["URL", "FAIR_score"]]
df = df.rename(columns={"FAIR_score": "FAIR-Checker"})


fuji_scores = [75, 52, 60, 27, 31, 18, 18, 18, 39, 4, 91, 60, 43]
df['F-UJI'] = fuji_scores

resource_title = ["Dataset (Harvard Dataverse)", 
                  "Dataset (Governmental platform)", 
                  "Dataset (Kaggle)",
                  "Dataset (WHO)",
                  "Dataset (Opendatasoft)",
                  "Bioinformatics tool (bio.tools)",
                  "HPO (website)",
                  "HPO (Ontology Lookup Service)",
                  "Training material (TeSS)",
                  "Online course (Moodle)",
                  "Dataset (PANGAEA)",
                  "Dataset (Kaggle)",
                  "Dataset (RDF metadata)"]

df['Resource'] = resource_title
#df['Mean'] = df.mean(axis=1)
df['Std dev'] = df.std(axis=1)

df = df.reindex(columns=['Resource', 'F-UJI', 'FAIR-Checker', 'Std dev', 'URL'])

df_sorted = df.sort_values(by='Std dev', ascending=True)
df_sorted = df_sorted.drop(columns=['URL'])
df_sorted

  df['Std dev'] = df.std(axis=1)


Unnamed: 0,Resource,F-UJI,FAIR-Checker,Std dev
10,Dataset (PANGAEA),91,91.7,0.494975
6,HPO (website),18,16.7,0.919239
7,HPO (Ontology Lookup Service),18,16.7,0.919239
0,Dataset (Harvard Dataverse),75,79.2,2.969848
2,Dataset (Kaggle),60,70.8,7.636753
11,Dataset (Kaggle),60,70.8,7.636753
9,Online course (Moodle),4,16.7,8.980256
1,Dataset (Governmental platform),52,70.8,13.293607
4,Dataset (Opendatasoft),31,50.0,13.435029
3,Dataset (WHO),27,50.0,16.263456


In [19]:
## Table conversion to latex
#latex_table = df_sorted.to_latex(index=False, column_format='lrrrr', float_format="%.2f", escape=False)
#latex_table = latex_table.replace("\\begin{tabular}", "\\begin{tabular}{\\small}")
#print(latex_table)