Create Ontology

In [14]:
from owlready2 import *

In [15]:
from  common.cleaner import clean_char

def _escape_value(text: str) -> str:
    """Escape the illegal characters for an ontology property"""
    if text is None:
        return None
    # function to escape XML character data
    text = escape(text)
    text = text.replace('\n', '')
    text = text.replace('\r', '')
    text = text.replace('\f', '')
    text = text.replace('\b', '')
    text = text.replace('"', '')
    text = text.replace('[', '')
    text = text.replace(']', '')
    text = text.replace('{', '')
    text = text.replace('}', '')
    text = text.replace('#', '')
    text = text.replace('|', '')
    text = clean_char(text)
    return text

def _escape_iri(text: str) -> str:
    """For IRI, we replace space character by _"""
    if text is None:
        return None
    text = _escape_value(text)
    text = text.replace(' ', '_')
    text = text.replace('.', '_')
    return text


In [16]:
onto = get_ontology("http://authors_relations.org/onto.owl")

In [17]:
# Create classes

with onto :
    class Author(Thing):
        pass
    class ConnectedAuthor(Author):
        pass

    class Article(Thing):
        # name of an article is the pdf_id in arxiv
        pass


In [20]:
# Data property

with onto :
    class has_number_citations(DataProperty):
        range = [int]

In [21]:
list(onto.classes())

[onto.Author, onto.ConnectedAuthor, onto.InterestingAuthor, onto.Article]

In [22]:
print(Author.subclasses())
print(ConnectedAuthor.ancestors())

<generator object EntityClass.subclasses at 0x000001FF7F507AE0>
{onto.Author, owl.Thing, onto.ConnectedAuthor}


In [24]:
# Save the ontology
onto.save(file = "authors_relations.owl")

In [25]:
# Create new properties to create relations
with onto:
    class wrote_Article(ObjectProperty):
        pass

    class quoted_Auteur(ObjectProperty):
        domain = [Author]
        range = [ConnectedAuthor, Author]

    class quoted_by(ConnectedAuthor >> Author):
        inverse_property = quoted_Auteur

    class wrote_by(Article >> Author):
        inverse_property = wrote_Article

    class quoted_in(ObjectProperty):
        domain = [ConnectedAuthor, Author]
        range = [Article]

        
    

Import data

In [26]:
# Import the Data to put in ontology
# importing the module
import json
 
# Opening JSON file with references
with open('pdf_references.json') as json_file:
    prediction = json.load(json_file)
prediction = prediction['References']

# Opening JSON file with metadata
with open('pdf_metadata.json') as json_file:
    metadata = json.load(json_file)
metadata = metadata['Papers']

print("References : "+str(len(prediction)))
print("Authors : "+str(len(metadata)))

References : 217
Authors : 216


Créer les instances

In [72]:
# Créer les auteurs et les articles d'abord

for article in metadata:
    pdf_id = article['id']
    article_name = pdf_id
    article_name = Article(_escape_iri(pdf_id))
    for author_qui_cite in article['authors']:   
        # identifier les noms
        author_qui_cite_name = _escape_value(author_qui_cite)
        
        # identifier les classes
        author_qui_cite_name = Author(_escape_iri(author_qui_cite))
        # identifier les relations
        
        author_qui_cite_name.wrote_Article=[article_name]
        # print(str(author_qui_cite_name) + ' a écrit l\'article ' + str(article_name))

In [73]:
# References

for reference in prediction:
            for reference_line in reference:
                ref_id = _escape_iri(reference_line['id'])

                for instance_article in onto.Article.instances():
                    pdf_id = str(instance_article).split('.')[-1]

                    
                    if pdf_id == ref_id:
                        for author_reference in reference_line['list_authors']:
 
                            # identifier les noms
                            author_reference_name = _escape_value(author_reference)

                            if onto.search(iri = "*{}".format(_escape_iri(author_reference))) != []:
                                author_reference_name = onto.search(iri = "*{}".format(_escape_iri(author_reference)))[0]
                                author_reference_name.quoted_in.append(instance_article)
                            else :
                                author_reference_name = ConnectedAuthor(_escape_iri(author_reference))
                                author_reference_name.quoted_in = [instance_article]



In [74]:
author_reference = "J__Comput"
interesting_author = onto.search(iri = "*{}".format(_escape_iri(author_reference)))[0]
interesting_author.quoted_in

[onto.2203_02700v1, onto.2203_01205v1, onto.2203_00938v1, onto.2203_00872v1]

In [84]:
sync_reasoner()

* Owlready2 * Running HermiT...
    java -Xmx2000M -cp c:\Users\Maria\Documents\00-projets\fil-rouge-master\fil-rouge-master\.venv\lib\site-packages\owlready2\hermit;c:\Users\Maria\Documents\00-projets\fil-rouge-master\fil-rouge-master\.venv\lib\site-packages\owlready2\hermit\HermiT.jar org.semanticweb.HermiT.cli.CommandLine -c -O -D -I file:///C:/Users/Maria/AppData/Local/Temp/tmpmta45dli
* Owlready2 * HermiT took 2.669063091278076 seconds
* Owlready * (NB: only changes on entities loaded in Python are shown, other changes are done but not listed)


In [86]:
onto.save(file = "authors_relations.owl")

Créer les règles

In [77]:
# Première règle : si un auteur a écrit un article et qu’un auteur cité est cité dans cet article, alors l’auteur cite l’auteur cité
with onto :
    rule = Imp()
    rule.set_as_rule("""wrote_Article(?r,?a), quoted_in(?c, ?r) -> quoted_Auteur(?a, ?c) """)

In [80]:
# Calcul nombre d'auteurs qui cites

for MonAuteur in Author.instances():
    nombre_de_citations = 0

    for citation in MonAuteur.quoted_in:

        nombre_de_citations +=1

    MonAuteur.has_number_citations=[nombre_de_citations]


In [81]:
# Calcul nombre d'auteurs qui cites

for Connected in ConnectedAuthor.instances():
    nombre_de_citations = 0

    for citation in Connected.quoted_in:

        nombre_de_citations +=1

    Connected.has_number_citations=[nombre_de_citations]


        
    

In [89]:
# un auteur qui a une relation quoted_in est un ConnectedAuthor

with onto :
    rule = Imp()
    rule.set_as_rule("""Author(?a) , quoted_in(?a, ?r) -> ConnectedAuthor(?a)""")

In [92]:
# Règle 2 : Connected Author qui est cité plus de 50 fois 
# (comptage de relations quoted_in pour ConnectedAuthor) 
# est un InterestingAuthor
with onto :
    rule = Imp()
    rule.set_as_rule("""ConnectedAuthor(?c) , has_number_citations(?c, ?x) , greaterThan(?x, 5) -> InterestingAuthor(?c)""")



In [93]:
onto.save(file = "authors_relations.owl")

Réinitialisation de l'ontologie

In [70]:
# Case permet de réinitialiser les instances enregistrées
for individual in Author.instances(): 
    destroy_entity(individual)

for individual in ConnectedAuthor.instances(): 
    destroy_entity(individual)

for individual in Article.instances(): 
    destroy_entity(individual)