Create Ontology

In [1]:
from owlready2 import *



In [2]:
from  common.cleaner import clean_char

def _escape_value(text: str) -> str:
    """Escape the illegal characters for an ontology property"""
    if text is None:
        return None
    # function to escape XML character data
    text = escape(text)
    text = text.replace('\n', '')
    text = text.replace('\r', '')
    text = text.replace('\f', '')
    text = text.replace('\b', '')
    text = text.replace('"', '')
    text = text.replace('[', '')
    text = text.replace(']', '')
    text = text.replace('{', '')
    text = text.replace('}', '')
    text = text.replace('#', '')
    text = text.replace('|', '')
    text = clean_char(text)
    return text

def _escape_iri(text: str) -> str:
    """For IRI, we replace space character by _"""
    if text is None:
        return None
    text = _escape_value(text)
    text = text.replace(' ', '_')
    text = text.replace('.', '_')
    return text


In [3]:
onto = get_ontology("http://authors_relations.org/onto.owl")

In [4]:
# Create classes

with onto :
    class Author(Thing):
        pass
    class ConnectedAuthor(Author):
        pass
    class InterestingAuthor(ConnectedAuthor):
        pass

    class Article(Thing):
        # name of an article is the pdf_id in arxiv
        pass


In [5]:
list(onto.classes())

[onto.Author, onto.ConnectedAuthor, onto.InterestingAuthor, onto.Article]

In [6]:
print(Author.subclasses())
print(ConnectedAuthor.ancestors())

<generator object EntityClass.subclasses at 0x00000235C9CF89E0>
{owl.Thing, onto.ConnectedAuthor, onto.Author}


In [7]:
onto

get_ontology("http://authors_relations.org/onto.owl#")

In [8]:
# Save the ontology
onto.save(file = "authors_relations.owl")

In [9]:
# Create new class properties
with onto:
    class wrote_Article(Author >> Article):
        pass
    class quoted_Auteur(Author >> ConnectedAuthor):
        pass
    class quoted_by(ConnectedAuthor >> Author):
        inverse_property = quoted_Auteur
    class wrote_by(Article >> Author):
        inverse_property = wrote_Article
    class quoted_in(ConnectedAuthor >> Article):
        pass
    

Import data

In [10]:
# Import the Data to put in ontology
# importing the module
import json
 
# Opening JSON file with references
with open('pdf_references.json') as json_file:
    prediction = json.load(json_file)
prediction = prediction['References']

# Opening JSON file with metadata
with open('pdf_metadata.json') as json_file:
    metadata = json.load(json_file)
metadata = metadata['Papers']

print("References : "+str(len(prediction)))
print("Authors : "+str(len(metadata)))

References : 217
Authors : 216


Nettoyage de données

In [11]:
 for reference in prediction:

        for reference_line in reference:

            for author_reference in reference_line['list_authors']:
                if '|' in author_reference:
                    print(author_reference)
                

|T
≤ |A(s)| ·'
µh(ah|xh
|Ch(xh−2
µh(·|xh)||µt h(·|xh
µh(·|xh)||µt h(·|xh
µh(·|xh)||µt h(·|xh
||µt h′
||µt h′
||µt h′
||µt h′
µh(·|xh)||µt h(·|xh
∈ ∆(S1)}∪{ph(·|sh
∈ ∆(S1)}∪{ph(·|sh


In [33]:
# Créer les auteurs et les articles d'abord

for article in metadata[:50]:
    pdf_id = article['id']
    article_name = pdf_id
    article_name = Article(_escape_iri(pdf_id))
    for author_qui_cite in article['authors']:   
        # identifier les noms
        author_qui_cite_name = _escape_value(author_qui_cite)
        
        # identifier les classes
        author_qui_cite_name = Author(_escape_iri(author_qui_cite))
        # identifier les relations
        
        author_qui_cite_name.wrote_Article=[article_name]
        # print(str(author_qui_cite_name) + ' a écrit l\'article ' + str(article_name))

In [38]:
# References

for reference in prediction:
            for reference_line in reference:
                ref_id = _escape_iri(reference_line['id'])

                for inst in onto.Article.instances():
                    pdf_id = str(inst).split('.')[-1]


                    if pdf_id == ref_id:
                        for author_reference in reference_line['list_authors']:
 
                            # identifier les noms
                            author_reference_name = _escape_value(author_reference)
                            article_name = inst

                            # identifier les classes
                            try :
                                author_reference_name = ConnectedAuthor(_escape_iri(author_reference))
                                
                            except:
                                pass
                            author_reference_name.quoted_in=[article_name]

In [39]:
count = 0
for rel in onto.quoted_in.get_relations():
    count +=1
count


4613

In [40]:
sync_reasoner()

* Owlready2 * Running HermiT...
    java -Xmx2000M -cp c:\Users\Maria\Documents\00-projets\fil-rouge-master\fil-rouge-master\.venv\lib\site-packages\owlready2\hermit;c:\Users\Maria\Documents\00-projets\fil-rouge-master\fil-rouge-master\.venv\lib\site-packages\owlready2\hermit\HermiT.jar org.semanticweb.HermiT.cli.CommandLine -c -O -D -I file:///C:/Users/Maria/AppData/Local/Temp/tmp1zl_ymno
* Owlready2 * HermiT took 1.0788357257843018 seconds
* Owlready * (NB: only changes on entities loaded in Python are shown, other changes are done but not listed)


In [41]:
onto.save(file = "authors_relations.owl")

In [None]:
# TODO
# qui a été cité combien de fois (ajouter deux règle)
# qui a ecrit combien d'articles
# Règle 1 : Connected Author qui est cité plus de 50 fois est une InterestingAuthor




In [21]:
onto.Siyu_Tang

onto.Siyu_Tang

In [32]:
onto.save(file = "authors_relations.owl")

In [31]:
# Case permet de réinitialiser les instances enregistrées
for individual in Author.instances(): 
    destroy_entity(individual)

for individual in ConnectedAuthor.instances(): 
    destroy_entity(individual)

for individual in Article.instances(): 
    destroy_entity(individual)