# goal of notebook
- align authors with their names (worked well without preprocessing, slight improvement with preprocessing)
- BTLF has a comma between Last Name and First Name: {Last Name, First Name}
- BNF/Constellation has no separation -> not possible to differenciate: {First Name (Possible Middle Names) Last Name}
- Steps: 
    - parsing of BTLF and creating keys (without preprocess)
    - parsing of Constellationsand matching with BTLF keys
    - parsing of BNF and matching with BTLF+Constellations Key

# prerequisite
- execute ExtractionGrapheBTLF to have latest version of authors (and regenerate BTLF book file so the authors are coherent)
- change source file to last version of files -> output files will be the final files

# outputs
- new BNF and Constellations files with authors aligned with BTLF authors


In [1]:
from rdflib import Graph, Namespace, Literal, URIRef
from rdflib import RDF, RDFS, OWL, XSD
import sys
sys.path.append('../')

sys.path.append('../andre')
import andre.utils as utils
import collections
SCHEMA = Namespace("http://schema.org/")
PBS = Namespace("http://www.example.org/pbs#")

In [2]:
graphe_Auteurs = Graph()
graphe_Auteurs.bind("schema","http://schema.org/", override=True, replace=True)
graphe_Auteurs.parse("../final_datasets/btlf_books/Graphes/grapheAuteurs_BTLF.ttl")

<Graph identifier=N45c5c824a1834ef98592dadb76a006f4 (<class 'rdflib.graph.Graph'>)>

# auteurs ont forcement un nom de famille ou pseudo

In [3]:
def extract_author_turtle(graph, author):
    last_name = str(graph.value(author, SCHEMA.familyName))
    first_name = str(graph.value(author, SCHEMA.givenName)) if graph.value(author, SCHEMA.givenName) else ""
    description = str(graph.value(author, SCHEMA.description)) if graph.value(author, SCHEMA.description) else ""
    return last_name, first_name, description

btlf_authors = {}

for author_uri in graphe_Auteurs.subjects(RDF.type, utils.schema.Person):
    last_name, first_name, description = extract_author_turtle(graphe_Auteurs, author_uri)
    # btlf_authors[first_name + " " + last_name] = author


    if last_name and first_name:
        raw_author_name = first_name + " " + last_name
        preprocessed_author_name = utils.preprocess_author_name(raw_author_name)
        if preprocessed_author_name in btlf_authors:
            print("dupliqué", preprocessed_author_name, raw_author_name, btlf_authors[preprocessed_author_name], author_uri)
        btlf_authors[preprocessed_author_name] = author_uri
    elif last_name and not first_name:
        raw_author_name = last_name
        preprocessed_author_name = utils.preprocess_author_name(raw_author_name)
        if preprocessed_author_name in btlf_authors:
            print("dupliqué", preprocessed_author_name, raw_author_name, btlf_authors[preprocessed_author_name], author_uri)
        btlf_authors[preprocessed_author_name] = author_uri
    elif not last_name and first_name:
        print("no last name", author_uri)
        raw_author_name = first_name
        preprocessed_author_name = utils.preprocess_author_name(raw_author_name)
        if preprocessed_author_name in btlf_authors:
            print("dupliqué", preprocessed_author_name, raw_author_name, btlf_authors[preprocessed_author_name], author_uri)
        btlf_authors[preprocessed_author_name] = author_uri
    elif not last_name and not first_name:
        print("no first name and no last name", author_uri)
    
    # print(last_name, first_name,  description)

# print(btlf_authors)
len(btlf_authors)

dupliqué matteo berton Mattéo Berton http://schema.org/author10934 http://schema.org/author11904
dupliqué alex nogues Alex Nogues http://schema.org/author12630 http://schema.org/author13591
dupliqué lucile de peslouan Lucile de Peslou¨an http://schema.org/author11506 http://schema.org/author14838
dupliqué sophie bedard Sophie Bédard http://schema.org/author14950 http://schema.org/author15169
dupliqué camilla de la bedoyere Camilla De la Bédoyère http://schema.org/author15325 http://schema.org/author15608
dupliqué eve patenaude Ève Patenaude http://schema.org/author15283 http://schema.org/author16156
dupliqué marsha forchuk skrypuch Marsha Forchuk Skrypuch http://schema.org/author15909 http://schema.org/author16677
dupliqué julia sarda Júlia Sardà http://schema.org/author14215 http://schema.org/author16959
dupliqué matt de la pena Matt de la Peña http://schema.org/author16708 http://schema.org/author17405
dupliqué genevieve darling Genevie`ve Darling http://schema.org/author11507 http:/

18122

In [4]:
def populate_authors_bnf_constellations(source, graph_source, graphe_auteurs):
    if source == "constellations":
        uri_base = "authorConstellations" 
    elif source == "bnf":
        uri_base = "authorBNF"
    else:
        assert(False) # function not designed for other sources

    array_size_counter = collections.defaultdict(lambda: 0)
    array_size_names = collections.defaultdict(lambda: [])
    author_match_counter = 0
    id = 0
    multiple_authors_count = 0
    for book in graph_source.subjects(RDF.type, utils.schema.Book):
        if source == "constellations":
            book_data = utils.extract_data_constellation(graph_source, book)
            if book_data.isbn == "9782740427415":
                print(book_data.book_authors)
        elif source == "bnf":
            book_data = utils.extract_data_bnf(graph_source, book)
        else:
            assert(False)
            
        author_names = book_data.book_authors
        for raw_author_name in author_names:
            preprocessed_author_name = utils.preprocess_author_name(raw_author_name)
            if preprocessed_author_name in btlf_authors.keys():
                if uri_base not in btlf_authors[preprocessed_author_name]: # prevent matching stats with previously added author
                    author_match_counter +=1
                graph_source.remove((book, SCHEMA.author, None)) 
                graph_source.remove((book, PBS.authorURI, None)) 

                graph_source.add((book, SCHEMA.author, btlf_authors[preprocessed_author_name])) 
                graph_source.add((book, PBS.authorString, Literal(raw_author_name))) 
            else:
                print("rate", raw_author_name)
                graph_source.remove((book, PBS.authorURI, None)) 
                graph_source.remove((book, SCHEMA.author, None)) 

                author_uri = SCHEMA[uri_base + str(id)]
                btlf_authors[preprocessed_author_name] = author_uri
                graphe_auteurs.add((author_uri, RDF.type, SCHEMA['Person']))
                graphe_auteurs.add((author_uri, SCHEMA.familyName,Literal(raw_author_name)))
                graph_source.add((book, SCHEMA.author, author_uri)) 
                graph_source.add((book, PBS.authorString, Literal(raw_author_name)))

            id += 1
            # checking if we can deduce first and last name from author name -> no, because some cases with more than 2 words in author name
            # not checking multiple authors because need to be fixed
            if (" et " not in raw_author_name 
                and " and " not in raw_author_name 
                and "d'après"  not in raw_author_name 
                and "d’après" not in raw_author_name 
                and "&" not in raw_author_name 
                and "with" not in raw_author_name
                and " | " not in raw_author_name):
                name_array = raw_author_name.rsplit(" ")
                if len(name_array) == 0:
                    array_size_counter["0"] += 1
                    array_size_names["0"].append(name_array)
                elif len(name_array) == 1:
                    array_size_counter["1"] += 1
                    array_size_names["1"].append(name_array)
                elif len(name_array) == 2:
                    array_size_counter["2"] += 1
                    array_size_names["2"].append(name_array)
                elif len(name_array) == 3:
                    array_size_counter["3"] += 1
                    array_size_names["3"].append(name_array)
                elif len(name_array) == 4:
                    array_size_counter["4"] += 1
                    array_size_names["4"].append(name_array)
                elif len(name_array) == 5:
                    array_size_counter["5"] += 1
                    array_size_names["5"].append(name_array)
                elif len(name_array) == 6:
                    array_size_counter["6"] += 1
                    array_size_names["6"].append(name_array)
                elif len(name_array) == 7:
                    array_size_counter["7"] += 1
                    array_size_names["7"].append(name_array)
                elif len(name_array) == 8:
                    array_size_counter["8"] += 1
                elif len(name_array) == 9:
                    array_size_counter["9"] += 1
            else:
                multiple_authors_count +=1
        
    print(multiple_authors_count, " multiple authors")
    print(author_match_counter, "author alignments")
    print( "word number , count")
    for k, v in sorted(array_size_counter.items(), key=lambda x: x[1], reverse=True):
        print(k,"            ", v)

In [5]:
btlf_authors

{'anthony kemp': rdflib.term.URIRef('http://schema.org/author1'),
 'michel honaker': rdflib.term.URIRef('http://schema.org/author10'),
 'dominique piat': rdflib.term.URIRef('http://schema.org/author100'),
 'jules supervielle': rdflib.term.URIRef('http://schema.org/author1000'),
 'john tiffany': rdflib.term.URIRef('http://schema.org/author10000'),
 'guilhem flouzat': rdflib.term.URIRef('http://schema.org/author10001'),
 'cecile terouanne': rdflib.term.URIRef('http://schema.org/author10002'),
 'rachel williams': rdflib.term.URIRef('http://schema.org/author10003'),
 'lucy letherland': rdflib.term.URIRef('http://schema.org/author10004'),
 'jeanclaude dusaussoy': rdflib.term.URIRef('http://schema.org/author10005'),
 'yolande moreau': rdflib.term.URIRef('http://schema.org/author10006'),
 'katherine j willis': rdflib.term.URIRef('http://schema.org/author10007'),
 'jeanchristophe fournier': rdflib.term.URIRef('http://schema.org/author10008'),
 'aurex verdon': rdflib.term.URIRef('http://schema.

# constellations

In [6]:
g = Graph()
g.bind("schema","http://schema.org/", override=True, replace=True)
g.bind("pbs","http://www.example.org/pbs#", override=True, replace=True)

g.parse("../final_datasets/constellations_cleanISBN13.ttl", format="turtle")
len(list(g.subjects(RDF.type, utils.schema.Book)))

11267

In [7]:
list(g.subjects(RDF.type, utils.schema.Book))[0]

rdflib.term.URIRef('http://schema.org/Book000aaa0d-d6f6-4c18-8c65-b4f972aae3d8')

In [8]:
populate_authors_bnf_constellations(source="constellations", graph_source=g, graphe_auteurs=graphe_Auteurs)

rate Julie Delporte
rate Deborah Hodge
rate Lionel Groulx
rate Natacha Scheidhauer
rate Evan H. Rhodes
rate Sarah Tsiang
rate Randall de Sève
rate Virginie Jouannet Roussel
rate Collectif
rate Jez Alborough
rate Céline et al. Bathias Rascalou
rate Helen Lester
rate PatrickGeorge
rate Vikki Vansickle
rate Jörg Steiner
rate Philippe de La Cotardière
rate Sarah Elton
rate David McPhail
rate Robie H. Harris
rate Wladyslaw Szpilman
rate Agnès de Lestrade
rate Rukhsana Khan
rate Muriel Mingau
rate Liane Shaw
rate Clive A. Lawton
rate Nick Dowson
rate del Pup
rate Edgar Poe
rate Prosper Gautier
rate Fred Bernard
rate Élisabeth de Lambilly
rate Kevin Major
rate Jean de La Fontaine
rate Jennifer Armstrong
rate John Wookward
rate Julie Brinckloe
rate Rachel Qitsualik
rate Chris Butterworth
rate Robert Munsch
rate Takabatake Jun
rate Rona Arato
rate David Alazraki
rate Esmé Shapiro
rate Martine Podesto
rate Corinne King
rate Nimrod
rate John Choi
rate James Gladstone
rate Geraldine McCaughrean
ra

In [9]:
with open("../final_datasets/btlf_books/Graphes/grapheAuteurs_BTLF_Constellations.ttl","wb") as fichier:
    graphe_Auteurs.serialize(fichier) 

with open("../final_datasets/constellations_auteursURI.ttl","wb") as fichier:
    g.serialize(fichier) 

# bnf

In [10]:
g = Graph()
g.bind("schema","http://schema.org/", override=True, replace=True)
g.bind("pbs","http://www.example.org/pbs#", override=True, replace=True)

g.parse("../final_datasets/bnf_cleanISBN13.ttl", format="turtle")
len(list(g.subjects(RDF.type, utils.schema.Book)))

Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x000001D541DA8360>
Traceback (most recent call last):
  File "C:\Users\dre\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\rdflib\term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
           ^^^^^^^^^^^^^^^^^^
  File "C:\Users\dre\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\isodate\isodates.py", line 203, in parse_date
    raise ISO8601Error('Unrecognised ISO 8601 date format: %r' % datestring)
isodate.isoerror.ISO8601Error: Unrecognised ISO 8601 date format: 'Port- d-es'
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x000001D541DA8360>
Traceback (most recent call last)

30384

In [11]:
populate_authors_bnf_constellations(source="bnf", graph_source=g, graphe_auteurs=graphe_Auteurs)

rate Jean-Claude Brisville
rate Association Calladium
rate Natalie Savage Carlson
rate Bernard Stone
rate Heinrich Hoffmann
rate Idries Shah
rate Lucien Voy
rate Kay Fender
rate Pierre Pluchon
rate Shigeo Watanabe
rate J. M. G. Le Clézio
rate Ian Serraillier
rate Raymond Sinamal
rate Isaac Leib Peretz
rate Heinz Edelmann
rate Marie-Thérèse Rouil
rate Kay Thompson
rate Tan Koide
rate Yasuko Koide
rate Claude Auclair
rate François Migeat
rate Bette Greene
rate Guillermo Mordillo
rate Toshi Yoshida
rate Geneviève Duroselle
rate Claude Hudelot
rate Jeannine Baticle
rate Jean-Clément Martin
rate Jacques Delval
rate Jean Meyer
rate Paul Maar
rate Charlotte Pomerantz
rate Anne Holm
rate Jean-Marie Petit
rate Françoise Lebrun
rate Ernestine Gilbreth
rate Frank Bunker Gilbreth
rate Randall Jarrell
rate Janni Howker
rate Gérard Pussey
rate Henri Rey
rate Janine Mossuz-Lavau
rate Paul Hitaisse
rate Sterling North
rate Ann-Madeleine Gelotte
rate Alain Broutin
rate Philippe Jacquin
rate Beverly Cle

In [12]:
with open("../final_datasets/authors_BTLF_Constellations_BNF.ttl","wb") as fichier:
    graphe_Auteurs.serialize(fichier) 

with open("../final_datasets/bnf_auteursURI.ttl","wb") as fichier:
    g.serialize(fichier) 

: 