# goal of notebook
- align authors with their names (worked well without preprocessing, slight improvement with preprocessing)
- BTLF has a comma between Last Name and First Name: {Last Name, First Name}
- BNF/Constellation has no separation -> not possible to differenciate: {First Name (Possible Middle Names) Last Name}
- Steps: 
    - parsing of BTLF and creating keys (without preprocess)
    - parsing of Constellationsand matching with BTLF keys
    - parsing of BNF and matching with BTLF+Constellations Key

# prerequisite
- execute ExtractionGrapheBTLF to have latest version of authors (and regenerate BTLF book file so the authors are coherent)
- change source file to last version of files -> output files will be the final files

# outputs
- new BNF and Constellations files with authors aligned with BTLF authors


In [1]:
from rdflib import Graph, Namespace, Literal, URIRef
from rdflib import RDF, RDFS, OWL, XSD
import sys
sys.path.append('../../')

sys.path.append('../../andre')
import andre.utils as utils
import collections
SCHEMA = Namespace("http://schema.org/")
PBS = Namespace("http://www.example.org/pbs#")

In [2]:
graphe_Auteurs = Graph()
graphe_Auteurs.bind("schema","http://schema.org/", override=True, replace=True)
graphe_Auteurs.parse("../final_datasets/authors_graph_btlf.ttl")

<Graph identifier=Nc4f6412e314747f98131b2a0c0fba35a (<class 'rdflib.graph.Graph'>)>

# auteurs ont forcement un nom de famille ou pseudo

In [3]:
def extract_author_turtle(graph, author):
    last_name = str(graph.value(author, SCHEMA.familyName)) if graph.value(author, SCHEMA.familyName) else ""
    first_name = str(graph.value(author, SCHEMA.givenName)) if graph.value(author, SCHEMA.givenName) else ""
    description = str(graph.value(author, SCHEMA.description)) if graph.value(author, SCHEMA.description) else ""
    return last_name, first_name, description

authors_dict = {}

for author_uri in graphe_Auteurs.subjects(RDF.type, utils.schema.Person):
    last_name, first_name, description = extract_author_turtle(graphe_Auteurs, author_uri)
    # authors_dict[first_name + " " + last_name] = author


    if last_name and first_name:
        raw_author_name = first_name + " " + last_name
        preprocessed_author_name = utils.preprocess_author_name(raw_author_name)
        # if preprocessed_author_name in authors_dict:
            # print("dupliqué", preprocessed_author_name, raw_author_name, authors_dict[preprocessed_author_name], author_uri)
        authors_dict[preprocessed_author_name] = author_uri
    elif last_name and not first_name:
        raw_author_name = last_name
        preprocessed_author_name = utils.preprocess_author_name(raw_author_name)
        # if preprocessed_author_name in authors_dict:
            # print("dupliqué", preprocessed_author_name, raw_author_name, authors_dict[preprocessed_author_name], author_uri)
        authors_dict[preprocessed_author_name] = author_uri
    elif not last_name and first_name:
        print("no last name", author_uri)
        raw_author_name = first_name
        preprocessed_author_name = utils.preprocess_author_name(raw_author_name)
        # if preprocessed_author_name in authors_dict:
            # print("dupliqué", preprocessed_author_name, raw_author_name, authors_dict[preprocessed_author_name], author_uri)
        authors_dict[preprocessed_author_name] = author_uri
    elif not last_name and not first_name:
        print("no first name and no last name", author_uri)
    
    # print(last_name, first_name,  description)

# print(authors_dict)
len(authors_dict)

18177

In [4]:
import Levenshtein

def populate_authors_in_graph(source, graph_source, author_graph, btlf_only=False):
    uri_base = {
        "constellations": "authorConstellations",
        "bnf": "authorBNF",
        "babelio": "authorBabelio"
    }

    array_size_counter = collections.defaultdict(int)
    array_size_names = collections.defaultdict(list)
    author_match_counter = 0
    id = 0
    multiple_authors_count = 0
    book_counter = 0
    levenstein_matches = {}

    for book in graph_source.subjects(RDF.type, utils.schema.Book):
        if source == "constellations":
            book_data = utils.extract_data_constellations(graph_source, book)
        elif source == "bnf":
            book_data = utils.extract_data_bnf(graph_source, book)
        elif source == "babelio":
            book_data = utils.extract_data_babelio(graph_source, book)
        else:
            assert(False)

        if btlf_only and not book_data.isbn: # books without valid isbn are sure not to be in BTLF
            continue

        author_names = book_data.book_authors
        graph_source.remove((book, SCHEMA.author, None)) # remove old author field, no information lost because already extracted
        for raw_author_name in author_names:
            preprocessed_author_name = utils.preprocess_author_name(raw_author_name)

            if preprocessed_author_name in authors_dict:
                if uri_base[source] not in authors_dict[preprocessed_author_name]: # prevent matching stats with previously added author
                    author_match_counter += 1
                graph_source.remove((book, PBS.authorURI, None))
                graph_source.add((book, SCHEMA.author, authors_dict[preprocessed_author_name]))
                graph_source.add((book, PBS.authorString, Literal(raw_author_name)))
            
            else:
                # Align with existing keys using Levenshtein distance
                for key in authors_dict.keys():
                    distance = Levenshtein.distance(preprocessed_author_name, key)
                    if distance <= 2:  # Adjust the threshold as needed
                        levenstein_matches[preprocessed_author_name] = key
                        break   
                print("raté", raw_author_name)
                graph_source.remove((book, PBS.authorURI, None))
                author_uri = SCHEMA[uri_base[source] + str(id)]
                authors_dict[preprocessed_author_name] = author_uri
                author_graph.add((author_uri, RDF.type, SCHEMA['Person']))
                author_graph.add((author_uri, SCHEMA.familyName, Literal(raw_author_name)))
                graph_source.add((book, SCHEMA.author, author_uri))
                graph_source.add((book, PBS.authorString, Literal(raw_author_name)))
                id += 1

            name_array = raw_author_name.rsplit(" ")
            array_size_counter[str(len(name_array))] += 1
            array_size_names[str(len(name_array))].append(name_array)

            if any(word in raw_author_name for word in [" et ", " and ", "d'après", "d’après", "&", "with", " | "]):
                multiple_authors_count += 1

        book_counter += 1
    print("matched with Levenstein", levenstein_matches)
    print("word number , count")
    for k, v in sorted(array_size_counter.items(), key=lambda x: int(x[0]), reverse=True):
        print(k,"            ", v)
    print("processed book number", book_counter)
    print(" multiple authors", multiple_authors_count)
    print("author alignments", author_match_counter)
    return array_size_names

In [7]:
authors_dict

{'anthony kemp': rdflib.term.URIRef('http://schema.org/author1'),
 'michel honaker': rdflib.term.URIRef('http://schema.org/author10'),
 'olivier desvaux': rdflib.term.URIRef('http://schema.org/author1000'),
 'laure fissore': rdflib.term.URIRef('http://schema.org/author10001'),
 'sophie van der linden': rdflib.term.URIRef('http://schema.org/author10002'),
 'kate messner': rdflib.term.URIRef('http://schema.org/author10004'),
 'christopher silas neal': rdflib.term.URIRef('http://schema.org/author10005'),
 'damien marie': rdflib.term.URIRef('http://schema.org/author10008'),
 'loic malnati': rdflib.term.URIRef('http://schema.org/author10009'),
 'mary higgins clark': rdflib.term.URIRef('http://schema.org/author1001'),
 'floriane herrero': rdflib.term.URIRef('http://schema.org/author10013'),
 'nonoya masaki': rdflib.term.URIRef('http://schema.org/author10018'),
 'kei tsuchiya': rdflib.term.URIRef('http://schema.org/author10019'),
 'wendell minor': rdflib.term.URIRef('http://schema.org/author1

# constellations

In [5]:
g = Graph()
g.bind("schema","http://schema.org/", override=True, replace=True)
g.bind("pbs","http://www.example.org/pbs#", override=True, replace=True)

g.parse("../../final_datasets/intermediary_files/constellations_NoDuplicatePublisher.ttl", format="turtle")
len(list(g.subjects(RDF.type, utils.schema.Book)))

11267

In [None]:
list(g.subjects(RDF.type, utils.schema.Book))[0]

rdflib.term.URIRef('http://schema.org/Book000aaa0d-d6f6-4c18-8c65-b4f972aae3d8')

In [6]:
populate_authors_in_graph(source="constellations", graph_source=g, author_graph=graphe_Auteurs)

raté Julie Delporte
raté Deborah Hodge
raté Lionel Groulx
raté Natacha Scheidhauer
raté Evan H. Rhodes
raté Sarah Tsiang
raté Randall de Sève
raté Virginie Jouannet Roussel
raté Collectif
raté Jez Alborough
raté Céline et al. Bathias Rascalou
raté Helen Lester
raté PatrickGeorge
raté Vikki Vansickle
raté Jörg Steiner
raté Philippe de La Cotardière
raté Sarah Elton
raté David McPhail
raté Robie H. Harris
raté Wladyslaw Szpilman
raté Agnès de Lestrade
raté Rukhsana Khan
raté Muriel Mingau
raté Liane Shaw
raté Clive A. Lawton
raté Nick Dowson
raté del Pup
raté Edgar Poe
raté Prosper Gautier
raté Fred Bernard
raté Élisabeth de Lambilly
raté Kevin Major
raté Jean de La Fontaine
raté Jennifer Armstrong
raté John Wookward
raté Julie Brinckloe
raté Rachel Qitsualik
raté Chris Butterworth
raté Robert Munsch
raté Takabatake Jun
raté Rona Arato
raté David Alazraki
raté Esmé Shapiro
raté Martine Podesto
raté Corinne King
raté Nimrod
raté John Choi
raté James Gladstone
raté Geraldine McCaughrean
ra

defaultdict(list,
            {'2': [['Torben', 'Kuhlmann'],
              ['Roselyne', 'Bertin'],
              ['Patrick', 'Banon'],
              ['Julie', 'Delporte'],
              ['Jean', 'Sioui'],
              ['Sandra', 'Laboucarie'],
              ['Robert', 'Soulières'],
              ['Deborah', 'Hodge'],
              ['Lionel', 'Groulx'],
              ['David', 'McKee'],
              ['Marine', 'Degli'],
              ['Olivier', 'Morel'],
              ['Jean-Michel', 'Lienhardt'],
              ['Jeanne', 'Boyer'],
              ['Natacha', 'Scheidhauer'],
              ['Nicolas', 'Debon'],
              ['Jacques', 'Pasquet'],
              ['Alan', 'Mets'],
              ['Fiona', 'Watt'],
              ['Amanda', 'McCardie'],
              ['Marie-Thérèse', 'Davidson'],
              ['Gabrielle', 'Vincent'],
              ['Sarah', 'Tsiang'],
              ['Cécile', 'Denis'],
              ['Ted', 'Lewin'],
              ['Olga', 'Lecaye'],
              ['Juli

In [None]:
# with open("../final_datasets/btlf_books/Graphes/grapheAuteurs_BTLF_Constellations.ttl","wb") as fichier:
#     graphe_Auteurs.serialize(fichier) 

# with open("../final_datasets/constellations_auteursURI.ttl","wb") as fichier:
#     g.serialize(fichier) 

# bnf

In [None]:
g = Graph()
g.bind("schema","http://schema.org/", override=True, replace=True)
g.bind("pbs","http://www.example.org/pbs#", override=True, replace=True)

g.parse("../../final_datasets/intermediary_files/bnf.ttl", format="turtle")
len(list(g.subjects(RDF.type, utils.schema.Book)))

Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x0000016E76D6C900>
Traceback (most recent call last):
  File "C:\Users\dre\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\rdflib\term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
           ^^^^^^^^^^^^^^^^^^
  File "C:\Users\dre\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\isodate\isodates.py", line 203, in parse_date
    raise ISO8601Error('Unrecognised ISO 8601 date format: %r' % datestring)
isodate.isoerror.ISO8601Error: Unrecognised ISO 8601 date format: 'Port- d-es'
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x0000016E76D6C900>
Traceback (most recent call last)

30384

In [None]:
populate_authors_in_graph(source="bnf", graph_source=g, author_graph=graphe_Auteurs)

rate Jean-Claude Brisville
rate Association Calladium
rate Natalie Savage Carlson
rate Bernard Stone
rate Heinrich Hoffmann
rate Idries Shah
rate Lucien Voy
rate Kay Fender
rate Pierre Pluchon
rate Shigeo Watanabe
rate Robert Kraus
rate J. M. G. Le Clézio
rate Ian Serraillier
rate Raymond Sinamal
rate Isaac Leib Peretz
rate Heinz Edelmann
rate Marie-Thérèse Rouil
rate Kay Thompson
rate Tan Koide
rate Yasuko Koide
rate Susan Pearson
rate Ann Jonas
rate Claude Auclair
rate François Migeat
rate Bette Greene
rate Guillermo Mordillo
rate Aliki
rate Toshi Yoshida
rate Geneviève Duroselle
rate Claude Hudelot
rate Jeannine Baticle
rate Jean-Clément Martin
rate Jacques Delval
rate Jean Meyer
rate Paul Maar
rate Charlotte Pomerantz
rate Sue Alexander
rate Anne Holm
rate Jean-Marie Petit
rate Françoise Lebrun
rate Trinka Hakes Noble
rate Ernestine Gilbreth
rate Frank Bunker Gilbreth
rate Randall Jarrell
rate Janni Howker
rate Jean Craighead George
rate Gérard Pussey
rate Henri Rey
rate Janine Mos

In [None]:
# with open("../final_datasets/authors_BTLF_Constellations_BNF.ttl","wb") as fichier:
#     graphe_Auteurs.serialize(fichier) 

with open("../final_datasets/bnf_auteursURI.ttl","wb") as fichier:
    g.serialize(fichier) 

# babelio

In [None]:
g = Graph()
g.bind("schema","http://schema.org/", override=True, replace=True)
g.bind("pbs","http://www.example.org/pbs#", override=True, replace=True)

g.parse("../final_datasets/babelio.ttl", format="turtle")
len(list(g.subjects(RDF.type, utils.schema.Book)))

4169

In [None]:
len(list(g.subjects(RDF.type, utils.schema.Book)))

4169

In [None]:
populate_authors_in_graph(source="babelio", graph_source=g, author_graph=graphe_Auteurs, btlf_only=True)

rate Kirihara Izumi
rate Miaki Sugaru
rate Taguchi Shuichi
rate Nakatani Nio
rate Gondaira Hitsuji
rate Nanao Itsuki
rate Endo Tatsuya
rate Kingyobachi Deme
rate Bingo Morihashi
rate Jimpachi Miri
rate Kouga Yun
rate Norie Yamada
rate Harold Sakuishi
rate Ubi Umeda
rate Hino Matsuri
rate Monique Bosco
rate Simon Roy
rate Jordan Tannahill
rate Delphine Perre
rate C. Helft
rate T.S. Easton
rate Myriam Vincent
rate Dai Sijie
rate Aimée de Jongh
rate Chantal Groléziat
rate F. Mauve
rate M. Leydier
rate Joanne K. Rowling
rate Fabien Cloutier
rate Edward van de Vendel
rate Margot Bruyère
rate Sébastien Gagnon
rate Margaret MacMillan
rate Joshua Khan
rate Alain Gagno
rate Dedieu
rate Fred Bernard
rate Jean Cômes Noguès
rate Anne Jonaz
rate Marc Voltenauer
rate Matthieu Radenac
rate Dana Wulfekotte
rate Agnès de Lestrade
rate Emilie Angebault
rate Mélanie Edwards
rate Célina Guiné
rate Sève Laurent-Fajal
rate Véronique Olivier-Barberon
rate Svjetlan Junaković
rate Illya Green
rate Dorothy-Shoe

In [None]:
with open("../final_datasets/authors_BTLF_Constellations_BNF_Babelio.ttl","wb") as fichier:
    graphe_Auteurs.serialize(fichier) 

with open("../final_datasets/babelio_auteursURI.ttl","wb") as fichier:
    g.serialize(fichier) 