In [7]:
from rdflib import Graph, Namespace, Literal, URIRef
from rdflib import RDF, RDFS, OWL, XSD
import xml.etree.ElementTree as ET

import sys
sys.path.append('../')

sys.path.append('../andre')
import andre.utils as utils
from andre.utils import schema as SCHEMA
from andre.utils import pbs as PBS
import isbnlib
import collections
import os
from pprint import pprint

SAVE_INTERMEDIARY_FILES = True
DELETE_INTERMEDIARY_FILES = False
ELIMINATE_DUPLICATES_PUBLISHERS = False # NoDuplicatePublisher files already generated, True iff need to generata agein (takes 10 mins)

# file paths

In [8]:
# publishers
bnf_NoDuplicatePublisher_file_path = "../final_datasets/intermediary_files/bnf_NoDuplicatePublisher.ttl"
constellations_NoDuplicatePublisher_file_path = "../final_datasets/intermediary_files/constellations_NoDuplicatePublisher.ttl"
babelio_NoDuplicatePublisher_file_path = "../final_datasets/intermediary_files/babelio_NoDuplicatePublisher.ttl"
publisher_BNF_Constellation_Babelio_file_path = "../final_datasets/intermediary_files/publisher_BNF_Constellation_Babelio_NoDuplicate.ttl"
publishers_btlf_bnf_constellations_babelio_file_path = "../final_datasets/publishers_BTLF_BNF_Constellations_Babelio.ttl"

# authors
authors_btlf_file_path = "../final_datasets/authors_graph_btlf.ttl"
constellations_authorURI_file_path = "../final_datasets/intermediary_files/constellations_authorURI.ttl"
bnf_authorURI_file_path = "../final_datasets/intermediary_files/bnf_authorURI.ttl"
babelio_authorURI_file_path = "../final_datasets/intermediary_files/babelio_authorURI.ttl"
authors_graph_all_sources_file_path = "../final_datasets/authors_BTLF_Constellations_BNF_Babelio.ttl"

# books
btlf_books_file_path = "../final_datasets/BTLF.ttl"


# publisher duplicates elimination

- eliminate duplicates of publishers in BNF + Constellations 
- done before manually matching with BTLF
- in alignement_editeur_analyse & extractionGrapheBTLF

In [9]:
# create dictionary of duplicates replacements with the key = duplicate, value = replacement
def create_duplicate_replacement(duplicates):
        duplicate_counter = 0 
        duplicate_replacements = {}
        for key in duplicates:
            duplicate_list = duplicates[key]
            if len(duplicate_list) >= 2:
                duplicate_counter += 1
                print("\""+ key + "\"")
                replacement = duplicate_list[0] # choosing 1st item of list because all the choices are equivalent
                print("  ", duplicate_list[0].replace("http://schema.org/", " "))
                for i in range(1,len(duplicate_list)):
                    duplicate_replacements[duplicate_list[i]] = replacement
                    print("  ", duplicate_list[i].replace("http://schema.org/", " "))
        print("duplicate", duplicate_counter)
        return duplicate_replacements

if ELIMINATE_DUPLICATES_PUBLISHERS:
    # original publisher graph load
    publishers_bnf_constellations_babelio_graph = Graph()
    publishers_bnf_constellations_babelio_graph.bind("schema", SCHEMA, override=True, replace=True)
    publishers_bnf_constellations_babelio_graph.parse("final_datasets/publishers-original-karim.ttl")
    print(len(list(publishers_bnf_constellations_babelio_graph.subjects(RDF.type, SCHEMA.Publisher))))

    # eliminate duplicates in publisher graph
    publishers_bnf_constellations_babelio_dict = {}
    duplicates_bnf_constellations_babelio = collections.defaultdict(lambda: [])
    for publisher in publishers_bnf_constellations_babelio_graph.subjects(RDF.type, SCHEMA.Publisher):
        raw_name = str(publishers_bnf_constellations_babelio_graph.value(publisher, SCHEMA.name))
        preprocessed_name = utils.preprocess_publisher_name(raw_name)

        duplicates_bnf_constellations_babelio[preprocessed_name].append(publisher)
        
    duplicates_replacements_bnf_constellations_babelio = create_duplicate_replacement(duplicates_bnf_constellations_babelio)

    # manual duplicates found when doing manual matches
    duplicates_replacements_bnf_constellations_babelio[URIRef('http://schema.org/MLafon')] = URIRef('http://schema.org/Michel_Lafon')
    duplicates_replacements_bnf_constellations_babelio[URIRef('http://schema.org/duSorbier')] = URIRef('http://schema.org/Le_Sorbier')
    duplicates_replacements_bnf_constellations_babelio[URIRef('http://schema.org/Les_Écrits_Des_Forges')] = URIRef('http://schema.org/Écrits_Des_Forges')

    print(len(duplicates_replacements_bnf_constellations_babelio))
    


### load initial book graphs

In [10]:
if ELIMINATE_DUPLICATES_PUBLISHERS:
    graph_bnf = Graph()
    graph_bnf.parse("final_datasets/bnf-original-karim.ttl", format="turtle")

    graph_constellations = Graph()
    graph_constellations.parse("final_datasets/constellations-original-karim.ttl", format="turtle")

    graph_babelio = Graph()
    graph_babelio.parse("final_datasets/babelio-original-karim.ttl", format="turtle")


### compute: replace duplicate publishers in book graphs

In [11]:
if ELIMINATE_DUPLICATES_PUBLISHERS:
    # 7 mins of execution
    for duplicate_publisher in duplicates_replacements_bnf_constellations_babelio:

        replacement_publisher = duplicates_replacements_bnf_constellations_babelio[duplicate_publisher]
        print(replacement_publisher)
        for publisher in publishers_bnf_constellations_babelio_graph.subjects(RDF.type, SCHEMA.Publisher):
            if publisher == duplicate_publisher:
                publishers_bnf_constellations_babelio_graph.remove((publisher, SCHEMA.name, None)) # remove attributes before reference so reference doesn't become orphan in graph
                publishers_bnf_constellations_babelio_graph.remove((publisher, RDF.type, None))
                # print(publisher)

        for bnf_book in graph_bnf.subjects(RDF.type, utils.schema.Book):
            book_data_source = utils.extract_data_bnf(graph_bnf, bnf_book)
            book_publisher = book_data_source.publisher
            if book_publisher == str(duplicate_publisher):
                graph_bnf.remove((bnf_book, SCHEMA.publisher, None))
                graph_bnf.add((bnf_book, SCHEMA.publisher, replacement_publisher))

        for constellations_book in graph_constellations.subjects(RDF.type, utils.schema.Book):
            book_data_source = utils.extract_data_constellations(graph_constellations, constellations_book)
            book_publisher = book_data_source.publisher   
            if book_publisher == str(duplicate_publisher):
                # print(book_publisher)
                graph_constellations.remove((constellations_book, SCHEMA.publisher, None))
                graph_constellations.add((constellations_book, SCHEMA.publisher, replacement_publisher))

        for babelio_book in graph_babelio.subjects(RDF.type, utils.schema.Book):
            book_data_source = utils.extract_data_babelio(graph_babelio, babelio_book)
            book_publisher = book_data_source.publisher
            if book_publisher == str(duplicate_publisher):
                graph_babelio.remove((babelio_book, SCHEMA.publisher, None))
                graph_babelio.add((babelio_book, SCHEMA.publisher, replacement_publisher))

### save graph to file

In [12]:
if ELIMINATE_DUPLICATES_PUBLISHERS and SAVE_INTERMEDIARY_FILES:
    with open(bnf_NoDuplicatePublisher_file_path, "wb") as fichier:
        graph_bnf.serialize(fichier)

    with open(constellations_NoDuplicatePublisher_file_path, "wb") as fichier:
        graph_constellations.serialize(fichier)

    with open(babelio_NoDuplicatePublisher_file_path, "wb") as fichier:
        graph_babelio.serialize(fichier)

    with open(publisher_BNF_Constellation_Babelio_file_path, "wb") as fichier:
        publishers_bnf_constellations_babelio_graph.serialize(fichier)

# BTLF Graph extraction

- parse xml files of BTLF (ONIX format)
- generate book, publisher and authors for BLTF compatible with knowledge base of BNF / BTLF
- match publishers of BNF_Constellation with publishers of BTLF
    -> hybrid exact and manual matching because editor names are tricky
    -> generate btlf books with editors consolidated with BNF/ BTLF

### prerequisite
- onix files
- manual_matches_v2 from alignement_editeurs_analyse.ipynb:

### output
- btlf book file
- bltf/bnf/constellation aligned editors
- btlf autors (not aligned, have to execute alignement_auteurs.ipynb)

### load ONIX

In [13]:
filePattern = "../final_datasets/btlf_books/Onix_synapseC_2/202312181524_onix3_M3_{}.xml"


ns = { 'd': "http://ns.editeur.org/onix/3.0/reference" }


btlf_books_onix = [] 
for n in range (0, 13):
    file = filePattern.format(n)
    data = ET.parse(file)
    root = data.getroot()
    products = root.findall("./d:Product", ns)
    btlf_books_onix += products


age_ranges = {
    "PC03" : [0, 1, 2, 3],
    "PC04" : [3, 4, 5, 6],
    "PC05" : [6, 8, 9],
    "PC06" : [9, 10, 11, 12],
    "PC07" : [12, 13, 14, 15],
    "PC01" : [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18], # Tout Public: > 8 ans car sinon, longueur et format livre incompatible entre ages extrêmes
    "NS03" : [12, 13, 14, 15, 16],
    "NS02" : [6, 7, 8, 9, 10, 11],
    "NS01" : [0, 1, 2, 3, 4, 5], # prescolaire: < 6 ans
    "NS05" : [17, 18] # college = CEGEP
    }

print(len(btlf_books_onix), " livres au total")

26920  livres au total


### author alignement

##### btlf author extraction

In [14]:
class Author:
    def __init__(self, raw_name, uri):
        self.raw_name = raw_name
        self.uri = uri
    
    def __str__(self):
        return f"Author: {self.raw_name} | URI: {self.uri}"

authors_graph_all_sources = Graph()
authors_graph_all_sources.bind("schema", SCHEMA, override=True, replace=True)
authors_graph_all_sources.bind("pbs", PBS, override=True)
authors_graph_all_sources.bind("author", SCHEMA["Person"], override=True, replace=True)

processed_authors = {} # key = "last_name, first_name"
btlf_isbn_authors = collections.defaultdict(list) # key = isbn, value = all btlf authors of this isbn
excluded_authors = {} # key = name, value = record reference
id = 0
for item in btlf_books_onix:
    rf = item.find('.//d:RecordReference', ns)
    isbn = item.findall(".//d:ProductIdentifier/d:ProductIDType[.='15']../d:IDValue", ns)
    if not isbn:
        print('ERREUR isbn',item)
    else:
        isbn = isbn[0].text
    editeur = item.findall(".//d:PublishingRole[.='01']../d:PublisherName", ns) 
    if not editeur:
        print('ERREUR editeur',item)   

    liste_auteurs = item.findall('.//d:DescriptiveDetail/d:Contributor', ns)
    for a in liste_auteurs: 
        name_inverted = a.find(".//d:PersonNameInverted", ns)
        if name_inverted == None:
            continue
        if name_inverted.text.strip() == "": # nom vide
            print(rf.text, "nom vide")
            continue
        split_name = name_inverted.text.rsplit(",")
        if len(split_name) > 2:
            print(rf.text, split_name)
            excluded_authors[name_inverted.text] = rf.text
            continue

        # populating authors graph and processed_authors dict
        if name_inverted.text not in processed_authors:
            id += 1 # uri should be created here to prevent duplictes
            author_uri = SCHEMA['authorBTLF_' + str(id)]
            processed_authors[name_inverted.text] = author_uri
            authors_graph_all_sources.add((author_uri,RDF.type, SCHEMA['Person']))
            if len(split_name) == 2:
                family_name = split_name[0].strip()
                first_name = split_name[1].strip()
                authors_graph_all_sources.add((author_uri,SCHEMA.givenName,Literal(first_name)))
                authors_graph_all_sources.add((author_uri,SCHEMA.familyName,Literal(family_name)))
            elif len(split_name) == 1: # pseudonyme ex: Barasui ou nom/prenom pas clairement défini ex: Vieira da Silva
                family_name = split_name[0].strip()
                authors_graph_all_sources.add((author_uri,SCHEMA.familyName,Literal(family_name)))
                # print(nom.text)
                
            bio = a.findall('./d:BiographicalNote', ns)
            if bio:
                authors_graph_all_sources.add((author_uri,SCHEMA.description	,Literal(bio[0].text)))

        # populating the btlf_isbn_authors dict with same URI as in the authors graph, but allow author duplicates (if 1 author is present is multiple isbns)
        if len(split_name) == 2:
            family_name = split_name[0].strip()
            first_name = split_name[1].strip()
            author_name_uri = Author(raw_name=first_name + " " + family_name,
                                    uri=processed_authors[name_inverted.text])
            btlf_isbn_authors[isbn].append(author_name_uri)
        elif len(split_name) == 1:
            family_name = split_name[0].strip()
            author_name_uri = Author(raw_name=family_name, 
                                     uri=processed_authors[name_inverted.text])
            btlf_isbn_authors[isbn].append(author_name_uri)

print('max id = ', id)
lengths = [len(value) for value in btlf_isbn_authors.values()]
length_distribution = collections.Counter(lengths)
print()
print("isbn author length distribution")
pprint(length_distribution)


NOT16385126581471 ['Saint-Mars', ' Dominique', ' de']
NOT16374463272094 ['Linden', ' Martijn', ' van der']
NOT16381758815244 ['Preux', ' Anne', ' de']
NOT16377345544194 ['Boyce', ' Frank', ' Cottrell']
NOT16384081182349 ['La Fontaine', ' Jean', ' de']
NOT16372454138223 ['Leeuwen', ' Joke', ' van']
NOT16384766842081 ['El Fathi', ' Mickaël', ' el']
NOT16379705320588 ['Panafieu', ' Jean-Baptiste', ' de']
NOT16379706895944 ['Panafieu', ' Jean-Baptiste', ' de']
NOT16384049205966 ['Panafieu', ' Jean-Baptiste', ' de']
NOT16388489374960 ['Vendel', ' Edward', ' van de']
NOT16388490240730 ['La Fontaine', ' Jean', ' de']
NOT16558488968844 ['Berge', ' Marieke', ' ten']
NOT16376036362764 ['Lestrade', ' Agnès', ' de']
NOT1636036742705 ['Bolduc', ' Edouard', ' Mme']
NOT16376024151906 ['Laubier', ' Matthieu', ' de']
NOT16389880890813 ['Saint-Chamas', ' Benoît', ' de']
NOT16389880890813 ['Saint-Chamas', ' Emmanuelle', ' de']
NOT16383996837844 ['Chantérac', ' Inès', ' de']
NOT16376020131342 ['Saint-Cham

In [15]:
for isbn, author in btlf_isbn_authors.items():
    print(f"Isbn: {isbn}")
    for a in author:
        print(a)   

Isbn: 9782070533206
Author: Anthony Kemp | URI: http://schema.org/authorBTLF_1
Isbn: 9782070534159
Author: Nicolas Werth | URI: http://schema.org/authorBTLF_2
Isbn: 9782917057063
Author: Katsumi Komagata | URI: http://schema.org/authorBTLF_3
Isbn: 9782215084563
Author: Agnès Vandewiele | URI: http://schema.org/authorBTLF_4
Author: Pascal Laheurte | URI: http://schema.org/authorBTLF_5
Isbn: 9782746708402
Author: Serge Hayat | URI: http://schema.org/authorBTLF_6
Author: Philippe Hayat | URI: http://schema.org/authorBTLF_7
Isbn: 9782081631427
Author: Barry Jonsberg | URI: http://schema.org/authorBTLF_8
Author: Luc Rigoureau | URI: http://schema.org/authorBTLF_9
Isbn: 9782081613324
Author: Michel Honaker | URI: http://schema.org/authorBTLF_10
Isbn: 9782081634176
Author: Emmanuel Bourdier | URI: http://schema.org/authorBTLF_11
Isbn: 9782081634701
Author: James V. Hart | URI: http://schema.org/authorBTLF_12
Author: Alice Marchand | URI: http://schema.org/authorBTLF_13
Isbn: 9782746708723
Aut

In [16]:
len(list(authors_graph_all_sources.subjects(RDF.type, SCHEMA.Person)))

18203

##### align btlf authors with BNF-Constellations-Babelio

In [17]:
def extract_author_turtle(graph, author):
    last_name = str(graph.value(author, SCHEMA.familyName)) if graph.value(author, SCHEMA.familyName) else ""
    first_name = str(graph.value(author, SCHEMA.givenName)) if graph.value(author, SCHEMA.givenName) else ""
    description = str(graph.value(author, SCHEMA.description)) if graph.value(author, SCHEMA.description) else ""
    return last_name, first_name, description

authors_dict = {}

# check if there are duplicates and missing data in the authors graph
for author_uri in authors_graph_all_sources.subjects(RDF.type, utils.schema.Person):
    last_name, first_name, description = extract_author_turtle(authors_graph_all_sources, author_uri)

    if last_name and first_name:
        raw_source_author_name = first_name + " " + last_name
        preprocessed_source_author_name = utils.preprocess_author_name(raw_source_author_name)
        if preprocessed_source_author_name in authors_dict:
            print("dupliqué", preprocessed_source_author_name, raw_source_author_name, authors_dict[preprocessed_source_author_name], author_uri)
        authors_dict[preprocessed_source_author_name] = author_uri
    elif last_name and not first_name:
        raw_source_author_name = last_name
        preprocessed_source_author_name = utils.preprocess_author_name(raw_source_author_name)
        if preprocessed_source_author_name in authors_dict:
            print("dupliqué", preprocessed_source_author_name, raw_source_author_name, authors_dict[preprocessed_source_author_name], author_uri)
        authors_dict[preprocessed_source_author_name] = author_uri
    elif not last_name and first_name:
        print("no last name", author_uri)
        raw_source_author_name = first_name
        preprocessed_source_author_name = utils.preprocess_author_name(raw_source_author_name)
        if preprocessed_source_author_name in authors_dict:
            print("dupliqué", preprocessed_source_author_name, raw_source_author_name, authors_dict[preprocessed_source_author_name], author_uri)
        authors_dict[preprocessed_source_author_name] = author_uri
    elif not last_name and not first_name:
        print("no first name and no last name", author_uri)

print(len(authors_dict), "non duplicate authors")

dupliqué claudine galea Claudine Galea http://schema.org/authorBTLF_205 http://schema.org/authorBTLF_2534
dupliqué bernard cheze Bernard Cheze http://schema.org/authorBTLF_2447 http://schema.org/authorBTLF_5612
dupliqué ceka Céka http://schema.org/authorBTLF_2595 http://schema.org/authorBTLF_6309
dupliqué petr sis Petr Sís http://schema.org/authorBTLF_1146 http://schema.org/authorBTLF_7377
dupliqué jean rene Jean René http://schema.org/authorBTLF_6812 http://schema.org/authorBTLF_11227
dupliqué matteo berton Mattéo Berton http://schema.org/authorBTLF_10992 http://schema.org/authorBTLF_11962
dupliqué gwenaelle boulet Gwenaëlle Boulet http://schema.org/authorBTLF_1811 http://schema.org/authorBTLF_13014
dupliqué alex nogues Alex Nogues http://schema.org/authorBTLF_12688 http://schema.org/authorBTLF_13649
dupliqué elise gravel Elise Gravel http://schema.org/authorBTLF_6731 http://schema.org/authorBTLF_14607
dupliqué christophe leon Christophe Léon http://schema.org/authorBTLF_609 http://sc

In [18]:
def populate_authors_in_graph_by_isbn(source, graph_source, author_graph, btlf_only=False):
    uri_base = {
        "constellations": "authorConstellations",
        "bnf": "authorBNF",
        "babelio": "authorBabelio"
    }

    array_size_counter = collections.defaultdict(int)
    array_size_names = collections.defaultdict(list)
    id = 0
    multiple_authors_count = 0
    book_counter = 0

    for book in graph_source.subjects(RDF.type, utils.schema.Book):
        if source == "constellations":
            book_data_source = utils.extract_data_constellations_before_alignement(graph_source, book)
        elif source == "bnf":
            book_data_source = utils.extract_data_bnf_before_alignement(graph_source, book)
        elif source == "babelio":
            book_data_source = utils.extract_data_babelio_before_alignement(graph_source, book)
        else:
            assert(False)

        if btlf_only and not book_data_source.isbn: # books without valid isbn are sure not to be in BTLF
            continue

        graph_source.remove((book, SCHEMA.author, None)) # remove old author field, no information lost because already extracted
        for raw_source_author_name in book_data_source.book_authors:
            id += 1
            preprocessed_source_author_name = utils.preprocess_author_name(raw_source_author_name)

            for isbn_source in book_data_source.isbn: # align authors of the same isbn if there distance is less than 3
                    authors_candidate_btlf = btlf_isbn_authors[isbn_source]
                    for author_btlf in authors_candidate_btlf:
                        if utils.align_with_levenshtein(author_btlf=author_btlf.raw_name, author_source=raw_source_author_name, levenshtein_distance=3):
                            # print("###############", isbn_source)
                            # print("source author", raw_source_author_name)
                            # print(author_btlf)
                            authors_dict[preprocessed_source_author_name] = author_btlf.uri
                            authors_dict[utils.preprocess_author_name(author_btlf.raw_name)] = author_btlf.uri

            if preprocessed_source_author_name in authors_dict:
                graph_source.remove((book, PBS.authorURI, None))
                graph_source.add((book, SCHEMA.author, authors_dict[preprocessed_source_author_name]))
                graph_source.add((book, PBS.authorString, Literal(raw_source_author_name)))
            else:
                graph_source.remove((book, PBS.authorURI, None))
                author_uri = SCHEMA[uri_base[source]+ "_" + str(id)]           
                print("pas aligné", raw_source_author_name)
                authors_dict[preprocessed_source_author_name] = author_uri
                author_graph.add((author_uri, RDF.type, SCHEMA['Person']))
                author_graph.add((author_uri, SCHEMA.familyName, Literal(raw_source_author_name)))
                graph_source.add((book, SCHEMA.author, author_uri))
                graph_source.add((book, PBS.authorString, Literal(raw_source_author_name)))


            name_array = raw_source_author_name.rsplit(" ")
            array_size_counter[str(len(name_array))] += 1
            array_size_names[str(len(name_array))].append(name_array)

            if any(word in raw_source_author_name for word in [" et ", " and ", "d'après", "d’après", "&", "with", " | "]):
                multiple_authors_count += 1

        book_counter += 1

    print("word number , count")
    for k, v in sorted(array_size_counter.items(), key=lambda x: int(x[0]), reverse=True):
        print(k,"            ", v)
    print("processed book number", book_counter)
    print("multiple authors", multiple_authors_count)
    return array_size_names

###### constellations

In [19]:
graph_constellations = Graph()
graph_constellations.bind("schema", SCHEMA, override=True, replace=True)
graph_constellations.bind("pbs", PBS, override=True, replace=True)

graph_constellations.parse(constellations_NoDuplicatePublisher_file_path, format="turtle")
len(list(graph_constellations.subjects(RDF.type, utils.schema.Book)))

11267

In [20]:
array_size_names_constellations = populate_authors_in_graph_by_isbn(source="constellations", graph_source=graph_constellations, author_graph=authors_graph_all_sources)

Levenshtein Author alignment: Roselyne Bertin -> Roselyne Bertin | distance = 0
Levenshtein Author alignment: Patrick Banon -> Patrick Banon | distance = 0
pas aligné Julie Delporte
Levenshtein Author alignment: Jean Sioui -> Jean Sioui | distance = 0
Levenshtein Author alignment: Sandra Laboucarie -> Sandra Laboucarie | distance = 0
Levenshtein Author alignment: Robert Soulières -> Robert Soulières | distance = 0
pas aligné Deborah Hodge
pas aligné Lionel Groulx
Levenshtein Author alignment: Marine Degli -> Marine Degli | distance = 0
Levenshtein Author alignment: Olivier Morel -> Olivier Morel | distance = 0
Levenshtein Author alignment: Jean-Michel Lienhardt -> Jean-Michel Lienhardt | distance = 0
Levenshtein Author alignment: Jeanne Boyer -> Jeanne Boyer | distance = 0
pas aligné Natacha Scheidhauer
Levenshtein Author alignment: Alan Mets -> Alan Mets | distance = 0
Levenshtein Author alignment: Fiona Watt -> Fiona Watt | distance = 0
Levenshtein Author alignment: Amanda McCardie -

In [21]:
array_size_names_constellations['6']

[['Joan', 'de', 'Déu', 'Prats', 'i', 'Pijoan'],
 ['Michael', 'Bill', 'Jr.', '&', 'Samson', 'Martin'],
 ['Michael', 'E.', 'Megan', '&', 'Mann', 'Herbert']]

In [22]:
# save constellations graph with authors
if SAVE_INTERMEDIARY_FILES:
    with open(constellations_authorURI_file_path,"wb") as fichier:
        graph_constellations.serialize(fichier) 

###### bnf

In [23]:
graph_bnf = Graph()
graph_bnf.bind("pbs", PBS, override=True, replace=True)
graph_bnf.bind("schema", SCHEMA, override=True, replace=True)

graph_bnf.parse(bnf_NoDuplicatePublisher_file_path, format="turtle")
len(list(graph_bnf.subjects(RDF.type, utils.schema.Book)))

Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x000001DD0027D080>
Traceback (most recent call last):
  File "C:\Users\dre\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\rdflib\term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
           ^^^^^^^^^^^^^^^^^^
  File "C:\Users\dre\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\isodate\isodates.py", line 203, in parse_date
    raise ISO8601Error('Unrecognised ISO 8601 date format: %r' % datestring)
isodate.isoerror.ISO8601Error: Unrecognised ISO 8601 date format: 'Port- d-es'
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x000001DD0027D080>
Traceback (most recent call last)

30384

In [24]:
array_size_names_bnf = populate_authors_in_graph_by_isbn(source="bnf", graph_source=graph_bnf, author_graph=authors_graph_all_sources)

pas aligné Jean-Claude Brisville
pas aligné Association Calladium
pas aligné Natalie Savage Carlson
pas aligné Bernard Stone
pas aligné Heinrich Hoffmann
pas aligné Idries Shah
pas aligné Lucien Voy
pas aligné Kay Fender
pas aligné Pierre Pluchon
pas aligné Shigeo Watanabe
pas aligné J. M. G. Le Clézio
pas aligné Ian Serraillier
pas aligné Raymond Sinamal
pas aligné Isaac Leib Peretz
pas aligné Heinz Edelmann
pas aligné Marie-Thérèse Rouil
pas aligné Kay Thompson
pas aligné Tan Koide
pas aligné Yasuko Koide
pas aligné Claude Auclair
pas aligné François Migeat
pas aligné Bette Greene
pas aligné Guillermo Mordillo
pas aligné Toshi Yoshida
pas aligné Geneviève Duroselle
pas aligné Claude Hudelot
pas aligné Jeannine Baticle
pas aligné Jean-Clément Martin
pas aligné Jacques Delval
pas aligné Jean Meyer
pas aligné Paul Maar
pas aligné Charlotte Pomerantz
pas aligné Anne Holm
pas aligné Jean-Marie Petit
pas aligné Françoise Lebrun
pas aligné Ernestine Gilbreth
pas aligné Frank Bunker Gilbreth

In [25]:
array_size_names_bnf['343']

[['Tao',
  '.',
  'Interprète',
  '|',
  'Nathalie',
  'Soussana.',
  'Interprète',
  '|',
  'Ahlam',
  'Boina.',
  'Interprète',
  '|',
  'Nathalie',
  'Soussana.',
  'Interprète',
  '|',
  'Ahlam',
  'Boina.',
  'Interprète',
  '|',
  'Ayane',
  'Teouri.',
  'Interprète',
  '|',
  'Ichane',
  'Teouri.',
  'Interprète',
  '|',
  'Nathalie',
  'Soussana.',
  'Interprète',
  '|',
  'Nayah',
  'Cheny.',
  'Interprète',
  '|',
  'Nathalie',
  'Soussana.',
  'Interprète',
  '|',
  'Nawal',
  'Milanao.',
  'Interprète',
  '|',
  'Nathalie',
  'Soussana.',
  'Interprète',
  '|',
  'Tiana',
  'Masselot.',
  'Interprète',
  '|',
  'Hélène',
  'Groeme.',
  'Interprète',
  '|',
  'Jean',
  'Groeme.',
  'Interprète',
  '|',
  'Nathalie',
  'Soussana.',
  'Interprète',
  '|',
  'Tao',
  '.',
  'Interprète',
  '|',
  'Nathalie',
  'Soussana.',
  'Interprète',
  '|',
  'Tiana',
  'Masselot.',
  'Interprète',
  '|',
  'Lionel',
  'Agenor.',
  'Interprète',
  '|',
  'Nathalie',
  'Soussana.',
  'Inter

In [26]:
if SAVE_INTERMEDIARY_FILES:
    with open(bnf_authorURI_file_path,"wb") as fichier:
        graph_bnf.serialize(fichier) 

###### babelio

In [27]:
graph_babelio = Graph()
graph_babelio.bind("schema", SCHEMA, override=True, replace=True)
graph_babelio.bind("pbs", PBS, override=True, replace=True)

graph_babelio.parse(babelio_NoDuplicatePublisher_file_path, format="turtle")
len(list(graph_babelio.subjects(RDF.type, utils.schema.Book))) # seulement ~2000 ont des isbns

4169

In [28]:
array_size_names_babelio = populate_authors_in_graph_by_isbn(source="babelio", graph_source=graph_babelio, author_graph=authors_graph_all_sources)

pas aligné Arleen Paré
pas aligné Al Purdy
pas aligné P. Bougeault
pas aligné Normand de Bellefeuille
pas aligné Annie Dubreuil
pas aligné Sylvie Marcoux
pas aligné David Zieroth
pas aligné Ross King
pas aligné Kirihara Izumi
pas aligné Miaki Sugaru
pas aligné Taguchi Shuichi
pas aligné Nakatani Nio
pas aligné Gondaira Hitsuji
pas aligné Nanao Itsuki
pas aligné Endo Tatsuya
pas aligné Kingyobachi Deme
pas aligné Bingo Morihashi
pas aligné Kaiu Shirai
pas aligné Posuka Demizu
pas aligné Ryuhei Tamura
pas aligné Jiro Andro
pas aligné Masaru Katori
pas aligné Jung-man Cho
pas aligné Jimpachi Miri
pas aligné Kouga Yun
pas aligné Kin-Hwan Park
pas aligné Rie Takada
pas aligné Amagi Koshiba
pas aligné Norie Yamada
pas aligné Harold Sakuishi
pas aligné Ubi Umeda
pas aligné Hino Matsuri
pas aligné Pierre Perrault
pas aligné Michel Brunet
pas aligné Robert Dickson
pas aligné Anne Vallières
pas aligné Rachna Gilmore
pas aligné Nino Ricci
pas aligné Chiki Kikuchi
pas aligné Raymond Milési
pas ali

In [29]:
with open(authors_graph_all_sources_file_path,"wb") as fichier:
    authors_graph_all_sources.serialize(fichier) # final file

if SAVE_INTERMEDIARY_FILES:
    with open(babelio_authorURI_file_path,"wb") as fichier:
        graph_babelio.serialize(fichier) 

### btlf editor extraction and alignment with BNF-Constellations-Babelio

##### load publisher graph

In [31]:
publishers_bnf_constellations_babelio_graph = Graph()
publishers_bnf_constellations_babelio_graph.bind("schema",SCHEMA , override=True, replace=True)
publishers_bnf_constellations_babelio_graph.parse(publisher_BNF_Constellation_Babelio_file_path)
len(list(publishers_bnf_constellations_babelio_graph.subjects(RDF.type, utils.schema.Publisher)))

2222

##### create publisher dict and assertions

In [32]:
# create a dict with key = preprocessed publisher , value = raw name, source, uri, preprocesd name

def find_publisher_by_raw_name(target_raw_name):
    for preprocessed_name in publishers_bnf_constellations_dict.keys(): # defined later
        raw_name = publishers_bnf_constellations_dict[preprocessed_name].raw_name
        # print(uri)
        if raw_name == target_raw_name:
            return publishers_bnf_constellations_dict[preprocessed_name]
    return "not found"

publishers_bnf_constellations_dict = {}
doublons = []
for publisher in publishers_bnf_constellations_babelio_graph.subjects(RDF.type, SCHEMA.Publisher):
    raw_name = str(publishers_bnf_constellations_babelio_graph.value(publisher, SCHEMA.name))
    preprocessed_name = utils.preprocess_publisher_name(raw_name)

    
    if preprocessed_name in publishers_bnf_constellations_dict.keys():
        doublons.append(preprocessed_name)
    else: 
        publishers_bnf_constellations_dict[preprocessed_name] = utils.Publisher(source="bnf_constellations", 
                                                                        uri=publisher, 
                                                                        raw_name=raw_name, 
                                                                        preprocessed_name=preprocessed_name)

print("doublons: ", len(doublons)) 
print("len publisher graph", len(list(publishers_bnf_constellations_babelio_graph.subjects(RDF.type, SCHEMA.Publisher))))
print("len publisher dict", len(publishers_bnf_constellations_dict)) #
assert(len(doublons) == 0) # should be 0 because loading the NoDuplicate file
assert(len(list(publishers_bnf_constellations_babelio_graph.subjects(RDF.type, SCHEMA.Publisher))) == len(publishers_bnf_constellations_dict)) # should be same because eliminated publisher duplicates with same preprocessing
print("assertions passed")

doublons:  0
len publisher graph 2222
len publisher dict 2222
assertions passed


##### align btlf publishers with bnf-constellations-babelio

In [34]:
manual_matches_v2 = { #  key = raw_name_btlf, value = raw_name_bnf_constellations_babelio
'Gulf Stream (Editions du)': 
   'Gulf Stream',
'Magnard-Vuibert': 
   'Magnard',
'Musée du Louvre': 
   'Musée du Louvre éd.',
'Soleil productions': 
   'Soleil production',
'Editions Ouest-France-Edilarge': 
   'Ouest-France',
'Plume de carotte': 
   'Plume carotte',
'Editions Dadoclem': 
   'Dadoclem éd.',
'Jean-Claude Gawsewitch éditeur': 
   'J.-C. Gawsewitch',
"Editions de L'Archipel": 
   "L'Archipel",
'Odile Jacob': 
   'O. Jacob',
'Editions du Centre Pompidou': 
   'Centre Pompidou',
'Musées de la ville de Strasbourg': 
   'les Musées de la Ville de Strasbourg',
'Lancosme multimédia': 
   'Lancosme éd.',
'Critères': 
   'Critères éd.',
'Quespire': 
   'Quespire éd.',
'Les éditions du Bonhomme vert': 
   'le Bonhomme vert',
'Vengeur masqué': 
   'Le Vengeur Masqué',
'Editions Ecole active': 
   "L'École active",
'Editions [MiC_MaC]': 
   'Mic-Mac',
'Eveil et découvertes': 
   'Eveil et découverte',
'Editions des Grandes personnes': 
   'Les Grandes Personnes',
'Editions-Diffusion Karthala': 
   'Karthala',
'Les éditions Carpentier': 
   'D. Carpentier',
'Pascal Galodé éditeurs': 
   'P. Galodé',
'Prisma Media': 
   'Prisma',
'Art à la page': 
   "l'Art à la page",
'CERCIL': 
   'du Cercil',
'Les Editions de la Grande ourse': 
   'La Grande Ourse',
'Editions de La Différence': 
   'La Différence',
'Tertium': 
   'Tertium éd.',
'Editions de la Renarde rouge': 
   'La Renarde Rouge',
'Aniss éditions': 
   'Aniss éd.',
'Editions du Rouergue': 
   'Rouergue',
'Les éditions du soleil de minuit': 
   'Soleil De Minuit',
'Editions des Arènes': 
   'Les Arènes',
'Le Square éditeur': 
   'le Square éd.',
'Griffon bleu': 
   'le Griffon bleu',
'Les éditions du Kilowatt': 
   'Kilowatt',
'Ophelbe': 
   'Ofelbe',
'Editions Privat SAS': 
   'Privat',
'Losange': 
   'du Losange',
'Les Editions Sandawe': 
   'Sandawe.com',
'Nouvelles éditions ivoiriennes': 
   'NEI - Nouvelles éditions Ivoiriennes',
'Studio des Trois Becs': 
   'Studio les Trois Becs',
'Editions Imago': 
   'Auzas éditeurs-Imago',
'205 Corp': 
   '205.fr',
'Librairie Téqui (Pierre)': 
   'Téqui',
'Editions de la Coopérative': 
   'la Coopérative',
'Les Productions Somme toute': 
   'Somme toute',
'Réseau Canopé': 
   'Canopé éditions',
'Editions Maelström': 
   'maelstrÖm reEvolution',
'Le Dilettante': 
   'Le dilletante',
'Les Editions de Minuit': 
   'Minuit',
'Editions Jean-Paul Gisserot': 
   'J.-P. Gisserot',
'Les éditions du dernier havre': 
   'Le dernier havre',
'Éditions du Septentrion': 
   'Septentrion',
'Orca Book Publishing': 
   'Orca',
'LéR - Les éditeurs réunis': 
   'Les Éditeurs Réunis',
"Presses de l'Université de Montréal": 
   'Les Presses De L’université De Montréal',
'Éditions du Noroit': 
   'Le Noroît',
'Editions de Fallois': 
   'Fallois',
'6 pieds sous terre éditions': 
   'Six Pieds Sous Terre',
'Editions du Triomphe': 
   'Triomphe',
'Éditions du Vermillon': 
   'Vermillon',
'Éditions ADA inc.': 
   'AdA',
"Homme sans nom (Editions de l')": 
   "L'Homme Sans Nom",
'Editions des Mille saisons': 
   'Mille Saisons',
'Vincze (Eva)': 
   'É. Vincze',
'Edizioni Corraini': 
   'Corraini',
'Editions Picquier (Philippe)': 
   'P. Picquier',
"Ed. de l'Homme sans nom": 
   "L'Homme Sans Nom",
'Jaguar': 
   'du Jaguar',
'Palissade': 
   'La Palissade',
'Note de partage': 
   'une Note de partage',
'Privat SAS': 
   'Privat',
'Marmaille et compagnie': 
   'Marmaille & compagnie',
'Alain Gorius': 
   'al Manar-A. Gorius',
'Harmattan Guinée': 
   "l'Harmattan Guinée",
'les Ed. Ago': 
   'AGO',
'Louvre éditions': 
   'Musée du Louvre éd.',
'Calicot': 
   'le Calicot',
"Aventuriers de l'étrange": 
   "les Aventuriers de l'étrange",
'Revue dessinée': 
   'La Revue dessinée',
'Diplodocus': 
   'le Diplodocus',
'Nouv. éd. ivoiriennes': 
   'NEI - Nouvelles éditions Ivoiriennes',
"Museum d'histoire naturelle": 
   "Muséum national d'histoire naturelle",
'Musée cantonal des beaux-arts de Lausanne': 
   'Musée des beaux-arts de Lausanne',
'Editions Deux-Cent-Cinq': 
   '205.fr',
'Inédite': 
   "l'Inédite",
"Néva éd.":"Neva",
"Ed. des Mille saisons": 
   "Mille Saisons",
"Sorbier":"Le Sorbier",
"Quadrants":"Quadrant",
"Rouge et or":"Rouge & Or",
"Nuée bleue": 
   "la Nuée bleue",
"Rocher": 
   "du Rocher",
"EP Emmanuel Proust éditions": 
   "Emmanuel Proust",
"Petit Lézard": 
   "Le Petit lézard",
"Afrobul":
   "Afro bulles éd.",
"Atalante": 
   "L'Atalante",
"Cherche Midi": 
   "Le Cherche Midi",
"Courrier du livre": "le Courrier du livre",
"Cité des sciences & de l'industrie":
   "Cité des sciences et de l'industrie",
"Ecole des loisirs":
   "L'École Des Loisirs",
"De La Martinière Jeunesse":
   "La Martinière Jeunesse",
"Temps apprivoisé":
   "le Temps apprivoisé",
"Archipel":
   "L'Archipel",
"Ecole active":
   "L'École active",
"Blake et Mortimer":
   "Blake & Mortimer",
"les Ed. du jardin des mots":
   "du Jardin des mots",
"M. Lafon": 
   "Michel Lafon",
"les Ed. du bonhomme vert":
   "le Bonhomme vert",
"Fond du Tiroir (Le)":
   "le Fond du tiroir",
"Ed. des Grandes personnes":
   "Les Grandes Personnes",
"Éditions de la Bagnole":
   "La Bagnole",
"les Ed. Fei":
   "Fei",
"Tulipe noire":
   "la Tulipe noire",
"Joie de lire":
   "La Joie De Lire",
"Temps des cerises":
   "le Temps des cerises",
"Orbestier":
   "d'Orbestier",
"Ed. de la Gouttière":
   "La Gouttière",
"Éditions Hurtubise inc.":
   "Hurtubise",
"Ed. de la Renarde rouge":
   "La Renarde Rouge",
"S. Diantantu":
   "Diantantu Editions",
"Arche éditeur":
   "L'Arche",
"Rocher jeunesse":
   "Le Rocher Jeunesse",
"Éditions de l'Isatis":
   "Isatis",
"Agrume":
   "L'Agrume",
"Petites moustaches éditions (Les)":
   "les Petites moustaches",
"Square éditeur":
   "le Square éd.",
"Cosmographe":
   "le Cosmographe",
"Harmattan Mali":
   "l'Harmattan Mali",
"Tropique édition":
   "Tropiques",
"Ed. du Centre Pompidou":
   "Centre Pompidou",
"Lattès":
   "JC Lattès",
"Atelier des Noyers":
   "l'Atelier des noyers",
"Lac aux Fées":
   "le lac aux fées",
"Le Robert":
   "Robert",
"Éditions du Phoenix":
   "Du Phœnix",
"Saint-Jean":
   "Guy Saint-Jean",
"Éditions du Trécarré":
   "Trécarré",
"Humanoïdes associés":
   "Les Humanoïdes Associés",
"Beech Street Books (French)":
   "Beech Street Books",
"Ed. de Fallois":
   "Fallois",
"Pré-aux-Clercs":
   "Le Pré Aux Clercs",
"Éditions de l'Hexagone":
   "L'Hexagone",
"6 pieds sous terre":
   "Six Pieds Sous Terre",
"Ed. de l'Olivier":
   "L'Olivier",
"éditions de la Bagnole":
   "La Bagnole",
"Sandawe":
   "Sandawe.com"
 }
 
id = 0

# editeurs = {}
btlf_publishers_dict = {}
doublons_btlf = []
exact_match_counter = 0
manual_match_counter = 0
no_match_counter = 0
    
for root in btlf_books_onix:

   # raw_names = root.findall(".//d:PublishingRole[.='01']../d:PublisherName", ns) 
   raw_names = root.findall(".//d:ImprintIDType[.='01']../../d:ImprintName", ns)

   if not raw_names:
      print('ERREUR', root)
   for raw_name in raw_names:
      raw_name = raw_name.text
      preprocessed_name = utils.preprocess_publisher_name(raw_name)

      if preprocessed_name in btlf_publishers_dict:
         doublons_btlf.append(preprocessed_name) # don't include in stats very large number of duplicates (here we extract them from the books directly)
      else: 
         btlf_publishers_dict[preprocessed_name] = raw_name
         # if "paquet" in preprocessed_name:
         #     print("\""+ raw_name+ "\"")
         if preprocessed_name in publishers_bnf_constellations_dict: # don't change graph
               exact_match_counter += 1 
         elif raw_name in manual_matches_v2: # don't change graph
               manual_match_counter += 1
               matched_raw_name = manual_matches_v2[raw_name]
               print(matched_raw_name)
               matched_publisher = find_publisher_by_raw_name(matched_raw_name)
               print(matched_publisher)
               publishers_bnf_constellations_babelio_graph.add((SCHEMA[matched_publisher.uri], SCHEMA.name, Literal(matched_publisher.raw_name, datatype=utils.xsd.string))) # ajouter le nom pour tester les matches et avertir de 2 noms possible pour maison edition
         else:
               no_match_counter += 1
               uri = raw_name.replace(' ', '_').replace("'", "").replace("&", "_").replace("/", "_").replace(".", "") # same name creation as Karim's script
               uri = URIRef(f'{SCHEMA}{uri}')
               publishers_bnf_constellations_babelio_graph.add((uri, RDF.type, SCHEMA.Publisher))
               publishers_bnf_constellations_babelio_graph.add((uri, SCHEMA.name, Literal(raw_name, datatype=utils.xsd.string)))
               preprocessed_name = utils.preprocess_publisher_name(raw_name)
               publishers_bnf_constellations_dict[preprocessed_name] = utils.Publisher(source="btlf", 
                                                                     uri=uri, 
                                                                     raw_name=raw_name, 
                                                                     preprocessed_name=preprocessed_name)

# print(editeurs)
print("exact", exact_match_counter)
print("manual", manual_match_counter)
print("no match", no_match_counter)

L'École Des Loisirs
uri = http://schema.org/LÉcole_Des_Loisirs, 
  source = bnf_constellations,
  raw_name = L'École Des Loisirs,
  preprocessed_name = lecoledesloisirs
La Martinière Jeunesse
uri = http://schema.org/La_Martinière_Jeunesse, 
  source = bnf_constellations,
  raw_name = La Martinière Jeunesse,
  preprocessed_name = lamartinierejeunesse
L'Arche
uri = http://schema.org/LArche, 
  source = bnf_constellations,
  raw_name = L'Arche,
  preprocessed_name = larche
Le Rocher Jeunesse
uri = http://schema.org/Le_Rocher_Jeunesse, 
  source = bnf_constellations,
  raw_name = Le Rocher Jeunesse,
  preprocessed_name = lerocherjeunesse
Musée du Louvre éd.
uri = http://schema.org/MuseduLouvred, 
  source = bnf_constellations,
  raw_name = Musée du Louvre éd.,
  preprocessed_name = museedulouvre
Centre Pompidou
uri = http://schema.org/CentrePompidou, 
  source = bnf_constellations,
  raw_name = Centre Pompidou,
  preprocessed_name = centrepompidou
Les Humanoïdes Associés
uri = http://schem

Blake & Mortimer
uri = http://schema.org/BlakeMortimer, 
  source = bnf_constellations,
  raw_name = Blake & Mortimer,
  preprocessed_name = blake&mortimer
du Jardin des mots
uri = http://schema.org/duJardindesmots, 
  source = bnf_constellations,
  raw_name = du Jardin des mots,
  preprocessed_name = dujardindesmots
Michel Lafon
uri = http://schema.org/Michel_Lafon, 
  source = bnf_constellations,
  raw_name = Michel Lafon,
  preprocessed_name = michellafon
le Bonhomme vert
uri = http://schema.org/leBonhommevert, 
  source = bnf_constellations,
  raw_name = le Bonhomme vert,
  preprocessed_name = lebonhommevert
Le Vengeur Masqué
uri = http://schema.org/Le_Vengeur_Masqué, 
  source = bnf_constellations,
  raw_name = Le Vengeur Masqué,
  preprocessed_name = levengeurmasque
Eveil et découverte
uri = http://schema.org/Eveiletdcouverte, 
  source = bnf_constellations,
  raw_name = Eveil et découverte,
  preprocessed_name = eveiletdecouverte
le Fond du tiroir
uri = http://schema.org/leFondd

In [35]:
# final file (not intermediary)
with open(publishers_btlf_bnf_constellations_babelio_file_path, "wb") as file:
    publishers_bnf_constellations_babelio_graph.serialize(file)

# BTLF books creation

In [37]:
btlf_books_graph = Graph()
btlf_books_graph.bind("schema", SCHEMA, override=True, replace=True)
btlf_books_graph.bind("pbs", PBS, override=True)

exact_match_counter = 0
manual_match_counter = 0
no_match_counter = 0

class BTLF_Book():
    def __init__(self, btlf_id, isbn=None, name=None, author=None, publisher=None, date=None):
        self.btlf_id = btlf_id
        self.isbn = isbn
        self.name = name
        self.author = author
        self.publisher = publisher
        self.date = date
    
    def __eq__(self, other):
        if(self.btlf_id == other.btlf_id
            and self.isbn == other.isbn 
            and self.name == other.name
            and self.author == other.author
            and self.publisher == other.publisher
            and self.date == other.date):
            return True
        else:
            return False


duplicate_books = collections.defaultdict(lambda: [])

livres = []
for root in btlf_books_onix:
    rf = root.find('.//d:RecordReference', ns)

    # if rf.text in duplicate_books:
    #     print("duplicate", rf.text)

    btlf_book_object = BTLF_Book(btlf_id=rf.text)
    
    btlf_books_graph.add((SCHEMA[rf.text], RDF.type, SCHEMA['Book']))
    btlf_books_graph.add((SCHEMA[rf.text], PBS.infoSource, PBS.BTLF))

    editeursLivre = root.findall(".//d:ImprintIDType[.='01']../../d:ImprintName", ns)


    for raw_name in editeursLivre:
        raw_name = raw_name.text
        preprocessed_name = utils.preprocess_publisher_name(raw_name)
        btlf_book_object.publisher = raw_name
        # print(preprocessed_name)
        if preprocessed_name in publishers_bnf_constellations_dict:
            exact_match_counter += 1
            publisher_uri = publishers_bnf_constellations_dict[preprocessed_name].uri
            btlf_books_graph.add((SCHEMA[rf.text], SCHEMA.publisher, publisher_uri))
        elif raw_name in manual_matches_v2:
            manual_match_counter += 1
            # print(raw_name)
            matched_raw_name = manual_matches_v2[raw_name]
            matched_publisher = find_publisher_by_raw_name(matched_raw_name)
            btlf_books_graph.add((SCHEMA[rf.text], SCHEMA.publisher, matched_publisher.uri))
        else:
            no_match_counter += 1
            # print("# si pas trouvé avec le preprocessed name, forcement dans le dict")
        
    liste_auteurs = root.findall('.//d:Contributor', ns)
    if len(liste_auteurs) > 0:
        for a in liste_auteurs: 
            name_inverted = a.find(".//d:ContributorRole[.='A01']../d:PersonNameInverted", ns)
            if name_inverted != None and name_inverted.text not in excluded_authors:
                btlf_books_graph.add((SCHEMA[rf.text], SCHEMA.author, processed_authors[name_inverted.text]))
                btlf_books_graph.add((SCHEMA[rf.text], PBS.authorString, Literal(name_inverted.text)))

        for a in liste_auteurs: 
            name_inverted = a.find(".//d:ContributorRole[.='A12']../d:PersonNameInverted", ns)
            if name_inverted != None and name_inverted.text not in excluded_authors:
                btlf_books_graph.add((SCHEMA[rf.text], SCHEMA.illustrator, processed_authors[name_inverted.text]))
                btlf_books_graph.add((SCHEMA[rf.text], PBS.illustratorString, Literal(name_inverted.text)))


    isbn = root.findall(".//d:ProductIdentifier/d:ProductIDType[.='15']../d:IDValue", ns)
    if isbn:
        isbn_str = isbn[0].text
        btlf_book_object.isbn = isbn_str
        btlf_books_graph.add((SCHEMA[rf.text], SCHEMA.isbn, Literal(isbn_str)))

        isbn13 = isbnlib.to_isbn13(isbn_str)
        if not isbn13:
            btlf_books_graph.add((SCHEMA[rf.text], PBS.invalidISBN, Literal(isbn)))
            print(" isbn not valid", rf.text)

    
       
    language = root.find('.//d:Language/d:LanguageCode', ns)
    if language != None:
        btlf_books_graph.add((SCHEMA[rf.text], SCHEMA.inLanguage, Literal(language.text)))

    

    liste_thema_main_subjet = root.findall(".//d:Subject/d:MainSubject/../d:SubjectSchemeIdentifier[.='93']../d:SubjectCode", ns)
    for t in liste_thema_main_subjet:
        btlf_books_graph.add((SCHEMA[rf.text], PBS.mainSubjectThema, Literal(t.text)))

                  
    liste_thema_subject = root.findall(".//d:Subject/d:SubjectSchemeIdentifier[.='93']../d:SubjectCode", ns)
    for t in liste_thema_subject:
        btlf_books_graph.add((SCHEMA[rf.text], PBS.subjectThema, Literal(t.text)))
          
    titre = root.find(".//d:DescriptiveDetail/d:TitleDetail/d:TitleType[.='01']../d:TitleElement/d:TitleElementLevel[.='01']../../d:TitleStatement", ns)

    if titre != None:
        titre_str = titre.text
        btlf_book_object.titre = titre_str
        btlf_books_graph.add((SCHEMA[rf.text], SCHEMA.name, Literal(titre_str)))        
        
        
    description = root.findall(".//d:TextContent/d:TextType[.='03']../d:Text", ns)
    if description:
        btlf_books_graph.add((SCHEMA[rf.text], SCHEMA.abstract, Literal(description[0].text)))
       
        
    reviewQuote = root.findall(".//d:TextContent/d:TextType[.='06']../d:Text", ns)
    if reviewQuote:
        btlf_books_graph.add((SCHEMA[rf.text], SCHEMA.review, Literal(reviewQuote[0].text)))


    date = root.find(".//d:PublishingDate/d:Date", ns)
    if date != None:
        date_str = date.text
        btlf_book_object.date = date_str
        btlf_books_graph.add((SCHEMA[rf.text], SCHEMA.datePublished, Literal(date_str)))
        
    audience = root.find(".//d:Audience/d:AudienceCodeValue", ns)
    if audience != None and audience.text in age_ranges:
        age_range = age_ranges[audience.text]
        for age in age_range:     
            btlf_books_graph.add((SCHEMA[rf.text], PBS.age, Literal(str(age)))) # cast to string to be consistent with BNF / Constellations
    
    duplicate_books[rf.text].append(btlf_book_object)

    


print("direct publisher match", exact_match_counter)
print("manual publisher match", manual_match_counter)
print("no publisher match", no_match_counter)

direct publisher match 24126
manual publisher match 3238
no publisher match 6


In [38]:
with open(btlf_books_file_path, "wb") as file:
    btlf_books_graph.serialize(file)

# isbn cleaning BNF - Constellations - Babelio

In [39]:
def sbn_to_isbn10(sbn):
    # Check if the input is a valid SBN string
    if len(sbn) == 9 and sbn.isdigit():
        return "0" + sbn

### bnf

In [40]:
graph_bnf = Graph()
graph_bnf.parse(bnf_authorURI_file_path, format="turtle")

for bnf_book in graph_bnf.subjects(RDF.type, utils.schema.Book):
    book_data_source = utils.extract_data_bnf(graph_bnf, bnf_book)
    if len(book_data_source.isbn) == 1:
        isbn = isbnlib.canonical(book_data_source.isbn[0])
    elif len(book_data_source.isbn) > 1:
        assert(False)
    else:
        isbn = ""

    if len(book_data_source.ean) == 1:
        ean = isbnlib.canonical(book_data_source.ean[0])
    elif len(book_data_source.ean) > 1:
        assert(False)
    else:
        ean = ""

    graph_bnf.remove((bnf_book, SCHEMA.isbn, None)) # if empty value, graphdb will match with all other empty isbn values
    
    if len(isbn) == 9:
        isbn = sbn_to_isbn10(isbn)
    
    valid_isbn13 = isbnlib.to_isbn13(isbn)
    valid_ean13 = isbnlib.to_isbn13(ean) # if valid_ean13 exists and valid should equals to original EAN

    if valid_isbn13 and valid_ean13 and valid_isbn13 != valid_ean13: # put both in graph because likely to be different editions but difficult to choose between the right one
        # print(isbn, valid_isbn13, valid_ean13)
        graph_bnf.add((bnf_book, SCHEMA.isbn, Literal(valid_isbn13)))
        graph_bnf.add((bnf_book, SCHEMA.isbn, Literal(valid_ean13))) 
    elif valid_isbn13 and valid_ean13 and valid_isbn13 == valid_ean13:
        graph_bnf.add((bnf_book, SCHEMA.isbn, Literal(valid_isbn13)))
    elif valid_isbn13 and not valid_ean13:
        graph_bnf.add((bnf_book, SCHEMA.isbn, Literal(valid_isbn13)))
    elif not valid_isbn13 and valid_ean13:
        graph_bnf.add((bnf_book, SCHEMA.isbn, Literal(valid_ean13)))
    else:
        print("isbn and ean not valid", isbn, ean)
        graph_bnf.add((bnf_book, utils.pbs.invalidISBN13, Literal(isbn)))

 
with open("../final_datasets/bnf.ttl","wb") as file:
    graph_bnf.serialize(file) 

Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x000001DD0027D080>
Traceback (most recent call last):
  File "C:\Users\dre\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\rdflib\term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
           ^^^^^^^^^^^^^^^^^^
  File "C:\Users\dre\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\isodate\isodates.py", line 203, in parse_date
    raise ISO8601Error('Unrecognised ISO 8601 date format: %r' % datestring)
isodate.isoerror.ISO8601Error: Unrecognised ISO 8601 date format: 'Port- d-es'
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x000001DD0027D080>
Traceback (most recent call last)

isbn and ean not valid  
isbn and ean not valid  
isbn and ean not valid  
isbn and ean not valid  
isbn and ean not valid  
isbn and ean not valid  
isbn and ean not valid 2081617074 2081617074
isbn and ean not valid  
isbn and ean not valid  
isbn and ean not valid  
isbn and ean not valid  
isbn and ean not valid 2211012152 2211012152
isbn and ean not valid 2211016832 2211016832
isbn and ean not valid 2211026953 2211026953
isbn and ean not valid  
isbn and ean not valid  
isbn and ean not valid  
isbn and ean not valid  
isbn and ean not valid  
isbn and ean not valid  
isbn and ean not valid 2904292424 2904292424
isbn and ean not valid 2878330885 2878330885
isbn and ean not valid  
isbn and ean not valid  
isbn and ean not valid 221102684X 221102684X
isbn and ean not valid  
isbn and ean not valid  
isbn and ean not valid 2226070696 2226070696
isbn and ean not valid 2910635685 2910635685
isbn and ean not valid  
isbn and ean not valid 2868770941 2868770941
isbn and ean not valid  


### constellations

In [41]:
graph_constellations = Graph()
graph_constellations.parse(constellations_authorURI_file_path, format="turtle")

for constellations_book in graph_constellations.subjects(RDF.type, utils.schema.Book):
    book_data_source = utils.extract_data_constellations(graph_constellations, constellations_book)
    if len(book_data_source.isbn) == 1:
        isbn = isbnlib.canonical(book_data_source.isbn[0])
    elif len(book_data_source.isbn) > 1:
        assert(False)
    else:
        isbn = ""
    graph_constellations.remove((constellations_book, SCHEMA.isbn, None)) 
    if len(isbn) == 9:
        isbn = sbn_to_isbn10(isbn)
    
    valid_isbn13 = isbnlib.to_isbn13(isbn)
    if valid_isbn13:
        graph_constellations.add((constellations_book, SCHEMA.isbn, Literal(valid_isbn13)))
    else:
        print("isbn not valid", isbn, "for book", constellations_book)
        graph_constellations.add((constellations_book, utils.pbs.invalidISBN13, Literal(isbn)))

with open("../final_datasets/constellations.ttl","wb") as file:
    graph_constellations.serialize(file)

isbn not valid 9782070610942 for book http://schema.org/Book0386261c-576c-483c-a8f5-173f57bbe036
isbn not valid  for book http://schema.org/Book05a14fe4-7672-4f35-9879-cf6b373a6ce7
isbn not valid  for book http://schema.org/Book1047b2ef-29ae-4e1b-bc6c-79740550f68b
isbn not valid 9782894281591 for book http://schema.org/Book13a0e400-2761-4932-881b-e2a6e47bea6a
isbn not valid 2278056729 for book http://schema.org/Book14b01b25-d3bf-415e-a6ea-ce4e7f14d5ec
isbn not valid  for book http://schema.org/Book160d93d9-f47a-4169-9ea4-b8141a97c40d
isbn not valid 2844141040 for book http://schema.org/Book18469950-c918-4e29-b0fd-16828a786983
isbn not valid  for book http://schema.org/Book1eb3f32f-bf76-407a-aeef-d784905ad6d0
isbn not valid  for book http://schema.org/Book20d84d10-1c67-49bd-b350-f76cf9289900
isbn not valid  for book http://schema.org/Book21fc84a1-6315-4652-aa74-60e8fa7ee621
isbn not valid 9782923163303 for book http://schema.org/Book228a80d3-e380-4a36-909d-e2eed573a572
isbn not valid  f

### babelio

In [42]:
graph_babelio = Graph()
graph_babelio.parse(babelio_authorURI_file_path, format="turtle")

for babelio_book in graph_babelio.subjects(RDF.type, utils.schema.Book):
    book_data_source = utils.extract_data_babelio(graph_babelio, babelio_book)
    if len(book_data_source.ean) == 1:
        isbn = isbnlib.canonical(book_data_source.ean[0])
    elif len(book_data_source.ean) > 1:
        assert(False)
    else:
        isbn = ""

    graph_babelio.remove((babelio_book, PBS.ean, None)) # all isbn are stored in ean -> doesn't respect schema standart
    
    if len(isbn) == 9:
        isbn = sbn_to_isbn10(isbn)
    
    valid_isbn13 = isbnlib.to_isbn13(isbn)
    if valid_isbn13:
        graph_babelio.add((babelio_book, SCHEMA.isbn, Literal(valid_isbn13)))
    elif isbn and not valid_isbn13:
        print("isbn not valid", isbn, "for book", babelio_book)
        graph_babelio.add((babelio_book, utils.pbs.invalidISBN13, Literal(isbn)))

with open("../final_datasets/babelio.ttl","wb") as file:
    graph_babelio.serialize(file)


In [43]:
if DELETE_INTERMEDIARY_FILES:
    os.remove(constellations_NoDuplicatePublisher_file_path)
    os.remove(bnf_NoDuplicatePublisher_file_path)
    os.remove(babelio_NoDuplicatePublisher_file_path)
    os.remove(constellations_authorURI_file_path)
    os.remove(bnf_authorURI_file_path)
    os.remove(babelio_authorURI_file_path)
    os.remove(publisher_BNF_Constellation_Babelio_file_path)