In [8]:
from rdflib import Graph
from rdflib import RDF
import sys
from pathlib import Path
sys.path.append('../')

sys.path.append('../andre')
import andre.utils as utils
from andre.utils import schema as SCHEMA
import collections
import isbnlib
import itertools


data_path = Path("final_datasets/knowledge_base")

In [9]:
def print_number_of_isbns_per_book_count(isbn_array):
    lengths = [len(element) for element in isbn_array]
    length_counts = collections.Counter(lengths)
    print("--- number of isbns per book:")
    print("total length: ", len(isbn_array))
    for length, count in length_counts.items():
        print(f"Length {length}: {count} occurrences")
    print()


def print_different_books_with_same_isbn_count(isbn_dict):
    lengths = [len(values) for values in isbn_dict.values()]
    length_counts = collections.Counter(lengths)
    print("--- different books with same isbn:")
    print("total length: ", len(isbn_dict))
    for length, count in length_counts.items():
        print(f"Length {length}: {count} occurrences")
    print()


def create_isbn_dict(graph, source):
    isbn_dict = collections.defaultdict(lambda :[])
    isbn_array = []
    for book in graph.subjects(RDF.type, SCHEMA.Book):
        if source == "btlf":
            book_data = utils.extract_data_btlf(graph, book)
        elif source == "bnf":
            book_data = utils.extract_data_bnf(graph, book)
        elif source == "constellations":
            book_data = utils.extract_data_constellations(graph, book)
        elif source == "babelio":
            book_data = utils.extract_data_babelio(graph, book)

        for isbn in book_data.isbn:
            isbn_dict[isbn].append(book_data)
        
        isbn_array.append(book_data.isbn)
    print_number_of_isbns_per_book_count(isbn_array)
    print_different_books_with_same_isbn_count(isbn_dict)
    return isbn_dict, isbn_array


def process_isbn_list(isbn_array):
    isbn_array_flat = list(itertools.chain.from_iterable(isbn_array))
    isbn_array_unique = set(isbn_array_flat)
    isbn13_array = [isbnlib.to_isbn13(isbn) for isbn in isbn_array_unique]
    empty_elements_count = isbn13_array.count('')
    print("Number of empty elements:", empty_elements_count)
    print("Number of unique isbns 13:", len(isbn13_array))
    return isbn13_array

# BTLF

In [10]:
graph_btlf = Graph()
graph_btlf.parse((data_path / "BTLF.ttl").as_posix(), format="turtle")


<Graph identifier=N9b0a0d848811432d9cdb014ac79e1d08 (<class 'rdflib.graph.Graph'>)>

In [11]:
btlf_isbn_dict, btlf_isbn_array = create_isbn_dict(graph_btlf, "btlf")
assert(len(btlf_isbn_array) == 26920)

--- number of isbns per book:
total length:  26920
Length 1: 26920 occurrences

--- different books with same isbn:
total length:  26920
Length 1: 26920 occurrences



In [12]:
btlf_isbns = process_isbn_list(btlf_isbn_array)

Number of empty elements: 0
Number of unique isbns 13: 26920


# BNF

In [13]:
graph_bnf = Graph()
graph_bnf.parse((data_path / "bnf.ttl").as_posix(), format="turtle")


Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x00000223AC7C0680>
Traceback (most recent call last):
  File "c:\Users\dre\Documents\projet biblio scolaire\Books-Python\books-python-env\Lib\site-packages\rdflib\term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
           ^^^^^^^^^^^^^^^^^^
  File "c:\Users\dre\Documents\projet biblio scolaire\Books-Python\books-python-env\Lib\site-packages\isodate\isodates.py", line 203, in parse_date
    raise ISO8601Error('Unrecognised ISO 8601 date format: %r' % datestring)
isodate.isoerror.ISO8601Error: Unrecognised ISO 8601 date format: 'Port- d-es'
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x00000223AC7C0680>
Traceback (most recent call last):
  File "c:\Users\dre\Documents\projet biblio scolaire\Books-Python\books-python-env\

<Graph identifier=N408d9e81aa81402eb4b1b509522e1151 (<class 'rdflib.graph.Graph'>)>

In [14]:
bnf_isbn_dict, bnf_isbn_array = create_isbn_dict(graph_bnf, "bnf")
assert(len(bnf_isbn_array) == 30384)

--- number of isbns per book:
total length:  30384
Length 0: 366 occurrences
Length 1: 29828 occurrences
Length 2: 190 occurrences

--- different books with same isbn:
total length:  29914
Length 1: 29669 occurrences
Length 2: 226 occurrences
Length 3: 12 occurrences
Length 4: 2 occurrences
Length 9: 1 occurrences
Length 6: 2 occurrences
Length 17: 1 occurrences
Length 5: 1 occurrences



In [15]:
bnf_isbns = process_isbn_list(bnf_isbn_array)

Number of empty elements: 0
Number of unique isbns 13: 29914


# Constellations

In [16]:
graph_constellations = Graph()
graph_constellations.parse((data_path / "constellations.ttl").as_posix(), format="turtle")

<Graph identifier=N8e66f466be3a4b86913d6a8040e417b8 (<class 'rdflib.graph.Graph'>)>

In [17]:
constellations_isbn_dict, constellations_isbn_array = create_isbn_dict(graph_constellations, "constellations")
assert(len(constellations_isbn_array) == 11267)

--- number of isbns per book:
total length:  11267
Length 1: 11177 occurrences
Length 0: 90 occurrences

--- different books with same isbn:
total length:  11177
Length 1: 11177 occurrences



In [18]:
constellations_isbns = process_isbn_list(constellations_isbn_array)

Number of empty elements: 0
Number of unique isbns 13: 11177


# Babelio

In [19]:
graph_babelio = Graph()
graph_babelio.parse((data_path / "babelio.ttl").as_posix(), format="turtle")


<Graph identifier=Nd6458d3da7ab45b2891bf61c0b610a56 (<class 'rdflib.graph.Graph'>)>

In [20]:
babelio_isbn_dict, babelio_isbn_array = create_isbn_dict(graph_babelio, "babelio")
assert(len(babelio_isbn_array) == 4169)

--- number of isbns per book:
total length:  4169
Length 1: 2252 occurrences
Length 0: 1917 occurrences

--- different books with same isbn:
total length:  2190
Length 1: 2133 occurrences
Length 2: 54 occurrences
Length 3: 1 occurrences
Length 4: 2 occurrences



In [21]:
babelio_isbns = process_isbn_list(babelio_isbn_array)

Number of empty elements: 0
Number of unique isbns 13: 2190


# merge isbn arrays

In [22]:
all_isbns = btlf_isbns + bnf_isbns + constellations_isbns + babelio_isbns
print(len(all_isbns))
assert(len(all_isbns) == 26920 + 29914 + 11177 + 2190)
all_isbns_unique = list(set(all_isbns))
print(len(all_isbns_unique))

70201
40109


In [23]:
with open(("../" / data_path / "isbns_base_connaissance.txt").as_posix(), "a") as file:
    for isbn in all_isbns_unique:
        file.write(isbn + "\n")