# Goal of notebook

- understand where BTLF data is coming from: BNF, Constellations, Babelio
- understand reason of missing data for each source

In [1]:
from rdflib import Graph, Namespace, Literal, URIRef
from rdflib import RDF, RDFS, OWL, XSD
import sys
import Levenshtein
sys.path.append('../')

sys.path.append('../andre')
import andre.utils as utils
from andre.utils import schema as SCHEMA
import collections
import isbnlib

In [2]:
def print_number_of_isbns_per_book_count(isbn_array):
    lengths = [len(element) for element in isbn_array]
    length_counts = collections.Counter(lengths)
    print("--- number of isbns per book:")
    print("total length: ", len(isbn_array))
    for length, count in length_counts.items():
        print(f"Length {length}: {count} occurrences")
    print()


def print_different_books_with_same_isbn_count(isbn_dict):
    lengths = [len(values) for values in isbn_dict.values()]
    length_counts = collections.Counter(lengths)
    print("--- different books with same isbn:")
    print("total length: ", len(isbn_dict))
    for length, count in length_counts.items():
        print(f"Length {length}: {count} occurrences")
    print()


def create_isbn_dict(graph, source):
    isbn_dict = collections.defaultdict(lambda :[])
    isbn_array = []
    for book in graph.subjects(RDF.type, SCHEMA.Book):
        if source == "btlf":
            book_data = utils.extract_data_btlf(graph, book)
        elif source == "bnf":
            book_data = utils.extract_data_bnf(graph, book)
        elif source == "constellations":
            book_data = utils.extract_data_constellations(graph, book)
        elif source == "babelio":
            book_data = utils.extract_data_babelio(graph, book)

        for isbn in book_data.isbn:
            isbn_dict[isbn].append(book_data)
        
        isbn_array.append(book_data.isbn)
    print_number_of_isbns_per_book_count(isbn_array)
    print_different_books_with_same_isbn_count(isbn_dict)
    return isbn_dict, isbn_array



# BTLF

In [3]:
graph_btlf = Graph()
graph_btlf.parse("final_datasets/BTLF.ttl", format="turtle")

<Graph identifier=N29896910968a4f4b848787ab8ca55db7 (<class 'rdflib.graph.Graph'>)>

In [4]:
btlf_isbn_dict, btlf_isbn_array = create_isbn_dict(graph_btlf, "btlf")
assert(len(btlf_isbn_array) == 26920)

--- number of isbns per book:
total length:  26920
Length 1: 26920 occurrences

--- different books with same isbn:
total length:  26920
Length 1: 26920 occurrences



# bnf

In [5]:
graph_bnf = Graph()
graph_bnf.parse("final_datasets/bnf.ttl", format="turtle")

Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x000001851BE8CB80>
Traceback (most recent call last):
  File "C:\Users\dre\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\rdflib\term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
           ^^^^^^^^^^^^^^^^^^
  File "C:\Users\dre\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\isodate\isodates.py", line 203, in parse_date
    raise ISO8601Error('Unrecognised ISO 8601 date format: %r' % datestring)
isodate.isoerror.ISO8601Error: Unrecognised ISO 8601 date format: 'Port- d-es'
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x000001851BE8CB80>
Traceback (most recent call last)

<Graph identifier=N3dbf0b514a734b94878636073dcced8a (<class 'rdflib.graph.Graph'>)>

In [6]:
bnf_isbn_dict, bnf_isbn_array = create_isbn_dict(graph_bnf, "bnf")
assert(len(bnf_isbn_array) == 30384)

# regex to find 2 isbns per book in bnf
# ns1:isbn "[0-9]+",

--- number of isbns per book:
total length:  30384
Length 0: 366 occurrences
Length 1: 29828 occurrences
Length 2: 190 occurrences

--- different books with same isbn:
total length:  29914
Length 1: 29669 occurrences
Length 2: 226 occurrences
Length 3: 12 occurrences
Length 4: 2 occurrences
Length 9: 1 occurrences
Length 6: 2 occurrences
Length 17: 1 occurrences
Length 5: 1 occurrences



In [7]:
def is_language_french(language):
    return language == "Français" or  language == "fre"

In [8]:
bnf_btlf = []
not_bnf_btlf = []
bnf_not_btlf = []
not_bnf_not_btlf = [] # should stay empty

for isbn_bnf in bnf_isbn_dict:
    if isbn_bnf in btlf_isbn_dict:
        bnf_btlf.append(isbn_bnf)
    else:
        bnf_not_btlf.append(isbn_bnf)

for isbn_btlf in btlf_isbn_dict:
    if isbn_btlf not in bnf_isbn_dict:
        not_bnf_btlf.append(isbn_btlf)
    if isbn_btlf in bnf_isbn_dict and isbn_btlf not in bnf_btlf:
        print("error")

In [9]:
print("intersection", len(bnf_btlf))
print("dans bnf mais pas btlf", len(bnf_not_btlf))
print("dans btlf mais pas bnf", len(not_bnf_btlf))

intersection 20970
dans bnf mais pas btlf 8944
dans btlf mais pas bnf 5950


In [10]:
# verifier proportion livres non francais dans (BNF not BTLF)
non_french_book_counter = 0
french_book_counter = 0
non_isbn_13_counter = 0

for isbn in bnf_not_btlf:
    if isbnlib.is_isbn13(isbn) and not is_language_french(bnf_isbn_dict[isbn][0].language):
        non_french_book_counter += 1
    elif isbnlib.is_isbn13(isbn) and is_language_french(bnf_isbn_dict[isbn][0].language):
        french_book_counter += 1
    if not isbnlib.is_isbn13(isbn):
        non_isbn_13_counter += 1

print("french", french_book_counter)
print("non french", non_french_book_counter)
print("non isbn13", non_isbn_13_counter)

french 7729
non french 1215
non isbn13 0


In [11]:
print("non isbn13 in bnf not present in btlf", len(bnf_not_btlf) - len(list(filter(isbnlib.is_isbn13,bnf_not_btlf))))

non isbn13 in bnf not present in btlf 0


# constellations

In [12]:
graph_constellations = Graph()
graph_constellations.parse("final_datasets/constellations.ttl", format="turtle")

<Graph identifier=N3cbc2187578e43cdaafb7af7c46e39bf (<class 'rdflib.graph.Graph'>)>

In [13]:
constellations_isbn_dict, constellations_isbn_array = create_isbn_dict(graph_constellations, "constellations")
assert(len(constellations_isbn_array) == 11267)

--- number of isbns per book:
total length:  11267
Length 1: 11177 occurrences
Length 0: 90 occurrences

--- different books with same isbn:
total length:  11177
Length 1: 11177 occurrences



In [14]:
constellations_btlf = []
not_constellations_btlf = []
constellations_not_btlf = []
not_constellations_not_btlf = [] # should stay empty

for isbn_constellations in constellations_isbn_dict:
    if isbn_constellations in btlf_isbn_dict:
        constellations_btlf.append(isbn_constellations)
    else:
        constellations_not_btlf.append(isbn_constellations)

for isbn_btlf in btlf_isbn_dict:
    if isbn_btlf not in constellations_isbn_dict:
        not_constellations_btlf.append(isbn_btlf)
    if isbn_btlf in constellations_isbn_dict and isbn_btlf not in constellations_btlf:
        print("error")

In [15]:
print("intersection", len(constellations_btlf))
print("dans constellations mais pas btlf", len(constellations_not_btlf))
print("dans btlf mais pas constellations", len(not_constellations_btlf))

intersection 6721
dans constellations mais pas btlf 4456
dans btlf mais pas constellations 20199


In [16]:
constellations_btlf

['9782350007625',
 '9782742769681',
 '9782895291930',
 '9782848017860',
 '9782924277812',
 '9782352900658',
 '9782896071449',
 '9782081305410',
 '9782330039516',
 '9782211097734',
 '9781474907354',
 '9782747058261',
 '9782070626489',
 '9782203075344',
 '9782812600593',
 '9782070576418',
 '9782740427798',
 '9782748520637',
 '9782745980663',
 '9782362900617',
 '9782742785995',
 '9782745933188',
 '9782362900433',
 '9782070641536',
 '9782035861986',
 '9782253073093',
 '9782745968777',
 '9782373491746',
 '9782922585506',
 '9782070659043',
 '9782732443522',
 '9782894289372',
 '9782812618840',
 '9782896080823',
 '9782764408520',
 '9782070589876',
 '9782266305914',
 '9782211092876',
 '9782211091336',
 '9782354132200',
 '9782355040627',
 '9782211089876',
 '9782215135302',
 '9782211078764',
 '9782020639507',
 '9782918689119',
 '9782733838730',
 '9782211208055',
 '9782070629923',
 '9782211207928',
 '9782882583888',
 '9782354883058',
 '9782812606830',
 '9782764445044',
 '9782211093750',
 '97823550

In [17]:
len(list(set(not_bnf_btlf) & set(constellations_btlf)))

4881

In [18]:
print("non isbn13 in constellations", len(constellations_not_btlf) - len(list(filter(isbnlib.is_isbn13,constellations_not_btlf))))

non isbn13 in constellations 0


# investigation of BTLF sources

In [19]:
constellation_counter = 0
bnf_counter = 0
constellations_and_bnf_counter = 0
source_unknown_counter = 0

for btlf_book in btlf_isbn_dict:
    if btlf_book in bnf_isbn_dict and btlf_book in constellations_isbn_dict:
        constellations_and_bnf_counter += 1
    elif btlf_book in bnf_isbn_dict and btlf_book not in constellations_isbn_dict:
        bnf_counter += 1
    elif btlf_book not in bnf_isbn_dict and btlf_book in constellations_isbn_dict:
        constellation_counter += 1
    else:
        print(btlf_book)
        source_unknown_counter += 1

print("constellations and bnf", constellations_and_bnf_counter)
print("constellations only", constellation_counter)
print("bnf only", bnf_counter)
print("source unkown", source_unknown_counter)

9782924645536
9782897091279
9782896579006
9782895911210
9782924996003
9782896578382
9782890749658
9782896576654
9782895370642
9782896332083
9782890463660
9782890469105
9782896577422
9782894354889
9782894355619
9782895850892
9782897512538
9782890218383
9782897740153
9782896510955
9782890218239
9782896512553
9782923342191
9782897140090
9782923342344
9782892950120
9782923342030
9782896074136
9782897142872
9782897700133
9782922225723
9782923896151
9782897141257
9782922225990
9782896070077
9782896480562
9782923813745
9782896074259
9782897701499
9782893813127
9782924563953
9782922225518
9782896074013
9782896074464
9782896071272
9782896481125
9782897525385
9782897125776
9782762117363
9782897125417
9782895129783
9782897770532
9782760942219
9782896867196
9782895123897
9782760999169
9782922892970
9782895124504
9782897852948
9782760933521
9782895124658
9782760942141
9782760947719
9782760933385
9782923196084
9782898011139
9782895129073
9782760942226
9782895123118
9782760942257
9782922868104
978289

# babelio: last BTLF source

In [20]:
graph_babelio = Graph()
graph_babelio.parse("final_datasets/babelio.ttl", format="turtle")

<Graph identifier=N93661a46dfef4a30bbc27afdd2e02f80 (<class 'rdflib.graph.Graph'>)>

In [21]:
babelio_isbn_dict, babelio_isbn_array = create_isbn_dict(graph_babelio, "babelio")
assert(len(babelio_isbn_array) == 4169)

--- number of isbns per book:
total length:  4169
Length 1: 2252 occurrences
Length 0: 1917 occurrences

--- different books with same isbn:
total length:  2190
Length 1: 2133 occurrences
Length 2: 54 occurrences
Length 3: 1 occurrences
Length 4: 2 occurrences



In [22]:
source_unknown_counter = 0
source_unknown_isbns = []

for btlf_book in btlf_isbn_dict:
    if btlf_book in bnf_isbn_dict or btlf_book in constellations_isbn_dict or btlf_book in babelio_isbn_dict:
        continue 
    else:
        source_unknown_counter += 1
        source_unknown_isbns.append(btlf_book)

print("source unkown", source_unknown_counter)

source unkown 0


In [23]:
source_unknown_isbns

[]

In [24]:
constellations_and_bnf_and_babelio_counter = 0
constellations_and_bnf_counter = 0
constellations_and_babelio_counter = 0
babelio_and_bnf_counter = 0
constellation_only_counter = 0
bnf_only_counter = 0
babelio_only_counter = 0
bnf_counter = 0
constellations_counter = 0
babelio_counter = 0

for btlf_book in btlf_isbn_dict:
    if btlf_book in bnf_isbn_dict:
        bnf_counter += 1
    if btlf_book in constellations_isbn_dict:
        constellations_counter += 1
    if btlf_book in babelio_isbn_dict:
        babelio_counter += 1
    if (btlf_book in bnf_isbn_dict 
        and btlf_book in constellations_isbn_dict
        and btlf_book in babelio_isbn_dict):
        constellations_and_bnf_and_babelio_counter += 1
    if (btlf_book in bnf_isbn_dict 
        and btlf_book in constellations_isbn_dict):
        constellations_and_bnf_counter += 1
    if (btlf_book in constellations_isbn_dict 
        and btlf_book in babelio_isbn_dict):
        constellations_and_babelio_counter += 1
    if (btlf_book in bnf_isbn_dict 
        and btlf_book in babelio_isbn_dict):
        babelio_and_bnf_counter += 1
    if (btlf_book in bnf_isbn_dict
        and btlf_book not in constellations_isbn_dict
        and btlf_book not in babelio_isbn_dict):
        bnf_only_counter += 1
    if (btlf_book not in bnf_isbn_dict 
        and btlf_book in constellations_isbn_dict 
        and btlf_book not in babelio_isbn_dict):
        constellation_only_counter += 1
    if (btlf_book not in bnf_isbn_dict 
        and btlf_book not in constellations_isbn_dict 
        and btlf_book in babelio_isbn_dict):
        babelio_only_counter += 1    

print("constellations", constellations_counter)
print("bnf", bnf_counter)
print("babelio", babelio_counter)
print("constellations and bnf and babelio", constellations_and_bnf_and_babelio_counter)
print("constellations and bnf", constellations_and_bnf_counter)
print("constellations and babelio", constellations_and_babelio_counter)
print("babelio and bnf", babelio_and_bnf_counter)
print("babelio only", babelio_only_counter)
print("bnf only", bnf_only_counter)
print("constellations only", constellation_only_counter)

constellations 6721
bnf 20970
babelio 2052
constellations and bnf and babelio 135
constellations and bnf 1840
constellations and babelio 251
babelio and bnf 867
babelio only 1069
bnf only 18398
constellations only 4765


# author alignement stats

In [66]:
class Author_Alignment:
    def __init__(self, source, isbn_dict):
        self.source = source
        self.total_author_alignment_count = 0
        self.total_author_alignment_count_levenshtein = 0
        self.matched_isbn_count = 0
        self.one_author_alignment_count = 0 # counts if at least 1 author/illustrator is aligned per isbn
        self.source_count = 0
        self.btlf_count = 0
        self.isbn_dict = isbn_dict
        self.non_aligned_authors = []
        self.metadata_different = set()
        self.one_author_alignment_count_levenshtein = 0
        self.levenshtein_distance = 3
    
    def align_with_levenshtein(self, author_source, author_btlf, isbn, levenshtein_distance=0):
        author_source_preprocessed = utils.preprocess_author_name(author_source)
        author_btlf_preprocessed = utils.preprocess_author_name(author_btlf)
        distance = Levenshtein.distance(author_source_preprocessed, author_btlf_preprocessed)
        if (len(author_source_preprocessed) > distance + 1 # to avoid false positives with short names and big distances
            and len(author_btlf_preprocessed) > distance + 1
            and distance <= levenshtein_distance):
            if distance == 3:
                print(author_source_preprocessed, author_btlf_preprocessed, distance, isbn)
            return True
        return False
    
    def extract_authors_illustrators(self, isbn, include_illustrators=True):
        btlf_book_data =  btlf_isbn_dict[isbn][0]
        authors_illustrators_btlf = btlf_book_data.book_authors + [btlf_book_data.illustrator] if btlf_book_data.illustrator else btlf_book_data.book_authors
        if include_illustrators:
            if self.source == "bnf":
                authors_illustrators_source = self.isbn_dict[isbn][0].book_authors
            elif self.source == "constellations" or self.source == "babelio":
                book_data = self.isbn_dict[isbn][0]
                authors_illustrators_source = book_data.book_authors + [book_data.illustrator] if book_data.illustrator else book_data.book_authors
        else:
            authors_illustrators_source = self.isbn_dict[isbn][0].book_authors
        return authors_illustrators_source, authors_illustrators_btlf

    def count_total_author_alignments_source(self): # will count every alignement
        print("## total author alignment levenshtein distance = 0&3")
        self.source_count = 0
        for btlf_book in btlf_isbn_dict:
            if btlf_book in self.isbn_dict:
                self.count_author_alignements_isbn(btlf_book)

    def count_one_author_alignments_source(self): # will check if at least one author is aligned per isbn
        print("## one author alignment levenshtein distance = 0")
        self.matched_isbn_count = 0
        for btlf_book in btlf_isbn_dict:
            if btlf_book in self.isbn_dict:
                self.matched_isbn_count += 1
                if self.is_one_author_aligned_isbn(btlf_book):
                    self.one_author_alignment_count += 1

    def count_one_author_alignments_source_levensthein(self): # different method because is_one_author_aligned_isbn returns bineary value
        print("## one author alignment levenshtein distance = 3")
        self.matched_isbn_count = 0
        for btlf_book in btlf_isbn_dict:
            if btlf_book in self.isbn_dict:
                self.matched_isbn_count += 1
                if self.is_one_author_aligned_isbn(btlf_book, self.levenshtein_distance): # 
                    self.one_author_alignment_count_levenshtein += 1 # adding here because 

    def count_author_alignements_isbn(self, isbn):
        authors_illustrators_source, authors_illustrators_btlf = self.extract_authors_illustrators(isbn)    
        if authors_illustrators_source and authors_illustrators_btlf:
            self.source_count += len(authors_illustrators_source)
            self.btlf_count += len(authors_illustrators_btlf)
            for author_source in authors_illustrators_source:
                for author_btlf in authors_illustrators_btlf:
                    if author_source and author_btlf:
                        author_btlf_split = author_btlf.rsplit(",")
                        if len(author_btlf_split) == 2:
                            author_btlf = author_btlf_split[1] + " " + author_btlf_split[0]
                        else:
                            author_btlf = author_btlf
                        if self.align_with_levenshtein(author_source, author_btlf, isbn, levenshtein_distance=0):
                            self.total_author_alignment_count += 1 # adding 1 for each author/illustrator aligned
                        if self.align_with_levenshtein(author_source, author_btlf, isbn, self.levenshtein_distance):
                            self.total_author_alignment_count_levenshtein += 1
       
    def is_one_author_aligned_isbn(self, isbn, levensthein_distance=0):
            authors_illustrators_source, authors_illustrators_btlf = self.extract_authors_illustrators(isbn)    
            if authors_illustrators_source and authors_illustrators_btlf:
                for author_source in authors_illustrators_source:
                    for author_btlf in authors_illustrators_btlf:
                        if author_source and author_btlf:
                            author_btlf_split = author_btlf.rsplit(",")
                            if len(author_btlf_split) == 2:
                                author_btlf = author_btlf_split[1] + " " + author_btlf_split[0]
                            else:
                                author_btlf = author_btlf
                            if self.align_with_levenshtein(author_source, author_btlf, isbn, levenshtein_distance=levensthein_distance):
                                return True      
                            else: # exploration of how many books have incorrect mapping between their isbn and metadata
                                book_data_source = self.isbn_dict[isbn][0]
                                book_data_btlf = btlf_isbn_dict[isbn][0]
                                if utils.preprocess_author_name(book_data_source.book_name) != utils.preprocess_author_name(book_data_btlf.book_name):
                                    self.metadata_different.add(isbn)
            return False
    
    def print(self):
        print(f"{self.source} total author alignement", self.total_author_alignment_count , "/", self.source_count, "=", round(self.total_author_alignment_count / self.source_count * 100), "%")
        print(f"{self.source} total author alignement levenshtein", self.total_author_alignment_count_levenshtein , "/", self.source_count, "=", round(self.total_author_alignment_count_levenshtein / self.source_count * 100), "%")
        print(f"{self.source} one author alignement", self.one_author_alignment_count , "/", self.matched_isbn_count, "=", round(self.one_author_alignment_count / self.matched_isbn_count * 100), "%")
        print(f"{self.source} one author alignement levenshtein", self.one_author_alignment_count_levenshtein , "/", self.matched_isbn_count, "=", round(self.one_author_alignment_count_levenshtein / self.matched_isbn_count * 100), "%")

print("###################### bnf")
bnf_btlf_alignements = Author_Alignment(source="bnf", isbn_dict=bnf_isbn_dict)
bnf_btlf_alignements.count_total_author_alignments_source()
bnf_btlf_alignements.count_one_author_alignments_source()
bnf_btlf_alignements.count_one_author_alignments_source_levensthein()

bnf_btlf_alignements.print()

print("###################### constellations")
constellations_btlf_alignements = Author_Alignment(source="constellations", isbn_dict=constellations_isbn_dict)
constellations_btlf_alignements.count_total_author_alignments_source()
constellations_btlf_alignements.count_one_author_alignments_source()
constellations_btlf_alignements.count_one_author_alignments_source_levensthein()
constellations_btlf_alignements.print()

print("###################### babelio")
babelio_btlf_alignements = Author_Alignment(source="babelio", isbn_dict=babelio_isbn_dict)
babelio_btlf_alignements.count_total_author_alignments_source()
babelio_btlf_alignements.count_one_author_alignments_source()
babelio_btlf_alignements.count_one_author_alignments_source_levensthein()
babelio_btlf_alignements.print()

# print("bnf author alignement", bnf_author_alignments , "/", bnf_author_count)
# print("constellations author alignement", constellations_author_alignments , "/", constellations_author_count)
# print("babelio author alignement", babelio_author_alignments , "/", babelio_books)

###################### bnf
## total author alignment levenshtein distance = 0&3
aleksandr nikolaevic afanasev aleksandr nikolaevitch afanasiev 3 9782210966826
gabriellesuzanne de villeneuve gabriellesuzanne villeneuve 3 9782362663345
eunyeong cho eun young cho 3 9782352890874
alice de nussy alice nussy 3 9782246824374
jean de la fontaine jean la fontaine 3 9782360811724
dorothee de monfreid dorothee monfreid 3 9782330034948
alice de poncheville alice poncheville 3 9782747045537
agnes de lestrade agnes lestrade 3 9782812602221
dorothee de monfreid dorothee monfreid 3 9782070637102
dorothee de monfreid dorothee monfreid 3 9782211095655
agnes de lestrade agnes lestrade 3 9782092513415
edith de cornulierluciniere edith cornulierluciniere 3 9782746709355
iris de mouy iris mouy 3 9782350211039
agnes de lestrade agnes lestrade 3 9782848655253
iris de vericourt iris vericourt 3 9782358510615
agnes de lestrade agnes lestrade 3 9782812601804
agnes de lestrade agnes lestrade 3 9782747034623
elisa

# publisher alignement stats

In [26]:
class Publisher_Alignment():
    def __init__(self, source, isbn_dict):
        self.source = source
        self.alignment_count = 0
        self.matched_isbn_count = 0
        self.source_count = 0
        self.btlf_count = 0
        self.isbn_dict = isbn_dict
        self.non_aligned_publishers = []

    def count(self): # will count every alignement
        for btlf_book in btlf_isbn_dict:
            if btlf_book in self.isbn_dict:
                self.matched_isbn_count += 1
                if self.is_one_publisher_aligned_isbn(btlf_book):
                    self.alignment_count += 1

    def is_one_publisher_aligned_isbn(self, isbn):
        if isbn == "9781409527664":
            print('a')
        publisher_source = self.isbn_dict[isbn][0].publisher.replace("http://schema.org/", "")
        publisher_btlf = btlf_isbn_dict[isbn][0].publisher.replace("http://schema.org/", "")
        res = utils.preprocess_publisher_name(publisher_btlf) == utils.preprocess_publisher_name(publisher_source)
        if res:
            return True
        else:
            self.non_aligned_publishers.append((isbn, publisher_source, publisher_btlf))
            return False
        
    def print(self):
        print(f"{self.source} one publisher alignment", self.alignment_count , "/", self.matched_isbn_count, "=", round(self.alignment_count / self.matched_isbn_count * 100), "%")

print("###################### bnf")
bnf_btlf_publisher_alignements = Publisher_Alignment(source="bnf", isbn_dict=bnf_isbn_dict)
bnf_btlf_publisher_alignements.count()
bnf_btlf_publisher_alignements.print()

print("###################### constellations")
constellations_btlf_publisher_alignements = Publisher_Alignment(source="constellations", isbn_dict=constellations_isbn_dict)
constellations_btlf_publisher_alignements.count()
constellations_btlf_publisher_alignements.print()

print("###################### babelio")
babelio_btlf_publisher_alignements = Publisher_Alignment(source="babelio", isbn_dict=babelio_isbn_dict)
babelio_btlf_publisher_alignements.count()
babelio_btlf_publisher_alignements.print()

###################### bnf
bnf one publisher alignment 17252 / 20970 = 82 %
###################### constellations
a
constellations one publisher alignment 5168 / 6721 = 77 %
###################### babelio
babelio one publisher alignment 1449 / 2052 = 71 %


In [27]:
constellations_btlf_publisher_alignements.non_aligned_publishers

[('9782895850458', 'Les_Éditeurs_Réunis', 'LéR_-_Les_Éditeurs_réunis'),
 ('9782895980131', 'Le_dernier_havre', 'Les__#233;ditions_du_dernier_havre'),
 ('9782923425368', 'Du_Phœnix', 'éditions_du_Phoenix'),
 ('9782924253113', 'Du_Phœnix', 'éditions_du_Phoenix'),
 ('9782895793540', 'Bayard_Canada', 'Bayard_Canada_Livres'),
 ('9782895791034', 'Bayard_Canada', 'Bayard_Canada_Livres'),
 ('9782895797661', 'Bayard_Canada', 'Bayard_Canada_Livres'),
 ('9782923342832', 'La_Bagnole', 'None'),
 ('9782895797173', 'Bayard_Canada', 'Bayard_Canada_Livres'),
 ('9782895795001', 'Bayard_Canada', 'Bayard_Canada_Livres'),
 ('9782895793502', 'Bayard_Canada', 'Bayard_Canada_Livres'),
 ('9782895794776', 'Bayard_Canada', 'Bayard_Canada_Livres'),
 ('9782895795391', 'Bayard_Canada', 'Bayard_Canada_Livres'),
 ('9782895796114', 'Bayard_Canada', 'Bayard_Canada_Livres'),
 ('9782897700843', 'Bayard_Canada', 'Bayard_Canada_Livres'),
 ('9782895797654', 'Bayard_Canada', 'Bayard_Canada_Livres'),
 ('9782895792819', 'Bayar

In [28]:
babelio_btlf = []
not_babelio_btlf = []
babelio_not_btlf = []
not_babelio_not_btlf = [] # should stay empty

for isbn_babelio in babelio_isbn_dict:
    if isbn_babelio in btlf_isbn_dict:
        babelio_btlf.append(isbn_babelio)
    else:
        babelio_not_btlf.append(isbn_babelio)

for isbn_btlf in btlf_isbn_dict:
    if isbn_btlf not in babelio_isbn_dict:
        not_babelio_btlf.append(isbn_btlf)
    if isbn_btlf in babelio_isbn_dict and isbn_btlf not in babelio_btlf:
        print("error")

In [29]:
print("intersection", len(babelio_btlf))
print("dans babelio mais pas btlf", len(babelio_not_btlf))
print("dans btlf mais pas babelio", len(not_babelio_btlf))

intersection 2052
dans babelio mais pas btlf 138
dans btlf mais pas babelio 24868


# babelio: don't understand why some books not in BTLF

In [30]:
babelio_not_btlf

['9782764434956',
 '9782845384521',
 '9782221016299',
 '9781023501545',
 '9782375110027',
 '9781770911949',
 '9781021401229',
 '9782764429815',
 '9781021400413',
 '9782895023579',
 '9781023506779',
 '9781021401571',
 '9780375508264',
 '9781368013796',
 '9781023508407',
 '9781023503020',
 '9782020327213',
 '9782365085540',
 '9781021405708',
 '9781021404299',
 '9782203129443',
 '9782203119109',
 '9782203118782',
 '9782215172260',
 '9781096688082',
 '9781021404817',
 '9782075063869',
 '9781090597342',
 '9781091081147',
 '9781021403360',
 '9781021401236',
 '9782298037883',
 '9782960080612',
 '9782842312879',
 '9783806750911',
 '9782266058292',
 '9782210986015',
 '9782092822784',
 '9782298085532',
 '9782923898971',
 '9781092111188',
 '9782010145414',
 '9782924342152',
 '9781090425522',
 '9782266106535',
 '9781023508742',
 '9782894195208',
 '9782897114589',
 '9782020029445',
 '9782896954971',
 '9782895310075',
 '9781023501811',
 '9782253041214',
 '9782710303534',
 '9781023508391',
 '97820750

In [31]:
incorrect_isbn_counter = 0
for isbn in babelio_not_btlf:
    if not isbnlib.is_isbn13(isbn):
        incorrect_isbn_counter += 1
incorrect_isbn_counter 

0

In [32]:
# verifier proportion livres non francais dans (babelio not BTLF)
non_french_book_counter = 0
french_book_counter = 0

for isbn in babelio_not_btlf:
    if isbnlib.is_isbn13(isbn) and not is_language_french(babelio_isbn_dict[isbn][0].language):
        non_french_book_counter += 1
    elif isbnlib.is_isbn13(isbn) and is_language_french(babelio_isbn_dict[isbn][0].language):
        french_book_counter += 1
        print(isbn)

print("french", french_book_counter)
print("non french", non_french_book_counter)

9782764434956
9782221016299
9781023501545
9782375110027
9781021401229
9782764429815
9781021400413
9782895023579
9781023506779
9781021401571
9781368013796
9781023508407
9781023503020
9782020327213
9782365085540
9781021405708
9781021404299
9782203129443
9782203119109
9782203118782
9782215172260
9781096688082
9781021404817
9782075063869
9781090597342
9781091081147
9781021403360
9781021401236
9782298037883
9782960080612
9782842312879
9783806750911
9782266058292
9782210986015
9782092822784
9782298085532
9782923898971
9781092111188
9782010145414
9782924342152
9781090425522
9781023508742
9782894195208
9782897114589
9782896954971
9782895310075
9781023501811
9782253041214
9782710303534
9781023508391
9782075046077
9782896628520
9782897741822
9781023501224
9782700224733
9782890514201
9782211071864
9782921365253
9782896622191
9782266079334
9782897740610
9782912360380
9782914096713
9782895967606
9782013214827
9782075066426
9782072545597
9781023506588
9782890375062
9782896983605
9783314214684
978109