# Goal of notebook

- understand where BTLF data is coming from: BNF, Constellations, Babelio
- understand reason of missing data for each source

In [1]:
from rdflib import Graph, Namespace, Literal, URIRef
from rdflib import RDF, RDFS, OWL, XSD
import sys
sys.path.append('../')

sys.path.append('../andre')
import andre.utils as utils
from andre.utils import schema as SCHEMA
import collections
import isbnlib

# bnf

In [2]:
graph_bnf = Graph()
graph_bnf.parse("final_datasets/bnf.ttl", format="turtle")

bnf_isbn_dict = collections.defaultdict(lambda :[])

for bnf_book in graph_bnf.subjects(RDF.type, utils.schema.Book):
    book_data = utils.extract_data_bnf(graph_bnf, bnf_book)
    # if not isLanguageFrench(book_data.language):
    #     print(book_data.language)
    bnf_isbn_dict[book_data.isbn].append(book_data) 

Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x00000254F2E6C7C0>
Traceback (most recent call last):
  File "C:\Users\dre\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\rdflib\term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
           ^^^^^^^^^^^^^^^^^^
  File "C:\Users\dre\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\isodate\isodates.py", line 203, in parse_date
    raise ISO8601Error('Unrecognised ISO 8601 date format: %r' % datestring)
isodate.isoerror.ISO8601Error: Unrecognised ISO 8601 date format: 'Port- d-es'
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x00000254F2E6C7C0>
Traceback (most recent call last)

In [3]:
def is_language_french(language):
    return language == "Français" or  language == "fre"

In [4]:
graph_btlf = Graph()
graph_btlf.parse("final_datasets/grapheLivres_BTLF_new_data_EditeursConsolides.ttl", format="turtle")

btlf_isbn_dict = collections.defaultdict(lambda :[])
for btlf_book in graph_btlf.subjects(RDF.type, utils.schema.Book):
    book_data = utils.extract_data_btlf(graph_btlf, btlf_book)
    btlf_isbn_dict[book_data.isbn].append(book_data) 

In [5]:
print(len(btlf_isbn_dict))

26920


In [6]:
bnf_btlf = []
not_bnf_btlf = []
bnf_not_btlf = []
not_bnf_not_btlf = [] # should stay empty

for isbn_bnf in bnf_isbn_dict:
    if isbn_bnf in btlf_isbn_dict:
        bnf_btlf.append(isbn_bnf)
    else:
        bnf_not_btlf.append(isbn_bnf)

for isbn_btlf in btlf_isbn_dict:
    if isbn_btlf not in bnf_isbn_dict:
        not_bnf_btlf.append(isbn_btlf)
    if isbn_btlf in bnf_isbn_dict and isbn_btlf not in bnf_btlf:
        print("error")

In [7]:
print("intersection", len(bnf_btlf))
print("dans bnf mais pas btlf", len(bnf_not_btlf))
print("dans btlf mais pas bnf", len(not_bnf_btlf))

intersection 20794
dans bnf mais pas btlf 9143
dans btlf mais pas bnf 6126


In [8]:
# verifier proportion livres non francais dans (BNF not BTLF)
non_french_book_counter = 0
french_book_counter = 0
non_isbn_13_counter = 0

for isbn in bnf_not_btlf:
    if isbnlib.is_isbn13(isbn) and not is_language_french(bnf_isbn_dict[isbn][0].language):
        non_french_book_counter += 1
    elif isbnlib.is_isbn13(isbn) and is_language_french(bnf_isbn_dict[isbn][0].language):
        french_book_counter += 1
    if not isbnlib.is_isbn13(isbn):
        non_isbn_13_counter += 1

print("french", french_book_counter)
print("non french", non_french_book_counter)
print("non isbn13", non_isbn_13_counter)

french 772
non french 1008
non isbn13 7363


In [9]:
print(len(btlf_isbn_dict))


26920


# constellations

In [10]:
graph_constellations = Graph()
graph_constellations.parse("final_datasets/Constellations.ttl", format="turtle")

constellations_isbn_dict = collections.defaultdict(lambda :[])

for constellations_book in graph_constellations.subjects(RDF.type, utils.schema.Book):
    book_data = utils.extract_data_constellation(graph_constellations, constellations_book)
    constellations_isbn_dict[book_data.isbn].append(book_data) 


In [11]:
print(len(constellations_isbn_dict))
print(len(btlf_isbn_dict))
print(len(bnf_isbn_dict))

11255
26920
29937


In [12]:
constellations_btlf = []
not_constellations_btlf = []
constellations_not_btlf = []
not_constellations_not_btlf = [] # should stay empty

for isbn_constellations in constellations_isbn_dict:
    if isbn_constellations in btlf_isbn_dict:
        constellations_btlf.append(isbn_constellations)
    else:
        constellations_not_btlf.append(isbn_constellations)

for isbn_btlf in btlf_isbn_dict:
    if isbn_btlf not in constellations_isbn_dict:
        not_constellations_btlf.append(isbn_btlf)
    if isbn_btlf in constellations_isbn_dict and isbn_btlf not in constellations_btlf:
        print("error")

In [13]:
print("intersection", len(constellations_btlf))
print("dans constellations mais pas btlf", len(constellations_not_btlf))
print("dans btlf mais pas constellations", len(not_constellations_btlf))

intersection 6683
dans constellations mais pas btlf 4572
dans btlf mais pas constellations 20237


In [14]:
constellations_btlf

['9782350007625',
 '9782742769681',
 '9782895291930',
 '9782848017860',
 '9782924277812',
 '9782352900658',
 '9782896071449',
 '9782081305410',
 '9782330039516',
 '9782211097734',
 '9781474907354',
 '9782747058261',
 '9782070626489',
 '9782203075344',
 '9782812600593',
 '9782070576418',
 '9782740427798',
 '9782748520637',
 '9782745980663',
 '9782362900617',
 '9782742785995',
 '9782745933188',
 '9782362900433',
 '9782070641536',
 '9782035861986',
 '9782253073093',
 '9782745968777',
 '9782373491746',
 '9782922585506',
 '9782070659043',
 '9782732443522',
 '9782812618840',
 '9782896080823',
 '9782764408520',
 '9782070589876',
 '9782266305914',
 '9782211092876',
 '9782211091336',
 '9782354132200',
 '9782355040627',
 '9782211089876',
 '9782215135302',
 '9782211078764',
 '9782020639507',
 '9782918689119',
 '9782733838730',
 '9782211208055',
 '9782070629923',
 '9782211207928',
 '9782882583888',
 '9782354883058',
 '9782812606830',
 '9782764445044',
 '9782211093750',
 '9782355041495',
 '97822661

In [15]:
len(list(set(not_bnf_btlf) & set(constellations_btlf)))

4915

In [16]:
# verifier proportion livres non francais dans (BNF not BTLF)
non_french_book_counter = 0
french_book_counter = 0
non_isbn_13_counter = 0

for isbn in constellations_not_btlf:
    if isbnlib.is_isbn13(isbn) and not is_language_french(constellations_isbn_dict[isbn][0].language):
        non_french_book_counter += 1
    elif isbnlib.is_isbn13(isbn) and is_language_french(constellations_isbn_dict[isbn][0].language):
        french_book_counter += 1
    if not isbnlib.is_isbn13(isbn):
        non_isbn_13_counter += 1

print("french", french_book_counter)
print("non french", non_french_book_counter)
print("non isbn13", non_isbn_13_counter)

french 0
non french 3045
non isbn13 1527


In [17]:
print("non isbn13 in constellations")
len(constellations_not_btlf) - len(list(filter(isbnlib.is_isbn13,constellations_not_btlf)))

non isbn13 in constellations


1527

# investigation of BTLF sources

In [18]:
constellation_counter = 0
bnf_counter = 0
constellations_and_bnf_counter = 0
source_unknown_counter = 0

for btlf_book in btlf_isbn_dict:
    if btlf_book in bnf_isbn_dict and btlf_book in constellations_isbn_dict:
        constellations_and_bnf_counter += 1
    elif btlf_book in bnf_isbn_dict and btlf_book not in constellations_isbn_dict:
        bnf_counter += 1
    elif btlf_book not in bnf_isbn_dict and btlf_book in constellations_isbn_dict:
        constellation_counter += 1
    else:
        print(btlf_book)
        source_unknown_counter += 1

print("constellations and bnf", constellations_and_bnf_counter)
print("constellations only", constellation_counter)
print("bnf only", bnf_counter)
print("source unkown", source_unknown_counter)

9782924645536
9782897091279
9782896579006
9782895911210
9782924996003
9782896578382
9782890749658
9782896576654
9782895370642
9782896332083
9782890463660
9782890469105
9782896577422
9782894354889
9782894355619
9782895850892
9782897512538
9782890217461
9782890218147
9782890218383
9782897740153
9782896510955
9782890218239
9782896512553
9782923342191
9782897140090
9782923342344
9782892950120
9782923342030
9782896074136
9782897142872
9782897700133
9782922225723
9782923896151
9782896070008
9782897141257
9782922225990
9782896070077
9782896480562
9782923813745
9782896074259
9782897701499
9782893813127
9782922225013
9782924563953
9782922225518
9782896074013
9782896074464
9782896071272
9782896481125
9782897525385
9782762122183
9782897125776
9782762117363
9782897125417
9782895129783
9782897770532
9782760942219
9782896867196
9782895123897
9782760999169
9782922892970
9782895124504
9782897852948
9782760933521
9782895124658
9782760942141
9782895123989
9782760947719
9782760933385
9782923196084
978289

# babelio: last BTLF source

In [19]:
graph_babelio = Graph()
graph_babelio.parse("final_datasets/babelio.ttl", format="turtle")

<Graph identifier=N96882806c9e64510b8f395542d23d631 (<class 'rdflib.graph.Graph'>)>

In [20]:
babelio_isbn_dict = collections.defaultdict(lambda :[])

for babelio_book in graph_babelio.subjects(RDF.type, utils.schema.Book):
    book_data = utils.extract_data_babelio(graph_babelio, babelio_book)
    babelio_isbn_dict[book_data.isbn].append(book_data) 

In [21]:
len(babelio_isbn_dict)

2215

In [22]:
babelio_isbn_dict.keys()

dict_keys(['9782924332016', '', '9782330056032', '9782075191043', '9782266317078', '9782875575173', '9782344000595', '9782742755400', '9782211222600', '9782764434956', '9782895407102', '9782364745056', '9782382120347', '9782811649920', '9782413026570', '9782505076780', '9782373492590', '9782505115304', '9791032712856', '9782380710243', '9782382120446', '9791032704677', '9782369743255', '9782344005682', '9782369740315', '9782723499903', '9782351809082', '9782723494014', '9782818921647', '9782203062382', '9782302025479', '9782355924262', '9782302023192', '9782820309624', '9782355921582', '9782756016641', '9782811603441', '9782505012801', '9782811638771', '9782849656419', '9782356481665', '9782845998650', '9782759500581', '9782756005485', '9782849466841', '9782351002353', '9782811642242', '9782756001548', '9782847899429', '9782845384521', '9782847894516', '9782723447720', '9782845993372', '9782372871976', '9782382120361', '9782355929458', '9791035502553', '9782809450774', '9782362663574',

In [23]:
babelio_isbn_dict['9782234006645'][0].book_authors

[]

In [24]:
len(list(graph_babelio.subjects(RDF.type, utils.schema.Book)))

4169

In [25]:
constellation_counter = 0
bnf_counter = 0
constellations_and_bnf_counter = 0
babelio_counter = 0
source_unknown_counter = 0

for btlf_book in btlf_isbn_dict:
    if btlf_book in bnf_isbn_dict and btlf_book in constellations_isbn_dict:
        constellations_and_bnf_counter += 1
    elif btlf_book in bnf_isbn_dict and btlf_book not in constellations_isbn_dict:
        bnf_counter += 1
    elif btlf_book not in bnf_isbn_dict and btlf_book in constellations_isbn_dict:
        constellation_counter += 1
    elif btlf_book in babelio_isbn_dict:
        babelio_counter += 1
    else:
        print(btlf_book)
        source_unknown_counter += 1

print("constellations and bnf", constellations_and_bnf_counter)
print("constellations only", constellation_counter)
print("bnf only", bnf_counter)
print("babelio only", babelio_counter)
print("source unkown", source_unknown_counter)

constellations and bnf 1768
constellations only 4915
bnf only 19026
babelio only 1211
source unkown 0


In [26]:
constellations_and_bnf_and_babelio_counter = 0
constellations_and_bnf_counter = 0
constellations_and_babelio_counter = 0
babelio_and_bnf_counter = 0
constellation_only_counter = 0
bnf_only_counter = 0
babelio_only_counter = 0

for btlf_book in btlf_isbn_dict:
    if (btlf_book in bnf_isbn_dict 
        and btlf_book in constellations_isbn_dict
        and btlf_book in babelio_isbn_dict):
        constellations_and_bnf_and_babelio_counter += 1
    if (btlf_book in bnf_isbn_dict 
        and btlf_book in constellations_isbn_dict):
        constellations_and_bnf_counter += 1
    if (btlf_book in constellations_isbn_dict 
        and btlf_book in babelio_isbn_dict):
        constellations_and_babelio_counter += 1
    if (btlf_book in bnf_isbn_dict 
        and btlf_book in babelio_isbn_dict):
        babelio_and_bnf_counter += 1
    if (btlf_book in bnf_isbn_dict
        and btlf_book not in constellations_isbn_dict
        and btlf_book not in babelio_isbn_dict):
        bnf_only_counter += 1
    if (btlf_book not in bnf_isbn_dict 
        and btlf_book in constellations_isbn_dict 
        and btlf_book not in babelio_isbn_dict):
        constellation_only_counter += 1
    if (btlf_book not in bnf_isbn_dict 
        and btlf_book not in constellations_isbn_dict 
        and btlf_book in babelio_isbn_dict):
        babelio_only_counter += 1
        print(btlf_book)
    

print("constellations and bnf and babelio", constellations_and_bnf_and_babelio_counter)
print("constellations and bnf", constellations_and_bnf_counter)
print("constellations and babelio", constellations_and_babelio_counter)
print("babelio and bnf", babelio_and_bnf_counter)
print("babelio only", babelio_only_counter)
print("bnf only", bnf_only_counter)
print("constellations only", constellation_only_counter)

9782924645536
9782897091279
9782896579006
9782895911210
9782924996003
9782896578382
9782890749658
9782896576654
9782895370642
9782896332083
9782890463660
9782890469105
9782896577422
9782894354889
9782894355619
9782895850892
9782897512538
9782890217461
9782890218147
9782890218383
9782897740153
9782896510955
9782890218239
9782896512553
9782923342191
9782897140090
9782923342344
9782892950120
9782923342030
9782896074136
9782897142872
9782897700133
9782922225723
9782923896151
9782896070008
9782897141257
9782922225990
9782896070077
9782896480562
9782923813745
9782896074259
9782897701499
9782893813127
9782922225013
9782924563953
9782922225518
9782896074013
9782896074464
9782896071272
9782896481125
9782897525385
9782762122183
9782897125776
9782762117363
9782897125417
9782895129783
9782897770532
9782760942219
9782896867196
9782895123897
9782760999169
9782922892970
9782895124504
9782897852948
9782760933521
9782895124658
9782760942141
9782895123989
9782760947719
9782760933385
9782923196084
978289

In [40]:
def are_authors_aligned_btlf(dict_a, isbn):
    
    authors_a = dict_a[isbn][0].book_authors # dict[isbn] not likely to have more than 1 element because no duplicates
    author_b = btlf_isbn_dict[isbn][0].book_authors

    if authors_a and author_b:
        for author_a in authors_a:
            if author_a and author_b:
                author_b_split = author_b.rsplit(",")
                if len(author_b_split) == 2:
                    author_b = author_b.rsplit(",")[1] + " " + author_b.rsplit(",")[0]
                else:
                    author_b = author_b
                res = utils.preprocess_author_name(author_a) == utils.preprocess_author_name(author_b)
                if res:
                    return True
        print(isbn, utils.preprocess_author_name(authors_a[0]), "," ,utils.preprocess_author_name(author_b))
        return False
    else:
        return False
    
bnf_author_alignments = 0
bnf_books = 0
constellations_author_alignments = 0
constellations_books = 0
babelio_author_alignments = 0
babelio_books = 0

print("###################### bnf")
for btlf_book in btlf_isbn_dict:

    if btlf_book in bnf_isbn_dict:
        bnf_books += 1
        if are_authors_aligned_btlf(bnf_isbn_dict, btlf_book):
            bnf_author_alignments += 1

print("###################### constellations")
for btlf_book in btlf_isbn_dict:

    if btlf_book in constellations_isbn_dict:
        constellations_books += 1
        if are_authors_aligned_btlf(constellations_isbn_dict, btlf_book):
            constellations_author_alignments += 1

print("###################### babelio")
for btlf_book in btlf_isbn_dict:

    if btlf_book in babelio_isbn_dict:
        babelio_books += 1
        if are_authors_aligned_btlf(babelio_isbn_dict, btlf_book):
            babelio_author_alignments += 1


print("bnf author alignement", bnf_author_alignments , "/", bnf_books)
print("constellations author alignement", constellations_author_alignments , "/", constellations_books)
print("babelio author alignement", babelio_author_alignments , "/", babelio_books)

###################### bnf
9782896862627 johanne raby , raby
9782924720264 marie barguirdjian bletton , marie barguirdjian
9782897990008 laeticia nkakou yoka , annette braconnemichoux
9782895408765 alex nogues otero , alex nogues
9782895407201 rodolfo j walsh , rodolfo walsh
9782897771140 cecile gariepy , simon drouin
9782895401841 jacques cartier , francoise ligier
9782895403234 andre marois , virginie egger
9782844555656 seungyeon chae , chae seungyeon
9791026402329 jana k kudrnova , eva bartova
9791092353549 latelier du trio , gilles abier
9782210966826 aleksandr nikolaevic afanasev , aleksandr nikolaevitch afanasiev
9782378011116 catherine louis , jihad darwiche
9782203215702 aleksandra woldanskapocinska , ola woldanskaplocinska
9782362663345 gabriellesuzanne de villeneuve , gabriellesuzanne villeneuve
9782344048115 charlottefleur cristofari , charlotte cristofari
9782745930828 paule paganon , dimitri casali
9782211094542 dominique mwankumi , isidore nadaywel e nziem
9782352890874 

In [28]:
babelio_btlf = []
not_babelio_btlf = []
babelio_not_btlf = []
not_babelio_not_btlf = [] # should stay empty

for isbn_babelio in babelio_isbn_dict:
    if isbn_babelio in btlf_isbn_dict:
        babelio_btlf.append(isbn_babelio)
    else:
        babelio_not_btlf.append(isbn_babelio)

for isbn_btlf in btlf_isbn_dict:
    if isbn_btlf not in babelio_isbn_dict:
        not_babelio_btlf.append(isbn_btlf)
    if isbn_btlf in babelio_isbn_dict and isbn_btlf not in babelio_btlf:
        print("error")

In [29]:
print("intersection", len(babelio_btlf))
print("dans babelio mais pas btlf", len(babelio_not_btlf))
print("dans btlf mais pas babelio", len(not_babelio_btlf))

intersection 2052
dans babelio mais pas btlf 163
dans btlf mais pas babelio 24868


## babelio: don't understand why some books not in BTLF

In [30]:
babelio_not_btlf

['',
 '9782764434956',
 '9782845384521',
 '9782221016299',
 '9781023501545',
 '9782375110027',
 '9781770911949',
 '9781021401229',
 '978B088646CCV',
 '978B0B1JM9PG2',
 '9782764429815',
 '9781021400413',
 '978B0861F2QMR',
 '9782895023579',
 '9781023506779',
 '9781021401571',
 '978B09BXCDXHF',
 '9780375508264',
 '9781368013796',
 '9781023508407',
 '9781023503020',
 '9782020327213',
 '9782365085540',
 '9781021405708',
 '9781021404299',
 '9782203129443',
 '9782203119109',
 '9782203118782',
 '9782215172260',
 '9781096688082',
 '9781021404817',
 '9782075063869',
 '9781090597342',
 '9781091081147',
 '9781021403360',
 '9781021401236',
 '9782298037883',
 '9782960080612',
 '9782842312879',
 '9783806750911',
 '9782266058292',
 '9782210986015',
 '9782092822784',
 '978B08WPCJ8P2',
 '9782298085532',
 '9782923898971',
 '9781092111188',
 '978B009SNHAGC',
 '9782010145414',
 '9782924342152',
 '9781090425522',
 '9782266106535',
 '9781023508742',
 '9782894195208',
 '9782897114589',
 '9782020029445',
 '978

In [31]:
incorrect_isbn_counter = 0
for isbn in babelio_not_btlf:
    if not isbnlib.is_isbn13(isbn):
        incorrect_isbn_counter += 1
incorrect_isbn_counter 

25

In [32]:
# verifier proportion livres non francais dans (babelio not BTLF)
non_french_book_counter = 0
french_book_counter = 0

for isbn in babelio_not_btlf:
    if isbnlib.is_isbn13(isbn) and not is_language_french(babelio_isbn_dict[isbn][0].language):
        non_french_book_counter += 1
    elif isbnlib.is_isbn13(isbn) and is_language_french(babelio_isbn_dict[isbn][0].language):
        french_book_counter += 1
        print(isbn)

print("french", french_book_counter)
print("non french", non_french_book_counter)

9782764434956
9782221016299
9781023501545
9782375110027
9781021401229
9782764429815
9781021400413
9782895023579
9781023506779
9781021401571
9781368013796
9781023508407
9781023503020
9782020327213
9782365085540
9781021405708
9781021404299
9782203129443
9782203119109
9782203118782
9782215172260
9781096688082
9781021404817
9782075063869
9781090597342
9781091081147
9781021403360
9781021401236
9782298037883
9782960080612
9782842312879
9783806750911
9782266058292
9782210986015
9782092822784
9782298085532
9782923898971
9781092111188
9782010145414
9782924342152
9781090425522
9781023508742
9782894195208
9782897114589
9782896954971
9782895310075
9781023501811
9782253041214
9782710303534
9781023508391
9782075046077
9782896628520
9782897741822
9781023501224
9782700224733
9782890514201
9782211071864
9782921365253
9782896622191
9782266079334
9782897740610
9782912360380
9782914096713
9782895967606
9782013214827
9782890062528
9782211232746
9781023502887
9782021110159
9781091416000
9781023508100
978289