In [1]:
from rdflib import Graph, Namespace, Literal, URIRef
from rdflib import RDF, RDFS, OWL, XSD
import sys
sys.path.append('../')

sys.path.append('../andre')
import andre.utils as utils
from andre.utils import schema as SCHEMA
import collections

In [2]:
publishers_btlf = Graph()
publishers_btlf.bind("schema","http://schema.org/", override=True, replace=True)
publishers_btlf.parse("../final_datasets/btlf_books/Graphes/grapheEditeurs_BTLF.ttl")
len(list(publishers_btlf))

1486

In [3]:
next(iter(publishers_btlf))

(rdflib.term.URIRef('http://schema.org/publisher75'),
 rdflib.term.URIRef('http://schema.org/name'),
 rdflib.term.Literal('Editions AKILEOS'))

In [4]:
publishers_bnf_constellations = Graph()
publishers_bnf_constellations.bind("schema","http://schema.org/", override=True, replace=True)
publishers_bnf_constellations.parse("../final_datasets/publishers.ttl")
len(list(publishers_bnf_constellations))

4690

In [5]:
def preprocess_name(name):
    preprocessed_name = raw_name.lower()
    preprocessed_name = utils.strip_special_chars(preprocessed_name)
    preprocessed_name = utils.strip_accents(preprocessed_name)
    preprocessed_name = preprocessed_name.replace("editions", "")
    preprocessed_name = preprocessed_name.replace("edition", "")
    preprocessed_name = preprocessed_name.replace(" ", "")
    preprocessed_name = preprocessed_name.strip()
    return preprocessed_name

class Publisher:
    def __init__(self, source, uri, raw_name, preprocessed_name):
        self.source = source
        self.uri = uri  
        self.raw_name = raw_name  
        self.preprocessed_name = preprocessed_name  

In [6]:
publishers_bnf_constellations_dict = {}
doublons = []
for publisher in publishers_bnf_constellations.subjects(RDF.type, SCHEMA.Publisher):
    raw_name = str(publishers_bnf_constellations.value(publisher, SCHEMA.name))
    preprocessed_name = preprocess_name(raw_name)

    
    if preprocessed_name in publishers_bnf_constellations_dict.keys():
        # print("doublons", name)
        doublons.append(preprocessed_name)
    else:
        publishers_bnf_constellations_dict[preprocessed_name] = Publisher(source="bnf_constellations", 
                                                                        uri=publisher, 
                                                                        raw_name=raw_name, 
                                                                        preprocessed_name=preprocessed_name)

print(len(doublons))
len(publishers_bnf_constellations_dict)

50


2245

In [7]:
doublons

['memo',
 'ekare',
 'denoel',
 'lelievredemars',
 'palette',
 'disneyhyperionbooks',
 'petitsgenies',
 'delachauxetniestle',
 'lapoulequipond',
 'northsouthbooks',
 'motus',
 'quatrefleuves',
 'format',
 '123soleil',
 'ruedelechiquier',
 'eyrolles',
 'ceres',
 'pika',
 'etre',
 'grenouille',
 'duberyl',
 'ankama',
 'duchene',
 'daralsaqi',
 'grund',
 'bamboo',
 'daraladab',
 'edicef',
 'dupepin',
 'pemf',
 'zebulo',
 'etre',
 '',
 'fleurus',
 'cadex',
 'daralhadaiq',
 'blackbox',
 'goelette',
 'koine',
 'micmac',
 'bld',
 'samir',
 'tsipika',
 '',
 'lemeac',
 'ndze',
 'daralhadaiq',
 'bartholdi',
 'h2t',
 '1018']

In [9]:
# assert(doublons == ['memo', 'ekare', 'denoel', 'le lievre de mars', 'palette', 'disneyhyperion books', 'petits genies', 'delachaux et niestle', 'la poule qui pond', 'motus', 'quatre fleuves', 'format', 'rue de lechiquier', 'eyrolles', 'ceres', 'pika', 'etre', 'grenouille', 'du beryl', 'ankama', 'du chene', 'dar alsaqi', 'grund', 'bamboo', 'dar aladab', 'edicef', 'du pepin', 'pemf', 'zebulo', 'etre', '', 'fleurus', 'cadex', 'dar alhadaiq', 'black box', 'goelette', 'koine', 'bld', 'samir', 'tsipika', '', 'lemeac', 'ndze', 'dar alhadaiq', 'bartholdi', 'h2t', '1018'])

In [10]:
import Levenshtein

def find_top_1_similar_strings_levenshtein(string_list, target_string):
    distances = []
    for string in string_list:
        distance = Levenshtein.distance(string, target_string)
        if distance <= 1:
            distances.append((string, distance))
    distances.sort(key=lambda x: x[1])
    if distances:
         return distances[0]
    else: 
        return None

In [27]:
direct_match_counter = 0
levenstein_match_counter = 0
btlf_publishers_dict = {}
matches_dict = {}
non_matches_dict = {}

for publisher in publishers_btlf.subjects(RDF.type, SCHEMA.Publisher):
    raw_name = str(publishers_btlf.value(publisher, SCHEMA.name))
    preprocessed_name = preprocess_name(raw_name)
    btlf_publishers_dict[preprocessed_name] = Publisher(source="btlf", 
                                                uri=publisher, 
                                                raw_name=raw_name, 
                                                preprocessed_name=preprocessed_name)
    if preprocessed_name in publishers_bnf_constellations_dict.keys():
        direct_match_counter +=1
        # print(preprocessed_name)
        matches_dict[preprocessed_name] = [btlf_publishers_dict[preprocessed_name], publishers_bnf_constellations_dict[preprocessed_name]]
    else:
        non_matches_dict[preprocessed_name] = publisher
        matched_key = find_top_1_similar_strings_levenshtein(publishers_bnf_constellations_dict.keys(), preprocessed_name)
        if matched_key:
            levenstein_match_counter += 1
            # print(matched_key, preprocessed_name)
print()
print("direct", direct_match_counter, "/", len(btlf_publishers_dict.keys()))
print("levenstein", levenstein_match_counter, "/", len(btlf_publishers_dict.keys()))


direct 458 / 741
levenstein 14 / 741


In [25]:
matches_dict

{'gallimard': [<__main__.Publisher at 0x29e46cfded0>,
  <__main__.Publisher at 0x29e2e90b290>],
 'lestroisourses': [<__main__.Publisher at 0x29e46cfe450>,
  <__main__.Publisher at 0x29e2e989690>],
 'corti': [<__main__.Publisher at 0x29e46cfe290>,
  <__main__.Publisher at 0x29e2e958990>],
 'mila': [<__main__.Publisher at 0x29e46cff6d0>,
  <__main__.Publisher at 0x29e2e8fffd0>],
 'etre': [<__main__.Publisher at 0x29e46cfce50>,
  <__main__.Publisher at 0x29e2e90aad0>],
 'pika': [<__main__.Publisher at 0x29e46cfda50>,
  <__main__.Publisher at 0x29e2e902ad0>],
 'chanok': [<__main__.Publisher at 0x29e46cfe6d0>,
  <__main__.Publisher at 0x29e2e97c9d0>],
 'hesse': [<__main__.Publisher at 0x29e46cfef90>,
  <__main__.Publisher at 0x29e2e96ac50>],
 'lelanvert': [<__main__.Publisher at 0x29e46cfe190>,
  <__main__.Publisher at 0x29e2e8ff210>],
 'mk2': [<__main__.Publisher at 0x29e46cffd10>,
  <__main__.Publisher at 0x29e2e97d890>],
 'delepure': [<__main__.Publisher at 0x29e46cfc250>,
  <__main__.Pu

In [29]:
len(non_matches_dict)

285

In [30]:
non_matches_dict

{'magnardvuibert': rdflib.term.URIRef('http://schema.org/publisher10'),
 'latelier': rdflib.term.URIRef('http://schema.org/publisher100'),
 'museedulouvre': rdflib.term.URIRef('http://schema.org/publisher103'),
 'soreda': rdflib.term.URIRef('http://schema.org/publisher105'),
 'hochecommunication': rdflib.term.URIRef('http://schema.org/publisher108'),
 'soleilproductions': rdflib.term.URIRef('http://schema.org/publisher11'),
 'dexia': rdflib.term.URIRef('http://schema.org/publisher110'),
 'lesduquotidien': rdflib.term.URIRef('http://schema.org/publisher112'),
 'alsegpresseetmultimedia': rdflib.term.URIRef('http://schema.org/publisher116'),
 'groupepaquet': rdflib.term.URIRef('http://schema.org/publisher117'),
 'epsilon': rdflib.term.URIRef('http://schema.org/publisher119'),
 'gulfstreamdu': rdflib.term.URIRef('http://schema.org/publisher12'),
 'cyr': rdflib.term.URIRef('http://schema.org/publisher122'),
 'graindesable': rdflib.term.URIRef('http://schema.org/publisher123'),
 'lireencaled