In [20]:
import rdflib
import rdflib.term
from rdflib import Graph, Namespace, RDF, OWL, RDFS, SDO

In [101]:
def calculate_similarity_percentage(list1, list2):
    # Convert lists to sets for efficient intersection calculation
    set1 = set(list1)
    set2 = set(list2)

    # Calculate the intersection (common elements) between the two sets
    common_elements = set1.intersection(set2)

    # Calculate the percentage of similarity
    similarity_percentage = (len(common_elements) / len(set1.union(set2))) * 100

    return similarity_percentage

In [102]:
csv = Graph()
csv.parse("CSV/modified_movie.ttl")

text = Graph()
text.parse('Extract_Text/extracted_text.ttl')
ns_film_data = Namespace('http://projet.fr/films_data/')
ns_film_schema = Namespace('http://projet.fr/films_schema/')
ns_perso_schema = Namespace('http://projet.fr/perso_schema/')
ns_text = Namespace('http://projet.fr/text/')

csv.bind('p_schema', ns_perso_schema)
csv.bind('f_schema', ns_film_schema)
csv.bind("", ns_film_data)

schema = Graph()
schema.parse('schema_film.ttl')

best = {'score': -1, 'text': '', 'csv': ''}

for subj, pred, obj in text.triples((None, RDF.type, None)):
    for t_subj, t_pred, t_obj in csv.triples((None, RDF.type, SDO.Movie)):

        cast = [o.split('/')[-1] for s, p, o in text.triples((subj, ns_text.castmember, None))]

        title = text.value(subj, ns_text.name)
        stars = [o.split('/')[-1] for s, p, o in csv.triples((t_subj, ns_film_schema.stars, None))]

        cast.sort()
        stars.sort()

        if len(stars) != 0 and len(cast) != 0:
            score = calculate_similarity_percentage(cast, stars)

            if best['score'] < score:
                best['score'] = score
                best['text'] = subj
                best['csv'] = t_subj

print(f'score {best}')

for subj, pred, obj in text.triples((None, ns_text.performer, None)):
    if len(list(text.triples((best['text'], None, subj)))) != 0 or len(
            list(text.triples((subj, None, best['text'])))) != 0:
        text.add((best['text'], ns_text.characters, subj))
        text.add((subj, ns_text.presentinwork, best['text']))

for subj, pred, obj in text.triples((None, ns_text.characters, None)):
    text.remove((subj, ns_text.name, None))

score {'score': 22.22222222222222, 'text': rdflib.term.URIRef('http://projet.fr/text/CaptainAmerica'), 'csv': rdflib.term.URIRef('http://projet.fr/films_data/26')}


In [103]:
text.serialize(destination="modified_text.ttl", format="turtle")

<Graph identifier=N4d5cfe9647484639a8a8f03271352a97 (<class 'rdflib.graph.Graph'>)>

In [104]:
def maxLen(l1, l2):
    if len(l1) > len(l2):
        return l1
    return l2


def merge(subj, pred, obj, old_pred, seen):
    if (subj, pred, obj) in seen:
        return seen

    seen.append((subj, pred, obj))

    if subj == best['text']:
        subject = best['csv']
    else:
        subject = ns_film_data[subj.split('/')[-1]]

    print(f'{obj}')

    if obj == ns_text.Node and subject != best['csv']:
        if old_pred == ns_text.characters:
            csv.add((subject, RDF.type, ns_perso_schema.Character))
        elif old_pred == ns_text.castmember:
            csv.add((subject, RDF.type, ns_perso_schema.Actor))
        else:
            csv.add((subject, RDF.type, OWL.onClass))
        return seen

    if pred.split('/')[-1] == 'name':
        property = SDO.name
    elif pred.split('/')[-1] == 'characters':
        property = SDO.character
    elif pred.split('/')[-1] == 'publicationdate':
        property = SDO.datePublished
    else:
        property = ns_film_schema[pred.split('/')[-1]]
        schema.add((property, RDF.type, OWL.ObjectProperty))

    if isinstance(obj, rdflib.term.Literal):
        csv.add((subject, property, obj))
        return seen

    if obj == best['text']:
        object = best['csv']
    else:
        object = ns_film_data[obj.split('/')[-1]]

    csv.add((subject, property, object))

    if pred == ns_text.castmember:
        for s, p, o in text.triples((obj, None, None)):
            seen = maxLen(merge(s, p, o, pred, seen), seen)
        return seen

    if pred == ns_text.characters:
        for s, p, o in text.triples((obj, None, None)):
            seen = maxLen(merge(s, p, o, pred, seen), seen)
        return seen

    for s, p, o in text.triples((obj, None, None)):
        seen = maxLen(merge(s, p, o, None, seen), seen)
    return seen


seen = []
for predicate in [ns_text.castmember, ns_text.characters, ns_text.publicationdate]:
    for subj, pred, obj in text.triples((best['text'], predicate, None)):
        print(seen)
        seen = merge(best['text'], pred, obj, pred, seen)

csv.remove((best['csv'], ns_film_schema.name, None))

[]
http://projet.fr/text/DominicCooper
http://projet.fr/text/Node
Dominic Cooper
[(rdflib.term.URIRef('http://projet.fr/text/CaptainAmerica'), rdflib.term.URIRef('http://projet.fr/text/castmember'), rdflib.term.URIRef('http://projet.fr/text/DominicCooper')), (rdflib.term.URIRef('http://projet.fr/text/DominicCooper'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://projet.fr/text/Node')), (rdflib.term.URIRef('http://projet.fr/text/DominicCooper'), rdflib.term.URIRef('http://projet.fr/text/name'), rdflib.term.Literal('Dominic Cooper'))]
http://projet.fr/text/HayleyAtwell
http://projet.fr/text/Node
Hayley Atwell
[(rdflib.term.URIRef('http://projet.fr/text/CaptainAmerica'), rdflib.term.URIRef('http://projet.fr/text/castmember'), rdflib.term.URIRef('http://projet.fr/text/DominicCooper')), (rdflib.term.URIRef('http://projet.fr/text/DominicCooper'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http

<Graph identifier=Ncf98c2872c804073bbe53d07e7d94029 (<class 'rdflib.graph.Graph'>)>

In [105]:
database = Graph()
database.parse("API/database.ttl")

ns_perso_data = Namespace('http://projet.fr/perso_data/')
csv.bind('p_data', ns_perso_data)

for subj, pred, obj in csv.triples((best['csv'], SDO.character, None)):
    for subj_d, pred_d, obj_d in database.triples((None, SDO.name, None)):
        #print(obj)
        name = csv.value(obj, SDO.name)
        #print([f'{subj} {pred} {obj}' for subj, pred, obj in csv.triples((obj, None, None))])
        #print(name)
        if obj_d == name:
            csv.add((obj, RDFS.seeAlso, ns_perso_data[subj_d.split('/')[-1]]))
            csv.add((obj, RDFS.seeAlso, ns_perso_data[subj_d.split('/')[-1].replace('P', 'H')]))

In [106]:
from unidecode import unidecode
from rdflib import URIRef

for subj, pred, obj in csv.triples((None, None, None)):
    if type(subj) != rdflib.Literal:
        csv.remove((subj, pred, obj))
        subj = URIRef(unidecode(subj))
        csv.add((subj, pred, obj))
    if type(obj) != rdflib.Literal:
        csv.remove((subj, pred, obj))
        obj = URIRef(unidecode(obj))
        csv.add((subj, pred, obj))

In [107]:
from SPARQLWrapper import SPARQLWrapper, JSON
from rdflib import Literal, XSD

endpoint = "http://dbpedia.org/sparql"
sparql = SPARQLWrapper(endpoint)


def search_resource_uri(title, director):
    # Construct the SPARQL query to search for the resource
    query = f"""
    SELECT ?subj
    WHERE {{
        ?subj rdfs:label ?name.
        ?subj dbo:director ?director.
        ?director rdfs:label ?named.
        FILTER(CONTAINS(?name, "%s"))
        FILTER(CONTAINS(?named, "%s"))
    }}
    LIMIT 1
    """ % (title, director)

    # Set the query and request JSON results
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)

    # Execute the query and parse the results
    results = sparql.query().convert()

    # Extract the URI of the resource, if available
    if "results" in results and "bindings" in results["results"]:
        bindings = results["results"]["bindings"]
        if bindings:
            b = bindings[0]["subj"]["value"]
            return b

    return None


for subj, pred, obj in csv.triples((None, RDF.type, SDO.Movie)):
    name = csv.value(subj, SDO.name)

    uri_director = csv.value(subj, SDO.director)

    director = csv.value(uri_director, SDO.name)

    print(f'{name} {director}')
    uri = search_resource_uri(name, director)

    if uri:
        print(f"URI {uri}")
        csv.add((subj, OWL.sameAs, Literal(uri, datatype=XSD.anyURI)))
    else:
        print(f"Resource '{name}' not found.")

Ghost Rider: Spirit of Vengeance Mark Neveldine
URI http://dbpedia.org/resource/Ghost_Rider:_Spirit_of_Vengeance
Batman & Robin Joel Schumacher
URI http://dbpedia.org/resource/Batman_&_Robin_(film)
Superman IV: The Quest for Peace Sidney J. Furie
URI http://dbpedia.org/resource/Superman_IV:_The_Quest_for_Peace
Avengers: Age of Ultron Joss Whedon
URI http://dbpedia.org/resource/Avengers:_Age_of_Ultron
The Punisher Jonathan Hensleigh
URI http://dbpedia.org/resource/The_Punisher_(2004_film)
Logan James Mangold
URI http://dbpedia.org/resource/Logan_(film)
The Wolverine James Mangold
URI http://dbpedia.org/resource/The_Wolverine_(film)
Spider-Man: Far from Home Jon Watts
URI http://dbpedia.org/resource/Spider-Man:_Far_From_Home
Captain America: The Winter Soldier Anthony Russo
Resource 'Captain America: The Winter Soldier' not found.
Captain America: Civil War Anthony Russo
Resource 'Captain America: Civil War' not found.
Deadpool Tim Miller
Resource 'Deadpool' not found.
X-Men: Apocalypse 

In [108]:
csv.serialize(destination="merge.ttl", format="turtle")
schema.serialize(destination="modified_schema_film.ttl", format="turtle")

<Graph identifier=N82ce41aeb035431d9c802b01a53d1de7 (<class 'rdflib.graph.Graph'>)>