In [1]:
import rdflib
import rdflib.term
from rdflib import Graph, Namespace, RDF, OWL

In [2]:
def calculate_similarity_percentage(list1, list2):
    # Convert lists to sets for efficient intersection calculation
    set1 = set(list1)
    set2 = set(list2)

    # Calculate the intersection (common elements) between the two sets
    common_elements = set1.intersection(set2)

    # Calculate the percentage of similarity
    similarity_percentage = (len(common_elements) / len(set1.union(set2))) * 100

    return similarity_percentage

In [9]:
csv = Graph()
csv.parse("CSV/modified_movie.ttl")

text = Graph()
text.parse('Extract_Text/extracted_text.ttl')
ns_film_data = Namespace('http://projet.fr/films_data/')
ns_film_schema = Namespace('http://projet.fr/films_schema/')
ns_perso_schema = Namespace('http://projet.fr/perso_schema/')
ns_text = Namespace('http://projet.fr/text/')

csv.bind('p_schema', ns_perso_schema)
csv.bind('f_schema', ns_film_schema)
csv.bind("", ns_film_data)

schema = Graph()
schema.parse('schema_film.ttl')

best = {'score': -1, 'text': '', 'csv': ''}

for subj, pred, obj in text.triples((None, RDF.type, None)):
    for t_subj, t_pred, t_obj in csv.triples((None, RDF.type, ns_film_schema.CreativeWork)):

        cast = [o.split('/')[-1] for s, p, o in text.triples((subj, ns_text.castmember, None))]

        title = text.value(subj, ns_text.name)
        stars = [o.split('/')[-1] for s, p, o in csv.triples((t_subj, ns_film_schema.stars, None))]

        cast.sort()
        stars.sort()

        if len(stars) != 0 and len(cast) != 0:
            score = calculate_similarity_percentage(cast, stars)

            if best['score'] < score:
                best['score'] = score
                best['text'] = subj
                best['csv'] = t_subj

print(f'score {best}')

for subj, pred, obj in text.triples((None, ns_text.performer, None)):
    if len(list(text.triples((best['text'], None, subj)))) != 0 or len(list(text.triples((subj, None, best['text'])))) != 0:
        text.add((best['text'], ns_text.characters, subj))
        text.add((subj, ns_text.present_in_work, best['text']))

score {'score': 22.22222222222222, 'text': rdflib.term.URIRef('http://projet.fr/text/CaptainAmerica'), 'csv': rdflib.term.URIRef('http://projet.fr/films_data/26')}


In [10]:
text.serialize(destination="modified_text.ttl", format="turtle")

<Graph identifier=N0e8b97565f794f8d9e60a35c4198b667 (<class 'rdflib.graph.Graph'>)>

In [11]:
def maxLen(l1, l2):
    if len(l1) > len(l2):
        return l1
    return l2


def merge(subj, pred, obj, old_pred, seen):
    if (subj,pred,obj) in seen:
        return seen
    
    seen.append((subj,pred,obj))

    if subj == best['text']:
        subject = best['csv']
    else:
        subject = ns_film_data[subj.split('/')[-1]]

    print(f'{obj}')

    if obj == ns_text.Node and subject != best['csv']:
        if old_pred == ns_text.characters:
            csv.add((subject, RDF.type, ns_perso_schema.Person))
            csv.add((subject, RDF.type, ns_perso_schema.Fictional))
        elif old_pred == ns_text.castmember:
            csv.add((subject, RDF.type, ns_perso_schema.Person))
            csv.add((subject, RDF.type, ns_perso_schema.Real))
        else:
            csv.add((subject, RDF.type, OWL.onClass))
        return seen

    property = ns_film_schema[pred.split('/')[-1]]
    schema.add((property, RDF.type, OWL.ObjectProperty))

    if isinstance(obj, rdflib.term.Literal):
        csv.add((subject, property, obj))
        return seen
    
    if obj == best['text']:
        object = best['csv']
    else:
        object = ns_film_data[obj.split('/')[-1]]
        
    csv.add((subject, property, object))

    if pred == ns_text.castmember:
        for s, p, o in text.triples((obj, None, None)):
            seen = maxLen(merge(s, p, o, pred, seen), seen)
        return seen

    if pred == ns_text.characters:
        for s, p, o in text.triples((obj, None, None)):
            seen = maxLen(merge(s, p, o, pred, seen), seen)
        return seen

    for s, p, o in text.triples((obj, None, None)):
        seen = maxLen(merge(s, p, o, None, seen), seen)
    return seen


seen = []
for predicate in [ns_text.castmember, ns_text.characters, ns_text.publicationdate]:
    for subj, pred, obj in text.triples((best['text'], predicate, None)):
        print(seen)
        seen = merge(best['text'], pred, obj, pred, seen)
        #print(seen)

[]
http://projet.fr/text/DominicCooper
http://projet.fr/text/Node
[(rdflib.term.URIRef('http://projet.fr/text/CaptainAmerica'), rdflib.term.URIRef('http://projet.fr/text/castmember'), rdflib.term.URIRef('http://projet.fr/text/DominicCooper')), (rdflib.term.URIRef('http://projet.fr/text/DominicCooper'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://projet.fr/text/Node'))]
http://projet.fr/text/HayleyAtwell
http://projet.fr/text/Node
[(rdflib.term.URIRef('http://projet.fr/text/CaptainAmerica'), rdflib.term.URIRef('http://projet.fr/text/castmember'), rdflib.term.URIRef('http://projet.fr/text/DominicCooper')), (rdflib.term.URIRef('http://projet.fr/text/DominicCooper'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://projet.fr/text/Node')), (rdflib.term.URIRef('http://projet.fr/text/CaptainAmerica'), rdflib.term.URIRef('http://projet.fr/text/castmember'), rdflib.term.URIRef('http://projet.f

In [12]:
csv.serialize(destination="merge.ttl", format="turtle")
schema.serialize(destination="modified_schema_film.ttl", format="turtle")

<Graph identifier=Ndcec6a62a20c4f8c9fa8cd4e456d3708 (<class 'rdflib.graph.Graph'>)>