# FilmOWLedge project
By CARASCO Mathias & PERRIN Nicolas

## Installs + Imports

In [None]:
# Run only if you need to run the "Enrich data" part
!py -m pip install -q git+https://github.com/boudinfl/pke.git
!py -m spacy download en_core_web_sm
!py -m pip install rdflib

In [1]:
from tqdm import tqdm
from time import sleep
import requests
import json
import re

In [2]:
import pke

In [3]:
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD

## Gathering data from API

In [None]:
API_KEY = "d6103800bdd84d0eedb89128368992ff"
LANGUAGE = "en-US"
BASE_URL = "https://api.themoviedb.org/3/discover/movie?"

MAX_PAGES = 500

def set_params(url: str, **kwargs):
    param_str = ""
    for key, value in kwargs.items():
        param_str += key+'='+str(value)+'&'
    return url+param_str[:-1]

pages_json = []

for page in tqdm(range(1, MAX_PAGES)):
    url = set_params(BASE_URL,
                     api_key=API_KEY,
                     language=LANGUAGE,
                     sort_by='popularity.desc',
                     include_adult='false',
                     include_video='false',
                     with_watch_monetization_types='flatrate',
                     page=page)
    r = requests.get(url)
    json_obj = r.json()
    pages_json.append(json_obj)
    file = open(f"api_dumps/page_{str(page).zfill(5)}.json", "w")
    file.write(json.dumps(json_obj, indent=4))
    sleep(0.1)

In [12]:
# Concatenation of individual movies to obtain a full list of movies
movies = []
for page in pages_json:
    movies.extend(page['results'])

In [31]:
# Saving data into a json file
f = open("movies.json", "w")
f.write(json.dumps(movies, indent=4))
f.close()

In [7]:
# Loading data from file saved in a previous session
movies = []
f = open("movies.json", "r")
movies = json.loads(f.read())
f.close()

In [8]:
list(map(lambda x: x['original_title'], movies))

['Puss in Boots: The Last Wish',
 'Avatar: The Way of Water',
 'Violent Night',
 'M3GAN',
 'The Chronicles of Narnia: The Lion, the Witch and the Wardrobe',
 'Troll',
 'Black Adam',
 'Glass Onion: A Knives Out Mystery',
 'Avatar',
 'The Woman King',
 'Strange World',
 "Roald Dahl's Matilda the Musical",
 'Savage Salvation',
 'Prey for the Devil',
 'High Heat',
 'Il mio nome è vendetta',
 'The Big 4',
 'Black Panther: Wakanda Forever',
 'Mal de ojo',
 'Encanto at the Hollywood Bowl',
 'Huevitos Congelados',
 "Guillermo del Toro's Pinocchio",
 'Detective Knight: Rogue',
 'Nanatsu no Taizai: Ensa no Edinburgh - Part 1',
 'Plan lekcji',
 'The Independent',
 'Avatar: The Deep Dive - A Special Edition of 20/20',
 'Night at the Museum: Kahmunrah Rises Again',
 'Medieval',
 'R.I.P.D. 2: Rise of the Damned',
 'Fall',
 'Terrifier 2',
 'The Boss Baby: Christmas Bonus',
 'Hex',
 'Снайпер. Білий ворон',
 '미용실 : 특별한 서비스 4',
 'The Menu',
 'Puss in Boots',
 'Top Gun: Maverick',
 'Abandoned',
 'Smile',

In [19]:
movies[0]

{'adult': False,
 'backdrop_path': '/r9PkFnRUIthgBp2JZZzD380MWZy.jpg',
 'genre_ids': [16, 28, 12, 35, 10751, 14],
 'id': 315162,
 'original_language': 'en',
 'original_title': 'Puss in Boots: The Last Wish',
 'overview': 'Puss in Boots discovers that his passion for adventure has taken its toll: He has burned through eight of his nine lives, leaving him with only one life left. Puss sets out on an epic journey to find the mythical Last Wish and restore his nine lives.',
 'popularity': 10563.563,
 'poster_path': '/1NqwE6LP9IEdOZ57NCT51ftHtWT.jpg',
 'release_date': '2022-12-21',
 'title': 'Puss in Boots: The Last Wish',
 'video': False,
 'vote_average': 8.6,
 'vote_count': 1067}

## Enrich data
The goal is now to enrich data by linking topics to movies. This will be achieved by using a Topic Modeling method to extract topics from the title of the movie and its overview (summary).

In [74]:
# initialize keyphrase extraction model, here TopicRank
extractor = pke.unsupervised.TopicRank()

def extract_keyphrases(extractor, text, n=20):
    # load the content of the document, here document is expected to be a simple 
    # test string and preprocessing is carried out using spacy
    extractor.load_document(input=text, language='en')

    # keyphrase candidate selection, in the case of TopicRank: sequences of nouns
    # and adjectives (i.e. `(Noun|Adj)*`)
    extractor.candidate_selection()

    # candidate weighting, in the case of TopicRank: using a random walk algorithm
    extractor.candidate_weighting()

    # N-best selection, keyphrases contains the 10 highest scored candidates as
    # (keyphrase, score) tuples
    keyphrases = extractor.get_n_best(n=n)

    return keyphrases

extract_keyphrases(extractor, movies[1]['original_title']+'. '+movies[1]['overview'])

[('jake', 0.09464521347108282),
 ('sully family', 0.08989761729212997),
 ('neytiri', 0.08618092059003643),
 ('way', 0.0744264160136417),
 ('water', 0.07197708073300846),
 ('kids', 0.07044810574474102),
 ('story', 0.06869781004088565),
 ('decade', 0.06467604488367745),
 ('events', 0.06388551888300528),
 ('trouble', 0.062358157270368884),
 ('avatar', 0.05586951509177656),
 ('alive', 0.05207573286481042),
 ('lengths', 0.051339732762403424),
 ('battles', 0.04885121937074795),
 ('tragedies', 0.04467091498768399)]

In [94]:
def cleanup_topic(topic):
    return re.sub(r'[^a-zA-Z0-9 ]', '', topic.replace('-', ' '))

cleanup_topic("epic-journey²*!")

'epic journey'

In [22]:
# We do the operation on each movie, so we have associated topics for each of them.
'''
for movie in tqdm(movies):
    movie["topics"] = list(map(lambda x: x[0],
                               extract_keyphrases(extractor, movie['original_title']+'. '+movie['overview'])
                              )
                          )
'''
for movie in tqdm(movies):
    list_of_topics = []
    for topic, _ in extract_keyphrases(extractor, movie['original_title']+'. '+movie['overview']):
        cleaned_topic = cleanup_topic(topic)
        if cleaned_topic.replace(' ', ''):
            list_of_topics.append(cleaned_topic)
    movie["topics"] = list_of_topics

100%|██████████████████████████████████████████████████████████████████████████| 10000/10000 [1:35:00<00:00,  1.75it/s]


In [95]:
@todo: remmove because already integrated to previous cell
# Postprocessing: we clean the topics (remove non alphanumeric caracters)
for movie in tqdm(movies):
    list_of_topics = []
    for topic in movie["topics"]:
        cleaned_topic = cleanup_topic(topic)
        if cleaned_topic.replace(' ', ''):
            list_of_topics.append(cleaned_topic)
    movie["topics"] = list_of_topics

100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 85721.55it/s]


In [97]:
# Saving the enriched data in the json format.
f = open("movies_enriched.json", "w")
f.write(json.dumps(movies, indent=4))
f.close()

In [18]:
# Loading enriched data from previous session
movies = []
f = open("movies_enriched.json", "r")
movies = json.loads(f.read())
f.close()

In [19]:
movies[1]

{'adult': False,
 'backdrop_path': '/s16H6tpK2utvwDtzZ8Qy4qm5Emw.jpg',
 'genre_ids': [878, 12, 28],
 'id': 76600,
 'original_language': 'en',
 'original_title': 'Avatar: The Way of Water',
 'overview': 'Set more than a decade after the events of the first film, learn the story of the Sully family (Jake, Neytiri, and their kids), the trouble that follows them, the lengths they go to keep each other safe, the battles they fight to stay alive, and the tragedies they endure.',
 'popularity': 3495.153,
 'poster_path': '/t6HIqrRAclMCA60NsSmeqe9RmNV.jpg',
 'release_date': '2022-12-16',
 'title': 'Avatar: The Way of Water',
 'video': False,
 'vote_average': 7.7,
 'vote_count': 3990,
 'topics': ['jake',
  'sully family',
  'neytiri',
  'way',
  'water',
  'kids',
  'story',
  'decade',
  'events',
  'trouble',
  'avatar',
  'alive',
  'lengths',
  'battles',
  'tragedies']}

In [20]:
# We show an overview of the different topics that have been collected and the number of movies each topic is associated with.

topics = {}
for movie in tqdm(movies):
    for topic in movie["topics"]:
        if topic in topics:
            topics[topic] += 1
        else:
            topics[topic] = 1

all_topics = list(topics.keys())
topics

100%|████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 141817.32it/s]


{'puss': 7,
 'boots': 9,
 'passion': 23,
 'adventure': 73,
 'lives': 242,
 'toll': 6,
 'life': 830,
 'epic journey': 12,
 'jake': 9,
 'sully family': 1,
 'neytiri': 1,
 'way': 391,
 'water': 26,
 'kids': 82,
 'story': 314,
 'decade': 14,
 'events': 125,
 'trouble': 66,
 'avatar': 13,
 'alive': 61,
 'lengths': 4,
 'battles': 6,
 'tragedies': 4,
 'team': 162,
 'wealthy family compound': 1,
 'christmas eve': 22,
 'santa claus': 15,
 'mercenaries': 13,
 'surprise combatant': 1,
 'hostage': 20,
 'grounds': 3,
 'nick': 22,
 'violent night': 1,
 'saint': 4,
 'life like doll': 2,
 'm3gan': 1,
 'artificial intelligence': 3,
 'new friend': 10,
 'brilliant toy company roboticist': 1,
 'programming works': 1,
 'overprotective': 1,
 'terrifying results': 1,
 'niece': 21,
 'narnia': 3,
 'witch': 14,
 'lion': 11,
 'wardrobe': 4,
 'siblings lucy': 1,
 'edmund': 3,
 'susan': 7,
 'children': 141,
 'aslan': 1,
 'climactic battle': 2,
 'spectacular': 5,
 'peter': 25,
 'wise': 7,
 'jadis': 1,
 'land': 88,


In [101]:
# We also define a function to play around and see with which movies one topic is associated with.
# Ultimately, this is just a POC, but we will later use RDF for our final application.

def search_keyword(keyword):
    if keyword in topics:
        print(f'{str(topics[keyword])} results found for "{keyword}"')
        print("------------------------------------------------------")
        for movie in movies:
            if keyword in movie["topics"]:
                print(movie['title'])

In [102]:
search_keyword('beasts')

4 results found for "beasts"
------------------------------------------------------
Transformers: Rise of the Beasts
Beasts of No Nation
Dragon Nest: Warriors' Dawn
L.O.R.D: Legend of Ravaging Dynasties


In [21]:
all_topics = []
for topic in topics:
    t_ = re.sub(r'[^a-zA-Z0-9 ]', '', topic)
    if t_.replace(' ', ''):
        all_topics.append(t_)

with open("all_topics.txt", "w") as f:
    f.write('\n'.join(all_topics))
all_topics_set = set(all_topics)
all_topics

['puss',
 'boots',
 'passion',
 'adventure',
 'lives',
 'toll',
 'life',
 'epic journey',
 'jake',
 'sully family',
 'neytiri',
 'way',
 'water',
 'kids',
 'story',
 'decade',
 'events',
 'trouble',
 'avatar',
 'alive',
 'lengths',
 'battles',
 'tragedies',
 'team',
 'wealthy family compound',
 'christmas eve',
 'santa claus',
 'mercenaries',
 'surprise combatant',
 'hostage',
 'grounds',
 'nick',
 'violent night',
 'saint',
 'life like doll',
 'm3gan',
 'artificial intelligence',
 'new friend',
 'brilliant toy company roboticist',
 'programming works',
 'overprotective',
 'terrifying results',
 'niece',
 'narnia',
 'witch',
 'lion',
 'wardrobe',
 'siblings lucy',
 'edmund',
 'susan',
 'children',
 'aslan',
 'climactic battle',
 'spectacular',
 'peter',
 'wise',
 'jadis',
 'land',
 'free',
 'eternal winter',
 'peaceful kingdom',
 'charming',
 'glacial powers',
 'dovre',
 'mountain',
 'capital',
 'creature',
 'norway',
 'gigantic awakens',
 'path',
 'years',
 'troll',
 'norwegian folklo

In [22]:
def find_similar_topics_2(topic, all_topics_):
    similar_topics = []
    for topic_tested in all_topics_:
        if topic_tested in (topic+'s', topic+topic[-1]+'y'):
            similar_topics.append(topic_tested)
    return similar_topics

def find_similar_topics(topic, all_topics_): # Much faster
    similar_topics = []
    for tested_topic in (topic+'s', topic+topic[-1]+'y'):
        if tested_topic in all_topics_:
            similar_topics.append(tested_topic)
    return similar_topics

k = 'sun'
#search_keyword(k)
find_similar_topics(k, all_topics_set)

['sunny']

## Lifting Topics to RDF
We will now lift our data to RDF, precisely our Topics, so we will later be able to link movies to each others from those topics. For that, we have to create a vocabulary that will allow this interoperability.

In [29]:
ttl_header = """@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix : <http://myvocab.org/> .

:Topic a skos:Concept ;
    skos:prefLabel "Topic"@en ;
    rdfs:label "Topic"@en ;
    skos:definition "A subject or area of interest or discussion."@en .

:similarTo a owl:SymmetricProperty ;
    rdfs:label "similar to"@en ;
    rdfs:comment "A word that is similar to another in terms of meaning and construction."@en ;
    rdfs:domain :Topic ;
    rdfs:range :Topic .
    
:relevance a owl:DatatypeProperty ;
    rdfs:label "relevance"@en ;
    rdfs:comment "relevance of a topic based on its specificity. The more semantically complex the more relevant."@en ;
    rdfs:domain :Topic ;
    rdfs:range xsd:integer .
"""


In [30]:
def format_str(string: str, **kwargs):
    for key, value in kwargs.items():
        string = string.replace('{{ '+key+' }}', str(value))
    return string

def to_camel_case(string):
    string = string.replace('-', ' ')
    words = string.split(" ")
    camel_case = ""
    for word in words[:]:
        camel_case += word.capitalize()
    return camel_case


def topic_to_rdf(topic, related_topics):
    text = ''':{{ ressource_name }} a :Topic ;
    skos:prefLabel "{{ topic }}"@en ;
    :relevance {{ relevance }}'''
    rdf = format_str(text,
                     ressource_name=to_camel_case(topic),
                     topic=topic,
                     relevance=len(topic.split(' ')))
    if related_topics:
        rdf += ' ;\n    :similarTo '+', '.join(map(lambda x: ':'+to_camel_case(x), related_topics))
    return rdf+" . \n\n"

print(topic_to_rdf("fun", ["funny"]))

:Fun a :Topic ;
    skos:prefLabel "fun"@en ;
    :relevance 1 ;
    :similarTo :Funny . 




In [31]:
ttl_body = ttl_header
for topic in tqdm(all_topics_set):
    ttl_body += topic_to_rdf(topic, find_similar_topics(topic, all_topics_set))
    
# Saving the ttl file.
f = open("topics.ttl", "w", encoding="utf-8")
f.write(ttl_body)
f.close()

100%|█████████████████████████████████████████████████████████████████████████| 49701/49701 [00:01<00:00, 36116.80it/s]


In [32]:
# Saving the ttl file.
f = open("topics.ttl", "w", encoding="utf-8")
f.write(ttl_body)
f.close()

## Interact with the system
Now, let us play around using our ontology to get recommended movies.

In [4]:
print("--------------------------------------------------------------")
print("----------------------- FILMOWLEDGE CLI ----------------------")
print("--------------------------------------------------------------")
print()
print("Please describe what kind of movies you are looking for today:")
textual_search = input()
print()
print("--------------------------------------------------------------")
print("Voici quelques suggestions de films pour vous...")
#keyphrases = extract_keyphrases(extractor, textual_search, n=2)
print("Topics of interest:")
for keyphrase in keyphrases:
    similar_topics = find_similar_topics(keyphrase, all_topics_set)
    print(similar_topics)

--------------------------------------------------------------
----------------------- FILMOWLEDGE CLI ----------------------
--------------------------------------------------------------

Please describe what kind of movies you are looking for today:
beast

--------------------------------------------------------------
Voici quelques suggestions de films pour vous...
Topics of interest:


NameError: name 'keyphrases' is not defined

In [33]:
def categorize_input(input_string, n_groups):
    input_string = re.sub(r'[^a-zA-Z0-9 ]', '', input_string.replace('-', ' ').lower())
    print(input_string)
    words = input_string.split(' ')
    clusters = []
    for i in range(n_groups):
        clusters.append([])
        k = 0
        while k+i < len(words):
            clusters[i].append(' '.join(words[k:min(k+i+1, len(words))]))
            k = k+1
    return clusters
    
print(categorize_input("hello today the sky is blue. And i am in an endless-love!", 4))

hello today the sky is blue and i am in an endless love
[['hello', 'today', 'the', 'sky', 'is', 'blue', 'and', 'i', 'am', 'in', 'an', 'endless', 'love'], ['hello today', 'today the', 'the sky', 'sky is', 'is blue', 'blue and', 'and i', 'i am', 'am in', 'in an', 'an endless', 'endless love'], ['hello today the', 'today the sky', 'the sky is', 'sky is blue', 'is blue and', 'blue and i', 'and i am', 'i am in', 'am in an', 'in an endless', 'an endless love'], ['hello today the sky', 'today the sky is', 'the sky is blue', 'sky is blue and', 'is blue and i', 'blue and i am', 'and i am in', 'i am in an', 'am in an endless', 'in an endless love']]


In [34]:
g = Graph()

# On charge les schémas
g.parse("../ontology/schema/MergedSchemas.ttl", format="turtle")
g.parse("topics.ttl", format="turtle")
g.parse("../Filmowledge/DataLifting/output.ttl", format="turtle")

<Graph identifier=Nd2a67351a92f4bf589fb0a8a0a313678 (<class 'rdflib.graph.Graph'>)>

In [10]:
def sparql_request_valid_topics(graph, topics):
    res = {i:[] for i in range(len(topics))}
    query = '''
        SELECT ?resource ?score
        WHERE {
        '''
    unions = []
    for i in range(len(topics)):
        topics_str = '"'+'"@en, "'.join(topics[i])+'"@en'
        unions.append(
        '''
          {
            ?resource skos:prefLabel ?label 
            VALUES ?score {'''+ str(i+1) +'''}
            FILTER (?label IN ('''+ topics_str +'''))
          }
          ''')
    query = query + ' UNION '.join(unions) + '}'
    t_ = graph.query(query)
    for _ in t_:
        res[i].append(_.resource)
    return res
        

def sparql_request_from_topics(graph, topics):
    '''
    Requests the graph to retrieve the movies from the given topics.
    Topics has to be a list : names = ["Action", "Romantic", "Thriller", ...]
    '''
    topics = [':'+topic for topic in topics] # Add ':' to match the format of topics in the graph
    sparql_query = """
    PREFIX movie: <http://example.com/movie#>
    SELECT *
    WHERE {
        ?movie movie:hasTopic ?topic.
        ?movie movie:Title ?title.
        ?movie movie:Popularity ?popularity.
        FILTER (?topic IN ( """ + ' '.join(topics) + """))
    }
    ORDER BY DESC(?popularity)
    """
    qres = g.query(sparql_query)
    for row in qres:
        print(f"{row.title} is about {row.topic} and has a popularity of {row.popularity}")
        
#Appel d'exemple, pour récupérer les films parlant de Criminals
#sparql_request_from_topics(g, ["Beast"])

r = sparql_request_valid_topics(g, categorize_input("hello today the sky is blue. And i am in an endless-love!", 4))
r
#sparql_request_from_topics(g, concat_res)

hello today the sky is blue and i am in an endless love


{0: [],
 1: [],
 2: [],
 3: [rdflib.term.URIRef('http://myvocab.org/Love'),
  rdflib.term.URIRef('http://myvocab.org/Today'),
  rdflib.term.URIRef('http://myvocab.org/Sky'),
  rdflib.term.URIRef('http://myvocab.org/Blue'),
  rdflib.term.URIRef('http://myvocab.org/EndlessLove')]}

In [None]:
sparql_request_from_topics(g, ["Beast"])