In [10]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

In [11]:
from rdflib.namespace import Namespace, RDF, RDFS, XSD
from rdflib.term import URIRef, Literal
import csv
import json
import networkx as nx
import pandas as pd
import rdflib
from collections import defaultdict, Counter
import locale

In [12]:
graph = rdflib.Graph()
graph.parse('data/14_graph.nt', format='turtle')

<Graph identifier=N374aa996c5db4373b13cc5a72307b1c2 (<class 'rdflib.graph.Graph'>)>

In [13]:
# prefixes used in the graph
WD = Namespace('http://www.wikidata.org/entity/')
WDT = Namespace('http://www.wikidata.org/prop/direct/')
SCHEMA = Namespace('http://schema.org/')
DDIS = Namespace('http://ddis.ch/atai/')

In [14]:
entities = set(graph.subjects()) | {s for s in graph.objects() if isinstance(s,URIRef)}
predicates = set(graph.predicates())
literals = {s for s in graph.objects() if isinstance(s, Literal)}

In [15]:
genre_list = {str(uri):str(s) for uri,s in graph.query('''
PREFIX ddis: <http://ddis.ch/atai/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX schema: <http://schema.org/>
SELECT ?genre ?lbl WHERE {
?movie wdt:P31 wd:Q11424 .
?movie wdt:P136 ?genre .
?genre rdfs:label ?lbl .
}
''')}

len(genre_list)

364

In [16]:
people_list = {str(uri):str(s) for uri,s in graph.query('''
PREFIX ddis: <http://ddis.ch/atai/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX schema: <http://schema.org/>
SELECT ?person ?lbl WHERE {
?person wdt:P31 wd:Q5 .
?person rdfs:label ?lbl .
}
''')}

len(people_list)

100157

In [17]:
movie_list = {str(uri):str(s) for uri,s in graph.query('''
PREFIX ddis: <http://ddis.ch/atai/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX schema: <http://schema.org/>
SELECT ?movie ?lbl WHERE {
?movie wdt:P31 wd:Q11424 .
?movie rdfs:label ?lbl .
}
''')}

len(movie_list)

24384

In [18]:
predicate_dict = {}
prefix_string = "PREFIX ddis: <http://ddis.ch/atai/> PREFIX wd: <http://www.wikidata.org/entity/> PREFIX wdt: <http://www.wikidata.org/prop/direct/> PREFIX schema: <http://schema.org/> "
for uri in predicates:
    try:
        p = uri.split("/")[-1]
        query_string = "SELECT ?lbl WHERE { wdt:PREDICATE rdfs:label ?lbl . }".replace("PREDICATE",p)
        res = graph.query(prefix_string + query_string)
        label, = res
        predicate_dict[str(uri)] = str(label[0])
    except:
        pass

In [19]:
predicate_dict

{'http://www.wikidata.org/prop/direct/P931': 'place served by transport hub',
 'http://www.wikidata.org/prop/direct/P6658': 'RCQ classification',
 'http://www.wikidata.org/prop/direct/P915': 'filming location',
 'http://www.wikidata.org/prop/direct/P108': 'employer',
 'http://www.wikidata.org/prop/direct/P22': 'father',
 'http://www.wikidata.org/prop/direct/P5970': 'Medierådet rating',
 'http://www.wikidata.org/prop/direct/P3428': 'INCAA film rating',
 'http://www.wikidata.org/prop/direct/P461': 'opposite of',
 'http://www.wikidata.org/prop/direct/P4584': 'first appearance',
 'http://www.wikidata.org/prop/direct/P3095': 'practiced by',
 'http://www.wikidata.org/prop/direct/P466': 'occupant',
 'http://www.wikidata.org/prop/direct/P2408': 'set in period',
 'http://www.wikidata.org/prop/direct/P123': 'publisher',
 'http://www.wikidata.org/prop/direct/P7327': 'OFLC classification',
 'http://www.wikidata.org/prop/direct/P161': 'cast member',
 'http://www.wikidata.org/prop/direct/P26': 'spou

In [20]:
# Defining some dictionaries for mapping wrods to predicates
roles_ner = {
    "actor":"P161",
    "cast":"P161",
    "cast member":"P161",
    "director":"P57",
    "screenwriter":"P58",
    "producer":"P162",
    "developer" : "P178",
    "film editor" : "P1040",
    "director of photography" : "P344",
    "film crew member" : "P2079",
    "choreographer" : "P1809",
    #"father" : "P22",
    #"spouse" : "P26",
    "art director" : "P3174",
    #"winner" : "P1346",
    "author" : "P50",
    "presenter" : "P371",
    "narrator" : "P2438",
    "animator" : "P6942",
    "creator" : "P170",
    #"student" : "P802",
    #"mother" : "P25",
    "participant" : "P710",
    "member of the crew of" : "P5096",
    "voice actor" : "P725",
    #"employer" : "P108",
    "publisher" : "P123",
    "musical conductor" : "P3300",
    "operator" : "P137",
    "performer" : "P175"
}

actions_ner = {
    "acted":"P161",
    "directed":"P57",
    "screenwrote":"P58",
    "wrote":"P58",
    "written":"P58",
    "produced":"P162",
    "featured":"P161",
    "featuring":"P161", 
    "recorded" : None,
    #"appeared" : None,
    "appear" : None,
    "direct" : None,
    #"produced" : None,
    "produce" : None,
    "filmed" : None,
    "edit" : None,
    #"film" : None,
    "edited" : None,
    #"cast" : None,
    "shoot" : None,
    "premiere" : None,
    #"release" : None,
    "distribute" : None,
    "directing":None,
    "played": None,
    "made": None,
    "authored": None,
    "appears": None
}

predicates_ner = {
    "release" : "P577",
    "when" : "P577",
    #"release year": "P577",
    "date": "P577",
    "year": "P577",
    #"date of release": "P577",
    "genre": "P136",
    "type": "P136",
    "category": "P136",
    #"ratings": "P444",
    "rated": "P444",
    "rating": "P444",
    "review": "P444",
    "score":"P444",
    # Consider if this is the right place to have it
    "suggest": None,
    "recommend": None
}

numbers_ner = [
    "one",
    "two",
    "three",
    "four",
    "five",
    "six",
    "seven",
    "eight",
    "nine",
    "ten",
    "eleven",
    "twelve",
    "thirteen",
    "fourteen",
    "fifteen",
    "sixteen",
    "seventeen",
    "eighteen",
    "nineteen",
    "twenty"
]

nums = list(range(1, 21))
nums = list(map(str, nums))
numbers_ner.extend(nums)

years_ner = list(range(1678, 2025))
years_ner = list(map(str, years_ner))


genres_ner = {v.replace(" film",""):k for k,v in genre_list.items()}
genres_ner = {k.replace("genre","").strip():v for k,v in genres_ner.items()}
del(genres_ner['film score'])

In [21]:
numbers_ner

['one',
 'two',
 'three',
 'four',
 'five',
 'six',
 'seven',
 'eight',
 'nine',
 'ten',
 'eleven',
 'twelve',
 'thirteen',
 'fourteen',
 'fifteen',
 'sixteen',
 'seventeen',
 'eighteen',
 'nineteen',
 'twenty',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20']

In [33]:
sentence_list_1=[ #Person search
    "Give me the <role> of <movie_name>",
    "Give me the name of the <role> of <movie_name>",
    "I'd like to know the <role>'s name for <movie_name>",
    "I'm curious about the person who <directed/wrote> <movie_name>",
    "Identify the <role> of <movie_name>",
    "Tell me the <role> behind <movie_name>",
    "Tell me the <role> for <movie_name>",
    "Tell me the <role> of <movie_name>",
    "What is the <role>'s name for the movie <movie_name>?",
    "Who <role> in <movie_name>?",
    "Who <directed/wrote> <movie_name>?",
    "Who <directed/wrote> the movie <movie_name>?",
    "Who is the <role> of <movie_name>?",
    "Who is the <role> of the film <movie_name>?",
    "Who oversaw the making of <movie_name>",
    "Who was in charge of directing <movie_name>?",
    "Who was the <role> of <movie_name>"
]

sentence_list_2 = [ # Movie search
    "Give me a list of <genre> movies directed by <name>",
    "Give me a list of <genre> movies featuring <name>",
    "Give me a list of <genre> movies written by <name>",
    "Give me a list of movies directed by <name>",
    "Give me a list of movies featuring <name>",
    "Give me a list of movies written by <name>",
    "In which movies did <name> played a role?",
    "In which movies does <name> appear?",
    "List <genre> films written by <name>",
    "List movies which were directed by <name>",
    "List movies which were written by <name>",
    "List the films made by <name>",
    "List films authored by <name>",
    "Retrieve the films authored by <name>",
    "Retrieve the films directed by <name>",
    "Retrieve the <genre> films with screenplays authored by <name>",
    "Select all the movies of <name>",
    "Select all the movies with <name>",
    "Show me <genre> movies directed by <name>",
    "Show me <genre> movies written by <name>",
    "Show me the films directed by <name>",
    "Show me movies written by <name>",
    "Show me the films where <name> appears",
    "Show me the films with <name> in the cast",
    "Tell me the movies where <name> makes appearances",
    "What are the movies with <name> as a cast member?",
    "Which <genre> films has <name> directed?",
    "Which <genre> films has <name> written?",
    "Which films has <name> written?",
    "Which movies has <name> <acted/wrote>",
    "Which movies has <name> directed?"
]

sentence_list_3 = [ # retrieve release date
    "Could you let me know the release date for <movie_name>?",
    "Could you tell me when <movie_name> was first shown?",
    "Give me the release date for <movie_name>",
    "I'd like to know the release date of the movie <movie_name>",
    "Please provide the release year for <movie_name>",
    "Tell me the release date of <movie_name>",
    "What's the date of release for <movie_name>?",
    "What's the release year of the movie <movie_name>?",
    "What's the launch date of <movie_name>?",
    "What's the debut date for <movie_name>?",
    "What is the release year of <movie_name>?",
    "When did <movie_name> come out?",
    "When did <movie_name> hit the theaters?",
    "When was <movie_name> released?",
    "When was <movie_name> launched?",
    "When was the movie <movie_name> released?",
    "When did <movie_name> first appear in cinemas?"
]

sentence_list_4 = [ # genre
    "Give me the genre for the movie <movie_name>",
    "I'd like to know the genre of <movie_name>",
    "I'm curious about the genre of <movie_name>",
    "In which genre does '<movie_name>' fit?",
    "Identify the movie genre for <movie_name>",
    "Tell me the genre of <movie_name>",
    "Tell me the type of film that is <movie_name>",
    "What category does <movie_name> fall into?",
    "What genre is associated with <movie_name>?",
    "What is the movie genre for <movie_name>?",
    "What is the genre of <movie_name>?",
    "Which category does <movie_name> fit into?",
    "Which genre does the movie <movie_name> belong to?",
    "Which genre does <movie_name> belong to?"
]

sentence_list_5 = [ #ratings
    "Display the movies with the <highest/lowest> ratings",
    "Give me the <top/bottom> <number> movies based on rating",
    "Give me the <top/bottom> rated movies",
    "Give me the <highest/lowest>-rated films",
    "Give me the <top/bottom> <number> movies based on rating",
    "Give me the <top/bottom> rated movies",
    "Give me the movie that has the <highest/lowest> user rating",
    "I'd like to know the movie with the <highest/lowest> user rating",
    "I'm curious about the movie with the <highest/lowest> user rating",
    "Identify the film with the <highest/lowest> user rating",
    "I want to see the <top/bottom>-rated movies",
    "List the movies with the <best/worst> ratings",
    "Provide a list of the <top/bottom>-rated movies",
    "Retrieve the movies with the <highest/lowest> ratings",
    "Show me the <highest/lowest>-rated films",
    "Show the movies with the <best/worst> ratings",
    "Tell me the film that received the <highest/lowest> user rating",
    "Tell me the movie with the <highest/lowest> user rating",
    "What are the most <highest/lowest>-rated films?",
    "What is the movie with the <highest/lowest> user rating?",
    "What movie has the <highest/lowest> user rating?",
    "Which film received the <highest/lowest> user rating?",
    "Which movie has the <highest/lowest> user rating?",
    "Which movie is rated the <highest/lowest> by users?"
]

sentence_list_6 = [
    "Show me the <highest/lowest>-rated <genre> films",
    "List the <genre> movies with the <best/worst> ratings",
    "I want to see the <top/bottom>-rated <genre> movies",
    "Display the <genre> movies with the <highest/lowest> ratings",
    "What are the most highly-rated <genre> films?",
    "Provide a list of the <top/bottom>-rated <genre> movies",
    "Give me the <highest/lowest>-rated <genre> films",
    "Show the movies with the <best/worst> <genre> ratings",
    "Retrieve the movies with the <highest/lowest> <genre> ratings",
]

sentence_list_7 = [
    "Show me the <highest/lowest>-rated <genre> films <action> <name>",
    "List the <genre> movies with the <best/worst> ratings <action> <name>",
    "I want to see the <top/bottom>-rated <genre> movies <action> <name>",
    "Display the <genre> movies with the <highest/lowest> ratings <action> <name>",
    "What are the most highly-rated <genre> films <action> <name>?",
    "Provide a list of the <top/bottom>-rated <genre> movies <action> <name>",
    "Give me the <highest/lowest>-rated <genre> films <action> <name>",
    "Show the movies with the <best/worst> <genre> ratings <action> <name>",
    "Retrieve the movies with the <highest/lowest> <genre> ratings <action> <name>",

    "List the <genre> movies <action> <name> with the <best/worst> ratings",
    "Display the <genre> movies <action> <name> with the <highest/lowest> ratings",
    "Provide a list of the <top/bottom>-rated <genre> movies",
    "Show the movies <action> <name> with the <best/worst> <genre> ratings",
    "Retrieve the movies <action> <name> with the <highest/lowest> <genre> ratings",
]

sentence_list_8 = [ # year-related
    "List the movies released in <year>",
    "Which movies were directed by <name> in <year>?",
    "What movies were released in <year>?",
    "What movies did <name> appear in in <year>?",
    "List the films featuring <name> in <year>",
    "Provide the filmography of <name> in <year>",
    "Which movies starred <name> in <year>?",
    "Which roles were played by <name> in <year>?"
]

sentence_list_9 = [ #Top/lowest N (based on release date)
    "What are the <best/worst> <number> movies <name> directed?",
    "List the <best/worst> <number> movies directed by <name>",
    "What are the <best/worst> <number> movies <name> directed?",
    "What are the <best/worst> <number> <genre> movies <name> directed?",
    "Which are the <best/worst> movies where <name> appeared?",
    "Can you name the <best/worst> <number> <genre> movies directed by <name>?",
    "Can you list the <best/worst> <number> films with <name> in the cast?",
    "What are the <best/worst> <number> movies starring <name>"
]

sentence_list_10 = [ #Yes/No
    "Is <name> in the cast of <movie_name>?",
    "Did <name> direct <movie_name>?",
    "Was the <movie> released in <year>?",
    "Is <name> part of the cast of <movie_name>?",
    "Was <movie_name> released before <year>?",
    "Is <movie_name> classified as <genre>?",
    "Did <name> appear in <movie_name>?",
]

sentence_list_11 = [
    # Movie recommendation
    # Three movies
    "Given that I like <movie_name>, <movie_name>, and <movie_name>, can you recommend some movies?",
    "Recommend movies like <movie_name>, <movie_name>, and <movie_name>",
    "Since I enjoy <movie_name>, <movie_name>, and <movie_name>, do you have any movie recommendations for me?",
    "I'm a fan of <movie_name>, <movie_name>, and <movie_name>. Any suggestions for similar movies?",
    "Considering my liking for <movie_name>, <movie_name>, and <movie_name>, could you suggest some other films I might enjoy?",
    "Given my interest in <movie_name>, <movie_name>, and <movie_name>, can you recommend additional movies for me to watch?",
    "If I enjoy <movie_name>, <movie_name>, and <movie_name>, what other movies would you suggest?",
    "Being a fan of <movie_name>, <movie_name>, and <movie_name>, are there similar movies you'd recommend?",
    "Since I like <movie_name>, <movie_name>, and <movie_name>, what other movies do you think I might like?",
    "I've enjoyed <movie_name>, <movie_name>, and <movie_name>. Any recommendations for movies in a similar vein?",
    "Given my preference for <movie_name>, <movie_name>, and <movie_name>, what other films should I check out?",
    "If <movie_name>, <movie_name>, and <movie_name> are my preferences, do you have any movie recommendations for me?",
    "Suggest films similar to <movie_name>, <movie_name>, and <movie_name>",
    "Can you recommend movies akin to <movie_name>, <movie_name>, and <movie_name>?",
    "Looking for movies in the same vein as <movie_name>, <movie_name>, and <movie_name>. Any recommendations?",
    "Seeking recommendations for movies like <movie_name>, <movie_name>, and <movie_name>",
    "Do you know any movie similar to <movie_name>, <movie_name>, and <movie_name>",
    "I'm on the lookout for captivating films that share a vibe with <movie_name>, <movie_name>, and <movie_name>. Any suggestions that align with these favorites of mine?",
    "In the mood for something reminiscent of <movie_name>, <movie_name>, and <movie_name>. Any movie recommendations that capture a similar essence?",
    "If my taste for movies like <movie_name>, <movie_name>, and <movie_name> is any indication, what other films should I consider watching?",
    "Looking to expand my movie playlist based on the likes of <movie_name>, <movie_name>, and <movie_name>. Any hidden gems you'd recommend?",
    "If the cinematic style of <movie_name>, <movie_name>, and <movie_name> resonates with you, could you suggest other films that share a similar appeal?",
    "Enjoyed the storytelling in <movie_name>, <movie_name>, and <movie_name>. Any recommendations for movies that weave narratives in a comparable fashion?",
    "Based on the cinematic experience of <movie_name>, <movie_name>, and <movie_name>, are there any other films you'd suggest to keep the excitement alive?",
    "If the movie magic of <movie_name>, <movie_name>, and <movie_name> has left a lasting impression, what other films do you think would leave me equally enchanted?",
    "Immersed in the world of <movie_name>, <movie_name>, and <movie_name>, I'm curious about your recommendations for movies that provide a similar level of immersion.",
    "Exploring films akin to <movie_name>, <movie_name>, and <movie_name> — any suggestions to keep the cinematic journey going?",
    "Captivated by the storytelling in <movie_name>, <movie_name>, and <movie_name>. What other films do you think would tell a tale in a way that resonates with me?",
    
    
    
    #Two movies
    "Being a fan of <movie_name> and <movie_name>, are there similar movies you'd recommend?",
    "Based on the cinematic experience of <movie_name> and <movie_name>, are there any other films you'd suggest to keep the excitement alive?",
    "Can you recommend movies akin to <movie_name> and <movie_name>?",
    "Captivated by the storytelling in <movie_name> and <movie_name>. What other films do you think would tell a tale in a way that resonates with me?",
    "Considering my liking for <movie_name> and <movie_name>, could you suggest some other films I might enjoy?",
    "Do you know any movie similar to <movie_name> and <movie_name>?",
    "Enjoyed the storytelling in <movie_name> and <movie_name>. Any recommendations for movies that weave narratives in a comparable fashion?",
    "Given my interest in <movie_name> and <movie_name>, can you recommend additional movies for me to watch?",
    "Given my preference for <movie_name> and <movie_name>, what other films should I check out?",
    "I'm a fan of <movie_name> and <movie_name>. Any suggestions for similar movies?",
    "If I enjoy <movie_name> and <movie_name>, what other movies would you suggest?",
    "I've enjoyed <movie_name> and <movie_name>. Any recommendations for movies in a similar vein?",
    "Looking for movies in the same vein as <movie_name> and <movie_name>. Any recommendations?",
    "Recommend movies like <movie_name> and <movie_name>",
    "Seeking recommendations for movies like <movie_name> and <movie_name>",
    "Since I enjoy <movie_name> and <movie_name>, do you have any movie recommendations for me?",
    "Since I like <movie_name> and <movie_name>, what other movies do you think I might like?",
    "Suggest films similar to <movie_name> and <movie_name>",
    "If <movie_name> and <movie_name> are my preferences, do you have any movie recommendations for me?",
    "Can you recommend movies that share a similar theme or storyline with both <movie_name> and <movie_name>?",
    
    #One movie
    "Since I enjoy <movie_name>, do you have any movie recommendations for me?",
    "Recommend movies like <movie_name>",
    "I'm a fan of <movie_name>. Any suggestions for similar movies?",
    "Considering my liking for <movie_name>, could you suggest some other films I might enjoy?",
    "Given my interest in <movie_name>, can you recommend additional movies for me to watch?",
    "If I enjoy <movie_name>, what other movies would you suggest?",
    "Being a fan of <movie_name>, are there similar movies you'd recommend?",
    "Since I like <movie_name>, what other movies do you think I might like?",
    "I've enjoyed <movie_name>. Any recommendations for movies in a similar vein?",
    "Given my preference for <movie_name>, what other films should I check out?",
    "If <movie_name> is my preference, do you have any movie recommendations for me?",
    "Suggest films similar to <movie_name>",
    "Can you recommend movies akin to <movie_name>?",
    "Looking for movies in the same vein as <movie_name>. Any recommendations?",
    "Seeking recommendations for movies like <movie_name>",

]

# Repeated instances
sentence_list_12 = [
    # Names
    "Do <name> and <name> appear in <movie_name>",
    "Do <name> and <name> appear in the same movie",
    "Have <name> and <name> ever worked together?",
    "Can you list all the movies starring <name>, <name>, and <name>?",
    "Can you recommend me a movie with a similar style than the movies from <name> and <name>", 
    "Can you list all the movies starring both <name> and <name>?",
    "Are there any films where <name>, <name> and <name> collaborated as <role>",
    "Is there a movie that features both <name> and <name> in prominent roles?",
    "Have <name> and <name> ever co-directed movies?",
    "Do <name> and <name> share the screen in any movies?",
    "Are there any notable collaborations between <name>, <name>, and <name> in the film industry?",
    "Do <name> and <name> have any projects together?",
    "Can you recommend a movie that showcases the directorial styles of both <name> and <name>?",
    "Can you provide a list of films directed by <name> and starring <name>?",
    "Have <name> and <name> ever been involved in a movie project that received significant critical acclaim?",
    
    # Movies
    "Are there any staff members in common between <movie_name> and <movie_name>?",
    "Were <movie_name> and <movie_name> released the same year?",
    "Can you suggest films where the cast includes both <name> and <name>?",
    "Do <movie_name> and <movie_name> share any crew members or production staff?",
    "Which actors appeared in both <movie_name> and <movie_name>?",
    "Are there any common themes or genres between <movie_name> and <movie_name>?",
    "Do <movie_name> and <movie_name> belong to the same film franchise?",
    "Are there any notable connections between <movie_name> and <movie_name> in terms of directors or producers?",
    "Which actors from <movie_name> and <movie_name> have collaborated in other films?",
    "Are there any common filming locations between <movie_name> and <movie_name>?",
    "Which <role> or <role> contributed to the soundtracks of both <movie_name> and <movie_name>?",
    
    # genre
    "Recommend me movies with the genres <genre>, <genre>, and <genre>",
    "Are there any <genre> and <genre> movies?",
    "I am looking for <genre> and <genre> movies, anything you would recommend?",
    "Are there any movies that blend the genres of <genre> and <genre>?",
    "I am looking for movies that fall under both <genre> and <genre>. Anything you would recommend?",
    "Can you suggest films that seamlessly combine elements of <genre> and <genre>?",
    "Which movies successfully incorporate both <genre> and <genre> in their storytelling?",
    "Do you have recommendations for movies that showcase a mix of <genre> and <genre>?",
    "I'm interested in exploring the intersection of <genre> and <genre>. Any standout films in that category?",
    "Can you provide a list of movies that are known for their unique blend of <genre> and <genre>?",
    "Are there any critically acclaimed movies that successfully navigate the genres of both <genre> and <genre>?",
    "I enjoy movies that combine the elements of <genre> and <genre>. What are your top picks in this category?",
    "Can you recommend movies that span a wide range of genres, including <genre>, <genre>, and <genre>?",
    "I'm in the mood for a movie night with diverse genres. Any suggestions for films that include <genre> and <genre>?",
    "Do you have recommendations for movies that masterfully balance the genres of both <genre> and <genre>?"
    
    
]

In [23]:
i = 0
peoples = list(people_list.values())
genres = list(genres_ner.keys())
res_s2 = []
for name in peoples:
    i+=1
    if(i==20000):
        break
        
    genre = genres[i%len(genres)]
    sentence = sentence_list_2[i%len(sentence_list_2)]
    
    if(i%2==0):
        AoW = "acted" 
    else: 
        AoW = "wrote"
    sentence2 = sentence.replace("<name>",name).replace("<genre>",genre)
    sentence3 = sentence2.replace("<acted/wrote>",AoW)

    ents = [(sentence3.find(name),sentence3.find(name) + len(name), "name"),
            (sentence3.find(genre),sentence3.find(genre)+len(genre), "genre"),
            (sentence3.find(AoW),sentence3.find(AoW) + len(AoW), "action"),
            (sentence3.find("directed"),sentence3.find("directed") + len("directed"), "action"),
            (sentence3.find("written"),sentence3.find("written") + len("written"), "action"),
            (sentence3.find("featuring"),sentence3.find("featuring") + len("featuring"), "action"),
            (sentence3.find("played"),sentence3.find("played") + len("played"), "action"),
            (sentence3.find("appear"),sentence3.find("appear") + len("appear"), "action"),
            (sentence3.find("made"),sentence3.find("made") + len("made"), "action"),
            (sentence3.find("authored"),sentence3.find("authored") + len("authored"), "action"),
            (sentence3.find("appears"),sentence3.find("appears") + len("appears"), "action")]

    entities = []
    for ent in ents:
        if(ent[0]!=-1):
            entities.append(ent)


    res_s2.append((sentence3,{"entities":entities}))
    



len(res_s2)

19999

In [24]:
i = 0
roles = list(roles_ner.keys())
movies = list(movie_list.values())
res_s1 = []
for movie in movies:
    i+=1
    if(i==20000):
        break
        
    role = roles[i%len(roles)]
    sentence = sentence_list_1[i%len(sentence_list_1)]
    
    if(i%2==0):
        AoW = "directed" 
    else: 
        AoW = "wrote"
    sentence2 = sentence.replace("<role>",role).replace("<movie_name>",movie)
    sentence3 = sentence2.replace("<directed/wrote>",AoW)

    ents = [(sentence3.find(role),sentence3.find(role) + len(role), "role"),
            (sentence3.find(movie),sentence3.find(movie)+len(movie), "movie"),
            (sentence3.find(AoW),sentence3.find(AoW) + len(AoW), "action")]
    
    for action in actions_ner.keys():
        if action in sentence3 and action != AoW:
            ents.append((sentence3.find(action),sentence3.find(action) + len(action), "action"))
            break

    entities = []
    for ent in ents:
        if(ent[0]!=-1):
            entities.append(ent)


    res_s1.append((sentence3,{"entities":entities}))
    
len(res_s1)

19999

In [25]:
i = 0
movies = list(movie_list.values())
res_s3 = []
for movie in movies:
    i+=1
    if(i==10000):
        break
        
    sentence = sentence_list_3[i%len(sentence_list_3)]
    
    sentence3 = sentence.replace("<movie_name>",movie)

    ents = [(sentence3.find(movie),sentence3.find(movie)+len(movie), "movie")]
    
    for predicate in list(predicates_ner.keys()):
        if predicate in sentence3:
            ents.append((sentence3.find(predicate),sentence3.find(predicate) + len(predicate), "predicate"))

    entities = []
    for ent in ents:
        if(ent[0]!=-1):
            entities.append(ent)


    res_s3.append((sentence3,{"entities":entities}))
    
len(res_s3)

9999

In [26]:
i = 0
movies = list(movie_list.values())
res_s4 = []
for movie in movies:
    i+=1
    if(i==10000):
        break
        
    sentence = sentence_list_4[i%len(sentence_list_4)]
    
    sentence3 = sentence.replace("<movie_name>",movie)

    ents = [(sentence3.find(movie),sentence3.find(movie)+len(movie), "movie")]
    
    for predicate in list(predicates_ner.keys()):
        if predicate in sentence3:
            ents.append((sentence3.find(predicate),sentence3.find(predicate) + len(predicate), "predicate"))

    entities = []
    for ent in ents:
        if(ent[0]!=-1):
            entities.append(ent)


    res_s4.append((sentence3,{"entities":entities}))
    
len(res_s4)

9999

In [27]:
res_s5 = []
i = 0
action_list = list(actions_ner.keys())
genres = list(genres_ner.keys())

sentence_list_567 = sentence_list_5 + sentence_list_6 + sentence_list_7
for name in peoples:
    i+=1
    if(i%2==0):
        HoL = "highest"
        ToB = "top"
        BoW = "best"
    else:
        HoL = "lowest"
        ToB = "bottom"
        BoW = "worst"
    
    action = action_list[i%len(action_list)]
    sentence = sentence_list_567[i%len(sentence_list_567)]
    genre = genres[i%len(genres)]
    number = numbers_ner[i%len(numbers_ner)]
    is_number = False
    if "<number>" in sentence:
        is_number = True

    sentence2 = sentence.replace("<highest/lowest>",HoL).replace("<top/bottom>",ToB).replace("<best/worst>",BoW)
    sentence3 = sentence2.replace("<name>",name).replace("<action>",action).replace("<genre>",genre).replace("<number>", number)
    ents = [(sentence3.find(HoL),sentence3.find(HoL) + len(HoL), "order"),
            (sentence3.find("highly"),sentence3.find("highly") + len("highly"), "order"),
            (sentence3.find(ToB),sentence3.find(ToB)+len(ToB), "order"),
            (sentence3.find(genre),sentence3.find(genre) + len(genre), "genre"),
            (sentence3.find(BoW),sentence3.find(BoW) + len(BoW), "order"),
            (sentence3.find(name),sentence3.find(name) + len(name), "name"),
            #(sentence3.find(' '+number+' ')+1,sentence3.find(' '+number+' ')-1 + len(number), "numerical_entity")
           ]
    
    for action in actions_ner.keys():
        if action in sentence3:
            ents.append((sentence3.find(action),sentence3.find(action) + len(action), "action"))
            
    for predicate in list(predicates_ner.keys()):
        if predicate in sentence3:
            ents.append((sentence3.find(predicate),sentence3.find(predicate) + len(predicate), "predicate"))
    
    if is_number:
        ents.append((sentence3.find(' '+number+' ')+1,sentence3.find(' '+number+' ')+1 + len(number), "numerical_entity"))

    entities = []
    for ent in ents:
        if(ent[0]!=-1):
            entities.append(ent)

    res_s5.append((sentence3,{"entities":entities}))
    
    if(i==30000):
        break

len(res_s5)


30000

In [28]:
res_s6 = []
i = 0
genres = list(genres_ner.keys())

sentence_list_89 = sentence_list_8 + sentence_list_9
for name in peoples:
    i+=1
    if(i%2==0):
        BoW = "best"
    else:
        BoW = "worst"
    
    sentence = sentence_list_89[i%len(sentence_list_89)]
    genre = genres[i%len(genres)]
    number = numbers_ner[i%len(numbers_ner)]
    year = years_ner[i%len(years_ner)]
    is_number=False
    if "<number>" in sentence:
        is_number = True

    sentence2 = sentence.replace("<best/worst>",BoW).replace("<name>",name)
    sentence3 = sentence2.replace("<genre>",genre).replace("<year>",year).replace("<number>", number)
    ents = [(sentence3.find(name),sentence3.find(name) + len(name), "name"),
            (sentence3.find(BoW),sentence3.find(BoW) + len(BoW), "order"),
            (sentence3.find(genre),sentence3.find(genre) + len(genre), "genre"),
            (sentence3.find(year),sentence3.find(year) + len(year), "year"),
           ]
    
    for action in actions_ner.keys():
        if action in sentence3:
            ents.append((sentence3.find(action),sentence3.find(action) + len(action), "action"))
            
    for predicate in list(predicates_ner.keys()):
        if predicate in sentence3:
            ents.append((sentence3.find(predicate),sentence3.find(predicate) + len(predicate), "predicate"))
    
    if is_number:
        ents.append((sentence3.find(' '+number+' ')+1,sentence3.find(' '+number+' ')+1 + len(number), "numerical_entity"))


    entities = []
    for ent in ents:
        if(ent[0]!=-1):
            entities.append(ent)

    res_s6.append((sentence3,{"entities":entities}))
    
    if(i==10000):
        break

len(res_s6)

10000

In [44]:
import random
random.seed(42)

i = 0
predicates = list(predicates_ner.keys())
movies = list(movie_list.values())
movies_random = list(movie_list.values())
res_s7 = []
for movie in movies:
    i+=1
    if(i==20000):
        break
        
    sentence = sentence_list_11[i%len(sentence_list_11)]
    
    num_movies = sentence.count("<movie_name>")
    sentence = sentence.replace("<movie_name>", movie, 1)
    ents = [(sentence.find(movie),sentence.find(movie)+len(movie), "movie")]
    movies_random.remove(movie)
    chosen_movies = []
    
    for i in range(num_movies-1):
        next_movie = random.choice(movies_random)
        sentence = sentence.replace("<movie_name>", next_movie,1)
        ents.append((sentence.find(next_movie),sentence.find(next_movie)+len(next_movie), "movie"))
        movies_random.remove(next_movie)
        chosen_movies.append(next_movie)
    
    movies_random.append(movie)
    movies_random.extend(chosen_movies)

    for predicate in predicates:
        if predicate in sentence:
            ents.append((sentence.find(predicate),sentence.find(predicate) + len(predicate), "predicate"))
            break

    entities = []
    for ent in ents:
        if(ent[0]!=-1):
            entities.append(ent)


    res_s7.append((sentence,{"entities":entities}))
    
len(res_s7)

24384

In [45]:
import re

i = 0
predicates = list(predicates_ner.keys())
movies = list(movie_list.values())
peoples = list(people_list.values())
peoples_random = list(people_list.values())
genres = list(genres_ner.keys())
roles = list(roles_ner.keys())
res_s8 = []
for person in peoples:
    i+=1
    if(i==20000):
        break
        
    sentence = sentence_list_12[i%len(sentence_list_12)]
    
    # Find all the replaceable objects
    regex = re.compile('(?:<[a-z_]+>)')
    matches = re.finditer(regex, sentence)
    
    ents = []
    
    #Using random choice since we don't really know how many instances have of each one and would be more complex having multiple loops
    people_chosen=[]
    movies_chosen=[]
    genres_chosen=[]
    roles_chosen=[]
    for match in matches:
        if match.group() == '<name>':
            insertion = random.choice(peoples_random)
            insertion_type = 'name'
            people_chosen.append(insertion)
            peoples_random.remove(insertion)
        elif match.group() == '<movie_name>':
            insertion = random.choice(movies)
            insertion_type = 'movie'
            movies_chosen.append(insertion)
            movies.remove(insertion)
        elif match.group() == '<role>':
            insertion = random.choice(roles)
            insertion_type = 'role'
            roles_chosen.append(insertion)
            roles.remove(insertion)
        elif match.group() == '<genre>':
            insertion = random.choice(genres)
            insertion_type = 'genre'
            genres_chosen.append(insertion)
            genres.remove(insertion)
        
        sentence = sentence.replace(match.group(), insertion,1)
        ents.append((sentence.find(insertion),sentence.find(insertion)+len(insertion), insertion_type))
        
    peoples_random.extend(people_chosen)
    movies.extend(movies_chosen)
    genres.extend(genres_chosen)
    roles.extend(roles_chosen)

    for predicate in predicates:
        if predicate in sentence:
            ents.append((sentence.find(predicate),sentence.find(predicate) + len(predicate), "predicate"))
            break
    
    for action in actions_ner.keys():
        if action in sentence3:
            ents.append((sentence3.find(action),sentence3.find(action) + len(action), "action"))

    entities = []
    for ent in ents:
        if(ent[0]!=-1):
            entities.append(ent)


    res_s8.append((sentence,{"entities":entities}))
    
len(res_s8)

19999

In [46]:
TRAIN_DATA = res_s1 + res_s2 + res_s3 + res_s4 + res_s5 + res_s6 + res_s7 + res_s8
len(TRAIN_DATA)

144379

In [47]:
import pandas as pd
import os
from tqdm import tqdm
from spacy.tokens import DocBin

nlp = spacy.load("en_core_web_sm")

db = DocBin() # create a DocBin object

count = 0
for text, annot in tqdm(TRAIN_DATA): # data in previous format
    try:
        doc = nlp.make_doc(text) # create doc object from text
        ents = []
        for start, end, label in annot["entities"]: # add character indexes
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                #print("Skipping entity")
                count+=1
            else:
                ents.append(span)
        doc.ents = ents # label the text with the ents
        db.add(doc)
    except Exception as e:
        print(text)
        print(annot)
        print(e)
        count+=1
print(count)
#os.chdir(r'XXXX\XXXXX')
db.to_disk("train.spacy") # save the docbin object

 74%|█████████████████████████████████████████████████████▍                  | 107041/144379 [00:11<00:05, 6472.62it/s]

Since I enjoy One Breath, Breath, and Demonia, do you have any movie recommendations for me?
{'entities': [(14, 24, 'movie'), (18, 24, 'movie'), (38, 45, 'movie'), (69, 78, 'predicate')]}
[E1010] Unable to set entity information for token 4 which is included in more than one span in entities, blocked, missing or outside.


 78%|████████████████████████████████████████████████████████▍               | 113165/144379 [00:12<00:04, 6746.45it/s]

Since I enjoy Deepfrozen, Simple Simon, and Simon, do you have any movie recommendations for me?
{'entities': [(14, 24, 'movie'), (26, 38, 'movie'), (33, 38, 'movie'), (73, 82, 'predicate')]}
[E1010] Unable to set entity information for token 6 which is included in more than one span in entities, blocked, missing or outside.


 87%|██████████████████████████████████████████████████████████████▌         | 125376/144379 [00:14<00:02, 7156.06it/s]

Since I enjoy The Sea Inside, The Insult, and The Sea, do you have any movie recommendations for me?
{'entities': [(14, 28, 'movie'), (30, 40, 'movie'), (14, 21, 'movie'), (77, 86, 'predicate')]}
[E1010] Unable to set entity information for token 3 which is included in more than one span in entities, blocked, missing or outside.
Since I enjoy Resident Evil: Extinction, Short Skin, and Evil, do you have any movie recommendations for me?
{'entities': [(14, 39, 'movie'), (41, 51, 'movie'), (23, 27, 'movie'), (85, 94, 'predicate')]}
[E1010] Unable to set entity information for token 4 which is included in more than one span in entities, blocked, missing or outside.
Which voice actor or actor contributed to the soundtracks of both Il Mare and The Church?
{'entities': [(6, 17, 'role'), (12, 17, 'role'), (66, 73, 'movie'), (78, 88, 'movie')]}
[E1010] Unable to set entity information for token 2 which is included in more than one span in entities, blocked, missing or outside.


 89%|███████████████████████████████████████████████████████████████▉        | 128253/144379 [00:14<00:02, 7156.52it/s]

I'm in the mood for a movie night with diverse genres. Any suggestions for films that include mystery fiction and mystery?
{'entities': [(94, 109, 'genre'), (94, 101, 'genre'), (47, 52, 'predicate')]}
[E1010] Unable to set entity information for token 19 which is included in more than one span in entities, blocked, missing or outside.
Which movies successfully incorporate both historical fiction and fiction in their storytelling?
{'entities': [(43, 61, 'genre'), (54, 61, 'genre')]}
[E1010] Unable to set entity information for token 6 which is included in more than one span in entities, blocked, missing or outside.


 90%|█████████████████████████████████████████████████████████████████       | 130419/144379 [00:15<00:01, 7144.42it/s]

Can you provide a list of movies that are known for their unique blend of pseudo-documentary and documentary?
{'entities': [(74, 92, 'genre'), (81, 92, 'genre')]}
[E1010] Unable to set entity information for token 17 which is included in more than one span in entities, blocked, missing or outside.
Recommend me movies with the genres historical novel, psychological thriller, and thriller
{'entities': [(36, 52, 'genre'), (54, 76, 'genre'), (68, 76, 'genre'), (29, 34, 'predicate')]}
[E1010] Unable to set entity information for token 10 which is included in more than one span in entities, blocked, missing or outside.
Do you have recommendations for movies that showcase a mix of supernatural fiction and fiction?
{'entities': [(62, 82, 'genre'), (75, 82, 'genre'), (12, 21, 'predicate')]}
[E1010] Unable to set entity information for token 12 which is included in more than one span in entities, blocked, missing or outside.
Which director of photography or director contributed to the soundtrack

 92%|██████████████████████████████████████████████████████████████████▌     | 133364/144379 [00:15<00:01, 7213.68it/s]

I'm in the mood for a movie night with diverse genres. Any suggestions for films that include erotic thriller and erotic?
{'entities': [(94, 109, 'genre'), (94, 100, 'genre'), (47, 52, 'predicate')]}
[E1010] Unable to set entity information for token 19 which is included in more than one span in entities, blocked, missing or outside.
Are there any movies that blend the genres of television series based on a novel and novel?
{'entities': [(46, 80, 'genre'), (75, 80, 'genre'), (36, 41, 'predicate')]}
[E1010] Unable to set entity information for token 14 which is included in more than one span in entities, blocked, missing or outside.
I am looking for movies that fall under both comedy thriller and comedy. Anything you would recommend?
{'entities': [(45, 60, 'genre'), (45, 51, 'genre'), (92, 101, 'predicate')]}
[E1010] Unable to set entity information for token 9 which is included in more than one span in entities, blocked, missing or outside.


 94%|███████████████████████████████████████████████████████████████████▉    | 136270/144379 [00:16<00:01, 7036.98it/s]

Do you have recommendations for movies that masterfully balance the genres of both Japanese horror and horror?
{'entities': [(83, 98, 'genre'), (92, 98, 'genre'), (68, 73, 'predicate')]}
[E1010] Unable to set entity information for token 14 which is included in more than one span in entities, blocked, missing or outside.
I enjoy movies that combine the elements of crime-comedy and comedy. What are your top picks in this category?
{'entities': [(44, 56, 'genre'), (50, 56, 'genre'), (101, 109, 'predicate')]}
[E1010] Unable to set entity information for token 10 which is included in more than one span in entities, blocked, missing or outside.


 96%|█████████████████████████████████████████████████████████████████████   | 138389/144379 [00:16<00:00, 6935.14it/s]

Can you recommend movies that span a wide range of genres, including political satire, family drama, and drama?
{'entities': [(69, 85, 'genre'), (87, 99, 'genre'), (94, 99, 'genre'), (51, 56, 'predicate')]}
[E1010] Unable to set entity information for token 17 which is included in more than one span in entities, blocked, missing or outside.


 99%|███████████████████████████████████████████████████████████████████████▏| 142706/144379 [00:16<00:00, 7121.73it/s]

I enjoy movies that combine the elements of supernatural horror and horror. What are your top picks in this category?
{'entities': [(44, 63, 'genre'), (57, 63, 'genre'), (108, 116, 'predicate')]}
[E1010] Unable to set entity information for token 9 which is included in more than one span in entities, blocked, missing or outside.
I am looking for fantasy anime and manga and fantasy movies, anything you would recommend?
{'entities': [(17, 40, 'genre'), (17, 24, 'genre'), (80, 89, 'predicate')]}
[E1010] Unable to set entity information for token 4 which is included in more than one span in entities, blocked, missing or outside.


100%|████████████████████████████████████████████████████████████████████████| 144379/144379 [00:17<00:00, 8382.91it/s]

Recommend me movies with the genres action comedy, adventure anime and manga, and action
{'entities': [(36, 49, 'genre'), (51, 76, 'genre'), (36, 42, 'genre'), (29, 34, 'predicate')]}
[E1010] Unable to set entity information for token 6 which is included in more than one span in entities, blocked, missing or outside.
I enjoy movies that combine the elements of horror fiction and horror. What are your top picks in this category?
{'entities': [(44, 58, 'genre'), (44, 50, 'genre'), (103, 111, 'predicate')]}
[E1010] Unable to set entity information for token 8 which is included in more than one span in entities, blocked, missing or outside.
Recommend me movies with the genres science fiction anime and manga, anime, and neo-noir
{'entities': [(36, 67, 'genre'), (52, 57, 'genre'), (80, 88, 'genre'), (29, 34, 'predicate')]}
[E1010] Unable to set entity information for token 8 which is included in more than one span in entities, blocked, missing or outside.
Recommend me movies with the genres 




In [None]:
"Display the Peplum film genre movies with the lowest ratings"[22:27]

In [None]:
"I want to see the top-rated film score movies featured Vanraj Bhatia"[28:38]

In [None]:
predicates_ner.keys()

### Test

In [32]:
nlp_model = spacy.load('models/NER/loss_78')

ValueError: Unpack failed: incomplete input

In [None]:
TRAIN_DATA

In [None]:
total=0
correct=0
for sentence, entities in TRAIN_DATA:
    pred = nlp_model(sentence)
    for ent in pred.ents:
        total+=1
        if (ent.start_char, ent.end_char, ent.label_) in entities['entities']:
            correct+=1
            #print(ent.label_+" Correctly found")
        else:
            spacy.displacy.render(pred, style="ent", jupyter=True)
            print(sentence, entities)
            print("Missing {} label".format(ent.label_))
            
print(correct, total)

In [8]:
nlp1 = spacy.load('models/NER/loss_72/') #load the best model
doc = nlp1("Recommend movies like Nightmare on Elm Street, Friday the 13th, and Halloween.") # input sample text

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter



In [54]:
doc = nlp1("Who is the director of Oppenheimer?") # input sample text

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter


In [74]:
doc = nlp1("What are the best 5 movies of Christopher Nolan?") # input sample text

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter


In [56]:
doc = nlp1("List the top ten horror movies") # input sample text

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter


In [57]:
doc = nlp1("Which is the best rated horror movie?") # input sample text

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter


In [100]:
doc = nlp1("Who is the screenwriter of The Masked Gang: Cyprus?") # input sample text

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter


In [85]:
for ent in doc.ents:
    print(ent.label_, ent.start_char, ent.end_char)


year 30 34
