In [2]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

In [3]:
from rdflib.namespace import Namespace, RDF, RDFS, XSD
from rdflib.term import URIRef, Literal
import csv
import json
import networkx as nx
import pandas as pd
import rdflib
from collections import defaultdict, Counter
import locale

In [14]:
graph = rdflib.Graph()
graph.parse('data/14_graph.nt', format='turtle')

<Graph identifier=Ne40908bdf5ec492db7cc759e535e42e1 (<class 'rdflib.graph.Graph'>)>

In [89]:
# prefixes used in the graph
WD = Namespace('http://www.wikidata.org/entity/')
WDT = Namespace('http://www.wikidata.org/prop/direct/')
SCHEMA = Namespace('http://schema.org/')
DDIS = Namespace('http://ddis.ch/atai/')

In [90]:
entities = set(graph.subjects()) | {s for s in graph.objects() if isinstance(s,URIRef)}
predicates = set(graph.predicates())
literals = {s for s in graph.objects() if isinstance(s, Literal)}

In [91]:
genre_list = {str(uri):str(s) for uri,s in graph.query('''
PREFIX ddis: <http://ddis.ch/atai/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX schema: <http://schema.org/>
SELECT ?genre ?lbl WHERE {
?movie wdt:P31 wd:Q11424 .
?movie wdt:P136 ?genre .
?genre rdfs:label ?lbl .
}
''')}

len(genre_list)

364

In [92]:
people_list = {str(uri):str(s) for uri,s in graph.query('''
PREFIX ddis: <http://ddis.ch/atai/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX schema: <http://schema.org/>
SELECT ?person ?lbl WHERE {
?person wdt:P31 wd:Q5 .
?person rdfs:label ?lbl .
}
''')}

len(people_list)

100157

In [93]:
movie_list = {str(uri):str(s) for uri,s in graph.query('''
PREFIX ddis: <http://ddis.ch/atai/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX schema: <http://schema.org/>
SELECT ?movie ?lbl WHERE {
?movie wdt:P31 wd:Q11424 .
?movie rdfs:label ?lbl .
}
''')}

len(movie_list)

24384

In [94]:
predicate_dict = {}
prefix_string = "PREFIX ddis: <http://ddis.ch/atai/> PREFIX wd: <http://www.wikidata.org/entity/> PREFIX wdt: <http://www.wikidata.org/prop/direct/> PREFIX schema: <http://schema.org/> "
for uri in predicates:
    try:
        p = uri.split("/")[-1]
        query_string = "SELECT ?lbl WHERE { wdt:PREDICATE rdfs:label ?lbl . }".replace("PREDICATE",p)
        res = graph.query(prefix_string + query_string)
        label, = res
        predicate_dict[str(uri)] = str(label[0])
    except:
        pass

In [102]:
predicate_dict

{'http://www.wikidata.org/prop/direct/P25': 'mother',
 'http://www.wikidata.org/prop/direct/P1056': 'product or material produced',
 'http://www.wikidata.org/prop/direct/P1552': 'has quality',
 'http://www.wikidata.org/prop/direct/P30': 'continent',
 'http://www.wikidata.org/prop/direct/P1455': 'list of works',
 'http://www.wikidata.org/prop/direct/P1657': 'MPAA film rating',
 'http://www.wikidata.org/prop/direct/P6886': 'writing language',
 'http://www.wikidata.org/prop/direct/P735': 'given name',
 'http://www.wikidata.org/prop/direct/P706': 'located on terrain feature',
 'http://www.wikidata.org/prop/direct/P1441': 'present in work',
 'http://www.wikidata.org/prop/direct/P1029': 'crew member(s)',
 'http://www.wikidata.org/prop/direct/P641': 'sport',
 'http://www.wikidata.org/prop/direct/P136': 'genre',
 'http://www.wikidata.org/prop/direct/P291': 'place of publication',
 'http://www.wikidata.org/prop/direct/P97': 'noble title',
 'http://www.wikidata.org/prop/direct/P1269': 'facet of'

In [95]:
# Defining some dictionaries for mapping wrods to predicates
roles_ner = {
    "actor":"P161",
    "cast":"P161",
    "cast member":"P161",
    "director":"P57",
    "screenwriter":"P58",
    "producer":"P162",
    "developer" : "P178",
    "film editor" : "P1040",
    "director of photography" : "P344",
    "film crew member" : "P2079",
    "choreographer" : "P1809",
    #"father" : "P22",
    #"spouse" : "P26",
    "art director" : "P3174",
    #"winner" : "P1346",
    "author" : "P50",
    "presenter" : "P371",
    "narrator" : "P2438",
    "animator" : "P6942",
    "creator" : "P170",
    #"student" : "P802",
    #"mother" : "P25",
    "participant" : "P710",
    "member of the crew of" : "P5096",
    "voice actor" : "P725",
    #"employer" : "P108",
    "publisher" : "P123",
    "musical conductor" : "P3300",
    "operator" : "P137",
    "performer" : "P175"
}

actions_ner = {
    "acted":"P161",
    "directed":"P57",
    "screenwrote":"P58",
    "wrote":"P58",
    "written":"P58",
    "produced":"P162",
    "featured":"P161",
    "featuring":"P161", 
    "recorded" : None,
    #"appeared" : None,
    "appear" : None,
    "direct" : None,
    #"produced" : None,
    "produce" : None,
    "filmed" : None,
    "edit" : None,
    #"film" : None,
    "edited" : None,
    "cast" : None,
    "shoot" : None,
    "premiere" : None,
    #"release" : None,
    "distribute" : None,
    "directing":None,
    "played": None,
    "made": None,
    "authored": None,
    "appears": None
}

predicates_ner = {
    "release" : "P577",
    "when" : "P577",
    #"release year": "P577",
    "date": "P577",
    "year": "P577",
    #"date of release": "P577",
    "genre": "P136",
    "type": "P136",
    "category": "P136",
    #"ratings": "P444",
    "rated": "P444",
    "rating": "P444",
    "review": "P444",
    "score":"P444"
}

numbers_ner = [
    "one",
    "two",
    "three",
    "four",
    "five",
    "six",
    "seven",
    "eight",
    "nine",
    "ten",
    "eleven",
    "twelve",
    "thirteen",
    "fourteen",
    "fifteen",
    "sixteen",
    "seventeen",
    "eighteen",
    "nineteen",
    "twenty"
]

nums = list(range(1, 21))
nums = list(map(str, nums))
numbers_ner.extend(nums)

years_ner = list(range(1678, 2025))
years_ner = list(map(str, years_ner))


genres_ner = {v.replace(" film",""):k for k,v in genre_list.items()}
genres_ner = {k.replace("genre","").strip():v for k,v in genres_ner.items()}
del(genres_ner['film score'])

In [96]:
numbers_ner

['one',
 'two',
 'three',
 'four',
 'five',
 'six',
 'seven',
 'eight',
 'nine',
 'ten',
 'eleven',
 'twelve',
 'thirteen',
 'fourteen',
 'fifteen',
 'sixteen',
 'seventeen',
 'eighteen',
 'nineteen',
 'twenty',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20']

In [97]:
sentence_list_1=[ #Person search
    "Give me the <role> of <movie_name>",
    "Give me the name of the <role> of <movie_name>",
    "I'd like to know the <role>'s name for <movie_name>",
    "I'm curious about the person who <directed/wrote> <movie_name>",
    "Identify the <role> of <movie_name>",
    "Tell me the <role> behind <movie_name>",
    "Tell me the <role> for <movie_name>",
    "Tell me the <role> of <movie_name>",
    "What is the <role>'s name for the movie <movie_name>?",
    "Who <role> in <movie_name>?",
    "Who <directed/wrote> <movie_name>?",
    "Who <directed/wrote> the movie <movie_name>?",
    "Who is the <role> of <movie_name>?",
    "Who is the <role> of the film <movie_name>?",
    "Who oversaw the making of <movie_name>",
    "Who was in charge of <directing> <movie_name>?",
    "Who was the <role> of <movie_name>"
]

sentence_list_2 = [ # Movie search
    "Give me a list of <genre> movies directed by <name>",
    "Give me a list of <genre> movies featuring <name>",
    "Give me a list of <genre> movies written by <name>",
    "Give me a list of movies directed by <name>",
    "Give me a list of movies featuring <name>",
    "Give me a list of movies written by <name>",
    "In which movies did <name> played a role?",
    "In which movies does <name> appear?",
    "List <genre> films written by <name>",
    "List movies which were directed by <name>",
    "List movies which were written by <name>",
    "List the films made by <name>",
    "List films authored by <name>",
    "Retrieve the films authored by <name>",
    "Retrieve the films directed by <name>",
    "Retrieve the <genre> films with screenplays authored by <name>",
    "Select all the movies of <name>",
    "Select all the movies with <name>",
    "Show me <genre> movies directed by <name>",
    "Show me <genre> movies written by <name>",
    "Show me the films directed by <name>",
    "Show me movies written by <name>",
    "Show me the films where <name> appears",
    "Show me the films with <name> in the cast",
    "Tell me the movies where <name> makes appearances",
    "What are the movies with <name> as a cast member?",
    "Which <genre> films has <name> directed?",
    "Which <genre> films has <name> written?",
    "Which films has <name> written?",
    "Which movies has <name> <acted/wrote>",
    "Which movies has <name> directed?"
]

sentence_list_3 = [ # retrieve release date
    "Could you let me know the release date for <movie_name>?",
    "Could you tell me when <movie_name> was first shown?",
    "Give me the release date for <movie_name>",
    "I'd like to know the release date of the movie <movie_name>.",
    "Please provide the release year for <movie_name>",
    "Tell me the release date of <movie_name>",
    "What's the date of release for <movie_name>?",
    "What's the release year of the movie <movie_name>?",
    "What's the launch date of <movie_name>?",
    "What's the debut date for <movie_name>?",
    "What is the release year of <movie_name>?",
    "When did <movie_name> come out?",
    "When did <movie_name> hit the theaters?",
    "When was <movie_name> released?",
    "When was <movie_name> launched?",
    "When was the movie <movie_name> released?",
    "When did <movie_name> first appear in cinemas?"
]

sentence_list_4 = [ # genre
    "Give me the genre for the movie <movie_name>",
    "I'd like to know the genre of <movie_name>",
    "I'm curious about the genre of <movie_name>",
    "In which genre does '<movie_name>' fit?",
    "Identify the movie genre for <movie_name>",
    "Tell me the genre of <movie_name>",
    "Tell me the type of film that is <movie_name>",
    "What category does <movie_name> fall into?",
    "What genre is associated with <movie_name>?",
    "What is the movie genre for <movie_name>?",
    "What is the genre of <movie_name>?",
    "Which category does <movie_name> fit into?",
    "Which genre does the movie <movie_name> belong to?",
    "Which genre does <movie_name> belong to?"
]

sentence_list_5 = [ #ratings
    "Display the movies with the <highest/lowest> ratings",
    "Give me the <top/bottom> <number> movies based on rating",
    "Give me the <top/bottom> rated movies",
    "Give me the <highest/lowest>-rated films",
    "Give me the <top/bottom> <number> movies based on rating",
    "Give me the <top/bottom> rated movies",
    "Give me the movie that has the <highest/lowest> user rating",
    "I'd like to know the movie with the <highest/lowest> user rating",
    "I'm curious about the movie with the <highest/lowest> user rating",
    "Identify the film with the <highest/lowest> user rating",
    "I want to see the <top/bottom>-rated movies",
    "List the movies with the <best/worst> ratings",
    "Provide a list of the <top/bottom>-rated movies",
    "Retrieve the movies with the <highest/lowest> ratings",
    "Show me the <highest/lowest>-rated films",
    "Show the movies with the <best/worst> ratings",
    "Tell me the film that received the <highest/lowest> user rating",
    "Tell me the movie with the <highest/lowest> user rating",
    "What are the most <highest/lowest>-rated films?",
    "What is the movie with the <highest/lowest> user rating?",
    "What movie has the <highest/lowest> user rating?",
    "Which film received the <highest/lowest> user rating?",
    "Which movie has the <highest/lowest> user rating?",
    "Which movie is rated the <highest/lowest> by users?"
]

sentence_list_6 = [
    "Show me the <highest/lowest>-rated <genre> films",
    "List the <genre> movies with the <best/worst> ratings",
    "I want to see the <top/bottom>-rated <genre> movies",
    "Display the <genre> movies with the <highest/lowest> ratings",
    "What are the most highly-rated <genre> films?",
    "Provide a list of the <top/bottom>-rated <genre> movies",
    "Give me the <highest/lowest>-rated <genre> films",
    "Show the movies with the <best/worst> <genre> ratings",
    "Retrieve the movies with the <highest/lowest> <genre> ratings",
]

sentence_list_7 = [
    "Show me the <highest/lowest>-rated <genre> films <action> <name>",
    "List the <genre> movies with the <best/worst> ratings <action> <name>",
    "I want to see the <top/bottom>-rated <genre> movies <action> <name>",
    "Display the <genre> movies with the <highest/lowest> ratings <action> <name>",
    "What are the most highly-rated <genre> films <action> <name>?",
    "Provide a list of the <top/bottom>-rated <genre> movies <action> <name>",
    "Give me the <highest/lowest>-rated <genre> films <action> <name>",
    "Show the movies with the <best/worst> <genre> ratings <action> <name>",
    "Retrieve the movies with the <highest/lowest> <genre> ratings <action> <name>",

    "List the <genre> movies <action> <name> with the <best/worst> ratings",
    "Display the <genre> movies <action> <name> with the <highest/lowest> ratings",
    "Provide a list of the <top/bottom>-rated <genre> movies",
    "Show the movies <action> <name> with the <best/worst> <genre> ratings",
    "Retrieve the movies <action> <name> with the <highest/lowest> <genre> ratings",
]

sentence_list_8 = [ # year-related
    "List the movies released in <year>",
    "Which movies were directed by <name> in <year>?",
    "What movies were released in <year>?",
    "What movies did <name> appear in in <year>?",
    "List the films featuring <name> in <year>",
    "Provide the filmography of <name> in <year>",
    "Which movies starred <name> in <year>?",
    "Which roles were played by <name> in <year>?"
]

sentence_list_9 = [ #Top/lowest N (based on release date)
    "What are the <best/worse> <number> movies <name> directed?",
    "List the <best/worse> <number> movies directed by <name>",
    "What are the <best/worse> <number> movies <name> directed?",
    "What are the <best/worse> <number> <genre> movies <name> directed?",
    "Which are the <best/worse> movies where <name> appeared?",
    "Can you name the <best/worse> <number> <genre> movies directed by <name>?",
    "Can you list the <best/worse> <number> films with <name> in the cast?",
    "What are the <best/worse> <number> movies starring <name>"
]

sentence_list_10 = [ #Yes/No
    "Is <name> in the cast of <movie_name>?",
    "Did <name> direct <movie_name>?",
    "Was the <movie> released in <year>?",
    "Is <name> part of the cast of <movie_name>?",
    "Was <movie_name> released before <year>?",
    "Is <movie_name> classified as <genre>?",
    "Did <actor_name> appear in <movie_name>?"
]

In [98]:
i = 0
peoples = list(people_list.values())
genres = list(genres_ner.keys())
res_s2 = []
for name in peoples:
    i+=1
    if(i==20000):
        break
        
    genre = genres[i%len(genres)]
    sentence = sentence_list_2[i%len(sentence_list_2)]
    
    if(i%2==0):
        AoW = "acted" 
    else: 
        AoW = "wrote"
    sentence2 = sentence.replace("<name>",name).replace("<genre>",genre)
    sentence3 = sentence2.replace("<acted/wrote>",AoW)

    ents = [(sentence3.find(name),sentence3.find(name) + len(name), "name"),
            (sentence3.find(genre),sentence3.find(genre)+len(genre), "genre"),
            (sentence3.find(AoW),sentence3.find(AoW) + len(AoW), "action"),
            (sentence3.find("directed"),sentence3.find("directed") + len("directed"), "action"),
            (sentence3.find("written"),sentence3.find("written") + len("written"), "action"),
            (sentence3.find("featuring"),sentence3.find("featuring") + len("featuring"), "action"),
            (sentence3.find("played"),sentence3.find("played") + len("played"), "action"),
            (sentence3.find("appear"),sentence3.find("appear") + len("appear"), "action"),
            (sentence3.find("made"),sentence3.find("made") + len("made"), "action"),
            (sentence3.find("authored"),sentence3.find("authored") + len("authored"), "action"),
            (sentence3.find("appears"),sentence3.find("appears") + len("appears"), "action")]

    entities = []
    for ent in ents:
        if(ent[0]!=-1):
            entities.append(ent)


    res_s2.append((sentence3,{"entities":entities}))
    



len(res_s2)

19999

In [99]:
i = 0
roles = list(roles_ner.keys())
movies = list(movie_list.values())
res_s1 = []
for movie in movies:
    i+=1
    if(i==20000):
        break
        
    role = roles[i%len(roles)]
    sentence = sentence_list_1[i%len(sentence_list_1)]
    
    if(i%2==0):
        AoW = "directed" 
    else: 
        AoW = "wrote"
    sentence2 = sentence.replace("<role>",role).replace("<movie_name>",movie)
    sentence3 = sentence2.replace("<directed/wrote>",AoW)

    ents = [(sentence3.find(role),sentence3.find(role) + len(role), "role"),
            (sentence3.find(movie),sentence3.find(movie)+len(movie), "movie"),
            (sentence3.find(AoW),sentence3.find(AoW) + len(AoW), "action")]
    
    for action in actions.keys():
        if action in sentence3 and action != AoW:
            ents.append((sentence3.find(action),sentence3.find(action) + len(action), "action"))
            break

    entities = []
    for ent in ents:
        if(ent[0]!=-1):
            entities.append(ent)


    res_s1.append((sentence3,{"entities":entities}))
    
len(res_s1)

NameError: name 'actions' is not defined

In [None]:
i = 0
movies = list(movie_list.values())
res_s3 = []
for movie in movies:
    i+=1
    if(i==10000):
        break
        
    sentence = sentence_list_3[i%len(sentence_list_3)]
    
    sentence3 = sentence.replace("<movie_name>",movie)

    ents = [(sentence3.find(movie),sentence3.find(movie)+len(movie), "movie")]
    
    for predicate in list(predicates_ner.keys()):
        if predicate in sentence3:
            ents.append((sentence3.find(predicate),sentence3.find(predicate) + len(predicate), "predicate"))

    entities = []
    for ent in ents:
        if(ent[0]!=-1):
            entities.append(ent)


    res_s3.append((sentence3,{"entities":entities}))
    
len(res_s3)

In [None]:
i = 0
movies = list(movie_list.values())
res_s4 = []
for movie in movies:
    i+=1
    if(i==10000):
        break
        
    sentence = sentence_list_4[i%len(sentence_list_4)]
    
    sentence3 = sentence.replace("<movie_name>",movie)

    ents = [(sentence3.find(movie),sentence3.find(movie)+len(movie), "movie")]
    
    for predicate in list(predicates_ner.keys()):
        if predicate in sentence3:
            ents.append((sentence3.find(predicate),sentence3.find(predicate) + len(predicate), "predicate"))

    entities = []
    for ent in ents:
        if(ent[0]!=-1):
            entities.append(ent)


    res_s4.append((sentence3,{"entities":entities}))
    
len(res_s4)

In [None]:
res_s5 = []
i = 0
action_list = list(actions_ner.keys())
genres = list(genres_ner.keys())

sentence_list_567 = sentence_list_5 + sentence_list_6 + sentence_list_7
for name in peoples:
    i+=1
    if(i%2==0):
        HoL = "highest"
        ToB = "top"
        BoW = "best"
    else:
        HoL = "lowest"
        ToB = "bottom"
        BoW = "worst"
    
    action = action_list[i%len(action_list)]
    sentence = sentence_list_567[i%len(sentence_list_567)]
    genre = genres[i%len(genres)]
    number = numbers_ner[i%len(numbers_ner)]
    is_number = False
    if "<number>" in sentence:
        is_number = True

    sentence2 = sentence.replace("<highest/lowest>",HoL).replace("<top/bottom>",ToB).replace("<best/worst>",BoW)
    sentence3 = sentence2.replace("<name>",name).replace("<action>",action).replace("<genre>",genre).replace("<number>", number)
    ents = [(sentence3.find(HoL),sentence3.find(HoL) + len(HoL), "order"),
            (sentence3.find("highly"),sentence3.find("highly") + len("highly"), "order"),
            (sentence3.find(ToB),sentence3.find(ToB)+len(ToB), "order"),
            (sentence3.find(genre),sentence3.find(genre) + len(genre), "genre"),
            (sentence3.find(BoW),sentence3.find(BoW) + len(BoW), "order"),
            #(sentence3.find(' '+number+' ')+1,sentence3.find(' '+number+' ')-1 + len(number), "numerical_entity")
           ]
    
    for action in actions_ner.keys():
        if action in sentence3:
            ents.append((sentence3.find(action),sentence3.find(action) + len(action), "action"))
            
    for predicate in list(predicates_ner.keys()):
        if predicate in sentence3:
            ents.append((sentence3.find(predicate),sentence3.find(predicate) + len(predicate), "predicate"))
    
    if is_number:
        ents.append((sentence3.find(' '+number+' ')+1,sentence3.find(' '+number+' ')+1 + len(number), "numerical_entity"))

    entities = []
    for ent in ents:
        if(ent[0]!=-1):
            entities.append(ent)

    res_s5.append((sentence3,{"entities":entities}))
    
    if(i==30000):
        break

len(res_s5)


In [None]:
res_s6 = []
i = 0
genres = list(genres_ner.keys())

sentence_list_89 = sentence_list_8 + sentence_list_9
for name in peoples:
    i+=1
    if(i%2==0):
        BoW = "best"
    else:
        BoW = "worst"
    
    sentence = sentence_list_89[i%len(sentence_list_89)]
    genre = genres[i%len(genres)]
    number = numbers_ner[i%len(numbers_ner)]
    year = years_ner[i%len(years_ner)]
    is_number=False
    if "<number>" in sentence:
        is_number = True

    sentence2 = sentence.replace("<best/worst>",BoW).replace("<name>",name)
    sentence3 = sentence2.replace("<genre>",genre).replace("<year>",year).replace("<number>", number)
    ents = [(sentence3.find(name),sentence3.find(name) + len(name), "name"),
            (sentence3.find(BoW),sentence3.find(BoW) + len(BoW), "order"),
            (sentence3.find(genre),sentence3.find(genre) + len(genre), "genre"),
            (sentence3.find(year),sentence3.find(year) + len(year), "year"),
           ]
    
    for action in actions_ner.keys():
        if action in sentence3:
            ents.append((sentence3.find(action),sentence3.find(action) + len(action), "action"))
            
    for predicate in list(predicates_ner.keys()):
        if predicate in sentence3:
            ents.append((sentence3.find(predicate),sentence3.find(predicate) + len(predicate), "predicate"))
    
    if is_number:
        ents.append((sentence3.find(' '+number+' ')+1,sentence3.find(' '+number+' ')+1 + len(number), "numerical_entity"))


    entities = []
    for ent in ents:
        if(ent[0]!=-1):
            entities.append(ent)

    res_s6.append((sentence3,{"entities":entities}))
    
    if(i==10000):
        break

len(res_s6)

In [None]:
res_s6

In [190]:
TRAIN_DATA = res_s1 + res_s2 + res_s3 + res_s4 + res_s5 + res_s6
len(train)

99996

In [192]:
import pandas as pd
import os
from tqdm import tqdm
from spacy.tokens import DocBin

nlp = spacy.load("en_core_web_sm")

db = DocBin() # create a DocBin object

count = 0
for text, annot in tqdm(TRAIN_DATA): # data in previous format
    try:
        doc = nlp.make_doc(text) # create doc object from text
        ents = []
        for start, end, label in annot["entities"]: # add character indexes
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                #print("Skipping entity")
                count+=1
            else:
                ents.append(span)
        doc.ents = ents # label the text with the ents
        db.add(doc)
    except Exception as e:
        print(text)
        print(annot)
        print(e)
        count+=1
print(count)
#os.chdir(r'XXXX\XXXXX')
db.to_disk("train.spacy") # save the docbin object

100%|██████████████████████████████████████████████████████████████████████████| 99996/99996 [00:10<00:00, 9182.80it/s]


20482


In [149]:
"Display the Peplum film genre movies with the lowest ratings"[22:27]

'genre'

In [178]:
"I want to see the top-rated film score movies featured Vanraj Bhatia"[28:38]

'film score'

In [164]:
predicates_ner.keys()

dict_keys(['release', 'when', 'date', 'year', 'genre', 'type', 'category', 'rated', 'rating', 'review', 'score'])

In [4]:
nlp_model = spacy.load('output/model-best')

In [53]:
nlp1 = spacy.load(r".\loss_76") #load the best model
doc = nlp1("What is the release date of Batman?") # input sample text

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter



In [54]:
doc = nlp1("Who is the director of Oppenheimer?") # input sample text

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter


In [74]:
doc = nlp1("What are the best 5 movies of Christopher Nolan?") # input sample text

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter


In [56]:
doc = nlp1("List the top ten horror movies") # input sample text

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter


In [57]:
doc = nlp1("Which is the best rated horror movie?") # input sample text

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter


In [100]:
doc = nlp1("Who is the screenwriter of The Masked Gang: Cyprus?") # input sample text

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter


In [85]:
for ent in doc.ents:
    print(ent.label_, ent.start_char, ent.end_char)


year 30 34
