In [1]:
import pyparsing
from speakeasypy import Speakeasy
import rdflib
from rdflib.namespace import split_uri
from rdflib.namespace import Namespace, RDF, RDFS, XSD
import pickle
import os.path
from chatbot.chatbot import ChatBot
from data.knowledge_graph import KnowledgeGraph
from language_processing.entity_relation_extraction import NamedEntityRecognizer, RelationExtractor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
knowledge_graph = KnowledgeGraph("data/14_graph.nt")

Setting up knowledge graph...
Finished movie graph setup.


In [3]:
predicates = list(knowledge_graph.predicate_map.values())
print(knowledge_graph.predicate_map)
print(len(predicates))
predicates.sort()
print(predicates)

{'http://www.wikidata.org/prop/direct/P18': 'image', 'http://www.wikidata.org/prop/direct/P57': 'director', 'http://www.wikidata.org/prop/direct/P161': 'cast member', 'http://www.wikidata.org/prop/direct/P462': 'color', 'http://www.wikidata.org/prop/direct/P3092': 'film crew member', 'http://www.wikidata.org/prop/direct/P361': 'part of', 'http://www.wikidata.org/prop/direct/P345': 'IMDb ID', 'http://www.wikidata.org/prop/direct/P750': 'distributed by', 'http://www.wikidata.org/prop/direct/P921': 'main subject', 'http://www.wikidata.org/prop/direct/P31': 'instance of', 'http://www.wikidata.org/prop/direct/P58': 'screenwriter', 'http://schema.org/description': 'node description', 'http://www.wikidata.org/prop/direct/P22': 'father', 'http://www.wikidata.org/prop/direct/P1981': 'FSK film rating', 'http://www.w3.org/2000/01/rdf-schema#label': 'node label', 'http://www.wikidata.org/prop/direct/P106': 'occupation', 'http://www.wikidata.org/prop/direct/P1441': 'present in work', 'http://www.wi

In [4]:
named_entity_recognizer = NamedEntityRecognizer()

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [5]:
relation_extractor = RelationExtractor(predicates)

In [12]:
question = "Who directed the movie Good Will Hunting?"
question_new = 'When was "The Godfather" released?'

In [13]:
extracted_entities = named_entity_recognizer.extract_entities(question_new)
print("__", extracted_entities)

__ ['The Godfather']


In [16]:
extracted_relations = relation_extractor.extract_relations(question_new, extracted_entities)
print(extracted_relations)
extracted_relation = extracted_relations['released']['best_match']
print(extracted_relation)

{'released': {'best_match': 'replaces', 'edit_distance': 4}}
replaces


In [17]:
entity_uri = knowledge_graph.match_entity(extracted_entities)
print("matched_entity:", entity_uri)

matched_entity: http://www.wikidata.org/entity/Q243556


In [18]:
print(extracted_relation)
relation_uri = knowledge_graph.match_predicate(extracted_relation)
print("relation_uri", relation_uri)

replaces
relation_uri http://www.wikidata.org/prop/direct/P1365


In [19]:
print(knowledge_graph.query_graph(entity_uri, relation_uri, False))

[]


In [2]:
graph = rdflib.Graph()
graph.parse("dataset/14_graph.nt", format="turtle")

<Graph identifier=Nf7c6d06742a64776b33829c998a0244a (<class 'rdflib.graph.Graph'>)>

In [42]:
predicates = list(graph.predicates())

In [23]:
with open("predicates.pkl", "wb") as f:
    pickle.dump(predicates, f)

In [25]:
with open("predicates.pkl", "rb") as f:
    loaded_predicates = pickle.load(f)

print(loaded_predicates)

[rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P18'), rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P161'), rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P161'), rdflib.term.URIRef('http://schema.org/description'), rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P161'), rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P3216'), rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P1441'), rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P50'), rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P175'), rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#label'), rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P1346'), rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P5021'), rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P161'), rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P58'), rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P136'), rdflib.term.URIRef('http://www.wikidata.org/pro

In [45]:
result = []
for uri in predicates[:300]:
    property_id = split_uri(uri)[1]
    
    query = f"""
        PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
        SELECT ?propertyLabel WHERE {{
          wdt:{property_id} rdfs:label ?propertyLabel .
          FILTER(LANG(?propertyLabel) = "en")
        }}
        LIMIT 1
        """
    
    res = graph.query(query)
    result.append([str(s) for s, in res])
print(result)

[['image'], ['cast member'], ['cast member'], [], ['cast member'], ['ClassInd rating'], ['present in work'], ['author'], ['performer'], [], ['winner'], ['assessment'], ['cast member'], ['screenwriter'], ['genre'], ['genre'], ['spouse'], ['genre'], ['instance of'], [], ['genre'], ['country of citizenship'], ['director'], ['from narrative universe'], ['cast member'], ['IMDb ID'], ['IMDb ID'], ['country of origin'], ['occupation'], ['IMDb ID'], [], ['occupation'], ['publication date'], ['director'], ['spouse'], ['country of citizenship'], ['voice actor'], ['field of work'], [], ['main subject'], ['country of origin'], ['occupation'], [], [], [], ['country'], ['production company'], ['instance of'], ['cast member'], ['instance of'], ['occupation'], ['cast member'], ['IMDb ID'], ['cast member'], ['occupation'], ['located in the administrative territorial entity'], ['screenwriter'], ['languages spoken, written or signed'], ['occupation'], ['IMDb ID'], ['occupation'], ['cast member'], [], ['s