# Rule-Based Matching for RE

### Different names for triples
---


* SPO = [Subject Predicate Object](https://en.wikipedia.org/wiki/Semantic_triple)
* ERE = [Entity Relation Entity](https://en.wikipedia.org/wiki/Entity%E2%80%93relationship_model)
* OAV = [Object Attribute Value](https://en.wikipedia.org/wiki/Entity%E2%80%93attribute%E2%80%93value_model)

## #1. Setup development environment

### Update & import Python modules

In [1]:
# install and download required modules
!pip install --upgrade spacy
!python -m spacy download en_core_web_lg
!pip install neo4j

# spaCy
import spacy
from spacy.matcher import Matcher
from spacy.matcher import DependencyMatcher
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
from spacy import displacy

# Google Drive
from google.colab import drive

# Firebase/Firestore
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore

# Neo4j
from neo4j import GraphDatabase
from neo4j.exceptions import ServiceUnavailable

# general Python modules
import json
import logging
import re
import requests
from pprint import pprint

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
2023-03-31 09:47:06.520652: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-31 09:47:10.339885: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Installing collec



In [2]:
import requests
from bs4 import BeautifulSoup
import spacy
import json
import re

In [3]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

### Get access to Google Drive and Neo4j

In [4]:
###########################################
# Google Drive
###########################################

# (re)mount drive, forcing it if needed
drive.mount("/content/drive/", force_remount = True)
print("Stablished access to Google Drive")

# initialize G Drive path
DRIVE_PATH = "/content/drive/MyDrive"

###########################################
# Firebase
###########################################

# open Firebase credentials
with open(DRIVE_PATH + "/IE/ie_course_2022_team03/credentials/firebase_credentials.json") as f:
  credential = json.load(f)
credential = credentials.Certificate(credential)

# # create Firestore database instance
# firebase_admin.initialize_app(credential)
# db = firestore.client()
# print("Stablished access to Firestore")



Mounted at /content/drive/
Stablished access to Google Drive


In [5]:
###########################################
# Neo4j
###########################################

# open Neo4j credentials
with open(DRIVE_PATH + "/IE/ie_course_2022_team03/credentials/neo4j_credentials.json") as f:
  neo4j_credential = json.load(f)

# initialize connection to Neo4j database
neo4j_db = GraphDatabase.driver(
    neo4j_credential["NEO4J_URI"], auth=(neo4j_credential["NEO4J_USERNAME"], neo4j_credential["NEO4J_PASSWORD"]))
print("Successful initialized Neo4j database")

Successful initialized Neo4j database


## #2. Extract SPO triples from text

### Define main data structures

#### Relations and Entities

In [6]:
with open(f'{DRIVE_PATH}/IE/ie_course_2022_team03/assets/entities_edited.json', encoding='utf-8') as f:
   entities_dicts = json.load(f)

In [7]:
extra_entities = [
    {
        "qid": "Q123",
        "name": "September",
        "aliases": [
    "Sept",
    "Sep",
    "Sep.",
    "9. month",
    "Sept."
        ],
        "description": "ninth month in the Julian and Gregorian calendars",
        "label": "MONTH"
    },
    {
        "qid": "Q126",
        "name": "December",
        "aliases": [
    "Dec",
    "12. month"
        ],
        "description": "twelfth month in the Julian and Gregorian calendars",
        "label": "MONTH"
    }
]

In [8]:
entities_dicts.extend(extra_entities)

In [9]:
entity_list = []
for entity in entities_dicts:
  try:
    id = entity['qid']
    root_url = "https://www.wikidata.org/wiki/"+id
    response= requests.get(root_url)
    d={}
    soup = BeautifulSoup(response.content,"html.parser")
    item = soup.find("div",{"id": "content"})
    qid = item.find("span",{"class":"wikibase-title-id"})
    d["qid"] = qid.get_text().replace("(","").replace(")","")
    d["name"] = item.find("span",{"class":"wikibase-title-label"}).get_text()
    if(item.find("div",{"class":"wikibase-aliasesview "}) == None):
      d["aliases"] = ""
    else:
      d["aliases"]= item.find("div",{"class":"wikibase-aliasesview "}).get_text(separator="\n",strip=True).split("\n")

    json_object = json.dumps(d, indent = 4) 
    entity_list.append(d)
    print("Data stored with Qid ", id)
  except Exception as e:
    print(e)

name_list = list(i['name'] for i in entity_list)
alias_list = list(i['aliases'] for i in entity_list)
alias_list = [item for sublist in alias_list for item in sublist]
gazetteers_list=name_list+alias_list

Data stored with Qid  Q740437
Data stored with Qid  Q812285
Data stored with Qid  Q61509
Data stored with Qid  Q48282
Data stored with Qid  Q44
Data stored with Qid  Q1221156
Data stored with Qid  Q160151
Data stored with Qid  Q80973
Data stored with Qid  Q3957
Data stored with Qid  Q2221906
Data stored with Qid  Q4173974
Data stored with Qid  Q17516
Data stored with Qid  Q707669
Data stored with Qid  Q483501
Data stored with Qid  Q1553259
Data stored with Qid  Q177
Data stored with Qid  Q56139
Data stored with Qid  Q1107656
Data stored with Qid  Q2312129
Data stored with Qid  Q704124
Data stored with Qid  Q980
Data stored with Qid  Q570116
Data stored with Qid  Q131621
Data stored with Qid  Q7937
Data stored with Qid  Q152526
Data stored with Qid  Q42973
Data stored with Qid  Q860861
Data stored with Qid  Q251712
Data stored with Qid  Q33506
Data stored with Qid  Q29540
Data stored with Qid  Q1778821
Data stored with Qid  Q4989906
Data stored with Qid  Q282
Data stored with Qid  Q2076

In [10]:
entities_dictionary = gazetteers_list

In [11]:
# nlp_extract = spacy.load('en_core_web_lg')
# nlp_extract.add_pipe("sentencizer")
relations_dictionary=[]

In [12]:
list_files = !ls /content/drive/MyDrive/IE/ie_course_2022_team03/retrieved_data/scrapped_data

In [13]:
list_files_twitter = !ls /content/drive/MyDrive/IE/ie_course_2022_team03/retrieved_data/twitter_data

In [14]:
sentences = ["The Bavarian State Painting Collections, a group of art museums located in Munich, Germany, contain some of the most significant collections of European paintings in the world.", 
             "The Bavarian Forest National Park, located in the southeastern region of Bavaria, Germany, is the oldest national park in the country and is home to a diverse range of flora and fauna.", 
             "The House of Wittelsbach, a dynasty of German nobility, ruled over Bavaria for nearly 800 years and was known for their patronage of the arts and sciences.", 
             "The Feldherrnhalle, a memorial in Munich, Germany, commemorates the Bavarian Army and the soldiers who fought in the Franco-Prussian War.", 
             "The Berliner, a doughnut-like pastry filled with jam or jelly, originated in Berlin, Germany and is a popular snack throughout the country.", 
             "The Bavarian cuisine is known for its hearty and filling dishes, such as roast pork with dumplings and sauerkraut, and is heavily influenced by neighboring Austria and Switzerland.", 
             "The Christmas market in Munich, Germany is one of the oldest and most famous in the world, attracting millions of visitors each year.", "The public transport system in Berlin, Germany is known for its efficiency and affordability, with trains, buses, and trams connecting all parts of the city.", 
             "The Oktoberfest, a 16-day festival held annually in Munich, Germany, is the largest beer festival in the world and attracts millions of visitors from around the globe.", 
             "The Bavarian Forest National Park is home to the largest population of lynx in Germany, with around 100 individuals living in the park.", 
             "The Bavarian State Opera, located in Munich, Germany, is one of the most prestigious opera houses in the world and has been home to many famous composers and performers.", 
             "The Berlin Wall, a barrier that divided the city of Berlin from 1961 to 1989, was a symbol of the Cold War and is now a popular tourist attraction.", 
             "The Munich Residenz, a former royal palace located in Munich, Germany, is one of the largest and most luxurious residences in Europe and is open to visitors year-round.", 
             "The Bavarian Alps, a mountain range located in southern Germany, offer some of the most stunning scenery in the country and are popular for hiking and skiing.", 
             "The Nymphenburg Palace, a Baroque palace located in Munich, Germany, was the summer residence of the rulers of Bavaria and is now open to the public as a museum.",
             "The Berlin Zoo, located in the heart of the city, is one of the oldest and most famous zoos in the world, with over 20,000 animals representing more than 1,500 species.", 
             "The Bavarian State Library, located in Munich, Germany, is one of the most important libraries in Europe and contains over 10 million books, manuscripts, and other documents.", 
             "The Brandenburg Gate, a neoclassical monument located in Berlin, Germany, is one of the most iconic landmarks in the city and is a symbol of the country's reunification.", 
             "The Bavarian State Archaeological Collection, located in Munich, Germany, contains some of the most important archaeological artifacts from the region, including prehistoric artifacts and ancient Roman artifacts.", 
             "The Berlin Philharmonic Orchestra, one of the world's most renowned orchestras, is based in Berlin, Germany and is known for its innovative and experimental performances.", 
             "The Munich Oktoberfest originally began as a celebration of the marriage between Crown Prince Ludwig and Princess Therese of Saxony-Hildburghausen in"
             ]

In [None]:
for file in list_files[200:400]:
  with open(DRIVE_PATH + f"/IE/ie_course_2022_team03/retrieved_data/scrapped_data/{file}", encoding='utf-8') as f:
    data_list = json.load(f)
    file_data = data_list['text']
    # print(file_data)
    for sentence in file_data:
      sentence = sentence.replace("\n", "") # remove new line chars
      sentence = sentence.lstrip() # remove leading blank spaces
      sentence = sentence.rstrip() # remove trailing blank space
      if sentence == "": # ignore empty paragraphs
        continue
      sentences.append(sentence)

In [40]:
for file in list_files_twitter[:20]:
  with open(DRIVE_PATH + f"/IE/ie_course_2022_team03/retrieved_data/twitter_data/{file}", encoding='utf-8') as f:
    data_list = json.load(f)
    for data in data_list:
      file_data = data['text'].split(',')
      # print(file_data)
      for sentence in file_data:
        sentence = sentence.replace("\n", "") # remove new line chars
        sentence = sentence.lstrip() # remove leading blank spaces
        sentence = sentence.rstrip() # remove trailing blank space
        if sentence == "": # ignore empty paragraphs
          continue
        sentences.append(sentence)

In [15]:
relations_dicts = [
  {
    "pid": "P276", # https://www.wikidata.org/wiki/Property:P276
    "label": "location",
    "description": "location of the object, structure or event.",
    "aliases": [
    "moveable object location",
    "located in",
    "event location",
    "venue",
    "is in",
    "location of item",
    "place held",
    "based in",
    "neighborhood",
    "region",
    "in",
    "located",
    "locality",
    "locale",
    "from",
    "place",
    "neighbourhood",
    "suburb",
    ]
  },
  {
    "pid": "P3872", # https://www.wikidata.org/wiki/Property:P3872
    "label": "patronage",
    "description": "number of passengers, patrons or visitors in specified time period.",
    "aliases": [
    "ridership",
    "visitors",
    "patrons",
    "number of visitors",
    "users",
    "passengers",
    "riders",
    "customers",
    ]
  },
  {
    "pid": "P1412", # https://www.wikidata.org/wiki/Property:P1412
    "label": "languages spoken, written or signed",
    "description": "language(s) that a person or a people speaks, writes or signs, including the native language(s)",
    "aliases": [
    "language spoken",
    "languages of expression",
    "languages signed",
    "language signed",
    "language written",
    "language read",
    "language used",
    "language",
    "speaks language",
    "writes language",
    "signs language",
    "uses language",
    "wrote language",
    "spoke language",
    "used language",
    "signed language",
    "languages spoken, written, or signed",
    "language(s) spoken, written or signed",
    "languages spoken",
    "language of expression",

    ]
  },
  {
    "pid": "P112", # https://www.wikidata.org/wiki/Property:P112
    "label": "founded by",
    "description": "founder or co-founder of this organization, religion or place",
    "aliases": [
    "co-founder",
    "founders",
    "established by",
    "co-founded by",
    "founder",
    "started by",
    "cofounder",
    "cofounded by",
    ]
  },
  {
    "pid": "P1619", # https://www.wikidata.org/wiki/Property:P1619
    "label": "date of official opening",
    "description": "date or point in time an event, museum, theater etc. officially opened",
    "aliases": [
    "opening date", 
    "opened", 
    "date opened", 
    "officially opened on", 
    "inaugurated", 
    "launch date", 
    "official opening", 
    "grand opening", 
    "date of opening", 
    "launched", 
    "inauguration", 
    "inauguration date", 
    "official opening time",
    ]
  },
  {
    "pid": "P1376", # https://www.wikidata.org/wiki/Property:P1376
    "label": "capital of",
    "description": "country, state, department, canton or other administrative division of which the municipality is the governmental seat",
    "aliases": [
    "county seat of",
    "county seat for",
    "administrative seat of",
    "seat of",
    "parish seat of",
    "is capital of"
    ]
  },
]

In [16]:
relations_dictionary = [
    "location",
    "moveable object location",
    "located in",
    "event location",
    "venue",
    "is in",
    "location of item",
    "place held",
    "based in",
    "neighborhood",
    "region",
    "in",
    "located",
    "locality",
    "locale",
    "from",
    "place",
    "neighbourhood",
    "suburb",
    "ridership",
    "visitors",
    "patrons",
    "number of visitors",
    "users",
    "passengers",
    "riders",
    "customers",
    "language spoken",
    "languages of expression",
    "languages signed",
    "language signed",
    "language written",
    "language read",
    "language used",
    "language",
    "speaks language",
    "writes language",
    "signs language",
    "uses language",
    "wrote language",
    "spoke language",
    "used language",
    "signed language",
    "languages spoken, written, or signed",
    "language(s) spoken, written or signed",
    "languages spoken",
    "language of expression",
    "co-founder",
    "founders",
    "established by",
    "co-founded by",
    "founder",
    "started by",
    "cofounder",
    "cofounded by",
    "opening date", 
    "opened", 
    "date opened", 
    "officially opened on", 
    "inaugurated", 
    "launch date", 
    "official opening", 
    "grand opening", 
    "date of opening", 
    "launched", 
    "inauguration", 
    "inauguration date", 
    "official opening time",
    "county seat of",
    "county seat for",
    "administrative seat of",
    "seat of",
    "parish seat of",
    "is capital of",
]

### Create the NLP pipeline and matchers

In [17]:
nlp = spacy.load("en_core_web_lg")

# create dependency and phrase matchers
dep_matcher = DependencyMatcher(nlp.vocab, validate=True)
rel_matcher = PhraseMatcher(nlp.vocab, None)
ent_matcher = PhraseMatcher(nlp.vocab, None)

# create pattern of phrase matcher for relations using gazetteers
rel_patterns = [nlp.make_doc(gazetteer) for gazetteer in relations_dictionary]
rel_matcher.add("relations", rel_patterns)

# create pattern of phrase matcher for entities using gazetteers
ent_patterns = [nlp.make_doc(gazetteer) for gazetteer in entities_dictionary]
ent_matcher.add("entities", ent_patterns)

### Extract SPO triples

In [18]:
# SPO triple function
def extract_triples_from_sentence(sentence):

  # initial process of the sentence with the NLP pipeline
  doc = nlp(sentence)
  # display dependency visualization
  displacy.render(doc, style="dep", jupyter=True, options={"distance": 90})

  #################################################
  # 1. use phrase matcher to identify occurrences of predicates and entities
  #################################################

  # matches for relation (1 or many extractions)
  rel_matches = rel_matcher(doc)
  # get relation spans from relation matches
  rel_spans = [doc[start:end] for _, start, end in rel_matches]
  # filter overlaping spans to ensure uniqueness of relations
  relation_spans = spacy.util.filter_spans(rel_spans)
  # create list of extracted relations in string type
  relations = [r.text for r in relation_spans] 

  # matches for entitties (1 or many extractions)
  ent_matches = ent_matcher(doc)
  # get entity spans from entity matches
  ent_spans = [doc[start:end] for _, start, end in ent_matches]
  # filter overlapping spans to ensure uniqueness of entities
  entity_spans = spacy.util.filter_spans(ent_spans)
  # create list of extracted entities in string type (*)
  entities = [e.text for e in entity_spans]

  print(f"• Extracted {len(relations)} relation(s): {relations}")
  print(f"• Extracted {len(entities)} entity(ies): {entities}")
  print()

  #################################################
  # 2. use relations (predicates) occurrence in sentence to identify entities
  #################################################

  # container of extracted triples. even though it may be unlikely, 
  # a single sentence may contain more than 1 triple
  triples = []

  #################################################
  # NOTE: we start by identifying the position of the relation in the sentence.
  # Notice that a relation is a span object (consisting of more than 1 token, 
  # more than 1 word), so the first step is identifying the head of the 
  # relation and then the subject and object entities that constitute 
  # the triple.
  #################################################

  for rel_span in relation_spans:
    # divide span in tokens
    for token in rel_span:
      # convert ancestors to string, so we can find the head of the relation span
      ancestors = [a.text for a in list(token.ancestors)]
      # find head of the relation span
      if token.text in ancestors:
        continue
      else:
        relation_head = token.text

    # container of extracted single SPO triple
    triple= []

    #################################################
    # NOTE: the DEP matcher pattern for SPO triples, starts locating the 
    # (head) of the relation by string match ("TEXT") and then it moves to the 
    # subject and object using dependency labels ("DEP"). This pattern will 
    # return 1 or more matches of triples extracted from a single sentence
    # since a (long) sentence may contain more than one reference to the same
    # relation with the same or different entities. 
    # SPO triple = Subject-Predicate-Object triple
    #################################################
    pattern = [
      {
        "RIGHT_ID": "predicate", 
        "RIGHT_ATTRS": {"TEXT": relation_head}, # 1. relation (predicate)
      },
      {
        "LEFT_ID": "predicate",
        "REL_OP": ";*", 
        "RIGHT_ID": "subject",
        "RIGHT_ATTRS": {"DEP": {"IN":["nsubj", "csubj"]}}, # 2. subject
      },
      {
        "LEFT_ID": "predicate",
        "REL_OP": ">>", 
        "RIGHT_ID": "object",
        "RIGHT_ATTRS": {"DEP": {"IN":["dobj", "pobj", "poss"]}}, # 3. object
      }
    ]

    # add patern to extract SPO triples using the DEP matcher
    dep_matcher.add("semantic_triple", [pattern])
    # extract matches of SPO triples
    pso_matches = dep_matcher(doc)

    # pass the entities (strings) through the pipeline to tokenize them
    entities_docs = [nlp(e) for e in entities]

    #################################################
    # 3. identify object and subject 
    #################################################

    #################################################
    # NOTE: in this sample of a returned match (within "matches"),
    # (4699203773119030710, [12, 5, 15]), the second element in the 
    # tuple (the list) corresponds to [P, S, O] found by the pattern.
    # Pay attention to it because we use the position of S and O in this
    # function. In the pattern we used only the head of the relation
    # so the returned match contains only a 1-word match, therefore we use 
    # it now to find the corresponding entity in "entities", now converted 
    # to a doc object in "entities_docs". Use use the Word2vec algorithm to
    # find similarity
    #################################################
    def reduce_entities(idx):
      # comparison of all entities found by the phrase matcher with the 
      # returned candidates matched entities, only one will remain
      highest_similarity = 0
      ent = None
      for pso_match in pso_matches:
        candidate = doc[pso_match[1][idx]]
        doc_candidate = nlp(candidate.text)

        for e in entities_docs:
          similarity = doc_candidate.similarity(e)
          if highest_similarity < similarity:
            highest_similarity = similarity
            ent = e.text
      return ent
    
    # get the full name of the subject from the "entities" list (*)
    subj = reduce_entities(idx=1)
    # add subject to triple (S--)

    # add relation (predicate) to triple (-P-)
    # NOTE: we add the relation (predicate) between S and O
    rel = rel_span.text

    # get the full name of the object from the "entities" list (*)
    obj = reduce_entities(idx=2)

    # make sure the triple is well-formed to add Wikidata IDS
    # otherwise it doesn't update the container "triples" and returns empty "[]"
    if subj and rel and obj:
      def look_up_wikidata_id(component_type, label):
        wikidata_id = None
        if component_type == "entity":
          dict_to_look_up = entities_dicts
          id_type = "qid" # Q is for entities in Wikidata information model
        elif component_type == "relation":
          dict_to_look_up = relations_dicts
          id_type = "pid" # P is for proerties (relations) in Wikidata information model
        for d in dict_to_look_up:
          if d.get("aliases"): # if relation/entity has aliasses attrib
            if label == d["label"] or label in d["aliases"]:
              wikidata_id = d[id_type]
          else:
            if label == d["label"]:
              wikidata_id = d[id_type]
        return wikidata_id
          
      triple = [
          (look_up_wikidata_id("entity", subj), subj),
          (look_up_wikidata_id("relation", rel), rel),
          (look_up_wikidata_id("entity", obj), obj),
      ]
      triples.append(triple)
    
    # NOTE: pending task, ensure uniqueness of triples coming from the same 
    # sentence, even though unlikely, it may happen

  return triples

In [19]:
import random
random.shuffle(sentences)

In [None]:
# extract SPO triples from sentences
for sentence in sentences:
  triples_in_sentence = extract_triples_from_sentence(sentence)
  if len(triples_in_sentence):
    # print("Extracted SPO triples:")
    for t in triples_in_sentence:
      print(f"• {t}")
      print()
  else:
    print("Not extracted SPO triples!")
    print()

  # NOTE: pending task, ensure uniqueness of triples coming 
  # from the same text

• Extracted 3 relation(s): ['located in', 'region', 'in']
• Extracted 3 entity(ies): ['Bavarian Forest National Park', 'Bavaria', 'Germany']

• [(None, 'Bavarian Forest National Park'), ('P276', 'located in'), (None, 'Bavaria')]

• [(None, 'Bavarian Forest National Park'), ('P276', 'region'), (None, 'Bavaria')]

• [(None, 'Bavarian Forest National Park'), ('P276', 'in'), (None, 'Bavaria')]



• Extracted 1 relation(s): ['located in']
• Extracted 4 entity(ies): ['Munich', 'Germany', 'Bavaria', 'museum']

• [(None, 'museum'), ('P276', 'located in'), (None, 'Munich')]



• Extracted 3 relation(s): ['in', 'in', 'visitors']
• Extracted 3 entity(ies): ['Christmas market', 'Munich', 'Germany']

• [(None, 'Germany'), ('P276', 'in'), (None, 'Munich')]

• [(None, 'Germany'), ('P276', 'in'), (None, 'Munich')]

• [(None, 'Germany'), ('P3872', 'visitors'), (None, 'Munich')]



• Extracted 2 relation(s): ['in', 'in']
• Extracted 2 entity(ies): ['Bavarian Forest National Park', 'Germany']

• [(None, 'Bavarian Forest National Park'), ('P276', 'in'), (None, 'Germany')]

• [(None, 'Bavarian Forest National Park'), ('P276', 'in'), (None, 'Germany')]



• Extracted 3 relation(s): ['located in', 'from', 'region']
• Extracted 3 entity(ies): ['Bavarian', 'Munich', 'Germany']

• [(None, 'Bavarian'), ('P276', 'located in'), (None, 'Munich')]

• [(None, 'Bavarian'), ('P276', 'from'), (None, 'Munich')]

• [(None, 'Bavarian'), ('P276', 'region'), (None, 'Munich')]



• Extracted 3 relation(s): ['located in', 'in', 'visitors']
• Extracted 3 entity(ies): ['Munich', 'Munich', 'Germany']

• [(None, 'Munich'), ('P276', 'located in'), (None, 'Munich')]

• [(None, 'Munich'), ('P276', 'in'), (None, 'Munich')]

• [(None, 'Munich'), ('P3872', 'visitors'), (None, 'Munich')]



• Extracted 1 relation(s): ['based in']
• Extracted 3 entity(ies): ['Berlin', 'Berlin', 'Germany']

Not extracted SPO triples!



• Extracted 1 relation(s): ['in']
• Extracted 2 entity(ies): ['Munich', 'Oktoberfest']

Not extracted SPO triples!



• Extracted 1 relation(s): ['from']
• Extracted 4 entity(ies): ['Berlin', 'city', 'Berlin', 'tourist attraction']

• [(None, 'tourist attraction'), ('P276', 'from'), (None, 'Berlin')]



• Extracted 0 relation(s): []
• Extracted 1 entity(ies): ['Bavarian cuisine']

Not extracted SPO triples!



• Extracted 2 relation(s): ['located in', 'in']
• Extracted 3 entity(ies): ['Bavarian', 'Munich', 'Germany']

• [(None, 'Munich'), ('P276', 'located in'), (None, 'Munich')]

• [(None, 'Munich'), ('P276', 'in'), (None, 'Munich')]



• Extracted 2 relation(s): ['located in', 'in']
• Extracted 2 entity(ies): ['Bavarian', 'Germany']

• [(None, 'Bavarian'), ('P276', 'located in'), (None, 'Germany')]

• [(None, 'Bavarian'), ('P276', 'in'), (None, 'Germany')]



• Extracted 1 relation(s): ['in']
• Extracted 3 entity(ies): ['Berliner', 'Berlin', 'Germany']

• [(None, 'Berliner'), ('P276', 'in'), (None, 'Berlin')]



• Extracted 0 relation(s): []
• Extracted 3 entity(ies): ['House of Wittelsbach', 'German', 'Bavaria']

Not extracted SPO triples!



• Extracted 2 relation(s): ['located in', 'in']
• Extracted 4 entity(ies): ['monument', 'Berlin', 'Germany', 'city']

• [(None, 'monument'), ('P276', 'located in'), (None, 'Berlin')]

• [(None, 'monument'), ('P276', 'in'), (None, 'Berlin')]



• Extracted 2 relation(s): ['located in', 'in']
• Extracted 2 entity(ies): ['Berlin', 'city']

• [(None, 'city'), ('P276', 'located in'), (None, 'city')]

• [(None, 'city'), ('P276', 'in'), (None, 'city')]



• Extracted 4 relation(s): ['in', 'in', 'visitors', 'from']
• Extracted 6 entity(ies): ['Oktoberfest', 'festival', 'Munich', 'Germany', 'beer', 'festival']

• [(None, 'Oktoberfest'), ('P276', 'in'), (None, 'Munich')]

• [(None, 'Oktoberfest'), ('P276', 'in'), (None, 'Munich')]

• [(None, 'Oktoberfest'), ('P3872', 'visitors'), (None, 'Munich')]

• [(None, 'Oktoberfest'), ('P276', 'from'), (None, 'Munich')]



• Extracted 2 relation(s): ['in', 'in']
• Extracted 4 entity(ies): ['Feldherrnhalle', 'Munich', 'Germany', 'Bavarian']

• [(None, 'Feldherrnhalle'), ('P276', 'in'), (None, 'Munich')]

• [(None, 'Feldherrnhalle'), ('P276', 'in'), (None, 'Munich')]



• Extracted 1 relation(s): ['in']
• Extracted 4 entity(ies): ['public transport', 'Berlin', 'Germany', 'city']

Not extracted SPO triples!



• Extracted 2 relation(s): ['located in', 'in']
• Extracted 3 entity(ies): ['Bavarian State Painting Collections', 'Munich', 'Germany']

• [(None, 'Bavarian State Painting Collections'), ('P276', 'located in'), (None, 'Munich')]

• [(None, 'Bavarian State Painting Collections'), ('P276', 'in'), (None, 'Munich')]



• Extracted 2 relation(s): ['located in', 'in']
• Extracted 3 entity(ies): ['Bavarian', 'Munich', 'Germany']

• [(None, 'Bavarian'), ('P276', 'located in'), (None, 'Munich')]

• [(None, 'Bavarian'), ('P276', 'in'), (None, 'Munich')]



In [None]:
triples = [
    [
      ('Q102400', 'Oktoberfest'), 
      ('P276', 'in'), 
      ('Q1726', 'Munich')
    ],
    [
      ('Q251712', 'Feldherrnhalle'), 
      ('P276', 'in'), 
      ('Q1726', 'Munich')
    ],
    [
      ('Q1726', 'Munich'), 
      ('P1376', 'capital of'), 
      ('Q980', 'Bavaria ')
    ],
    [
      ('Q131621', 'House of Wittelsbach'), 
      ('P276', 'in'), 
      ('Q980', 'Bavaria ')
    ],
     [
      ('Q980', 'Bavaria'), 
      ('P276', 'in'), 
      ('Q183', 'Germany')
    ],
    [
      ('Q1726', 'Munich '), 
      ('P276', 'in'), 
      ('Q183', 'Germany')
    ],
    [
      ('Q123', 'September'),
      ('P3872', 'patronage'), 
      ('Q1726', 'Munich')
    ],
    [
      ('Q64', 'Berlin'), 
      ('P1376', 'capital of'), 
      ('Q183', 'Germany')
    ],
    [
      ('Q57607', 'Christmas market'),
      ('P276', 'in'), 
      ('Q980', 'Bavaria')
    ],
    [
      ('Q704124', 'Bavarian Forest National Park'),
      ('P276', 'in'), 
      ('Q1726', 'Munich')
    ],
    [
      ('Q126', 'December'),
      ('P3872', 'patronage'), 
      ('Q64', 'Berlin')
    ],
    [
      ('Q57607', 'Christmas market'),
      ('P1619', 'date of official opening'), 
      ('Q126', 'December')
    ],
    [
      ('Q812285', 'Bavarian State Painting Collections'),
      ('P276', 'in'), 
      ('Q1726', 'Munich')
    ],
     [
      ('Q102400', 'Oktoberfest'),
      ('P1619', 'date of official opening'), 
      ('Q123', 'September')
    ],
    [
      ('Q102400', 'Oktoberfest'),
      ('P276', 'in'), 
      ('Q64', 'Berlin')
    ]
]

In [None]:
def get_wikidata_info(wikidata_id):
  wikidata_info = None
  if wikidata_id[0] == "Q": # is an entity
    dict_to_look_up = entities_dicts
    id_attrb_name = "qid"
    uri_root = "https://www.wikidata.org/wiki/"
  elif wikidata_id[0] == "P": # is a relation (or property)
    dict_to_look_up = relations_dicts
    id_attrb_name = "pid"
    uri_root = "https://www.wikidata.org/wiki/Property:"
  for d in dict_to_look_up:
    if d[id_attrb_name] == wikidata_id:
      wikidata_info = d
  #     uri = uri_root + wikidata_id
  #     namespace_url = f"https://www.wikidata.org/wiki/Special:EntityData/{wikidata_id}.json"

  return wikidata_info, namespace_url

In [None]:
triple_collection = []
for triple in triples:
  triple_set = []
  print("#################### Beginning of triple ##########################")
  for t in triple:
    wikidata_id = t[0]
    wikidata_info, namespace_url = get_wikidata_info(wikidata_id)
    triple_set.append(wikidata_info)
    print("---")
    pprint(wikidata_info)
    print(uri)
    print(namespace_url)
  triple_collection.append(triple_set)
  print("####################### End of triple #############################")
  print()

#################### Beginning of triple ##########################


NameError: ignored

In [None]:
# two slots of triples
triples = [
  [
    {
      "qid": "Q102400",
      "label": "Oktoberfest",
      "description": "world's largest Volks festival in Munich, Bavaria, Germany",
      "ner_label": "EVENT"
    },
    {
      "pid": "P276",
      "label": "location",
      "description": "location of the object, structure or event."
    },
    {
      "qid": "Q1726",
      "label": "Munich",
      "description": "capital and most populous city of Bavaria, Germany",
      "ner_label": "LOC"
    }
  ],
  [
    {
      "qid": "Q704124",
      "label": "Bavarian Forest National Park",
      "description": "national park in Bavaria, Germany",
      "ner_label": "PLACE"
    },
    {
      "pid": "P276",
      "label": "location",
      "description": "location of the object, structure or event."
    },
    {
      "qid": "Q183",
      "label": "Germany",
      "description": "country in Central Europe",
      "ner_label": "LOC"
    }
  ]
]

entity_supertypes = [
  {
    "qid": "Q1656682",
    "label": "event ",
    "description": "temporary and scheduled happening, like a conference, festival, competition or similar",
    "ner_label": "CONCEPT"
  },
    {
    "qid": "Q570116",
    "label": "tourist attraction",
    "description": "place of interest where tourists visit",
    "ner_label": "CONCEPT"
  },
    {
    "qid": "Q2221906",
    "label": "Geographic location",
    "description": "point or an area on something's surface or elsewhere",
    "ner_label": "CONCEPT"
  },
]

In [None]:
def convert_to_relation_name(relation):
  relation = relation.replace(" ", "_")
  relation = relation.upper()
  return relation

def get_supertype_id(ner_label):
  supertype_id = None
  if ner_label == "EVENT" or ner_label == "DATE":
    supertype_id = "Q1656682"
  elif ner_label == "PLACE" or ner_label == "ORG":
    supertype_id = "Q570116"
  elif ner_label == "LOC" or ner_label == "GPE":
    supertype_id = "Q2221906"
  return supertype_id

def add_supertypes(tx, entity_supertypes):
  for entity in entity_supertypes:
    # check entity uniqueness
    exists = tx.run(
      "MATCH (a:Entity) "
      "WHERE a.qid = $qid "
      "RETURN a",
      qid=entity["qid"]
    )
    if not exists.single():
      # create supertype entity
      tx.run(
        "CREATE (a:Entity $entity)",
        entity=entity
      )
      print(f"Created Supertype ({entity['label']})")
    else:
      print(f"Supertype ({entity['label']}) already exists")

def add_entity(tx, entity):
  # check entity uniqueness
  exists = tx.run(
    "MATCH (a:Entity) "
    "WHERE a.qid = $qid "
    "RETURN a",
    qid=entity["qid"]
  )
  # create entity if it doesn't exist
  if not exists.single():
    tx.run(
      "CREATE (a:Entity $entity)",
      entity=entity
    )
    print(f"Created ({entity['name']}) entity!")
  else:
    print(f"Entity ({entity['name']}) already exists!")


def add_relation(tx, subj, rel, obj):
  # check relation existence
  exists = tx.run(
    "MATCH (a:Entity {qid: $a_qid}), (b:Entity {qid: $b_qid}) "
    f"RETURN EXISTS( (a)-[:{convert_to_relation_name(rel['label'])}]->(b) )",
    a_qid=subj["qid"],
    b_qid=obj["qid"]
  ).value()[0]

  # create relation if it doesn't exist
  if not exists:
    tx.run(
      "MATCH (a:Entity),(b:Entity) "
      "WHERE a.qid = $a_qid "
      "AND b.qid = $b_qid "
      f"CREATE (a)-[r:{convert_to_relation_name(rel['label'])}]->(b)",
      a_qid=subj["qid"],
      b_qid=obj["qid"]
    )
    print(f"Created relation ({subj['name']}) –{convert_to_relation_name(rel['label'])}–> ({obj['name']})")
  else:
    print(f"Relation ({subj['name']}) –{convert_to_relation_name(rel['label'])}–> ({obj['name']}) already exists!")


def add_relation_to_supertype(tx, entity):
  supertype_id = get_supertype_id(entity["name"])
  # check relation existence
  exists = tx.run(
    "MATCH (a:Entity {qid: $a_qid}), (b:Entity {qid: $b_qid}) "
    "RETURN EXISTS( (a)-[:IS_INSTANCE_OF]->(b) )",
    a_qid=entity["qid"],
    b_qid=supertype_id
  ).value()
  # create relation if it doesn't exist
  if exists == []:
    tx.run(
      "MATCH (a:Entity),(b:Entity) "
      "WHERE a.qid = $a_qid "
      "AND b.qid = $b_qid "
      "CREATE (a)-[r:IS_INSTANCE_OF]->(b)",
      a_qid=entity["qid"],
      b_qid=supertype_id
    )
    print(f"Created relation ({entity['name']}) –IS_INSTANCE_OF–> ({entity['name']})")
  else:
    print(f"Relation ({entity['name']}) –IS_INSTANCE_OF–> ({entity['name']}) already exists!")


with neo4j_db.session(database="neo4j") as session:
  print("--- supertypes ---")
  session.execute_write(add_supertypes, entity_supertypes)
  for idx, triple in enumerate(triple_collection):
    print(f"--- triple {idx+1} ---")
    subj = triple[0]
    rel = triple[1]
    obj = triple[2]
    session.execute_write(add_entity, subj)
    session.execute_write(add_entity, obj)
    session.execute_write(add_relation, subj, rel, obj)
    session.execute_write(add_relation_to_supertype, subj)
    session.execute_write(add_relation_to_supertype, obj)


In [None]:
# query the Wikidata API to retrieve an entity's information
# fetch entity info from the Wikidata namespace URL

# for entity
namespace_url = f"https://www.wikidata.org/wiki/Special:EntityData/Q7322.json"
r = requests.get(namespace_url, params={"format": "json"})
# simplify access to root elements of JSON object
pprint(r.json()["entities"][f"Q7322"])

# for property
namespace_url = f"https://www.wikidata.org/wiki/Special:EntityData/P61.json"
r = requests.get(namespace_url, params={"format": "json"})
# simplify access to root elements of JSON object
pprint(r.json()["entities"][f"P61"])