   # Turn a Harry Potter Book into a Knowledge Graph

  ## Developed_By_Jitendra_And_Suggested_From_NEO4J.

Learn how to combine Selenium and SpaCy to create a Neo4j knowledge graph of the Harry Potter universe.
        Visite For More Info About Harry Potter Books : https://harrypotter.fandom.com/wiki/Main_Page

First Things is That you have to install the required dependencies for this project Available Inside requirements.txt file.

In [1]:
# Setup selenium
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
from selenium import webdriver
from selenium.webdriver.common.by import By
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome(chrome_options)

In [2]:
def enrich_single_item(item):
  try:
    # Find the HTML element with required data
    div = wd.find_element_by_xpath(f"//div[@data-source = '{item}']")
    # Extract relevant data from "a" or "div" tag
    try:
      result = div.find_element(By.TAG_NAME, "a").text.split('[')[0].strip()
      if result[0] == '[':
        raise Exception
    except:
      result = div.find_element(By.TAG_NAME, "div").text.split('[')[0].strip()
    return result
  except:
    return None

                                Harry Potter fandom page scraping
--------------------------------------------------------------------------------------------------------------------------------
We will use Selenium for web scraping. As mentioned, we will begin by scraping the characters in the Harry Potter and the Philosopher's Stone book. The list of characters by chapter is available under the CC-BY-SA license, so we don't have to worry about any copyright infringement. Additionally, each of the characters has a web page with detailed information about the character. For example, if you check out the Hermione Granger page, you can observe a structured table with additional information. We will use the alias section for the entity extraction and add other character details like house and blood type to enrich our knowledge graph. visite for info : https://harrypotter.fandom.com/wiki/Main_Page

In [3]:
import time

def get_characters(url):
  # Get the list of characters by chapter
  wd.get(url)
  character_dict = dict()
  elem = wd.find_element(By.CLASS_NAME,  "mw-parser-output")
  
  # Locate character by chapter
  tables = elem.find_elements(By.TAG_NAME, 'table')
  for i, chapter in enumerate(tables):
    list_of_characters = []
    characters = chapter.find_elements(By.TAG_NAME, 'a')
    for character in characters:
      if not character.get_attribute('title'):
        continue
      list_of_characters.append({'title': character.get_attribute('title'), 'url': character.get_attribute('href')})
    character_dict['chapter_' + str(i + 1)] = list_of_characters
  # Enrich characters with additional information
  for chapter in character_dict:
    for index, character in enumerate(character_dict[chapter]):
      # Rate limit sleep
      time.sleep(1)
      # Get the character page with selenium
      wd.get(character['url'])
      # Enrich aliases
      try:
        alias_div = wd.find_element_by_xpath("//div[@data-source = 'alias']")
        aliases = alias_div.find_elements(By.TAG_NAME, 'li')
        result = []
        for a in aliases:
          # Ignore under the cloak-guise and the name he told
          if "disguise" in a.text or "the name he told" in a.text:
            continue
          alias = a.text.split('[')[0].split('(')[0].strip()
          result.append(alias)
        character_dict[chapter][index]['aliases'] = result
      except:
        pass
      # Enrich loyalties
      try:
        loyalty_div = wd.find_element_by_xpath("//div[@data-source = 'loyalty']")
        loyalties = loyalty_div.find_elements(By.TAG_NAME, 'li')
        result = []
        for l in loyalties:
          loyalty = l.text.split('[')[0].split('(')[0].strip()
          result.append(loyalty)
        character_dict[chapter][index]['loyalty'] = result
      except:
        pass
      # Enrich family relationships
      try:
        family_div = wd.find_element_by_xpath("//div[@data-source = 'family']")
        relationships = family_div.find_elements(By.TAG_NAME, 'li')
        result = []
        for r in relationships:
          rel = r.text.split('[')[0].split('(')[0].strip()
          rel_type = r.text.split('(')[-1].split(')')[0].split('[')[0]
          result.append({'person':rel, 'type': rel_type})
        character_dict[chapter][index]['family'] = result
      except:
        pass
      # Enrich blood
      character_dict[chapter][index]['blood'] = enrich_single_item('blood')
      # Enrich nationality
      character_dict[chapter][index]['nationality'] = enrich_single_item('nationality')
      # Enrich species
      character_dict[chapter][index]['species'] = enrich_single_item('species')
      # Enrich house
      character_dict[chapter][index]['house'] = enrich_single_item('house')
      # Enrich gender
      character_dict[chapter][index]['gender'] = enrich_single_item('gender')
  return character_dict

In [5]:
character_dict = get_characters("https://harrypotter.fandom.com/wiki/Harry_Potter_and_the_Philosopher%27s_Stone_(character_index)")

In [6]:
from neo4j import GraphDatabase
# Change the host and user/password combination to your neo4j
# Will not work with a localhost bolt url
host = 'bolt://54.92.169.42:7687'
user = 'neo4j'
password = 'dereliction-hisses-manuals'
driver = GraphDatabase.driver(host,auth=(user, password))

In [7]:
entity_query = """
UNWIND $data as row
MERGE (c:Character{name:row.title})
SET c.url = row.url,
    c.aliases = row.aliases,
    c.blood = row.blood,
    c.nationality = row.nationality,
    c.species = row.species,
    c.gender = row.gender
FOREACH (h in CASE WHEN row.house IS NOT NULL THEN [1] ELSE [] END | MERGE (h1:House{name:row.house}) MERGE (c)-[:BELONGS_TO]->(h1))
FOREACH (l in row.loyalty | MERGE (g:Group{name:l}) MERGE (c)-[:LOYAL_TO]->(g))
FOREACH (f in row.family | MERGE (f1:Character{name:f.person}) MERGE (c)-[t:FAMILY_MEMBER]->(f1) SET t.type = f.type)    

"""
with driver.session() as session:
  for chapter in character_dict:
    session.run(entity_query, {'data': character_dict[chapter]})

In [8]:
def get_character_dict(chapter):
  super_list = list()
  dicts = [character_dict['chapter_' + str(i)] for i in range(1,chapter + 1)]
  for d in dicts:
    for item in d:
      super_list.append(item)
  return super_list

In [9]:
import spacy
from spacy.matcher import Matcher
import neuralcoref

nlp = spacy.load('en_core_web_sm')
neuralcoref.add_to_pipe(nlp)

doc1 = nlp('My sister has a dog. She loves him.')
print(doc1._.coref_clusters)



[My sister: [My sister, She], a dog: [a dog, him]]


In [10]:
def coref_resolution(text):
    """Function that executes coreference resolution on a given text"""
    doc = nlp(text)
    # fetches tokens with whitespaces from spacy document
    tok_list = list(token.text_with_ws for token in doc)
    for cluster in doc._.coref_clusters:
        # get tokens from representative cluster name
        cluster_main_words = set(cluster.main.text.split(' '))
        for coref in cluster:
            if coref != cluster.main:  # if coreference element is not the representative element of that cluster
                if coref.text != cluster.main.text and bool(set(coref.text.split(' ')).intersection(cluster_main_words)) == False:
                    # if coreference element text and representative element text are not equal and none of the coreference element words are in representative element. This was done to handle nested coreference scenarios
                    tok_list[coref.start] = cluster.main.text + \
                        doc[coref.end-1].whitespace_
                    for i in range(coref.start+1, coref.end):
                        tok_list[i] = ""

    return "".join(tok_list)

                    Entity recognition with SpaCy's rule-based matching
                ==========================================================
First, I wanted to be cool and use a Named Entity Recognition model. I've tried models from SpaCy, HuggingFace, Flair, and even Stanford NLP. None of them worked well enough to satisfy my requirements. So instead of training my model, I decided to use SpaCy's rule-based pattern matching feature. We already know which characters we are looking for based on the data we scraped from the HP fandom site. Now we just have to find a way to match them in the text as perfectly as possible. We have to define the text patterns for each of the character.

In [11]:
def get_matcher_patterns(character):
  matcher_pattern = []
  stop_words = ['of', 'the', 'at', 'family', 'keeper', 'wizard', 'fat', 'de', 'hogwarts', 'hotel', 'owner', 'express']
  parts_of_name = [el for el in character['title'].split(' ') if len(el) > 2]
  # Append the whole pattern
  matcher_pattern.append([{"LOWER": n.lower(), "IS_TITLE": True} for n in parts_of_name])
  
  # Append parts of names
  if not "'" in character['title']: # Skip names like Vernon Dursley's secretary
    for n in parts_of_name:
      if n.lower() in stop_words: # Skip appending stop words
        continue
      matcher_pattern.append([{"LOWER": n.lower(), "IS_TITLE": True}])
      # Special case for Ronald Weasley -> Also add Ron
      if n == "Ronald":
        matcher_pattern.append([{"LOWER": "ron", "IS_TITLE": True}])
  return matcher_pattern

In [12]:
import random

hardcoded_options = dict()
hardcoded_options['Malfoy'] = ['Draco Malfoy']
hardcoded_options['Patil'] = ['Padma Patil', 'Parvati Patil']
hardcoded_options['Tom'] = ['Tom']

def handle_multiple_options(result, doc):
  needs_deduplication = [(i,x) for i,x in enumerate(result) if len(x['string_id']) > 1]
  for index, multiple_options in needs_deduplication:
    # Special logic for Dursleys, if there if Mr. then Vernon, if Mrs. then Petunia
    prefix = doc[multiple_options['start']-3 : multiple_options['start']]
    if (multiple_options['text'] == 'Dursley') and ("Mr." in prefix.text):
      resolution = ["Vernon Dursley"]
    elif (multiple_options['text'] == 'Dursley') and ("Mrs." in prefix.text):
      resolution = ["Petunia Dursley"]
    # Find nearest entity
    else:
      end_char = multiple_options['end']
      distance = sys.maxsize
      resolution = []
      for possible_option in result:
        # Skip multiple options and entities that don't have any of the multiple option
        if (not len(possible_option['string_id']) == 1) or (not possible_option['string_id'][0] in multiple_options['string_id']):
          continue
        new_distance = abs(multiple_options['end'] - possible_option['end'])
        if new_distance < distance:
          distance = new_distance
          resolution = possible_option['string_id']
      
      if not resolution:
        try:
          ho = hardcoded_options[multiple_options['text']]
          if len(ho) == 1:
            resolution = ho
          else:
            resolution = [random.choice(ho)]
        except:
          print(f"no way to disambiguate {multiple_options['text']} from options: {multiple_options['string_id']}")
    
    result[index]['string_id'] = resolution
  return result

                             Infer relationships between characters
                           ===========================================
We are finished with the hard part. Inferring relationships between characters is very simple. First, we need to define the distance threshold of interaction or relation between two characters. As mentioned, we will use the same distance threshold as was used in the GoT extraction. That is, if two characters co-occur within the distance of 14 words, then we assume they must have interacted. I have also merged entities not to skew results. What do I mean by joining entities? Suppose we have the following two sentences: "Harry was having a good day. He went to talk to Dumbledore in the afternoon." Our entity extraction process will identify three entities, "Harry", "He" as a reference to Harry, and "Dumbledore". If we took the naive approach, we could infer two interactions between Harry and Dumbledore as two references of "Harry" are close to "Dumbledore". However, I want to avoid that, so I have merged entities in a sequence that refers to the same character as a single entity. Finally, we have to count the number of interactions between the pairs of characters.

In [13]:
from collections import Counter

def get_distances(result, distance_threshold):
  #sort by start character
  result = sorted(result, key=lambda k: k['start'])
  compact_entities = []
  # Merge entities
  for entity in result:
    # If the same entity occurs, prolong the end 
    if (len(compact_entities) > 0) and (compact_entities[-1]['string_id'] == entity['string_id']):
      compact_entities[-1]['end'] = entity['end']
    else:
      compact_entities.append(entity)
  distances = list()
  # Iterate over all entities
  for index, source in enumerate(compact_entities[:-1]):
    # Compare with entities that come after the given one
    for target in compact_entities[index + 1:]:
      if (source['string_id'] != target['string_id']) and (abs(source['end'] - target['start']) < distance_threshold):
        link = sorted([source['string_id'][0], target['string_id'][0]])
        distances.append(link)
      else:
        break
  # Count the number of interactions
  return Counter(map(tuple, distances))

                               Store results to Neo4j graph database
                            ===========================================
We have extracted the interactions network between character, and the only thing left is to store the results into a graph database. The import query is very straightforward as we are dealing with a monopartite network. If you are using the Colab notebook I have prepared, then it would be easiest to create either a Neo4j Sandbox or Aura database instance to store the results.

In [14]:
def store_to_neo4j(distances):
  data = [{'source': el[0], 'target': el[1], 'weight': distances[el]} for el in distances]
  with driver.session() as session:
    session.run("""
    UNWIND $data as row
    MERGE (c:Character{name:row.source})
    MERGE (t:Character{name:row.target})
    MERGE (c)-[i:INTERACTS]-(t)
    SET i.weight = coalesce(i.weight,0) + row.weight
    """, {'data': data})

First of all, we have to get our hands on the text from the book. I've found a GitHub repository that contains the text of the first four Harry Potter books. There is no license attached to the data, so I will assume we can use the data for educational purposes within the limits of fair use. If you actually want to read the book, please go and buy it. Getting the text from a GitHub file is quite easy:

In [15]:
import requests
res = requests.get("https://raw.githubusercontent.com/amephraim/nlp/master/texts/J.%20K.%20Rowling%20-%20Harry%20Potter%201%20-%20Sorcerer's%20Stone.txt")
text = res.text
chapters = text.split("CHAPTER")[1:]
def get_characters_in_chapter(chapter):
  c = chapters[chapter - 1]
  # Prepare characters matcher
  matcher = Matcher(nlp.vocab)
  for character in get_character_dict(chapter):
    matcher_pattern = get_matcher_patterns(character)
    matcher.add(character['title'], matcher_pattern)

  # Prepare text
  lines = c.split('\n')[1:]
  lines = list(filter(None, lines))
  chapter_title = lines[0]
  print(chapter_title)
  text = " ".join(lines[1:])
  
  # Run coreference resolution
  text = coref_resolution(text)

  # Find matches
  doc = nlp(text)
  matches = matcher(doc)
  result = []
  for match_id, start, end in matches:
      string_id = nlp.vocab.strings[match_id]  # Get string representation
      span = doc[start:end]  # The matched span

      # Get predicates for correct result appendment
      exists_longer = [(start == e['start'] and end < e['end']) or (start > e['start'] and end == e['end']) for e in result]
      same = [start == e['start'] and end == e['end'] for e in result]
      shorter_end = [start == e['start'] and end > e['end'] for e in result]
      shorter_start = [start < e['start'] and end == e['end'] for e in result]
      
      # Append to results
      if any(exists_longer): # If there is a longer version of the given entity already in results
        continue
      
      if any(shorter_end): # If there is any entity with the same start span but has shorter end
        del result[shorter_end.index(True)]
        result.append({'string_id': [string_id], 'start': start, 'end': end, 'text': span.text}) 
      elif any(shorter_start): # If there is any entity with the same end span but has shorter start
        del result[shorter_start.index(True)]
        result.append =({'string_id': [string_id], 'start': start, 'end': end, 'text': span.text}) 
      elif not any(same): # If not exists yet
        result.append({'string_id': [string_id], 'start': start, 'end': end, 'text': span.text})
      else: # Add more entities to a single span
        i = same.index(True)
        result[i]['string_id'].append(string_id)
  
  # Handle results where there are multiple options
  handle_multiple_options(result, doc)
  return result


Run the code for each chapter of the book.

In [16]:
for c in range(1,len(chapters) + 1):
  end = get_characters_in_chapter(c)
  distances = get_distances(end, 14)
  store_to_neo4j(distances)

THE BOY WHO LIVED
THE VANISHING GLASS
THE LETTERS FROM NO ONE
THE KEEPER OF THE KEYS
DIAGON ALLEY
THE JOURNEY FROM PLATFORM NINE AND THREE-QUARTERS
THE SORTING HAT
THE POTIONS MASTER
THE MIDNIGHT DUEL
HALLOWEEN
QUIDDITCH
THE MIRROR OF ERISED
NICOLAS FLAMEL
NORBERT THE NORWEGIAN RIDGEBACK
THE FORIBIDDEN FOREST
THROUGH THE TRAPDOOR
THE MAN WITH TWO FACES
