In [21]:
%load_ext autoreload
%autoreload 2

import sys
import re
import numpy as np
from tqdm import tqdm

import torch
import spacy
from transformers import pipeline
from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain.document_loaders import PyPDFLoader, TextLoader

from sklearn.metrics.pairwise import cosine_similarity

# sys.path.append("..")
# from itext2kg.documents_distiller import DocumentsDisiller, CV, Article
from itext2kg.models import Entity, KnowledgeGraph
from itext2kg.irelations_extraction import iRelationsExtractor
from itext2kg.graph_integration import GraphIntegrator

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Setup Models

Using Spacy transformer model for NER and Ollama LLM for relationship extraction

In [3]:
spacy.prefer_gpu()

True

In [4]:
nlp = spacy.load("en_core_web_trf")

  model.load_state_dict(torch.load(filelike, map_location=device))


In [5]:
llm = ChatOllama(
    # model="gemma2:9b-instruct-q8_0", # Took 
    model="gemma2:9b-instruct-q5_K_M", # Took 12 minutes to extract relationships
    # model="gemma2:9b-instruct-q5_K_S", # Took 13 minutes to extract relationships
    # model="gemma2:9b-instruct-q4_K_S", # Took <10 minutes to extract relationships
    temperature=0,
    max_retries=5,
    max_tokens=None,
    keep_alive=-1
)

embeddings = OllamaEmbeddings(
    model="bge-large"
)

## Extract Entities using Spacy

In [12]:
from langchain.document_loaders import PyPDFLoader, TextLoader

loader = PyPDFLoader(f"../datasets/scientific_articles/building_age.pdf")
pages = loader.load()
pages = [page.page_content.replace("{", "").replace("}", "") for page in pages]

In [13]:
global_ent = []
entities = []
page_ents = []
entity_types = ["PERSON", "GPE", "ORG"]

for page in pages:
    ents = []
    doc = nlp(page)
    for ent in doc.ents:
        if ent.label_ in entity_types:
            text = ent.text.replace("\n", "")
            text = re.split(r'\d', text)[0].rstrip()
            
            embed = np.array(embeddings.embed_query(text.lower()))
            entity = Entity(name=text, label=ent.label_)
            entity.properties.embeddings = embed
            
            ents.append(entity)
            if text.lower() not in entities:
                global_ent.append(entity)
                entities.append(text.lower())

    page_ents.append(ents)

len(global_ent)

  with torch.cuda.amp.autocast(self._mixed_precision):


203

## Use iText2KG for extracting relationships

In [14]:
relationships = []
irelations_extractor = iRelationsExtractor(llm_model=llm, 
                                           embeddings_model=embeddings,
                                           sleep_time=1)

for page, ents in tqdm(zip(pages, page_ents), total=len(pages)):
    rels = irelations_extractor.extract_relations(context=page, entities=ents)
    relationships.append(rels)

  0%|                                                                                                                 | 0/8 [00:00<?, ?it/s]

{'relationships': [{'startNode': {'label': 'PERSON', 'name': 'Zeng'}, 'endNode': {'label': 'ORG', 'name': 'University College London'}, 'name': 'affiliated with'}, {'startNode': {'label': 'PERSON', 'name': 'J. M. Goo'}, 'endNode': {'label': 'ORG', 'name': 'University College London'}, 'name': 'affiliated with'}, {'startNode': {'label': 'PERSON', 'name': 'X. Wang'}, 'endNode': {'label': 'ORG', 'name': 'University College London'}, 'name': 'affiliated with'}, {'startNode': {'label': 'PERSON', 'name': 'B. Chi'}, 'endNode': {'label': 'ORG', 'name': 'University College London'}, 'name': 'affiliated with'}, {'startNode': {'label': 'PERSON', 'name': 'M. Wang'}, 'endNode': {'label': 'ORG', 'name': 'University College London'}, 'name': 'affiliated with'}, {'startNode': {'label': 'PERSON', 'name': 'J. Boehm'}, 'endNode': {'label': 'ORG', 'name': 'University College London'}, 'name': 'affiliated with'}, {'startNode': {'label': 'GPE', 'name': 'London'}, 'endNode': {'label': 'ORG', 'name': 'Univers

 12%|█████████████                                                                                           | 1/8 [04:58<34:51, 298.72s/it]

{'relationships': [{'startNode': {'label': 'ORG', 'name': 'FI-London'}, 'endNode': {'label': 'PERSON', 'name': 'Despotovic'}, 'name': 'developed'}, {'startNode': {'label': 'PERSON', 'name': 'Riemenschneider'}, 'endNode': {'label': 'ORG', 'name': 'FI-London'}, 'name': 'developed'}, {'startNode': {'label': 'PERSON', 'name': 'Shalunts'}, 'endNode': {'label': 'ORG', 'name': 'FI-London'}, 'name': 'categorised'}, {'startNode': {'label': 'PERSON', 'name': 'Law'}, 'endNode': {'label': 'GPE', 'name': 'London'}, 'name': 'built'}, {'startNode': {'label': 'PERSON', 'name': 'Zeppelzauer'}, 'endNode': {'label': 'ORG', 'name': 'FI-London'}, 'name': 'developed'}, {'startNode': {'label': 'PERSON', 'name': 'Despotovic'}, 'endNode': {'label': 'ORG', 'name': 'FI-London'}, 'name': 'developed'}, {'startNode': {'label': 'PERSON', 'name': 'Ogawa'}, 'endNode': {'label': 'ORG', 'name': 'FI-London'}, 'name': 'developed'}, {'startNode': {'label': 'PERSON', 'name': 'Achiam'}, 'endNode': {'label': 'ORG', 'name': 'G

 25%|██████████████████████████                                                                              | 2/8 [06:39<18:12, 182.09s/it]

{'relationships': [{'startNode': {'label': 'GPE', 'name': 'London'}, 'endNode': {'label': 'ORG', 'name': 'FI-London'}, 'name': 'located_in'}, {'startNode': {'label': 'PERSON', 'name': 'Despotovic'}, 'endNode': {'label': 'ORG', 'name': 'FI-London'}, 'name': 'contributed_to'}, {'startNode': {'label': 'PERSON', 'name': 'Sun'}, 'endNode': {'label': 'ORG', 'name': 'FI-London'}, 'name': 'contributed_to'}, {'startNode': {'label': 'ORG', 'name': 'Google Street View'}, 'endNode': {'label': 'GPE', 'name': 'Amsterdam'}, 'name': 'used_in'}, {'startNode': {'label': 'GPE', 'name': 'London'}, 'endNode': {'label': 'ORG', 'name': 'FI-London'}, 'name': 'focus_of'}, {'startNode': {'label': 'GPE', 'name': 'Camden'}, 'endNode': {'label': 'ORG', 'name': 'FI-London'}, 'name': 'focus_of'}, {'startNode': {'label': 'GPE', 'name': 'Lon-'}, 'endNode': {'label': 'ORG', 'name': 'FI-London'}, 'name': 'focus_of'}, {'startNode': {'label': 'PERSON', 'name': 'Hudson'}, 'endNode': {'label': 'ORG', 'name': 'FI-London'}, '

 38%|███████████████████████████████████████                                                                 | 3/8 [07:34<10:19, 123.98s/it]

{'relationships': [{'startNode': {'label': 'ORG', 'name': 'FI-London'}, 'endNode': {'label': 'GPE', 'name': 'London'}, 'name': 'located_in'}]}
[INFO] Verification of invented entities
[INFO][INVENTED ENTITIES] Aie; the entities label='ORG' name='fi london' properties=EntityProperties(embeddings=None) and label='GPE' name='london' properties=EntityProperties(embeddings=None) are invented. Solving them ...


 50%|████████████████████████████████████████████████████                                                    | 4/8 [13:30<14:22, 215.68s/it]

[INFO] Wohoo! Entity was matched --- [fi london:ORG] --merged--> [FI-London:ORG]
[INFO] Wohoo! Entity was matched --- [london:GPE] --merged--> [London:GPE]
{'relationships': [{'startNode': {'label': 'GPE', 'name': 'London'}, 'endNode': {'label': 'ORG', 'name': 'the British Library'}, 'name': 'located in'}]}
[INFO] Verification of invented entities
[INFO][INVENTED ENTITIES] Aie; the entities label='GPE' name='london' properties=EntityProperties(embeddings=None) and label='ORG' name='the british library' properties=EntityProperties(embeddings=None) are invented. Solving them ...


 62%|█████████████████████████████████████████████████████████████████                                       | 5/8 [13:44<07:08, 142.97s/it]

[INFO] Wohoo! Entity was matched --- [london:GPE] --merged--> [London:GPE]
[INFO] Wohoo! Entity was matched --- [the british library:ORG] --merged--> [the British Library:ORG]


 75%|██████████████████████████████████████████████████████████████████████████████▊                          | 6/8 [13:55<03:16, 98.02s/it]

{'relationships': []}
[INFO] Verification of invented entities
{'relationships': [{'startNode': {'label': 'ORG', 'name': 'Blooms-bury Academic'}, 'endNode': {'label': 'PERSON', 'name': 'Kedron, P.'}, 'name': 'employs'}, {'startNode': {'label': 'GPE', 'name': 'England'}, 'endNode': {'label': 'ORG', 'name': 'google'}, 'name': 'located_in'}, {'startNode': {'label': 'GPE', 'name': 'Scotland'}, 'endNode': {'label': 'ORG', 'name': 'google'}, 'name': 'located_in'}, {'startNode': {'label': 'GPE', 'name': 'Wales'}, 'endNode': {'label': 'ORG', 'name': 'google'}, 'name': 'located_in'}]}
[INFO] Verification of invented entities
[INFO][INVENTED ENTITIES] Aie; the entities label='ORG' name='blooms bury academic' properties=EntityProperties(embeddings=None) and label='PERSON' name='kedron, p.' properties=EntityProperties(embeddings=None) are invented. Solving them ...
[INFO] Wohoo! Entity was matched --- [blooms bury academic:ORG] --merged--> [Blooms-bury Academic:ORG]
[INFO] Wohoo! Entity was matche

 88%|███████████████████████████████████████████████████████████████████████████████████████████▉             | 7/8 [14:18<01:13, 73.57s/it]

{'relationships': [{'startNode': {'label': 'PERSON', 'name': 'W.'}, 'endNode': {'label': 'PERSON', 'name': 'Donoser'}, 'name': 'COLLABORATED_WITH'}, {'startNode': {'label': 'PERSON', 'name': 'M.'}, 'endNode': {'label': 'PERSON', 'name': 'Havemann'}, 'name': 'COLLABORATED_WITH'}, {'startNode': {'label': 'PERSON', 'name': 'Fellner'}, 'endNode': {'label': 'PERSON', 'name': 'D.'}, 'name': 'COLLABORATED_WITH'}, {'startNode': {'label': 'PERSON', 'name': 'Bischof'}, 'endNode': {'label': 'PERSON', 'name': 'Roberts'}, 'name': 'COLLABORATED_WITH'}, {'startNode': {'label': 'PERSON', 'name': 'J.'}, 'endNode': {'label': 'PERSON', 'name': 'L'}, 'name': 'COLLABORATED_WITH'}, {'startNode': {'label': 'PERSON', 'name': '¨uddecke'}, 'endNode': {'label': 'PERSON', 'name': 'Das'}, 'name': 'COLLABORATED_WITH'}, {'startNode': {'label': 'PERSON', 'name': 'S.'}, 'endNode': {'label': 'PERSON', 'name': 'Han'}, 'name': 'COLLABORATED_WITH'}, {'startNode': {'label': 'ORG', 'name': 'arXiv'}, 'endNode': {'label': 'PE

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [20:06<00:00, 150.76s/it]


## Format Relationships to Visualize in Neo4j

### Get all new entities found related to Spacy Entities

In [16]:
for rels in relationships:
    for rel in rels:
        start_node = rel.startEntity.name
        end_node = rel.endEntity.name

        start_node = start_node.replace("\n", "")
        start_node = re.split(r'\d', start_node)[0].rstrip()

        end_node = end_node.replace("\n", "")
        end_node = re.split(r'\d', end_node)[0].rstrip()
        
        if start_node.lower() in entities or end_node.lower() in entities:
            if start_node.lower() not in entities and start_node != "":
                embed = np.array(embeddings.embed_query(start_node))
                entity = Entity(name=start_node, label="Unknown")
                entity.properties.embeddings = embed
                
                # ent_json = {
                #     'label': "Unknown",
                #     'name': start_node,
                #     'properties': {"embeddings": embed}
                # }
                
                entities.append(start_node.lower())
                global_ent.append(entity)
            elif end_node.lower() not in entities and end_node != "":
                embed = np.array(embeddings.embed_query(end_node))
                entity = Entity(name=end_node, label="Unknown")
                entity.properties.embeddings = embed
                
                # ent_json = {
                #     'label': "Unknown",
                #     'name': end_node,
                #     'properties': {"embeddings": embed}
                # }
                
                entities.append(end_node.lower())
                global_ent.append(entity)

### Create Entity Resolution Mapping based on embeddings cosine similarity

In [17]:
entity_mapping = {}
sim_threshold = 0.77
for i in range(len(global_ent)):
    best_sim = -1
    sim_ent = ""
    for j in range(i+1, len(global_ent)):
        embed1 = global_ent[i].properties.embeddings
        embed2 = global_ent[j].properties.embeddings

        cosine_sim = cosine_similarity(embed1.reshape(1, -1), embed2.reshape(1, -1))[0][0]

        if cosine_sim >= sim_threshold:
            if len(global_ent[i].name) > len(global_ent[j].name):
                entity_mapping[global_ent[j].name] = global_ent[i]
            else:
                entity_mapping[global_ent[i].name] = global_ent[j]

entity_mapping

{'Wang': Entity(name=M. Wang, label=PERSON, properties=embeddings=array([ 0.01313623, -0.00247302, -0.02136446, ..., -0.03578523,
        -0.0178745 , -0.00455471])),
 'M.': Entity(name=Sun, M., label=ORG, properties=embeddings=array([ 0.00724447,  0.01004348,  0.0231706 , ..., -0.03690347,
        -0.06039724,  0.00505535])),
 'UK': Entity(name=USA, label=GPE, properties=embeddings=array([-0.0229421 ,  0.01759524, -0.02084281, ..., -0.02347386,
        -0.02525458,  0.02751953])),
 'London': Entity(name=dublin, label=GPE, properties=embeddings=array([-0.02115863, -0.00792595, -0.00486335, ..., -0.04757677,
        -0.0316798 ,  0.01525229])),
 'Aksoezen': Entity(name=Aksoezen, M., label=ORG, properties=embeddings=array([ 0.0067305 , -0.00402309, -0.0162841 , ..., -0.07051509,
        -0.0407936 ,  0.01553599])),
 'Li': Entity(name=Yang, label=PERSON, properties=embeddings=array([-0.00577883, -0.00696133, -0.00554983, ..., -0.03419205,
        -0.03590113,  0.00493976])),
 'L': Entity(

### Get and Correct Relationships Based on Entity Mapping

In [18]:
global_rel = []
for rels in relationships:
    for rel in rels:
        start_node, end_node = rel.startEntity, rel.endEntity
        while start_node in entity_mapping:
            start_node = entity_mapping[start_node.name]
        
        while end_node in entity_mapping:
            end_node = entity_mapping[end_node.name]
        
        if start_node.name.lower() in entities and end_node.name.lower() in entities and start_node.name.lower() != end_node.name.lower():
            rel.name = rel.name.replace(" ", "_").replace("’", "")
            rel.startEntity = start_node
            rel.endEntity = end_node
            global_rel.append(rel)

print(f"Number of entities: {len(global_ent)} , relationships: {len(global_rel)}")

Number of entities: 203 , relationships: 70


In [19]:
triples = []
for rel in global_rel:
    triples.append([rel.startEntity.name, rel.name, rel.endEntity.name])

triples

[['London', 'located_in', 'University College London'],
 ['Aksoezen', 'affiliated_with', 'Google'],
 ['Law', 'affiliated_with', 'Google'],
 ['Zeng', 'affiliated_with', 'University College London'],
 ['Zeppelzauer', 'affiliated_with', 'Google'],
 ['University College London', 'located_in', 'UK'],
 ['J. Boehm', 'affiliated_with', 'University College London'],
 ['University College London', 'located_in', 'London'],
 ['M. Wang', 'affiliated_with', 'University College London'],
 ['Ogawa', 'affiliated_with', 'Google'],
 ['X. Wang', 'affiliated_with', 'University College London'],
 ['J. M. Goo', 'affiliated_with', 'University College London'],
 ['Sun', 'affiliated_with', 'Google'],
 ['London', 'located_in', 'UK'],
 ['Li', 'affiliated_with', 'Google'],
 ['B. Chi', 'affiliated_with', 'University College London'],
 ['Holler', 'developed', 'GPT'],
 ['Zhang', 'developed', 'GPT'],
 ['Law', 'built', 'London'],
 ['Achiam', 'developed', 'GPT'],
 ['Wang', 'developed', 'GPT'],
 ['Zeppelzauer', 'develope

Run: docker run -p7474:7474 -p7687:7687 -e NEO4J_AUTH=neo4j/secretgraph neo4j:latest in terminal

In [22]:
URI = "bolt://localhost:7687"
USERNAME = "neo4j"
PASSWORD = "secretgraph"

# new_graph = {}
# new_graph["nodes"] = global_ent
# new_graph["relationships"] = global_rel

kg = KnowledgeGraph()
kg.entities = global_ent
kg.relationships = global_rel
GraphIntegrator(uri=URI, username=USERNAME, password=PASSWORD).visualize_graph(kg)