In [1]:
%load_ext autoreload
%autoreload 2

import sys
import re
import numpy as np
from tqdm import tqdm

import torch
import spacy
from transformers import pipeline
from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain.document_loaders import PyPDFLoader, TextLoader

from sklearn.metrics.pairwise import cosine_similarity

# sys.path.append("..")
# from itext2kg.documents_distiller import DocumentsDisiller, CV, Article
# from itext2kg.models import Entity, KnowledgeGraph
from itext2kg.irelations_extraction import iRelationsExtractor
from itext2kg.graph_integration import GraphIntegrator

# Setup Models

Using Spacy transformer model for NER and Ollama LLM for relationship extraction

In [2]:
spacy.prefer_gpu()

True

In [3]:
nlp = spacy.load("en_core_web_trf")

  model.load_state_dict(torch.load(filelike, map_location=device))


In [4]:
llm = ChatOllama(
    # model="gemma2:9b-instruct-q8_0", # Took 
    model="gemma2:9b-instruct-q5_K_M", # Took 12 minutes to extract relationships
    # model="gemma2:9b-instruct-q5_K_S", # Took 13 minutes to extract relationships
    # model="gemma2:9b-instruct-q4_K_S", # Took <10 minutes to extract relationships
    temperature=0,
    max_retries=5,
    max_tokens=None,
    keep_alive=-1
)

embeddings = OllamaEmbeddings(
    model="bge-large"
)

## Extract Entities using Spacy

In [6]:
loader = PyPDFLoader(f"../datasets/scientific_articles/building_age.pdf")
pages = loader.load()
pages = [page.page_content.replace("{", "").replace("}", "") for page in pages]

In [7]:
global_ent = []
entities = []
page_ents = []
entity_types = ["PERSON", "GPE", "ORG"]

for page in pages:
    ents = []
    doc = nlp(page)
    for ent in doc.ents:
        if ent.label_ in entity_types:
            text = ent.text.replace("\n", "")
            text = re.split(r'\d', text)[0].rstrip()
            
            embed = np.array(embeddings.embed_query(text.lower()))
            ents.append(text)
            
            # entity = Entity(name=text, label=ent.label_)
            # entity.properties.embeddings = embed
            # ents.append(entity)
            
            if text.lower() not in entities:
                ent_json = {
                    "name": text,
                    "label": ent.label_,
                    "properties": {"embeddings": embed}
                }
                global_ent.append(ent_json)
                # global_ent.append(entity)
                entities.append(text.lower())

    page_ents.append(ents)

len(global_ent)

  with torch.cuda.amp.autocast(self._mixed_precision):


203

## Use iText2KG for extracting relationships

In [8]:
relationships = []
irelations_extractor = iRelationsExtractor(llm_model=llm, 
                                           embeddings_model=embeddings,
                                           sleep_time=1)

for page, ents in tqdm(zip(pages, page_ents), total=len(pages)): # Took 20 minutes with latest itext2kg version
    rels = irelations_extractor.extract_relations(context=page, entities=ents)
    relationships.append(rels)

  0%|                                                                                                                 | 0/8 [00:00<?, ?it/s]

{'relationships': [{'startNode': 'Zeng', 'endNode': 'University College London', 'name': 'affiliated with'}, {'startNode': 'J. M. Goo', 'endNode': 'University College London', 'name': 'affiliated with'}, {'startNode': 'X. Wang', 'endNode': 'University College London', 'name': 'affiliated with'}, {'startNode': 'B. Chi', 'endNode': 'University College London', 'name': 'affiliated with'}, {'startNode': 'M. Wang', 'endNode': 'University College London', 'name': 'affiliated with'}, {'startNode': 'J. Boehm', 'endNode': 'University College London', 'name': 'affiliated with'}, {'startNode': 'Aksoezen', 'endNode': 'energy consumption analysis', 'name': 'related to'}, {'startNode': 'Law', 'endNode': 'housing prices', 'name': 'related to'}, {'startNode': 'Sun', 'endNode': 'building age estimation', 'name': 'studied in the context of'}, {'startNode': 'Ogawa', 'endNode': 'disaster resilience analysis', 'name': 'contributed to'}, {'startNode': 'Li', 'endNode': 'Google', 'name': 'worked at'}, {'start

 12%|█████████████▏                                                                                           | 1/8 [00:46<05:25, 46.51s/it]

{'relationships': [{'startNode': 'Despotovic', 'endNode': 'FI-London', 'name': 'worked on'}, {'startNode': 'Riemenschneider', 'endNode': 'architectural styles', 'name': 'classified'}, {'startNode': 'Shalunts', 'endNode': 'construction styles', 'name': 'categorized'}, {'startNode': 'Law', 'endNode': 'house prices in London', 'name': 'estimated'}, {'startNode': 'Zeppelzauer', 'endNode': 'building age', 'name': 'predicted'}, {'startNode': 'Despotovic', 'endNode': 'heating energy demand', 'name': 'analyzed'}, {'startNode': 'Ogawa', 'endNode': 'building age', 'name': 'predicted'}, {'startNode': 'Achiam', 'endNode': 'GPT', 'name': 'developed'}, {'startNode': 'Qin', 'endNode': 'natural language processing tasks', 'name': 'achieved performance in'}, {'startNode': 'Roberts', 'endNode': 'geographic knowledge and reasoning', 'name': 'indicated capabilities in'}, {'startNode': 'Li', 'endNode': 'geographic knowledge and reasoning', 'name': 'indicated capabilities in'}, {'startNode': 'Wang', 'endNod

 25%|██████████████████████████▎                                                                              | 2/8 [01:38<04:57, 49.67s/it]

{'relationships': [{'startNode': 'London', 'endNode': 'FI-London', 'name': 'is_location_of'}, {'startNode': 'Despotovic', 'endNode': 'dataset', 'name': 'created'}, {'startNode': 'Sun', 'endNode': 'dataset', 'name': 'combined'}, {'startNode': 'Google Street View', 'endNode': 'dataset', 'name': 'data_source'}, {'startNode': 'Amsterdam', 'endNode': 'dataset', 'name': 'origin'}, {'startNode': 'London', 'endNode': 'FI-London', 'name': 'focus_area'}, {'startNode': 'Camden', 'endNode': 'FI-London', 'name': 'location_within'}, {'startNode': 'FI-London', 'endNode': 'building_age_epochs', 'name': 'contains_information_about'}, {'startNode': 'Hudson', 'endNode': 'FI-London', 'name': 'derived_from'}, {'startNode': 'London', 'endNode': 'case_study', 'name': 'selected_as'}, {'startNode': 'Jones', 'endNode': 'building_age_epochs', 'name': 'analyzed'}]}


 50%|████████████████████████████████████████████████████▌                                                    | 4/8 [02:25<02:04, 31.20s/it]

{'relationships': [{'startNode': 'FI-London', 'endNode': 'London', 'name': 'is located in'}, {'startNode': 'FI-London', 'endNode': 'OpenAI', 'name': 'uses'}]}


 62%|█████████████████████████████████████████████████████████████████▋                                       | 5/8 [02:37<01:13, 24.36s/it]

{'relationships': [{'startNode': 'London', 'endNode': 'British Library', 'name': 'located in'}, {'startNode': 'Rawte', 'endNode': 'British Library', 'name': 'mentioned in context about'}]}


 75%|██████████████████████████████████████████████████████████████████████████████▊                          | 6/8 [02:50<00:40, 20.28s/it]

{'relationships': [{'startNode': 'GPT-4 Vision', 'endNode': 'building facade images', 'name': 'classifies'}, {'startNode': 'age epoch', 'endNode': 'building facade images', 'name': 'is associated with'}]}


 88%|███████████████████████████████████████████████████████████████████████████████████████████▉             | 7/8 [02:59<00:16, 16.60s/it]

{'relationships': [{'startNode': '07 niigata-ken chuetsu-oki earthquake', 'endNode': 'Proc. 32nd Asian Conference on Remote Sensing', 'name': 'was presented at'}]}
{'relationships': [{'startNode': 'Ogawa, Y .', 'endNode': 'IEEE Journal ofSelected Topics in Applied Earth Observations and Remote', 'name': 'authored'}, {'startNode': 'Qin, C.', 'endNode': 'arXiv', 'name': 'published_in'}, {'startNode': 'Radford, A.', 'endNode': 'International conference on machine learning', 'name': 'presented_at'}, {'startNode': 'Rawte, V .', 'endNode': 'arXiv', 'name': 'published_in'}, {'startNode': 'Riemenschneider, H.', 'endNode': '2012 IEEE Conference on Computer Vision and Pattern Recognition', 'name': 'presented_at'}, {'startNode': 'Roberts, J.', 'endNode': 'arXiv', 'name': 'published_in'}, {'startNode': 'Shalunts', 'endNode': 'International Symposium on Visual Computing', 'name': 'presented_at'}, {'startNode': 'Stanley, S.', 'endNode': 'Energy Efficiency', 'name': 'published_in'}, {'startNode': 'Su

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [03:45<00:00, 28.24s/it]


## Format Relationships to Visualize in Neo4j

### Get all new entities found related to Spacy Entities

In [12]:
for rels in relationships:
    for rel in rels:
        # start_node = rel.startEntity.name
        # end_node = rel.endEntity.name

        start_node = rel["startNode"]
        end_node = rel["endNode"]

        start_node = start_node.replace("\n", "")
        start_node = re.split(r'\d', start_node)[0].rstrip()

        end_node = end_node.replace("\n", "")
        end_node = re.split(r'\d', end_node)[0].rstrip()
        
        if start_node.lower() in entities or end_node.lower() in entities:
            if start_node.lower() not in entities and start_node != "":
                embed = np.array(embeddings.embed_query(start_node))
                # entity = Entity(name=start_node, label="Unknown")
                # entity.properties.embeddings = embed
                
                ent_json = {
                    'label': "Unknown",
                    'name': start_node,
                    'properties': {"embeddings": embed}
                }
                
                entities.append(start_node.lower())
                global_ent.append(ent_json)
                # global_ent.append(entity)
            elif end_node.lower() not in entities and end_node != "":
                embed = np.array(embeddings.embed_query(end_node))
                # entity = Entity(name=end_node, label="Unknown")
                # entity.properties.embeddings = embed
                
                ent_json = {
                    'label': "Unknown",
                    'name': end_node,
                    'properties': {"embeddings": embed}
                }
                
                entities.append(end_node.lower())
                global_ent.append(ent_json)
                # global_ent.append(entity)

### Create Entity Resolution Mapping based on embeddings cosine similarity

In [15]:
entity_mapping = {}
sim_threshold = 0.77
for i in range(len(global_ent)):
    best_sim = -1
    sim_ent = ""
    for j in range(i+1, len(global_ent)):
        # embed1 = global_ent[i].properties.embeddings
        # embed2 = global_ent[j].properties.embeddings

        embed1 = global_ent[i]["properties"]["embeddings"]
        embed2 = global_ent[j]["properties"]["embeddings"]

        cosine_sim = cosine_similarity(embed1.reshape(1, -1), embed2.reshape(1, -1))[0][0]

        if cosine_sim >= sim_threshold:
            if len(global_ent[i]["name"]) > len(global_ent[j]["name"]):
                entity_mapping[global_ent[j]["name"]] = global_ent[i]["name"]
            else:
                entity_mapping[global_ent[i]["name"]] = global_ent[j]["name"]
            
            # if len(global_ent[i].name) > len(global_ent[j].name):
            #     # entity_mapping[global_ent[j].name] = global_ent[i]
            #     entity_mapping[global_ent[j].name] = global_ent[i].name
            # else:
            #     # entity_mapping[global_ent[i].name] = global_ent[j]
            #     entity_mapping[global_ent[i].name] = global_ent[j].name

entity_mapping

{'Wang': 'Wang, X.',
 'X. Wang': 'Wang, X.',
 'M.': 'Sun, M.',
 'UK': 'USA',
 'London': 'dublin',
 'Aksoezen': 'Aksoezen, M.',
 'Ogawa': 'Ogawa, Y .',
 'Li': 'Yang',
 'L': 'L. H.',
 'Zeppelzauer': 'Zeppelzauer, M.',
 'et': 'et al.',
 'Radford': 'Radford, A.',
 'Tam': 'Tam, Tso, T. Y',
 'Stanley': 'Stanley, S.',
 'FI': 'FI-London',
 'Riemenschneider': 'Riemenschneider, H.',
 'Qin': 'Qin, C.',
 'Roberts': 'Roberts, J.',
 'Kedron': 'Kedron, P.',
 'Zhang': 'Zhang, Y .',
 'dublin': 'Amsterdam',
 'Hudson': 'Hudson, P.',
 'Jones': '.Jones',
 'British Library': 'the British Library',
 'Rawte': 'Rawte, V .',
 'L.': 'L. H.',
 'F.': 'F. L.',
 'D.': 'Smith, D.',
 'Energy': 'Energy Efficiency',
 'D¨oller': 'D ¨oller',
 'W.': 'J. W.',
 'Russell, T.': 'Russell, C.',
 'T.': 'T. Y',
 'Newcastle': 'Newcastle Uni-',
 'England': 'Scotland',
 'H.': 'L. H.',
 'Yang': 'Chen',
 'J.': 'J. W.',
 'N.': 'J.-N.',
 'Y .': 'Zhang, Y .',
 'Taipei': 'dublin',
 'International Journal of Applied Earth Obser-vation and G

### Get and Correct Relationships Based on Entity Mapping

In [19]:
global_rel = []
for rels in relationships:
    for rel in rels:
        # start_node, end_node = rel.startEntity, rel.endEntity
        start_node, end_node = rel["startNode"], rel["endNode"]
        while start_node in entity_mapping:
            # start_node = entity_mapping[start_node.name]
            start_node = entity_mapping[start_node]
        
        while end_node in entity_mapping:
            end_node = entity_mapping[end_node]

        if start_node.lower() in entities and end_node.lower() in entities and start_node.lower() != end_node.lower():
            rel["name"] = rel["name"].replace(" ", "_").replace("’", "")
            rel["startNode"] = start_node
            rel["endNode"] = end_node
            global_rel.append(rel)
        
        # if start_node.name.lower() in entities and end_node.name.lower() in entities and start_node.name.lower() != end_node.name.lower():
        #     rel.name = rel.name.replace(" ", "_").replace("’", "")
        #     rel.startEntity = start_node
        #     rel.endEntity = end_node
        #     global_rel.append(rel)

print(f"Number of entities: {len(global_ent)} , relationships: {len(global_rel)}")

Number of entities: 231 , relationships: 63


In [21]:
triples = []
for rel in global_rel:
    # triples.append([rel.startEntity.name, rel.name, rel.endEntity.name])
    triples.append([rel["startNode"], rel["name"], rel["endNode"]])
    

triples

[['Zeng', 'affiliated_with', 'University College London'],
 ['J. M. Goo', 'affiliated_with', 'University College London'],
 ['Wang, X.', 'affiliated_with', 'University College London'],
 ['B. Chi', 'affiliated_with', 'University College London'],
 ['M. Wang', 'affiliated_with', 'University College London'],
 ['J. Boehm', 'affiliated_with', 'University College London'],
 ['Aksoezen, M.', 'related_to', 'energy consumption analysis'],
 ['Law', 'related_to', 'house prices in London'],
 ['Sun', 'studied_in_the_context_of', 'building age estimation'],
 ['Ogawa, Y .', 'contributed_to', 'disaster resilience analysis'],
 ['Chen', 'worked_at', 'Google'],
 ['Zeppelzauer, M.', 'studied_in_the_context_of', 'building age estimation'],
 ['Despotovic', 'studied_in_the_context_of', 'building age estimation'],
 ['Sun', 'studied_in_the_context_of', 'building age estimation'],
 ['Ogawa, Y .', 'studied_in_the_context_of', 'building age estimation'],
 ['Hinton', 'developed', 'CLIP'],
 ['Radford, A.', 'devel

Run: docker run -p7474:7474 -p7687:7687 -e NEO4J_AUTH=neo4j/secretgraph neo4j:latest in terminal

In [23]:
URI = "bolt://localhost:7687"
USERNAME = "neo4j"
PASSWORD = "secretgraph"

new_graph = {}
new_graph["nodes"] = global_ent
new_graph["relationships"] = global_rel

# kg = KnowledgeGraph()
# kg.entities = global_ent
# kg.relationships = global_rel
GraphIntegrator(uri=URI, username=USERNAME, password=PASSWORD).visualize_graph(new_graph)