In [None]:
%load_ext autoreload
%autoreload 2

import sys
import re
import numpy as np
from tqdm import tqdm

import torch
import spacy
from transformers import pipeline
from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint, HuggingFacePipeline
from langchain.document_loaders import PyPDFLoader, TextLoader

from sklearn.metrics.pairwise import cosine_similarity

sys.path.append("..")
from itext2kg.documents_distiller import DocumentsDisiller, CV, Article
from itext2kg.irelations_extraction import iRelationsExtractor

# Setup Models

Using Spacy transformer model for NER and Ollama LLM for relationship extraction

In [None]:
spacy.prefer_gpu()

In [None]:
nlp = spacy.load("en_core_web_trf")

In [None]:
llm = ChatOllama(
    model="gemma2:9b-instruct-q8_0",
    temperature=0,
    max_retries=5,
    max_tokens=None,
    keep_alive=-1
)

embeddings = OllamaEmbeddings(
    model="nomic-embed-text"
)

## Extract Entities using Spacy

In [None]:
from langchain.document_loaders import PyPDFLoader, TextLoader

# loader = TextLoader("../datasets/cvs/Emily_Davis.txt")
# loader = TextLoader("../datasets/scientific_articles/bertology.txt")
# pages = loader.load()
# pages = [page.page_content.replace("{", "").replace("}", "") for page in pages]

loader = PyPDFLoader(f"../datasets/scientific_articles/bertology.pdf")
pages = loader.load()
pages = [page.page_content.replace("{", "").replace("}", "") for page in pages]

In [None]:
global_ent = []
entities = []
page_ents = []

for page in pages:
    doc = nlp(page)
    # page_entities = []
    entity_types = ["PERSON", "GPE", "ORG"]
    ents = []
    for ent in doc.ents:
        if ent.label_ in entity_types:
            text = ent.text.replace("\n", "")
            text = re.split(r'\d', text)[0].rstrip()
            ents.append(text)
            if text.lower() not in entities:
                embed = np.array(embeddings.embed_query(text.lower()))
                ent_json = {
                'label': ent.label_,
                'name': text,
                'properties': {"embeddings": embed}
                }
                
                global_ent.append(ent_json)
                entities.append(text.lower())

    page_ents.append(ents)

len(global_ent)

## Use iText2KG for extracting relationships

In [None]:
relationships = []
irelations_extractor = iRelationsExtractor(llm_model=llm, 
                                               embeddings_model=embeddings,
                                               sleep_time=1)

for page, ents in tqdm(zip(pages, page_ents), total=len(pages)):
    rels = irelations_extractor.extract_relations(context=page, entities=ents)
    relationships.append(rels)

## Format Relationships to Visualize in Neo4j

### Get all new entities found related to Spacy Entities

In [None]:
for rels in relationships:
    for rel in rels:
        start_node = rel["startNode"]
        end_node = rel["endNode"]

        start_node = start_node.replace("\n", "")
        start_node = re.split(r'\d', start_node)[0].rstrip()

        end_node = end_node.replace("\n", "")
        end_node = re.split(r'\d', end_node)[0].rstrip()
        
        if start_node.lower() in entities or end_node.lower() in entities:
            if start_node.lower() not in entities:
                embed = np.array(embeddings.embed_query(start_node))
                
                ent_json = {
                'label': "Unknown",
                'name': start_node,
                'properties': {"embeddings": embed}
                }
                
                entities.append(start_node.lower())
                global_ent.append(ent_json)
            
            if end_node.lower() not in entities:
                embed = np.array(embeddings.embed_query(end_node))
                
                ent_json = {
                'label': "Unknown",
                'name': end_node,
                'properties': {"embeddings": embed}
                }
                
                entities.append(end_node.lower())

                global_ent.append(ent_json)

### Create Entity Resolution Mapping based on embeddings cosine similarity

In [None]:
entity_mapping = {}
sim_threshold = 0.75
for i in range(len(global_ent)):
    best_sim = -1
    sim_ent = ""
    for j in range(i+1, len(global_ent)):
        embed1 = global_ent[i]["properties"]["embeddings"]
        embed2 = global_ent[j]["properties"]["embeddings"]

        cosine_sim = cosine_similarity(embed1.reshape(1, -1), embed2.reshape(1, -1))[0][0]

        if cosine_sim >= sim_threshold:
            if len(global_ent[i]["name"]) > len(global_ent[j]["name"]):
                entity_mapping[global_ent[j]["name"]] = global_ent[i]["name"]
            else:
                entity_mapping[global_ent[i]["name"]] = global_ent[j]["name"]

In [None]:
entity_mapping

### Get and Correct Relationships Based on Entity Mapping

In [None]:
global_rel = []
for rels in relationships:
    for rel in rels:
        start_node, end_node = rel["startNode"], rel["endNode"]
        while start_node in entity_mapping.keys():
            start_node = entity_mapping[start_node]
        
        while end_node in entity_mapping.keys():
            end_node = entity_mapping[start_node]
        
        # start_node = entity_mapping[rel["startNode"]] if rel["startNode"] in entity_mapping else rel["startNode"]
        # end_node = entity_mapping[rel["endNode"]] if rel["endNode"] in entity_mapping else rel["endNode"]

        start_node = start_node.replace("\n", "")
        start_node = re.split(r'\d', start_node)[0].rstrip()

        end_node = end_node.replace("\n", "")
        end_node = re.split(r'\d', end_node)[0].rstrip()
        
        # if [start_node, end_node] not in pairing and start_node in entities and end_node in entities:
        if start_node.lower() in entities and end_node.lower() in entities:
            rel["name"] = rel["name"].replace(" ", "_")
            rel["startNode"] = start_node
            rel["endNode"] = end_node
            global_rel.append(rel)

print(f"Number of entities: {len(global_ent)} , relationships: {len(global_rel)}")

In [None]:
triples = []
for rel in global_rel:
    triples.append([rel["startNode"], rel["name"], rel["endNode"]])

triples

Run: docker run -p7474:7474 -p7687:7687 -e NEO4J_AUTH=neo4j/secretgraph neo4j:latest in terminal

In [None]:
from itext2kg.graph_integration import GraphIntegrator

URI = "bolt://localhost:7687"
USERNAME = "neo4j"
PASSWORD = "secretgraph"

new_graph = {}
new_graph["nodes"] = global_ent
new_graph["relationships"] = global_rel
GraphIntegrator(uri=URI, username=USERNAME, password=PASSWORD).visualize_graph(json_graph=new_graph)