In [None]:
%load_ext autoreload
%autoreload 2

import sys
import numpy as np
from tqdm import tqdm

import spacy
from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain.document_loaders import PyPDFLoader, TextLoader

from sklearn.metrics.pairwise import cosine_similarity

sys.path.append("..")
from itext2kg.documents_distiller import DocumentsDisiller, CV, Article
from itext2kg.irelations_extraction import iRelationsExtractor

# Setup Models

Using Spacy transformer model for NER and Ollama LLM for relationship extraction

In [None]:
spacy.prefer_gpu()

In [None]:
nlp = spacy.load("en_core_web_trf")

In [None]:
llm = ChatOllama(
    model="gemma2:9b-instruct-q8_0",
    temperature=0,
    max_retries=5,
    max_tokens=None,
    keep_alive=-1
)

embeddings = OllamaEmbeddings(
    model="nomic-embed-text"
)

## Extract Entities using Spacy

In [None]:
from langchain.document_loaders import PyPDFLoader, TextLoader

# loader = TextLoader("../datasets/cvs/Emily_Davis.txt")
# loader = TextLoader("../datasets/scientific_articles/bertology.txt")
# pages = loader.load()
# pages = [page.page_content.replace("{", "").replace("}", "") for page in pages]

loader = PyPDFLoader(f"../datasets/scientific_articles/bertology.pdf")
pages = loader.load()
pages = [page.page_content.replace("{", "").replace("}", "") for page in pages]

In [None]:
global_ent = []
entities = []
page_ents = []

for page in pages:
    doc = nlp(page)
    # page_entities = []
    entity_types = ["PERSON", "GPE", "ORG"]
    ents = []
    for ent in doc.ents:
        if ent.label_ in entity_types:
            text = ent.text.replace("\n", "")
            ents.append(text)
            if text not in entities:
                embed = np.array(embeddings.embed_query(text))
                ent_json = {
                'label': ent.label_,
                'name': text,
                'properties': {"embeddings": embed}
                }
                
                global_ent.append(ent_json)
                entities.append(ent.text)

    page_ents.append(ents)

len(global_ent)

In [None]:
entity_mapping = {}
sim_threshold = 0.8
for i in range(len(global_ent)):
    best_sim = -1
    sim_ent = ""
    for j in range(i+1, len(global_ent)):
        embed1 = global_ent[i]["properties"]["embeddings"]
        embed2 = global_ent[j]["properties"]["embeddings"]

        cosine_sim = cosine_similarity(embed1.reshape(1, -1), embed2.reshape(1, -1))[0][0]

        if cosine_sim >= sim_threshold:
            if len(global_ent[i]["name"]) > len(global_ent[j]["name"]):
                entity_mapping[global_ent[j]["name"]] = global_ent[i]["name"]
            else:
                entity_mapping[global_ent[i]["name"]] = global_ent[j]["name"]

In [None]:
entity_mapping

## Use iText2KG for creating semantic blocks and extract relationships

In [None]:
relationships = []
irelations_extractor = iRelationsExtractor(llm_model=llm, 
                                               embeddings_model=embeddings,
                                               sleep_time=1)

for page, ents in tqdm(zip(pages, page_ents), total=len(pages)):
    rels = irelations_extractor.extract_relations(context=page, entities=ents)
    relationships.append(rels)

## Format Relationships to Visualize in Neo4j

In [None]:
global_rel = []
pairing = []
for rels in relationships:
    for rel in rels:
        start_node = entity_mapping[rel["startNode"]] if rel["startNode"] in entity_mapping else rel["startNode"]
        end_node = entity_mapping[rel["endNode"]] if rel["endNode"] in entity_mapping else rel["endNode"]
        
        if [start_node, end_node] not in pairing and start_node in entities and end_node in entities:
            rel["name"] = rel["name"].replace(" ", "_")
            global_rel.append(rel)
            pairing.append([start_node, end_node])

len(global_rel)

In [None]:
global_ent[:10]

In [None]:
global_rel[:10]

Run: docker run -p7474:7474 -p7687:7687 -e NEO4J_AUTH=neo4j/secretgraph neo4j:latest in terminal

In [None]:
from itext2kg.graph_integration import GraphIntegrator

URI = "bolt://localhost:7687"
USERNAME = "neo4j"
PASSWORD = "secretgraph"

new_graph = {}
new_graph["nodes"] = global_ent
new_graph["relationships"] = global_rel
GraphIntegrator(uri=URI, username=USERNAME, password=PASSWORD).visualize_graph(json_graph=new_graph)