In [1]:
%load_ext autoreload
%autoreload 2

import sys
import numpy as np
from tqdm import tqdm

import spacy
from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain.document_loaders import PyPDFLoader, TextLoader

from sklearn.metrics.pairwise import cosine_similarity

sys.path.append("..")
from itext2kg.documents_distiller import DocumentsDisiller, CV, Article
from itext2kg.irelations_extraction import iRelationsExtractor

# Setup Models

Using Spacy transformer model for NER and Ollama LLM for relationship extraction

In [2]:
spacy.prefer_gpu()

True

In [2]:
nlp = spacy.load("en_core_web_trf")

  model.load_state_dict(torch.load(filelike, map_location=device))


In [3]:
llm = ChatOllama(
    model="gemma2:9b-instruct-q8_0",
    temperature=0,
    max_retries=5,
    max_tokens=None,
    keep_alive=-1
)

embeddings = OllamaEmbeddings(
    model="nomic-embed-text"
)

## Extract Entities using Spacy

In [28]:
from langchain.document_loaders import PyPDFLoader, TextLoader

# loader = TextLoader("../datasets/cvs/Emily_Davis.txt")
# loader = TextLoader("../datasets/scientific_articles/bertology.txt")
# pages = loader.load()
# pages = [page.page_content.replace("{", "").replace("}", "") for page in pages]

loader = PyPDFLoader(f"../datasets/scientific_articles/bertology.pdf")
pages = loader.load()
pages = [page.page_content.replace("{", "").replace("}", "") for page in pages]

In [12]:
global_ent = []
entities = []
page_ents = []

for page in pages:
    doc = nlp(page)
    # page_entities = []
    entity_types = ["PERSON", "GPE", "ORG"]
    ents = []
    for ent in doc.ents:
        if ent.label_ in entity_types:
            text = ent.text.replace("\n", "")
            ents.append(text)
            if text not in entities:
                embed = np.array(embeddings.embed_query(text))
                ent_json = {
                'label': ent.label_,
                'name': text,
                'properties': {"embeddings": embed}
                }
                
                global_ent.append(ent_json)
                entities.append(ent.text)

    page_ents.append(ents)

len(global_ent)

  with torch.cuda.amp.autocast(self._mixed_precision):


433

In [29]:
entity_mapping = {}
sim_threshold = 0.8
for i in range(len(global_ent)):
    best_sim = -1
    sim_ent = ""
    for j in range(i+1, len(global_ent)):
        embed1 = global_ent[i]["properties"]["embeddings"]
        embed2 = global_ent[j]["properties"]["embeddings"]

        cosine_sim = cosine_similarity(embed1.reshape(1, -1), embed2.reshape(1, -1))[0][0]

        if cosine_sim >= sim_threshold:
            if len(global_ent[i]["name"]) > len(global_ent[j]["name"]):
                entity_mapping[global_ent[j]["name"]] = global_ent[i]["name"]
            else:
                entity_mapping[global_ent[i]["name"]] = global_ent[j]["name"]

In [30]:
entity_mapping

{'ICLR': 'ICLR 2021',
 'ICLR 2021': 'ICLR 2021',
 'Jesse Vig': 'Jesse Vig1Ali',
 'El': 'El-',
 'Gebali': 'Sara El-Gebali',
 'Vaswani': 'Ashish Vaswani',
 'et al': 'et al.',
 'Niven': 'Timothy Niven',
 'Kurita': 'Keita Kurita',
 'Devlin': 'Jacob Devlin',
 'Nguyen': 'Hai Nguyen',
 'S Henikoff': 'J G Henikoff',
 'J G Henikoff': 'Henikoff & Henikoff',
 'Elnaggar': 'Ahmed Elnaggar',
 'Yang': 'Yiming Yang',
 'Steinegger': 'Martin Steinegger',
 'Suzek': 'Baris E. Suzek',
 'Veldhoen': 'Sara Veldhoen',
 'Conneau': 'Alexis Conneau',
 'Reif': 'Emily Reif',
 'AlQuraishi': 'Mohammed AlQuraishi.',
 'Fox': 'E. Fox',
 'Berman': 'Helen M Berman',
 'Moult': 'John Moult',
 'the Protein Data Bank': 'the Protein Data Bank’s',
 'Grimsley': 'Christopher Grimsley',
 'Jain': 'Jain & Wallace',
 'Kinjo': 'Akira Kinjo',
 'Nakamura': 'Haruki Nakamura',
 'Raganato': 'Alessandro Raganato',
 'Jawahar': 'Ganesh Jawahar',
 'Mikolov': 'Tomas Mikolov',
 'Bepler': 'Tristan Bepler',
 'Riesselman': 'Adam J Riesselman',
 'Ma

## Use iText2KG for creating semantic blocks and extract relationships

In [76]:
relationships = []
irelations_extractor = iRelationsExtractor(llm_model=llm, 
                                               embeddings_model=embeddings,
                                               sleep_time=1)

for page, ents in tqdm(zip(pages, page_ents), total=len(pages)):
    rels = irelations_extractor.extract_relations(context=page, entities=ents)
    relationships.append(rels)

  0%|                                                                                                                | 0/24 [00:35<?, ?it/s]


KeyboardInterrupt: 

## Format Relationships to Visualize in Neo4j

In [71]:
global_rel = []
pairing = []
for rels in relationships:
    for rel in rels:
        start_node = entity_mapping[rel["startNode"]] if rel["startNode"] in entity_mapping else rel["startNode"]
        end_node = entity_mapping[rel["endNode"]] if rel["endNode"] in entity_mapping else rel["endNode"]
        
        if [start_node, end_node] not in pairing and start_node in entities and end_node in entities:
            rel["name"] = rel["name"].replace(" ", "_")
            global_rel.append(rel)
            pairing.append([start_node, end_node])

len(global_rel)

109

In [36]:
global_ent[:10]

[{'label': 'PERSON',
  'name': 'Zeng1',
  'properties': {'embeddings': array([ 3.34436450e-02,  9.43139200e-03, -1.66122530e-01, -4.25418500e-02,
           3.97841820e-02,  6.44107000e-02, -6.34345040e-03, -2.31915710e-02,
          -2.15638300e-02, -3.32576700e-02,  8.94801900e-02,  1.00202100e-02,
           8.86074200e-02,  8.17845500e-03,  3.07764340e-02, -2.84064600e-02,
           4.01265140e-02, -1.54194490e-02,  8.35615200e-03, -4.04722500e-02,
          -4.12085580e-02,  8.03790400e-03,  2.74109030e-03,  1.35523520e-02,
           1.03566445e-01,  5.88628240e-02,  3.48872320e-02, -5.84219800e-02,
           1.93469520e-02, -1.78288200e-02, -1.38082560e-02, -3.51555530e-02,
          -3.50225200e-02,  7.53667800e-03, -8.89150050e-03, -3.50485400e-02,
           6.30046800e-02,  2.35632600e-02, -4.92672100e-02,  1.89334080e-02,
           3.71154170e-02, -3.63429900e-02, -5.50409360e-03, -2.69919930e-02,
           3.50615980e-02,  2.78329600e-02,  3.57281420e-03, -1.46300650e-

In [37]:
global_rel[:10]

[{'startNode': 'Zeng1',
  'endNode': 'University College London',
  'name': 'is_affiliated_with',
  'properties': {'embeddings': array([ 0.00366466,  0.00390176, -0.15098307, -0.00661818,  0.03186678,
          -0.02526036, -0.01457634, -0.00625404,  0.03736493, -0.04053658,
          -0.02141453,  0.0404009 ,  0.06899558,  0.01968662, -0.03122638,
          -0.02945441, -0.00196858,  0.00790046, -0.02139869,  0.03799462,
          -0.02411652, -0.01695912,  0.00658534, -0.00152148,  0.13238417,
          -0.02705171,  0.04555077, -0.05815474, -0.01241038,  0.02802988,
           0.04506523,  0.02564681,  0.06493525, -0.02238502, -0.00475682,
          -0.02209762,  0.02272889,  0.03000464,  0.02618805,  0.03116077,
           0.01261721,  0.06732679, -0.0465389 , -0.05448991, -0.01179043,
          -0.0040398 ,  0.00866393,  0.00432013,  0.063346  ,  0.05586501,
          -0.0176185 ,  0.04233289, -0.01585811,  0.00817671,  0.05566334,
           0.01683855,  0.06308847,  0.05360553, 

Run: docker run -p7474:7474 -p7687:7687 -e NEO4J_AUTH=neo4j/secretgraph neo4j:latest in terminal

In [72]:
from itext2kg.graph_integration import GraphIntegrator

URI = "bolt://localhost:7687"
USERNAME = "neo4j"
PASSWORD = "secretgraph"

new_graph = {}
new_graph["nodes"] = global_ent
new_graph["relationships"] = global_rel
GraphIntegrator(uri=URI, username=USERNAME, password=PASSWORD).visualize_graph(json_graph=new_graph)