In [72]:
%load_ext autoreload
%autoreload 2

import sys

import spacy
from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain.document_loaders import PyPDFLoader, TextLoader


sys.path.append("..")
from itext2kg.documents_distiller import DocumentsDisiller, CV, Article
from itext2kg.irelations_extraction import iRelationsExtractor

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Setup Models

Using Spacy transformer model for NER and Ollama LLM for relationship extraction

In [3]:
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_trf")

In [4]:
llm = ChatOllama(
    model="llama3",
    temperature=0,
    max_retries=5,
    max_tokens=None,
)

embeddings = OllamaEmbeddings(
    model="llama3",
)

## Extract Entities using Spacy

In [88]:
from langchain.document_loaders import PyPDFLoader, TextLoader

# loader = TextLoader("../datasets/cvs/Emily_Davis.txt")
loader = TextLoader("../datasets/scientific_articles/bertology.txt")
pages = loader.load()
pages = [page.page_content.replace("{", "").replace("}", "") for page in pages]
pages

loader = PyPDFLoader(f"../datasets/scientific_articles/bertology.pdf")
pages = loader.load()
pages = [page.page_content.replace("{", "").replace("}", "") for page in pages]

pages

['Published as a conference paper at ICLR 2021\nBERT OLOGY MEETS BIOLOGY : INTERPRETING\nATTENTION IN PROTEIN LANGUAGE MODELS\nJesse Vig1Ali Madani1Lav R. Varshney1,2Caiming Xiong1\nRichard Socher1Nazneen Fatema Rajani1\n1Salesforce Research,2University of Illinois at Urbana-Champaign\njvig,amadani,cxiong,rsocher,nazneen.rajani@salesforce.com\nvarshney@illinois.edu\nABSTRACT\nTransformer architectures have proven to learn useful representations for pro-\ntein classiﬁcation and generation tasks. However, these representations present\nchallenges in interpretability. In this work, we demonstrate a set of methods for\nanalyzing protein Transformer models through the lens of attention. We show that\nattention: (1) captures the folding structure of proteins, connecting amino acids that\nare far apart in the underlying sequence, but spatially close in the three-dimensional\nstructure, (2) targets binding sites, a key functional component of proteins, and\n(3) focuses on progressively more co

In [93]:
global_ent = []
entities = []
for page in pages:
    doc = nlp(page)
    # page_entities = []
    entity_types = ["PERSON", "GPE", "LOC", "ORG"]
    for ent in doc.ents:
        if ent.label_ in entity_types and ent.text not in entities:
            # page_entities.append(ent.text)
            entities.append(ent.text)
            
            ent_json = {
                'label': ent.label_,
                'name': ent.text
            }
            global_ent.append(ent_json)

    # entities.append(page_entities)
    
global_ent

  with torch.cuda.amp.autocast(self._mixed_precision):


[{'label': 'ORG', 'name': 'ICLR 2021\n'},
 {'label': 'PERSON', 'name': 'Jesse Vig1Ali'},
 {'label': 'PERSON', 'name': 'El'},
 {'label': 'PERSON', 'name': 'Gebali'},
 {'label': 'PERSON', 'name': 'Rollins'},
 {'label': 'PERSON', 'name': 'Vaswani'},
 {'label': 'ORG', 'name': 'et al'},
 {'label': 'PERSON', 'name': 'Niven'},
 {'label': 'PERSON', 'name': 'Tan'},
 {'label': 'PERSON', 'name': 'Kurita'},
 {'label': 'PERSON', 'name': 'Rogers'},
 {'label': 'PERSON', 'name': 'Devlin'},
 {'label': 'PERSON', 'name': 'Rose'},
 {'label': 'PERSON', 'name': 'Nguyen'},
 {'label': 'ORG', 'name': 'Henikoff & Henikoff'},
 {'label': 'PERSON', 'name': 'Rao'},
 {'label': 'PERSON', 'name': 'Brik'},
 {'label': 'PERSON', 'name': 'El-'},
 {'label': 'ORG', 'name': 'ProtTrans'},
 {'label': 'PERSON', 'name': 'Elnaggar'},
 {'label': 'PERSON', 'name': 'Lan'},
 {'label': 'PERSON', 'name': 'Yang'},
 {'label': 'PERSON', 'name': 'Steinegger'},
 {'label': 'PERSON', 'name': 'Suzek'},
 {'label': 'PERSON', 'name': 'Veldhoen'},

## Use iText2KG for creating semantic blocks and extract relationships

In [90]:
from itext2kg.documents_distiller import DocumentsDisiller, CV

document_distiller = DocumentsDisiller(llm_model = llm)

In [91]:
# IE_query = '''
# # DIRECTIVES : 
# - Act like an experienced information extractor. 
# - You have a chunk of a CV.
# - If you do not find the right information, keep its place empty.
# '''

IE_query = '''
# DIRECTIVES : 
- Act like an experienced information extractor. 
- You have a chunk of a scientific paper.
- If you do not find the right information, keep its place empty.
'''
# we have replaced the curly braces with square brackets to avoid the error in the query
distilled_cv = document_distiller.distill(documents=pages, IE_query=IE_query, output_data_structure=Article)

[{'title': 'BERT OLOGY MEETS BIOLOGY : INTERPRETING ATTENTION IN PROTEIN LANGUAGE MODELS', 'authors': [{'name': 'Jesse Vig', 'affiliation': 'Salesforce Research'}, {'name': 'Ali Madani', 'affiliation': 'Salesforce Research'}, {'name': 'Lav R. Varshney', 'affiliation': 'University of Illinois at Urbana-Champaign'}, {'name': 'Caiming Xiong', 'affiliation': 'Salesforce Research'}, {'name': 'Richard Socher', 'affiliation': 'Salesforce Research'}, {'name': 'Nazneen Fatema Rajani', 'affiliation': 'Salesforce Research'}], 'abstract': 'Transformer architectures have proven to learn useful representations for protein classification and generation tasks. However, these representations present challenges in interpretability.', 'key_findings': 'Attention captures the folding structure of proteins, connecting amino acids that are far apart in the underlying sequence but spatially close in the three-dimensional structure. Attention targets binding sites, a key functional component of proteins. Focus

In [92]:
semantic_blocks = [f"{key} - {value}".replace("{", "[").replace("}", "]") for key, value in distilled_cv.items() if value !=[] and value != ""  and value != None]
semantic_blocks

["title - BERT OLOGY MEETS BIOLOGY : INTERPRETING ATTENTION IN PROTEIN LANGUAGE MODELS Published as a conference paper at ICLR 2021 Methodology What Does Attention Understand About Proteins? Analysis of Pretrained Language Models Why does attention target binding sites? A Study on Protein Language Models Consistent Correlations Conclusions and Future Work What you can cram into a single $&!#* vector: Probing sentence embeddings for linguistic properties. Do attention heads in BERT track syntactic dependencies? Interrogating the explanatory power of attention in neural machine translation NGL Viewer: a web application for molecular visualization Attention is not not explanation. A MODEL OVERVIEW Additional Experimental Details Contact Property Analysis Figure 9: Percentage of each head’s attention that is focused on Strand secondary structure. Figure 10: Percentage of each head’s attention that is focused on Turn/Bend secondary structure. C.2 Contact Maps: Statistical Significance Tests

In [94]:
relationships = []
# for block_entities, semantic_block in zip(entities, semantic_blocks):
#     if len(block_entities) == 0:
#         continue
    
#     irelations_extractor = iRelationsExtractor(llm_model=llm, 
#                                                embeddings_model=embeddings,
#                                                sleep_time=5)

for semantic_block in semantic_blocks:
    irelations_extractor = iRelationsExtractor(llm_model=llm, 
                                               embeddings_model=embeddings,
                                               sleep_time=5)

    rels = irelations_extractor.extract_relations(context=semantic_block, entities=entities)
    relationships.append(rels)

relationships

{'relationships': [{'startNode': 'BERT OLOGY MEETS BIOLOGY', 'endNode': 'H. Larochelle', 'name': 'Collaboration'}, {'startNode': 'A. Beygelzimer', 'endNode': "F. d'Alché-Buc", 'name': 'Co-authorship'}, {'startNode': 'E. Fox', 'endNode': 'R. Garnett', 'name': 'Collaboration'}, {'startNode': 'Adam J Riesselman', 'endNode': 'Jung-Eun Shin', 'name': 'Co-authorship'}, {'startNode': 'Aaron W Kollasch', 'endNode': 'Conor McMahon', 'name': 'Collaboration'}, {'startNode': 'Elana Simon', 'endNode': 'Chris', 'name': 'Co-authorship'}, {'startNode': 'Sander', 'endNode': 'Aashish Manglik', 'name': 'Collaboration'}, {'startNode': 'Andrew C Kruse', 'endNode': 'Debora S Marks', 'name': 'Co-authorship'}, {'startNode': 'bioRxiv', 'endNode': 'Alexander Rives', 'name': 'Publication'}, {'startNode': 'Siddharth Goyal', 'endNode': 'Joshua Meier', 'name': 'Collaboration'}, {'startNode': 'Demi Guo', 'endNode': 'Myle Ott', 'name': 'Co-authorship'}, {'startNode': 'C Lawrence Zitnick', 'endNode': 'Jerry', 'name': 

[[{'startNode': 'BERT OLOGY MEETS BIOLOGY',
   'endNode': 'H. Larochelle',
   'name': 'collaboration',
   'properties': {'embeddings': array([-0.00036769, -0.01258983,  0.00358374, ..., -0.00757058,
            0.01303176, -0.01043084])}},
  {'startNode': 'A. Beygelzimer',
   'endNode': "F. d'Alché-Buc",
   'name': 'co authorship',
   'properties': {'embeddings': array([-2.0373680e-03, -4.0053505e-02,  3.3196718e-03, ...,
           -1.6912600e-03,  6.9178840e-03,  8.1485580e-05])}},
  {'startNode': 'E. Fox',
   'endNode': 'R. Garnett',
   'name': 'collaboration',
   'properties': {'embeddings': array([-0.00036769, -0.01258983,  0.00358374, ..., -0.00757058,
            0.01303176, -0.01043084])}},
  {'startNode': 'Adam J Riesselman',
   'endNode': 'Jung-Eun Shin',
   'name': 'co authorship',
   'properties': {'embeddings': array([-2.0373680e-03, -4.0053505e-02,  3.3196718e-03, ...,
           -1.6912600e-03,  6.9178840e-03,  8.1485580e-05])}},
  {'startNode': 'Aaron W Kollasch',
   'e

In [95]:
# prev_rels = relationships
count = 0
for rels in relationships:
    count += len(rels)

count

191

## Format Relationships to Visualize in Neo4j

In [96]:
global_rel = []
for rels in relationships:
    for rel in rels:
        rel["name"] = rel["name"].replace(" ", "_")
        global_rel.append(rel)

global_rel

[{'startNode': 'BERT OLOGY MEETS BIOLOGY',
  'endNode': 'H. Larochelle',
  'name': 'collaboration',
  'properties': {'embeddings': array([-0.00036769, -0.01258983,  0.00358374, ..., -0.00757058,
           0.01303176, -0.01043084])}},
 {'startNode': 'A. Beygelzimer',
  'endNode': "F. d'Alché-Buc",
  'name': 'co_authorship',
  'properties': {'embeddings': array([-2.0373680e-03, -4.0053505e-02,  3.3196718e-03, ...,
          -1.6912600e-03,  6.9178840e-03,  8.1485580e-05])}},
 {'startNode': 'E. Fox',
  'endNode': 'R. Garnett',
  'name': 'collaboration',
  'properties': {'embeddings': array([-0.00036769, -0.01258983,  0.00358374, ..., -0.00757058,
           0.01303176, -0.01043084])}},
 {'startNode': 'Adam J Riesselman',
  'endNode': 'Jung-Eun Shin',
  'name': 'co_authorship',
  'properties': {'embeddings': array([-2.0373680e-03, -4.0053505e-02,  3.3196718e-03, ...,
          -1.6912600e-03,  6.9178840e-03,  8.1485580e-05])}},
 {'startNode': 'Aaron W Kollasch',
  'endNode': 'Conor McMaho

In [97]:
len(global_ent)

431

Run: docker run -p7474:7474 -p7687:7687 -e NEO4J_AUTH=neo4j/secretgraph neo4j:latest in terminal

In [98]:
from itext2kg.graph_integration import GraphIntegrator

URI = "bolt://localhost:7687"
USERNAME = "neo4j"
PASSWORD = "secretgraph"

new_graph = {}
new_graph["nodes"] = global_ent
new_graph["relationships"] = global_rel
GraphIntegrator(uri=URI, username=USERNAME, password=PASSWORD).visualize_graph(json_graph=new_graph)