In [14]:
import requests
import pdf2image
import pytesseract
pdf = requests.get('https://arxiv.org/pdf/2110.03526.pdf')
doc = pdf2image.convert_from_bytes(pdf.content)
# Get the article text
article = []
for page_number, page_data in enumerate(doc):
    txt = pytesseract.image_to_string(page_data).encode("utf-8")
# Sixth page are only references
    if page_number < 6:
        article.append(txt.decode("utf-8"))
article_txt = " ".join(article)
print(article_txt)

Mohammadreza Ahmadi

Tissue Engineering and Regeneration of Skin
and Hair Follicle Growth From Stem Cells

INTRODUCTION

Many people with skin diseases such as chronic wounds, non-healing and diabetic
ulcers need reconstruction and regeneration of their skin. In addition, the medical industry also
needed a method of skin rejuvenation and reconstruction for cosmetic purposes, even for
healthy people. Reconstructive medicine used the method to deliver pluripotent stem cells to the
targeted tissue.

33 years after the introduction of bone marrow stem cells, fat-derived stem cells have
become an excellent source for cell therapy. In 1961, two Canadian scientists first introduced
stem cells. These cells, later found to be hematopoietic stem cells, have been used successfully
to treat leukemia and some severe autoimmune diseases called bone marrow transplants. In
1968, another stem cell was introduced into the bone marrow, which has been shown to be
effective due to its high ability to regul

In [15]:
import nltk
nltk.download('punkt')

def clean_text(text):
    """Remove section titles and figure descriptions from text"""
    clean = "\n".join([row for row in text.split("\n") if
                      (len(row.split(" "))) > 3 and not (row.startswith("(a)"))
                      and not row.startswith("Figure")])
    return clean

text = article_txt.split("INTRODUCTION")[1]
ctext = clean_text(text)
sentences = nltk.tokenize.sent_tokenize(ctext)
print(sentences)

['Many people with skin diseases such as chronic wounds, non-healing and diabetic\nulcers need reconstruction and regeneration of their skin.', 'In addition, the medical industry also\nneeded a method of skin rejuvenation and reconstruction for cosmetic purposes, even for\nhealthy people.', 'Reconstructive medicine used the method to deliver pluripotent stem cells to the\n33 years after the introduction of bone marrow stem cells, fat-derived stem cells have\nbecome an excellent source for cell therapy.', 'In 1961, two Canadian scientists first introduced\nstem cells.', 'These cells, later found to be hematopoietic stem cells, have been used successfully\nto treat leukemia and some severe autoimmune diseases called bone marrow transplants.', 'In\n1968, another stem cell was introduced into the bone marrow, which has been shown to be\neffective due to its high ability to regulate immunity in many diseases, including skin, bone, joint\ndiseases, heart, brain, nerves, and kidney.', 'Nevert

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ilan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [23]:
import hashlib
import requests

def query_plain(text, url="http://bern2.korea.ac.kr/plain"):
    """Biomedical entity linking API"""
    return requests.post(url, json={'text': str(text)}).json()

entity_list = []

# The last sentence is invalid
for s in sentences[:-1]:
    entity_list.append(query_plain(s))

parsed_entities = []
for entities in entity_list:
    e = []
    
    # If there are no entities in the text
    if not entities.get('annotations'):
        print("No entities found in text:", entities['text'])
        parsed_entities.append({'text': entities['text'], 'text_sha256': hashlib.sha256(entities['text'].encode('utf-8')).hexdigest()})
        continue
    
    for entity in entities['annotations']:
        other_ids = [id for id in entity['id'] if not id.startswith("BERN")]
        entity_type = entity['obj']
        entity_name = entities['text'][entity['span']['begin']:entity['span']['end']]
        
        try:
            entity_id = [id for id in entity['id'] if id.startswith("BERN")][0]
        except IndexError:
            entity_id = entity_name
        
        e.append({'entity_id': entity_id, 'other_ids': other_ids, 'entity_type': entity_type, 'entity': entity_name})
    
    parsed_entities.append({'entities': e, 'text': entities['text'], 'text_sha256': hashlib.sha256(entities['text'].encode('utf-8')).hexdigest()})
    print("Processed entities for text:", entities['text'])
    print("Parsed entities:", parsed_entities)

# Add more print statements as needed to debug and identify issues


Processed entities for text: Many people with skin diseases such as chronic wounds, non-healing and diabetic ulcers need reconstruction and regeneration of their skin.
Parsed entities: [{'entities': [{'entity_id': 'people', 'other_ids': ['NCBITaxon:9606'], 'entity_type': 'species', 'entity': 'people'}, {'entity_id': 'skin diseases', 'other_ids': ['mesh:D012871'], 'entity_type': 'disease', 'entity': 'skin diseases'}, {'entity_id': 'chronic wounds', 'other_ids': ['mesh:D002908'], 'entity_type': 'disease', 'entity': 'chronic wounds'}, {'entity_id': 'non-healing and diabetic ulcers', 'other_ids': ['mesh:D003668'], 'entity_type': 'disease', 'entity': 'non-healing and diabetic ulcers'}], 'text': 'Many people with skin diseases such as chronic wounds, non-healing and diabetic ulcers need reconstruction and regeneration of their skin.', 'text_sha256': '95be1713dbb23e86525959e49ea5a196b49f3bbc1ce8c6cbfa373f176c9d39ae'}]
Processed entities for text: In addition, the medical industry also needed 

In [28]:
from neo4j import GraphDatabase
import pandas as pd

host = 'bolt://18.234.128.63:7687'
user = 'neo4j'
password = 'purchaser-preservers-pat'
driver = GraphDatabase.driver(host, auth=(user, password))

def neo4j_query(query, params=None):
    with driver.session() as session:
        result = session.run(query, params)
        # Fetch all records from the result object and store them in a list
        result_list = [record.values() for record in result]
    # Create DataFrame from the list of records
    return pd.DataFrame(result_list, columns=result.keys())


In [29]:
# import the and the article into the grap 
author = article_txt.split("\n")[0]
title = " ".join(article_txt.split("\n")[2:4])
neo4j_query("""
MERGE (a:Author{name:$author})
MERGE (b:Article{title:$title})
MERGE (a)-[:WROTE]->(b)
""", {'title':title, 'author':author})

In [30]:
# import the sentences and mentioned entities
neo4j_query("""
MATCH (a:Article)
UNWIND $data as row
MERGE (s:Sentence{id:row.text_sha256})
SET s.text = row.text
MERGE (a)-[:HAS_SENTENCE]->(s)
WITH s, row.entities as entities
UNWIND entities as entity
MERGE (e:Entity{id:entity.entity_id})
ON CREATE SET e.other_ids = entity.other_ids,
 e.name = entity.entity,
 e.type = entity.entity_type
MERGE (s)-[m:MENTIONS]->(e)
ON CREATE SET m.count = 1
ON MATCH SET m.count = m.count + 1
""", {'data': parsed_entities})


In [31]:
# example application 1 : search engine
neo4j_query("""
MATCH (e:Entity)<-[:MENTIONS]-(s:Sentence)
WHERE e.name = "autoimmune diseases"
RETURN s.text as result
""")
# example application 2 : co-occurrence
neo4j_query("""
MATCH (e1:Entity)<-[:MENTIONS]-()-[:MENTIONS]->(e2:Entity)
WHERE id(e1) < id(e2)
RETURN e1.name as entity1, e2.name as entity2, count(*) as 
cooccurrence
ORDER BY cooccurrence
DESC LIMIT 3
""")
# example application 3 : author expertise
neo4j_query("""
MATCH (a:Author)-[:WROTE]->()-[:HAS_SENTENCE]->()-[:MENTIONS]-
>(e:Entity)
RETURN a.name as author, e.name as entity, count(*) as count
ORDER BY count DESC
LIMIT 5
""")

Unnamed: 0,author,entity,count
0,Mohammadreza Ahmadi,collagen,9
1,Mohammadreza Ahmadi,fibroblasts,8
2,Mohammadreza Ahmadi,stem cell,7
3,Mohammadreza Ahmadi,ADSCs,7
4,Mohammadreza Ahmadi,mesenchymal stem cells,7


In [35]:
from transformers import AutoTokenizer
from zero_shot_re import RelTaggerModel, RelationExtractor
model = RelTaggerModel.from_pretrained("fractalego/fewrel-zero-shot")
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
relations = ['associated', 'interacts']
extractor = RelationExtractor(model, tokenizer, relations)


Downloading: 100%|████████████████████████████████████████████████████████████████| 1.34G/1.34G [01:05<00:00, 20.6MB/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████| 28.0/28.0 [00:00<00:00, 14.0kB/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████| 232k/232k [00:00<00:00, 1.47MB/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████| 466k/466k [00:00<00:00, 1.69MB/s]


In [38]:
import itertools

# Candidate sentence where there is more than a single entity present
candidates = [s for s in parsed_entities if (s.get('entities')) and (len(s['entities']) > 1)]
predicted_rels = []

for c in candidates:
    combinations = itertools.combinations([{'name': x['entity'], 'id': x['entity_id']} for x in c['entities']], 2)

    for combination in list(combinations):
        try:
            # Define threshold for the most probable relation
            ranked_rels = extractor.rank(text=c['text'].replace(",", " "), head=combination[0]['name'], tail=combination[1]['name'])
            
            if ranked_rels[0][1] > 0.85:
                predicted_rels.append({'head': combination[0]['id'], 'tail': combination[1]['id'], 'type': ranked_rels[0][0], 'source': c['text_sha256']})
        except Exception as e:
            # Handle the specific exception, print an error message, or take appropriate action
            print(f"An error occurred: {e}")
            pass

An error occurred: The entity "bone marrow stem cells" is not in the text.
An error occurred: The entity "bone marrow stem cells" is not in the text.
An error occurred: The entity "targeted tissue" is not in the text.
An error occurred: The entity "targeted tissue" is not in the text.
An error occurred: The entity "fibroblasts" is not in the text.
An error occurred: The entity "SVF" is not in the text.
An error occurred: The entity "macrophage" is not in the text.
An error occurred: The entity "-monocyte" is not in the text.
An error occurred: The entity "SVF" is not in the text.
An error occurred: The entity "SVF" is not in the text.
An error occurred: The entity "SVF" is not in the text.
An error occurred: The entity "SVF" is not in the text.
An error occurred: The entity "SVF" is not in the text.
An error occurred: The entity "SVF" is not in the text.
An error occurred: The entity "macrophage" is not in the text.
An error occurred: The entity "-monocyte" is not in the text.
An error

In [39]:
neo4j_query("""
UNWIND $data as row
MATCH (source:Entity {id: row.head})
MATCH (target:Entity {id: row.tail})
MATCH (text:Sentence {id: row.source})
MERGE (source)-[:REL]->(r:Relation {type: row.type})-[:REL]->(target)
MERGE (text)-[:MENTIONS]->(r)
""", {'data': predicted_rels})

# examine the extracted relationships
neo4j_query("""
MATCH (s:Entity)-[:REL]->(r:Relation)-[:REL]->(t:Entity), (r)<-[:MENTIONS]-(st:Sentence)
RETURN s.name as source_entity, t.name as target_entity, r.type as type, st.text as source_text
""")


Unnamed: 0,source_entity,target_entity,type,source_text
0,skin diseases,chronic wounds,associated,Many people with skin diseases such as chronic...
1,hematopoietic stem cells,leukemia,associated,"These cells, later found to be hematopoietic s..."
2,hematopoietic stem cells,autoimmune diseases,associated,"These cells, later found to be hematopoietic s..."
3,leukemia,autoimmune diseases,associated,"These cells, later found to be hematopoietic s..."
4,Mesenchymal stem cells,targeted,interacts,Mesenchymal stem cells are the most interestin...
5,collagenase enzymes,fibroblasts,interacts,"On one hand, by injecting collagenase enzymes ..."
6,collagenase enzymes,fibroblasts,interacts,"Fibrous, collagen fibers and fibroblasts On on..."
7,engineered stem cell,inflammatory cells,associated,"On the other hand, In the engineered stem cell..."
8,engineered stem cell,fibroblasts,associated,"On the other hand, In the engineered stem cell..."
9,fibroblasts,collagen,interacts,"Fibrous, collagen fibers and fibroblasts On on..."
