In [1]:
import spacy
# import crosslingual_coreference

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys
print(sys.executable)


c:\Users\layas\anaconda3\envs\Python310\python.exe


In [3]:
# Add rebel component https://github.com/Babelscape/rebel/blob/main/spacy_component.py
import requests
import re
import hashlib
from spacy import Language
from typing import List

from spacy.tokens import Doc, Span

from transformers import pipeline

def call_wiki_api(item):
  try:
    url = f"https://www.wikidata.org/w/api.php?action=wbsearchentities&search={item}&language=en&format=json"
    data = requests.get(url).json()
    # Return the first id (Could upgrade this in the future)
    return data['search'][0]['id']
  except:
    return 'id-less'

def extract_triplets(text):
    """
    Function to parse the generated text and extract the triplets
    """
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(),
                                 'type': relation.strip(),
                                 'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
    return triplets


@Language.factory(
    "rebel",
    requires=["doc.sents"],
    assigns=["doc._.rel"],
    default_config={
        "model_name": "Babelscape/rebel-large",
        "device": 0,
    },
)
class RebelComponent:
    def __init__(
        self,
        nlp,
        name,
        model_name: str,
        device: int,
    ):        
        assert model_name is not None, ""
        self.triplet_extractor = pipeline("text2text-generation", model=model_name, tokenizer=model_name, device=device)
        self.entity_mapping = {}
        # Register custom extension on the Doc
        if not Doc.has_extension("rel"):
          Doc.set_extension("rel", default={})

    def get_wiki_id(self, item: str):
        mapping = self.entity_mapping.get(item)
        if mapping:
          return mapping
        else:
          res = call_wiki_api(item)
          self.entity_mapping[item] = res
          return res

    def _generate_triplets(self, sent: Span) -> List[dict]:
        output_ids1 = self.triplet_extractor(sent.text, return_tensors=True, return_text=False)[0]["generated_token_ids"]["output_ids"]
        extracted_text1 = self.triplet_extractor.tokenizer.batch_decode(output_ids1[0])
        extracted_triplets1 = extract_triplets(extracted_text1[0])
        return extracted_triplets1
 

    def set_annotations(self, doc: Doc, triplets: List[dict]):
        for triplet in triplets:
            # Remove self-loops (relationships that start and end at the entity)
            if triplet['head'] == triplet['tail']:
                continue

            # Use regex to search for entities
            head_span = re.search(triplet["head"], doc.text)
            tail_span = re.search(triplet["tail"], doc.text)

            # Skip the relation if both head and tail entities are not present in the text
            # Sometimes the Rebel model hallucinates some entities
            if not head_span or not tail_span:
              continue

            index = hashlib.sha1("".join([triplet['head'], triplet['tail'], triplet['type']]).encode('utf-8')).hexdigest()
            if index not in doc._.rel:
                # Get wiki ids and store results
                doc._.rel[index] = {"relation": triplet["type"], "head_span": {'text': triplet['head'], 'id': self.get_wiki_id(triplet['head'])}, "tail_span": {'text': triplet['tail'], 'id': self.get_wiki_id(triplet['tail'])}}

    def __call__(self, doc: Doc) -> Doc:
        for sent in doc.sents:
            sentence_triplets = self._generate_triplets(sent)
            self.set_annotations(doc, sentence_triplets)
        print("Doc:",doc)
        return doc

In [4]:
# Add coreference resolution model
coref = spacy.load('en_core_web_sm', disable=['ner', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer'])
coref.add_pipe(
    "xx_coref", config={"chunk_size": 2500, "chunk_overlap": 2, "device": -1})

# Define rel extraction model

rel_ext = spacy.load('en_core_web_sm', disable=['ner', 'lemmatizer', 'attribute_rules', 'tagger'])
rel_ext.add_pipe("rebel", config={
    'device':-1, # Number of the GPU, -1 if want to use CPU
    'model_name':'Babelscape/rebel-large'} # Model used, will default to 'Babelscape/rebel-large' if not given
    )


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\layas\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\layas\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
error loading _jsonnet (this is expected on Windows), treating C:\Users\layas\AppData\Local\Temp\tmp753yry_b\config.json as plain json
Some weights of the model checkpoint at nreimers/mMiniLMv2-L12-H384-distilled-from-XLMR-Large were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRoberta

<__main__.RebelComponent at 0x19bccb7c3a0>

In [5]:
import PyPDF2 as pdf

def extract_text_from_pdf(pdf_path, page_number):
    with open(pdf_path, 'rb') as file:
        pdf_reader = pdf.PdfReader(file)
        page = pdf_reader.pages[page_number - 1]  # Adjusting for 0-based indexing
        text = page.extract_text() # Remove '\n' characters
        return text

pdf_path = 'Dataset\DEWA Sustainability Report 2022.pdf'
page_number = 17  # Example page number
extracted_text = extract_text_from_pdf(pdf_path, page_number)
print(extracted_text)


-  Digital X was formed in 
October 2019 to offer 
digital services, resource 
augmentation, intelligent 
automation solutions, robotics, 
advanced data analytics 
solutions for optimal decision-
making and mission-critical 
analytical modelling systems. 
DigitalX’s services assist 
companies with designing, 
implementing and managing 
technologies to enhance their 
business capabilities as well 
as accelerating their digital 
transformation by building 
cutting-edge and easy-to-use 
systems powered by AI.
-  Infra X: was formed in October 
2019 and focuses on connecting 
Digital DEWA’s value-added 
services from its data centres 
and cloud services to customers. 
InfraX leverages DEWA’s 
infrastructure to offer a secure, 
reliable, and independent super-
fast network that meets the 
future digital transformation 
demands. InfraX is considered 
to be the first non-telecom 
company in the UAE to receive a 
special purpose IoT license from 
the Telecommunications and 
Digital Government 

In [6]:
input_text = extracted_text

coref_text = coref(input_text)._.resolved_text

doc = rel_ext(coref_text)
for value, rel_dict in doc._.rel.items():
    print(f"{rel_dict}")


Doc: -  Digital X was formed in 
October 2019 to offer 
digital services, resource 
augmentation, intelligent 
automation solutions, robotics, 
advanced data analytics 
solutions for optimal decision-
making and mission-critical 
analytical modelling systems. 
Digital X's services assist 
companies with designing, 
implementing and managing 
technologies to enhance companies's 
business capabilities as well 
as accelerating companies's digital 
transformation by building 
cutting-edge and easy-to-use 
systems powered by AI.
-  Infra X: was formed in October 
2019 and focuses on connecting 
Digital DEWA’s value-added 
services from Digital DEWA’s's data centres 
and cloud services to customers. 
Digital X leverages Digital DEWA’s's 
infrastructure to offer a secure, 
reliable, and independent super-
fast network that meets the 
future digital transformation 
demands. Digital X is considered 
to be the first non-telecom 
company in the UAE to receive a 
special purpose IoT license from 


In [7]:
source=[]
target=[]
edge=[]
# Extract data from the dictionaries
for rel_dict in doc._.rel.items():
    print(rel_dict)
    source.append(rel_dict[1]['head_span']['text'])
    target.append(rel_dict[1]['tail_span']['text'])
    edge.append(rel_dict[1]['relation'])

# Create DataFrame

('00578411323b021e2c95a3a4639c21992860605c', {'relation': 'inception', 'head_span': {'text': 'Digital X', 'id': 'Q114708567'}, 'tail_span': {'text': 'October 2019', 'id': 'Q47087606'}})
('4b6300a7d6d59e72bfdb62d52a073d58c305b6bb', {'relation': 'parent organization', 'head_span': {'text': 'Infra X', 'id': 'id-less'}, 'tail_span': {'text': 'DEWA', 'id': 'Q759430'}})
('fdad1c6a1d10e27c9ca9fc446c72679916134c66', {'relation': 'subsidiary', 'head_span': {'text': 'DEWA', 'id': 'Q759430'}, 'tail_span': {'text': 'Infra X', 'id': 'id-less'}})
('1458b926ddadc90890f50523a137f22d55a542ba', {'relation': 'part of', 'head_span': {'text': 'data centre', 'id': 'Q671224'}, 'tail_span': {'text': 'cloud services', 'id': 'Q116028986'}})
('3673e24d0ad564b278629a75c26b64e287d24ef3', {'relation': 'developer', 'head_span': {'text': 'Digital X', 'id': 'Q114708567'}, 'tail_span': {'text': 'Digital DEWA', 'id': 'Q39886348'}})
('a348af917c78d1688a11e4339ffeb1e04c092c23', {'relation': 'product or material produced',

In [8]:
import pandas as pd
kg_df = pd.DataFrame({'source': source, 'target': target, 'edge': edge})


In [9]:
kg_df

Unnamed: 0,source,target,edge
0,Digital X,October 2019,inception
1,Infra X,DEWA,parent organization
2,DEWA,Infra X,subsidiary
3,data centre,cloud services,part of
4,Digital X,Digital DEWA,developer
5,Digital DEWA,Digital X,product or material produced
6,Digital X,UAE,country
7,Digital X,5G,product or material produced
8,Shuaa Energy 1,MBR Solar Park,part of
9,MBR Solar Park,Shuaa Energy 1,has part


In [None]:
from neo4j import GraphDatabase
import pandas as pd

# Connect to Neo4j
uri = "neo4j+s://72992856.databases.neo4j.io:7687"
user = "neo4j"
password = "Gvi8ygPRAS3GMP2kplWDpWO12HtEbNH2o1lOnfiF1O4"
# Assuming you have your DataFrame already defined as kg_df

# Function to create nodes and relationships
def create_nodes_and_relationships(tx, source, target, edge):
    query = (
        "MERGE (s:Node {name: $source}) "
        "MERGE (t:Node {name: $target}) "
        "MERGE (s)-[:`" + edge + "`]->(t)"
    )
    tx.run(query, source=source, target=target)

# Create a Neo4j driver instance
driver = GraphDatabase.driver(uri, auth=(user, password))

# Iterate through each row in the DataFrame and create nodes and relationships
with driver.session() as session:
    for index, row in kg_df.iterrows():
        source_name = row['source']
        target_name = row['target']
        edge_name = row['edge']
        
        # Call the function to create nodes and relationship
        session.write_transaction(create_nodes_and_relationships, source_name, target_name, edge_name)

# Close the driver
driver.close()
