<h1>Import All the necessary libs

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig, pipeline
from tqdm import tqdm_notebook
import json
import spacy

<h1>Loading the dataset

In [2]:
path = "/kaggle/input/text-corpus-dataset/Biology_Full.json"
with open(path) as f:
    data = json.load(f)
chunks = [value['text'] for value in data.values()]
print("Total chunks: ", len(chunks))

Total chunks:  4348


In [3]:
for chunk in chunks:
    if len(chunk)>1024:
        print(len(chunk))

1620
1620
1050
1050


In [4]:
def segment_text(text, maxLen):
    currLen = 0
    chunks = []
    currText = ""
    words = text.split()
    for word in words:
        if currLen + len(word) + (1 if currText else 0) <= maxLen:
            currText += " "+word
            currLen += len(word) + 1
        else:
            
            chunks.append(currText)
            currLen = len(word)
            currText = word
    if currText:
        chunks.append(currText) 
    return chunks

In [5]:
chunksT = []
for chu in chunks:
    # chunksT.extend(segment_text(chu, 512))
    chunksT.extend(segment_text(chu, 512))
print(len(chunksT))
print(chunksT[0])

10587
 Code, start stop codons and ultimate condition for cel division About the o rigin of the genetic c ode, the start and stop codons and the very first cell. Correspond ing author : Pierre F . Zöfel AEG Brussel retired Tel +3223806682 +32473/777801 Email; pierre.zofel@gmail.com/pierre.zofel@hotmail.com “The universality of the code argues that it must have been established very early in evolution. Originally , there may have been a stereochemical relationship between amino acids and the codons representing


In [6]:
chunksT[1]

'them” Benjamin Lewin Abbreviations: AAs; amino acids AAi: binding amino acids AAx: recognized amino acid Bi 1-3: codon Code, start stop codons and ultimate condition for cel division Abstract : In the world of living organisms, a few important molecules play prominent role s. They are grouped together under two categories: the bases and the amino acids (AAs) , both of which have the common property of forming long chains containing multiple repetitions of the same elements . The living world constantly'

<h1>Extracting the entities with the labels

<h2>Using LLM

In [7]:
# def extract_entities(chunks, model_name="dbmdz/bert-large-cased-finetuned-conll03-english"):
#     try:
#         tokenizer = AutoTokenizer.from_pretrained(model_name)
#         model = AutoModelForTokenClassification.from_pretrained(model_name)
#         config = AutoConfig.from_pretrained(model_name)
        
#         inputs = tokenizer(chunks, return_tensors="pt", max_length=1024, padding=True, truncation=True)

#         with torch.no_grad():
#             outputs = model(**inputs)
#         predictions = torch.argmax(outputs.logits, dim=2)

#         tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
#         token_labels = [config.id2label[p.item()] for p in predictions[0]]

#         results = []
#         current_entity = []
#         current_label = None

#         for token, label in zip(tokens[1:-1], token_labels[1:-1]):
#             # Handle subwords
#             if token.startswith("##"):
#                 if current_entity:
#                     current_entity[-1] += token[2:]
#                 continue
                
#             # Handle entity continuation
#             if label.startswith("B-") or label == "O":
#                 if current_entity:
#                     results.append((" ".join(current_entity), current_label))
#                     current_entity = []
#                 # Remove B- prefix
#                 if label != "O":
#                     current_entity = [token]
#                     current_label = label[2:]
#             elif label.startswith("I-"):
#                 if not current_entity:
#                     current_entity = [token]
#                     current_label = label[2:]
#                 else:
#                     current_entity.append(token)
        
#         # Add final entity if exists
#         if current_entity:
#             results.append((" ".join(current_entity), current_label))
            
#         return results
        
#     except Exception as e:
#         print(f"Error performing NER: {str(e)}")
#         return []

In [8]:
# Entities_data = extract_entities(chunksT, model_name="dbmdz/bert-large-cased-finetuned-conll03-english")
# uncased_entites = [entities[0].lower().replace(" ", "").strip() for entities in Entities_data]

In [9]:
# uncased_entites

<h2>Using NLTK and spacy

In [10]:
def extract_nlp(chunks):
    nlp = spacy.load("en_core_web_sm")
    entities = []
    for chu in tqdm_notebook(chunks, total = len(chunks), desc = "process chunks"):
        doc = nlp(chu)
        for ent in doc.ents:
            entities.append([ent.text, ent.label_])
    return entities

In [11]:
entities = extract_nlp(chunks)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chu in tqdm_notebook(chunks, total = len(chunks), desc = "process chunks"):


process chunks:   0%|          | 0/4348 [00:00<?, ?it/s]

In [12]:
entities[3]

['Benjamin Lewin Abbreviations', 'PERSON']

In [13]:
originalEntities = [ent[0] for ent in entities]

In [14]:
entityType_dict = {ent[0].lower().replace(" ", "").strip():ent[1] for ent in entities}
# entityType_dict
    

In [15]:
uncased_entites_ = [ent[0].lower().replace(" ", "").strip() for ent in entities]
print(len(uncased_entites_))
uncased_entites_[3]

77306


'benjaminlewinabbreviations'

<h1>Extracting the Relations with the labels

In [16]:
def extract_triplets(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
    return triplets

In [17]:
triplet_extractor = pipeline('text2text-generation', model='Babelscape/rebel-large', tokenizer='Babelscape/rebel-large')

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/344 [00:00<?, ?B/s]

Device set to use cuda:0


In [18]:
outputs = triplet_extractor(chunksT, return_tensors=True, return_text=False,  truncation=True, max_length=1024)

In [19]:
extracted_texts = triplet_extractor.tokenizer.batch_decode([output["generated_token_ids"] for output in outputs])

In [20]:
Relation_data = [extract_triplets(ext_text) for ext_text in extracted_texts]

<h1>Filtering the Relations and Entites by finding the common entities between both the data

In [21]:
final_res = []
for re_elements in Relation_data:
    for re in re_elements:
        if re['head'].lower().replace(" ", "").strip() in uncased_entites_ and re['tail'].lower().replace(" ", "").strip() in uncased_entites_:
            final_res.append({
                'head': re['head'],
                'tail': re['tail'],
                'relation': re['type']
            })
print(len(final_res))

2772


In [22]:
final_res = list({tuple(d.items()): d for d in final_res}.values())
print(len(final_res))
final_res

1592


[{'head': 'binding', 'tail': 'binding', 'relation': 'use'},
 {'head': 'molecule', 'tail': 'RNA', 'relation': 'has part'},
 {'head': 'RNA', 'tail': 'nucleotide', 'relation': 'has part'},
 {'head': 'Mohamed A. Marahiel',
  'tail': 'Philipps University',
  'relation': 'employer'},
 {'head': 'recognition', 'tail': 'recognition', 'relation': 'instance of'},
 {'head': 'ATP', 'tail': 'biochemistry', 'relation': 'studied by'},
 {'head': 'biochemistry', 'tail': 'ATP', 'relation': 'studies'},
 {'head': 'Nature 395', 'tail': 'Nature 418', 'relation': 'followed by'},
 {'head': 'Bioessays', 'tail': 'USA', 'relation': 'country of origin'},
 {'head': 'Marahiel MA', 'tail': 'Biochemistry', 'relation': 'field of work'},
 {'head': 'Warsaw', 'tail': 'Ukraine', 'relation': 'country'},
 {'head': 'Ukraine',
  'tail': 'Warsaw',
  'relation': 'contains administrative territorial entity'},
 {'head': 'titin', 'tail': 'domain', 'relation': 'instance of'},
 {'head': 'DDFLN4', 'tail': 'domain', 'relation': 'instan

<h1>We will keep multiple entities between same nodes

In [23]:
allRelations = [res['relation'] for res in final_res]
allRelations = list(set(allRelations))

In [24]:
len(allRelations)

79

<h1> Get the embeddings for those relation ships

<h2>Using the sentence transformer

In [25]:
from sentence_transformers import SentenceTransformer

In [26]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [27]:
embeddings = model.encode(allRelations)
print(embeddings.shape)

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

(79, 384)


In [28]:
embd_dict = {rel: embed for rel, embed in zip(allRelations, embeddings)}

In [29]:
embd_dict.keys()

dict_keys(['country of origin', 'named after', 'different from', 'notable work', 'shares border with', 'subject has role', 'area', 'use', 'located in the administrative territorial entity', 'contains administrative territorial entity', 'number of articles', 'twinned administrative body', 'is a list of', 'practiced by', 'part', 'author', 'subsidiary', 'religious order', 'country', 'father', 'published in', 'capital of', 'publication date', 'facet of', 'date of birth', 'inception', 'student of', 'discoverer or inventor', 'has', 'owned by', 'studies', 'number of participants', 'depicts', 'spouse', 'main subject', 'programming language', 'followed by', 'parent taxon', 'event distance', 'conferred by', 'educated at', 'publisher', 'applies to jurisdiction', 'number of episodes', 'family', 'length', 'has parts of the class', 'work period (start)', 'has cause', 'employer', 'part of', 'location of formation', 'field of this occupation', 'studied by', 'has effect', 'parent organization', 'studen

<h1>Using those embeddings peform the KNN algo over them

In [30]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [31]:
def performOnlineKNN(embed_dict, th):
    centroids = []
    centroids_cluster = []
    centroid_counts = []
    for rel, embed in tqdm_notebook(embed_dict.items(), total= len(embed_dict), desc = "Processing Embeddings: "):
        if len(centroids) == 0:
            centroids.append(embed)
            centroids_cluster.append([rel])
            centroid_counts.append(1)
        else:
            update = False
            for i, cent in enumerate(centroids):
                sim = cosine_similarity(cent.reshape(1, -1), embed.reshape(1, -1))[0][0]
                if sim >= th:
                    centroids[i] = (centroids[i] * centroid_counts[i] + embed) / (centroid_counts[i] + 1)
                    centroid_counts[i] += 1
                    if len(centroids_cluster[i]) == 0:
                        centroids_cluster[i] = [rel]
                    else:
                        centroids_cluster[i].append(rel)
                    update = True
                    break
            if not update:
                centroids.append(embed)
                centroids_cluster.append([rel])
                centroid_counts.append(1)
    return centroids, centroids_cluster
                    

In [32]:
centroids, centroids_cluster = performOnlineKNN(embd_dict, th=0.7)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for rel, embed in tqdm_notebook(embed_dict.items(), total= len(embed_dict), desc = "Processing Embeddings: "):


Processing Embeddings:   0%|          | 0/79 [00:00<?, ?it/s]

In [33]:
centroids_cluster

[['country of origin'],
 ['named after'],
 ['different from'],
 ['notable work'],
 ['shares border with'],
 ['subject has role'],
 ['area'],
 ['use', 'uses'],
 ['located in the administrative territorial entity',
  'contains administrative territorial entity'],
 ['number of articles'],
 ['twinned administrative body'],
 ['is a list of'],
 ['practiced by'],
 ['part', 'part of', 'has part'],
 ['author'],
 ['subsidiary'],
 ['religious order'],
 ['country'],
 ['father'],
 ['published in'],
 ['capital of', 'capital'],
 ['publication date'],
 ['facet of'],
 ['date of birth'],
 ['inception'],
 ['student of', 'student'],
 ['discoverer or inventor'],
 ['has', 'has cause', 'has effect'],
 ['owned by', 'owner of'],
 ['studies', 'studied by'],
 ['number of participants'],
 ['depicts'],
 ['spouse'],
 ['main subject'],
 ['programming language'],
 ['followed by'],
 ['parent taxon'],
 ['event distance'],
 ['conferred by'],
 ['educated at'],
 ['publisher'],
 ['applies to jurisdiction'],
 ['number of ep

<h1>Based on the final clusters name the relations and replace the original relations with the cluster names in the dataset

In [34]:
def getName(embed_dict, centroids_cluster):
    rel_Cluster_Name = {}
    for clust in tqdm_notebook(centroids_cluster, total = len(centroids_cluster), desc = "Finding BestName"):
        most_sim = ""
        max_sum = 0
        if len(clust) == 1:
            rel_Cluster_Name[clust[0]] = clust[0]
            continue
        for ele in clust:
            sim_sum = 0
            for ele2 in clust:
                if ele == ele2: 
                    continue
                sim = cosine_similarity(embed_dict[ele].reshape(1, -1), embed_dict[ele2].reshape(1, -1))[0][0]
                sim_sum += sim
            if  max_sum < sim_sum:
                max_sum = sim_sum
                most_sim = ele
        for ele in clust:
            rel_Cluster_Name[ele] = most_sim
             
    return rel_Cluster_Name

In [35]:
rel_Cluster_Name = getName(embd_dict, centroids_cluster)
rel_Cluster_Name

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for clust in tqdm_notebook(centroids_cluster, total = len(centroids_cluster), desc = "Finding BestName"):


Finding BestName:   0%|          | 0/68 [00:00<?, ?it/s]

{'country of origin': 'country of origin',
 'named after': 'named after',
 'different from': 'different from',
 'notable work': 'notable work',
 'shares border with': 'shares border with',
 'subject has role': 'subject has role',
 'area': 'area',
 'use': 'use',
 'uses': 'use',
 'located in the administrative territorial entity': 'located in the administrative territorial entity',
 'contains administrative territorial entity': 'located in the administrative territorial entity',
 'number of articles': 'number of articles',
 'twinned administrative body': 'twinned administrative body',
 'is a list of': 'is a list of',
 'practiced by': 'practiced by',
 'part': 'part',
 'part of': 'part',
 'has part': 'part',
 'author': 'author',
 'subsidiary': 'subsidiary',
 'religious order': 'religious order',
 'country': 'country',
 'father': 'father',
 'published in': 'published in',
 'capital of': 'capital of',
 'capital': 'capital of',
 'publication date': 'publication date',
 'facet of': 'facet of',

In [36]:
rel_Cluster_Name

{'country of origin': 'country of origin',
 'named after': 'named after',
 'different from': 'different from',
 'notable work': 'notable work',
 'shares border with': 'shares border with',
 'subject has role': 'subject has role',
 'area': 'area',
 'use': 'use',
 'uses': 'use',
 'located in the administrative territorial entity': 'located in the administrative territorial entity',
 'contains administrative territorial entity': 'located in the administrative territorial entity',
 'number of articles': 'number of articles',
 'twinned administrative body': 'twinned administrative body',
 'is a list of': 'is a list of',
 'practiced by': 'practiced by',
 'part': 'part',
 'part of': 'part',
 'has part': 'part',
 'author': 'author',
 'subsidiary': 'subsidiary',
 'religious order': 'religious order',
 'country': 'country',
 'father': 'father',
 'published in': 'published in',
 'capital of': 'capital of',
 'capital': 'capital of',
 'publication date': 'publication date',
 'facet of': 'facet of',

<h1>Finally make a csv with the columns: head, tail, original relationship, cluster name

In [37]:
originalEntitiesEmbd = model.encode(originalEntities)
orgEntEm_dict = {orgEnt.lower().replace(" ", "").strip(): orgEmbed for orgEnt, orgEmbed in zip(originalEntities,originalEntitiesEmbd)}
# orgEntEm_dict

Batches:   0%|          | 0/2416 [00:00<?, ?it/s]

In [38]:
import pandas as pd
# columns to make: head, tail, orignal relations, cluster name and along with that we also need the embeddings of the relation and the embedding of head and tail
data_df = {
    "head": [],
    "tail":[],
    "relation":[],
    "relationCluster": [],
    "headType":[],
    "tailType":[],
    "headEmbed":[],
    "tailEmbed":[],
    "relEmbed":[]
}
# entityType_dict
# final_res
for res in tqdm_notebook(final_res, desc = "Making the final Df", total = len(final_res)):
    data_df['head'].append(res['head'])
    data_df['tail'].append(res['tail'])
    data_df['relation'].append(res['relation'])
    data_df['relationCluster'].append(rel_Cluster_Name[res['relation']])
    data_df['headType'].append(entityType_dict[res['head'].lower().replace(" ", "").strip()])
    data_df['tailType'].append(entityType_dict[res['tail'].lower().replace(" ", "").strip()])
    data_df['headEmbed'].append(str(orgEntEm_dict[res['head'].lower().replace(" ", "").strip()]))
    data_df['tailEmbed'].append(str(orgEntEm_dict[res['tail'].lower().replace(" ", "").strip()]))
    data_df['relEmbed'].append(str(embd_dict[res['relation']]))

data_df = pd.DataFrame(data_df)
data_df

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for res in tqdm_notebook(final_res, desc = "Making the final Df", total = len(final_res)):


Making the final Df:   0%|          | 0/1592 [00:00<?, ?it/s]

Unnamed: 0,head,tail,relation,relationCluster,headType,tailType,headEmbed,tailEmbed,relEmbed
0,binding,binding,use,use,PERSON,PERSON,[-1.21414684e-01 7.70534948e-02 -6.98211342e-...,[-1.21414684e-01 7.70534948e-02 -6.98211342e-...,[-4.45322357e-02 4.27974947e-02 -4.77293991e-...
1,molecule,RNA,has part,part,ORG,ORG,[-3.25336270e-02 -4.65905480e-02 -4.65750284e-...,[-2.83921994e-02 -4.77526076e-02 -2.52587777e-...,[ 2.79611517e-02 8.10583457e-02 9.62299332e-...
2,RNA,nucleotide,has part,part,ORG,PERSON,[-2.83921994e-02 -4.77526076e-02 -2.52587777e-...,[-6.87153712e-02 -3.47657092e-02 -5.22661991e-...,[ 2.79611517e-02 8.10583457e-02 9.62299332e-...
3,Mohamed A. Marahiel,Philipps University,employer,employer,PERSON,ORG,[ 2.43127402e-02 1.21844962e-01 -4.82097529e-...,[-1.81916431e-02 4.98085357e-02 -4.74214414e-...,[-8.25185329e-02 7.37527981e-02 1.93586424e-...
4,recognition,recognition,instance of,instance of,ORG,ORG,[-4.66626249e-02 3.92468609e-02 -1.46743702e-...,[-4.66626249e-02 3.92468609e-02 -1.46743702e-...,[-1.19809937e-02 -7.29214912e-03 3.41963544e-...
...,...,...,...,...,...,...,...,...,...
1587,1KFM,protein,instance of,instance of,PERSON,GPE,[-4.20423038e-03 4.03800830e-02 -5.87142520e-...,[-5.44746667e-02 -7.03664273e-02 -1.68157909e-...,[-1.19809937e-02 -7.29214912e-03 3.41963544e-...
1588,1QJ9,protein,instance of,instance of,CARDINAL,GPE,[-9.14737508e-02 -2.53125317e-02 -1.44129246e-...,[-5.44746667e-02 -7.03664273e-02 -1.68157909e-...,[-1.19809937e-02 -7.29214912e-03 3.41963544e-...
1589,1LSX,protein,instance of,instance of,DATE,GPE,[-6.63179606e-02 2.36366652e-02 1.50013287e-...,[-5.44746667e-02 -7.03664273e-02 -1.68157909e-...,[-1.19809937e-02 -7.29214912e-03 3.41963544e-...
1590,Biophysical Journal,79,number of articles,number of articles,ORG,CARDINAL,[-7.37022981e-02 -3.67033258e-02 -1.12545015e-...,[ 2.68745739e-02 1.45227686e-01 -6.88864291e-...,[ 6.02299124e-02 1.41815487e-02 -2.53492631e-...


In [39]:
from IPython.display import FileLink
data_df.to_csv("Biology_.csv",index = False)
link = FileLink('Biology_.csv')

In [40]:
link