# ExGI - Gene Interaction Extraction algorithm


Welcome to ExGI, an algorithm designed for extracting gene interactions from scientific articles. This algorithm employs a combination of natural language processing (NLP) techniques, named entity recognition (NER), and relation extraction to identify and refine gene interactions within the provided text.

## Overview:
The ExGI algorithm involves several key steps:

1. **Pre-processing:** Extract and preprocess article data, including abstract tokenization and sentence elimination.
2. **Relation Extraction:** Utilise a fine-tuned bioBERT model to extract gene interactions.
5. **Post-processing:** Refine entity names and Compute confidence scores for extracted relations, considering multiple factors.

## Important Note:
Please be aware that Named Entity Recoginition functionalities of this code rely on a RESTful API provided by http://bern2.korea.ac.kr/. It's important to note that the API may experience intermittent unresponsiveness or delays, especially with multiple concurrent requests. If issues persist, refer to BERN2 documentation(http://bern2.korea.ac.kr/documentation) for guidance on implementing a local installation.



The following two cell imports the needed packages, pretrained BioBERT weights and loads pretrained BioBert model


In [None]:
# Extract BioBERT weights
!wget http://nlp.dmis.korea.edu/projects/biobert-2020-checkpoints/biobert_v1.1_pubmed.tar.gz
!tar -xvzf /content/biobert_v1.1_pubmed.tar.gz

# Install required packages
!pip install pytorch_transformers
!pip install transformers
!tar -xzf biobert_weights
!ls biobert_v1.1_pubmed/
!transformers-cli convert --model_type bert --tf_checkpoint biobert_v1.1_pubmed/model.ckpt-1000000 --config biobert_v1.1_pubmed/bert_config.json --pytorch_dump_output biobert_v1.1_pubmed/pytorch_model.bin
!ls biobert_v1.1_pubmed/
!mv biobert_v1.1_pubmed/bert_config.json biobert_v1.1_pubmed/config.json
!ls biobert_v1.1_pubmed/
!pip install datasets
from pytorch_transformers import BertModel
DATA_DIR="."
import os
import numpy as np
import pickle
import tensorflow as tf
# Installing additional packages
!pip install biopython
!pip install Bio
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# install BERT
!pip install pytorch_pretrained_bert pytorch-nlp

# BERT imports
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
!pip install Keras-Preprocessing
from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt

# specify GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)
model = BertForSequenceClassification.from_pretrained("biobert_v1.1_pubmed", num_labels=2)#binary classification
model.cuda()

In [None]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
from openpyxl import load_workbook
from Bio import Entrez
import nltk.data
import nltk
nltk.download('punkt')
import requests
import itertools

In [32]:
#Variable Initialisation

#Initializing email ID for PubMed Entrez search query fetch
mail = 'jyour_email@example.com' # Replace with your actual email ID

# Ensure to provide the correct path where you have saved the 'sentence_extraction_f.pth' file
sentence_classifier = 'sentence_extraction_f.pth'

# Ensure to provide the correct path where you have saved the 'relation_extraction_f.pth' file
relation_extraction_model = F"relation_extraction_f.pth"

# Ensure to provide the correct path where you have saved the 'unwanted_words.xlsx' file
unwanted_words_list = 'unwanted_words.xlsx'

# Ensure to provide the correct path where you have saved the 'E_Coli_RegulonDB_regs.xlsx' file
E_Coli_Regulations  = "E_Coli_RegulonDB_regs.xlsx"

# Keywords for PubMed Entrez search query to extract regulatory relationships for E. Coli, based on the paper
# Please refer to the paper for the rationale behind selecting these keywords
Keywords = "E Coli Escherichia coli gene regulation gene expression transcriptional"

# Number of articles to fetch from PubMed
Article_count = 100

# Maximum length of tokens for data processing related to BioBERT models
max_len = 256

# **Pre-processing - Search query with target specific keywords and retrieve relevant abstracts:** The following cell contains the methods to fetch literature abstracts as per selected keywords


In [4]:
# Function to search for articles on PubMed based on a given query and retrieve a specified number of results

def search(query,amt):
    # Set the email for PubMed API access
    Entrez.email = mail

    # Perform the PubMed search and retrieve results
    handle = Entrez.esearch(db='pubmed',
                            sort='relevance',
                            retmax=amt,
                            retmode='xml',
                            term=query)
    results = Entrez.read(handle)
    return results

# Function to generate a PubMed link for a given article ID
def make_a_link(id):
    return("https://pubmed.ncbi.nlm.nih.gov/"+str(id)+"/")

# Function to extract relevant information from a list of PubMed article IDs
def extract_articles(list_of_ids):

    # Lists to store extracted information
    Article_title = []
    Article_Abstract = []
    Pub_med_link = []
    Link_full_articles = []
    Type_of_articles = []

    # Loop through each article ID in the list
    for id in list_of_ids:

        # Use the article ID to construct a link and retrieve the HTML page
        uClient  = uReq(make_a_link(id))
        page_html = uClient.read()
        page_soup = soup(page_html, "html.parser")

        # Extract the title of the article
        Title = page_soup.find("h1",{"class":"heading-title"}).text.replace("\n","").strip()

        # Extract the abstract of the article
        Abstract = page_soup.find("div",{"class":"abstract-content"})
        if(Abstract is not None):
            Abstract = Abstract.text.replace("\n","").strip()

        # Extract the link to the full article (if available)
        Full_Article_link = page_soup.find("a",{"data-ga-action":"DOI"})
        if(Full_Article_link is not None):
            Full_Article_link = page_soup.find("a",{"data-ga-action":"DOI"})["href"]
            Link_full_articles.append(Full_Article_link)
        else:
            Link_full_articles.append("NA")

        # Extract the type of the article
        Type = page_soup.find("span","article-source").text.strip()

        # Append extracted information to respective lists
        Article_title.append(Title)
        Article_Abstract.append(Abstract)
        Pub_med_link.append(make_a_link(id))
        Type_of_articles.append(Type)

    # Create a new DataFrame to store the extracted information
    new_articles = pd.DataFrame()
    new_articles["Title"]=Article_title
    new_articles["abstract"]=Article_Abstract
    new_articles["link to full article"]=Link_full_articles
    new_articles['Type']=Type_of_articles
    new_articles["pub_med_links"]=Pub_med_link
    return(new_articles)

# **Pre-processing - Sentence Tokenisation**: The following cell contains a method to tokenize abstracts into sentences.

In [5]:
# Function to tokenize abstracts in a DataFrame using NLTK's sentence tokenizer
def tokenize_abs(df):
  # Load the English sentence tokenizer from NLTK
  tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

  # Lists to store the IDs and tokenized sentences
  ids = []
  sents = []

  # Loop through each row in the DataFrame
  for x in range(len(df)):

      # Get the abstract data from the current row
      data = df.iloc[x]['abstract']

      # Tokenize the abstract into sentences using NLTK's sentence tokenizer
      sentences = tokenizer.tokenize(data)

      # Append the sentences and corresponding IDs to the lists
      for sent in sentences:
          sents.append(sent)
          ids.append(x)

  # Create a pandas DataFrame with 'id' and 'Sentence' columns
  sentences_df = pd.DataFrame()
  sentences_df['id']=ids
  sentences_df['Sentence']=sents
  # Return the DataFrame containing sentences and IDs
  return(sentences_df)

In [6]:

def filter_sentences_genes(df,list_of_genes):
  # Lists to store information about whether a sentence contains a gene and the gene itself
  contains_ = []
  gene = []

  # Iterate through each row in the DataFrame
  for x in range(len(df)):

    # Initialize variables to track gene information for the current sentence
    c = 0 # Flag to indicate whether the sentence contains a gene
    g = '' # Variable to store the gene found in the sentence

    # Extract the sentence from the DataFrame
    sent = df['Sentence'].iloc[x]

    # Iterate through the list of genes to check if they are present in the sentence
    for y in range(len(list_of_genes)):
      if (list_of_genes[y] in sent):
        c = 1
        g = list_of_genes[y]
        continue
    # Append the results for the current sentence to the lists
    contains_.append(c)
    gene.append(g)

  # Add new columns 'Contain' and 'Gene' to the DataFrame
  df['Contain']=contains_
  df['Gene'] = gene

  # Return the modified DataFrame
  return(df)

# **Pre-processing - Sentence Eliminator 1:** The following cell contains the method that eliminate sentences unrelated to gene regulation or expression. The method uses our pretrained classifier sentence_extraction_f.pth



In [8]:
#Sentence classifier 1
def sentence_classification_1(df, max_len):

  # Load the pre-trained model
  model.load_state_dict(torch.load(sentence_classifier))
  model.eval()
  Sentences = df['Sentence']

  # Extract sentences from the DataFrame
  tokenizer = BertTokenizer.from_pretrained('biobert_v1.1_pubmed', do_lower_case=True)

  # Tokenize sentences using BioBERT tokenizer
  tokenized_texts = list(map(lambda t: ['[CLS]']+tokenizer.tokenize(t)+['[SEP]'] , Sentences))
  max_len = max_len

  # Pad sequences to a specified maximum length
  input_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, tokenized_texts)),
                            maxlen=max_len, dtype="long", truncating="post", padding="post")
  input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
  input_ids = pad_sequences(input_ids, maxlen=max_len, dtype="long", truncating="post", padding="post")

  # Create attention masks
  attention_masks = []
  for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

  # Define batch size
  batch_size = 8
  validation_inputs = torch.tensor(input_ids)
  validation_masks = torch.tensor(attention_masks, dtype=torch.long)

  # Create an iterator of our data with torch DataLoader
  # Prepare validation data for the model
  validation_data = TensorDataset(validation_inputs, validation_masks)
  validation_sampler = SequentialSampler(validation_data)
  validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

  # Lists to store predictions and logits
  all_predictions=[]
  all_predictions_0 = []
  all_predictions_1= []
  logits_abc = []

  # Iterate through validation data
  for batch in validation_dataloader:
      # Add batch to GPU
      batch = tuple(t.to(device) for t in batch)
      # Unpack the inputs from our dataloader
      b_input_ids, b_input_mask = batch
      # Telling the model not to compute or store gradients
      with torch.no_grad():
        # Forward pass, calculate logit predictions
        logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        #label_ids = b_labels.to('cpu').numpy()
        predicted_class = []
        logits_abc.append(logits)
        for x in range(len(logits)):
          if(logits[x][0]>0):
            abc=0
          else:
            abc=1
          all_predictions.append(abc)
        # Convert logits to probabilities and extract predictions for each class
        logits = logits.cpu()
        logits = (tf.nn.sigmoid(logits).numpy())
        for x in range(len(logits)):
          all_predictions_0.append(logits[x][0])
          all_predictions_1.append(logits[x][1])

  # Add predictions and logits to the DataFrame
  df['predictied']=all_predictions
  df['prediction_0']=all_predictions_0
  df['prediction_1']=all_predictions_1

  # Display information about the elimination process
  print("Sentence Eliminator 1: eliminated", len(df)-np.sum(df['predictied']), "out of ",len(df))

  # Fill NaN values in the DataFrame with empty strings
  df = df.fillna('')

  # Return the modified DataFrame
  return(df)

# **Pre-processing: Sentence Eliminator 2** The following cell contains the method that eliminate sentences that do not mention any gene/protein entities. The method uses an NER pretrained model BERN2.0.

BERN2.0 web service call may disconnect at times due to exceeding connection calls. We recommend running the program in small batches for bigger datasets.


In [9]:
#Sentence classifier 2

# Function to extract genes from a sentence based on annotations
def return_genes(sentence,X):
    genes = []

    # Iterate through annotations in the extracted annotations (X)
    for x in range(len(X['annotations'])):
        # Extract information about the annotation
        typ = X['annotations'][x]['obj']
        # Check if the annotation type is 'gene'
        if(typ == "gene"):
            begin = X['annotations'][x]['span']['begin']
            end = X['annotations'][x]['span']['end']
            # Extract the gene from the sentence based on the annotation span
            sent = sentence
            genes.append(sent[begin:end])
    return(genes)

# Function to check if a sentence contains multiple gene mentions
def if_contains_genes(df):
  target_gene = []
  all_genes = []


  # Iterate through each sentence in the DataFrame
  for pos in range(len(df)):

      # Query for raw annotations using the 'query_raw' function
      RET = query_raw(df['Sentence'].iloc[pos])

      # Extract genes from the sentence based on annotations
      all_genes.append(return_genes(df['Sentence'].iloc[pos],RET))

      # Count the number of gene annotations in the query result
      objs = [RET['annotations'][x]['obj'] for x in range(len(RET['annotations']))].count('gene')

      # Check if the sentence contains more than one gene mention
      if(objs>1):
          target_gene.append(1)
      else:
          target_gene.append(0)
  # Add new columns 'gene_mention' and 'All_X' to the DataFrame
  df['gene_mention']=target_gene
  df['All_X']= all_genes

  # Display information about the elimination process
  print("Sentence Eliminator 2: eliminated", len(df)-np.sum(df['gene_mention']), "out of ",len(df))

  # Fill NaN values in the DataFrame with empty strings
  df = df.fillna('')

  # Return the modified DataFrame
  return(df)

# **Relation Extraction: Named entity recognition and entity tagging.** The following cell contains the NER method to identify the gene/protein entities and tag sentences with all possible pairs.


In [25]:
# Named Entity Recognition (NER) functions

# Function to return gene pairs from a sentence based on annotations
def return_gene_pairs(sentence,X):
    genes = []

    # Iterate through annotations in the provided data (X)
    for x in range(len(X['annotations'])):
        typ = X['annotations'][x]['obj']

        # Check if the annotation type is 'gene'
        if(typ == "gene"):
            begin = X['annotations'][x]['span']['begin']
            end = X['annotations'][x]['span']['end']
            sent = sentence
            genes.append(sent[begin:end])

    # Generate pairs of gene combinations
    gene_pairs = []
    for row in genes:
        for column in genes:
            gene_pairs.append([row,column])

    return(gene_pairs)


# Function to return gene positions from a sentence based on annotations
def return_gene_pos(sentence,X):
    pos = []

    # Iterate through annotations in the provided data (X)
    for x in range(len(X['annotations'])):
        typ = X['annotations'][x]['obj']

        # Check if the annotation type is 'gene'
        if(typ == "gene"):
            begin = X['annotations'][x]['span']['begin']
            end = X['annotations'][x]['span']['end']
            sent = sentence
            pos.append([begin,end])

    # Check and modify positions if genes are part of an operon
    pos = check_if_operon(sentence,pos)
    return(pos)

# Function to return gene position pairs from a sentence based on annotations
def return_gene_pos_pairs(sentence,X):
    pos = []
    # Iterate through annotations in the provided data (X)
    for x in range(len(X['annotations'])):
        typ = X['annotations'][x]['obj']

        # Check if the annotation type is 'gene'
        if(typ == "gene"):
            begin = X['annotations'][x]['span']['begin']
            end = X['annotations'][x]['span']['end']
            sent = sentence
            pos.append([begin,end])

    # Check and modify positions if genes are part of an operon
    pos = check_if_operon(sentence,pos)
    # Generate pairs of gene position combinations
    gene_pos_pairs = []
    for row in pos:
        for column in pos:
            gene_pos_pairs.append([row,column])
    return(gene_pos_pairs)

# Function to query raw annotations for a given text
def query_raw(text, url="http://bern2.korea.ac.kr/plain"):
  try:
    return requests.post(url, json={'text': text}).json()
  except:
    query_raw(text)

# Function to check if genes are part of an operon and modify positions accordingly
def check_if_operon(sentence,pos):
    new_list = pos.copy()
    pos_to_remove = []
    # Iterate through positions to check for operons
    for x in range(len(pos)-1):
        this = pos[x][1]
        next = pos[x+1][0]
        if(next-this==1):
            if(sentence[next-1]=="-"):
                pos_to_remove.append(x)
                pos_to_remove.append(x+1)

    # Modify positions based on operons
    if(len(pos_to_remove)>0):
        pos_to_remove = list(dict.fromkeys(pos_to_remove))
        pair_start = pos_to_remove[0]
        pair_end = 0
        for y in pos_to_remove:
            new_list.remove(pos[y])
        for y in range(len(pos_to_remove)):

            if(y != len(pos_to_remove)-1):
                if(pos_to_remove[y]-pos_to_remove[y+1] == -1):
                    pair_end = 0
                else:
                    pair_end = pos_to_remove[y]
                if(pair_end != 0):
                    new_list.insert(pair_start,[pos[pair_start][0],pos[pair_end][1]])
                    pair_end = 0
                    pair_start = pos_to_remove[y+1]
            else:
                pair_end = pos_to_remove[y]
                new_list.insert(pair_start,[pos[pair_start][0],pos[pair_end][1]])

    return(new_list)

# Function for Named Entity Recognition (NER) tasks
def NER_1(df):
  # Initialize empty lists to store information
  ID = []
  sent_tagged = []
  sent_original = []
  pair = []
  add_lengths = 0
  agents=[]
  targets=[]
  All_X = []
  all_non_interested_genes = []
  new_sents = []

  # Extract unique sentences from the dataframe
  sentences = list(df["Sentence"].unique())

  # Iterate through unique sentences
  IDs = []
  ind = []
  for sentence in sentences:
    # Extract document ID for the current sentence
    index = list(df[df['Sentence']==sentence]["id"])
    IDs.append(index[0])

  # Iterate through unique sentences
  for x in range(len(sentences)):
      # Query raw annotations for the current sentence
      X = query_raw(sentences[x])
      # Get gene position pairs for the current sentence
      pos = return_gene_pos_pairs(sentences[x],X)

      # Iterate through gene position pairs
      for j in range(len(pos)):
          # Get gene pairs for the current sentence
          genes = return_gene_pairs(sentences[x],X)
          sent = sentences[x]

          # Check if gene positions are different
          if(pos[j][0] != pos[j][1]):
              agent = genes[j][0]
              target = genes[j][1]

              # Create placeholders for agents and targets
              agent_placeholder = "*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!"[0:len(agent)]
              target_placeholder = "^(#^(#^(#^(#^(#^(#^(#^(#^(#^(#^(#^(#^(#^(#^(#^(#^(#^(#^(#^(#^(#"[0:len(target)]

              # Replace agent and target in the sentence with placeholders
              sent = sent[0:pos[j][0][0]]+agent_placeholder+sent[pos[j][0][1]:len(sent)]
              sent = sent[0:pos[j][1][0]]+target_placeholder+sent[pos[j][1][1]:len(sent)]
              sent = sent.replace(agent_placeholder,"GENE1").replace(target_placeholder,"GENE2")

          else:
              continue

          ID.append(IDs[x])
          All_X.append(X)

          agents.append(genes[j][0])
          targets.append(genes[j][1])
          sent_tagged.append(sent)

          # Store original sentence for reference
          sent_original.append(sentences[x])
          ind.append(x)

          # Retrieve additional gene-related details using helper functions
          genes = return_genes(sentences[x],X)
          pos_2 = return_gene_pos(sentences[x],X)
          all_non_interested_genes.append(genes)

          # Initialize variables for sentence modification
          cur_sent = sent
          temp_sent = cur_sent
          placeholders = []

          # Replace non-interested genes with placeholders
          for y in range(len(genes)):

              if("GENE1" not in genes[y] and "GENE2" not in genes[y]):

                  try:
                    item = sent[pos_2[y][0]:pos_2[y][1]]
                    length = len(item)
                    placeholder = ""
                    for l in range(length):
                        placeholder = placeholder + str(y)
                    if(len(placeholder)>0):
                        placeholders.append(placeholder)
                  except:
                    continue


                  temp_sent = temp_sent.replace(genes[y], placeholder)
          # Replace standardized tags for consistency
          temp_sent = temp_sent.replace("GENE1","$GENE_AGENT#").replace("GENE2","$GENE_TARGET#")
          # Replace remaining placeholders with a common tag
          for p in placeholders:
              temp_sent = temp_sent.replace(p,"BLANK")
          # Append the modified sentence to the list
          new_sents.append(temp_sent)

  # Create a new DataFrame to store the processed information
  new_df = pd.DataFrame()
  new_df["ID"]=ID
  new_df['ind']=ind
  new_df['Sent original'] = sent_original
  new_df['Sent tagged'] = sent_tagged
  new_df['Agent'] = agents
  new_df['Target'] = targets
  new_df["All_X"] =All_X
  new_df["all_non_interested_genes"]=all_non_interested_genes
  new_df["new_sents_blanks"]=new_sents

  # Fill NaN values in the DataFrame with empty strings
  new_df = new_df.fillna('')

  # Return the modified DataFrame
  return(new_df)


# **Relation extraction- Relation classification:** The following cell contains the method for relation extraction. The model uses our fine-tunned pretrained BioBERT classifier relation_extraction_f.pth.


In [11]:
# Function for relation extraction
def relation_extraction(df):

  # Load the pre-trained relation extraction model
  model.load_state_dict(torch.load(relation_extraction_model))
  model.cuda()

  # Extract features from the DataFrame
  df_features = df[['new_sents_blanks']]
  feature = df_features['new_sents_blanks']


  # Tokenize and prepare input data for the model
  tokenizer = BertTokenizer.from_pretrained('biobert_v1.1_pubmed', do_lower_case=True)

  tokenized_texts = list(map(lambda t: ['[CLS]']+tokenizer.tokenize(t)+['[SEP]'] , feature))
  max_len = 256

  input_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, tokenized_texts)),
                              maxlen=max_len, dtype="long", truncating="post", padding="post")
  input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
  input_ids = pad_sequences(input_ids, maxlen=max_len, dtype="long", truncating="post", padding="post")
  attention_masks = []

  for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

  batch_size = 8
  validation_inputs = torch.tensor(input_ids)
  validation_masks = torch.tensor(attention_masks, dtype=torch.long)

  # Create an iterator for the validation data
  validation_data = TensorDataset(validation_inputs, validation_masks)
  validation_sampler = SequentialSampler(validation_data)
  validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
  all_predictions=[]
  all_predictions_0 = []
  all_predictions_1= []
  all_predictions=[]
  logits_abc = []

  # Iterate through validation data
  for batch in validation_dataloader:
      # Add batch to GPU
      batch = tuple(t.to(device) for t in batch)
      # Unpack the inputs from our dataloader
      b_input_ids, b_input_mask = batch
      # Telling the model not to compute or store gradients
      with torch.no_grad():
        # Forward pass, calculate logit predictions
        logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        predicted_class = []
        logits_abc.append(logits)

        # Iterate through logits to make predictions
        for x in range(len(logits)):
          if(logits[x][0]>0):
            abc=0
          else:
            abc=1
          all_predictions.append(abc)

        # Convert logits to probabilities and extract predictions for each class
        logits = logits.cpu()
        logits = (tf.nn.sigmoid(logits).numpy())

        for x in range(len(logits)):
          all_predictions_0.append(logits[x][0])
          all_predictions_1.append(logits[x][1])
  # Add relation extraction predictions and probabilities to the DataFrame
  df['prediction_RE']=all_predictions
  df['prediction_RE_0']=all_predictions_0
  df['prediction_RE_1']=all_predictions_1

  # Fill NaN values in the DataFrame with empty strings
  df = df.fillna('')

  # Return the modified DataFrame
  return(df)

# **Post-processing- Refinement:** The following cell contains code to refine entity names and computing confidence factor.

In [12]:
# Function to remove unwanted words from 'Agent' and 'Target' columns
def remove_unwanted_wds(df):
  # Read the list of unwanted words from an Excel file
  df_uw_2 = pd.read_excel(unwanted_words_list)

  # Initialize lists to store refined 'Agent' and 'Target' values
  agent_refined = []
  target_refined = []

  # Iterate through the DataFrame rows
  for x in range(len(df)):

      # Process 'Agent' column
      wrd = df['Agent'].iloc[x]
      if(len(df['Agent'].iloc[x].split(" "))!=1):
          wrd = ""
          for y in df['Agent'].iloc[x].split(" "):
              if(y not in list(df_uw_2['Unwanted_words'])):
                wrd =wrd + y + " "
      wrd = wrd.strip()
      agent_refined.append(wrd)

  for x in range(len(df)):

      # Process 'Target' column
      wrd = df['Target'].iloc[x]
      if(len(df['Target'].iloc[x].split(" "))!=1):
          wrd = ""
          for y in df['Target'].iloc[x].split(" "):
              if(y not in list(df_uw_2['Unwanted_words'])):
                wrd =wrd + y + " "
      wrd = wrd.strip()
      target_refined.append(wrd)

  # Add refined 'Agent' and 'Target' columns to the DataFrame
  df['Agent_uws']=agent_refined
  df['Target_uws']=target_refined

  # Fill any NaN values in the DataFrame with empty strings
  df = df.fillna('')

  # Return the modified DataFrame
  return(df)

# Function to remove non-gene words from 'Agent_uws' and 'Target_uws' columns
def remove_non_gene(df):

  # Initialize lists to store refined 'Agent' and 'Target' values
  Refined_Agents = []

  # Iterate through the DataFrame rows
  for y in range(len(df)):
      # Process 'Agent_uws' column
      temp = df['Agent_uws'].iloc[y]

      temp = temp.split(" ")
      gene_name = ''
      if(len(temp)>1):
          for x in range(len(temp)):
              if(len(temp[x])>0 and temp[x]!=" "):
                  X = query_raw(temp[x])
                  if('annotations' in list(X.keys())):
                    if(len(X['annotations'])>0):
                        if (X['annotations'][0]['obj'] == "gene"):
                            gene_name = gene_name + temp[x] + " "
          gene_name = gene_name.strip()
          Refined_Agents.append(gene_name)
      else:
          Refined_Agents.append(temp[0])
  Refined_Target = []
  for y in range(len(df)):
      # Process 'Target_uws' column
      temp = df['Target_uws'].iloc[y]
      temp = temp.split(" ")
      gene_name = ''
      if(len(temp)>1):
          for x in range(len(temp)):
              if(len(temp[x])>0 and temp[x]!=" "):
                  X = query_raw(temp[x])
                  if('annotations' in list(X.keys())):
                    if(len(X['annotations'])>0):
                        if (X['annotations'][0]['obj'] == "gene"):
                            gene_name = gene_name + temp[x] + " "
          gene_name = gene_name.strip()
          Refined_Target.append(gene_name)
      else:
          Refined_Target.append(temp[0])

  # Add refined 'Agent_uws_gn' and 'Target_uws_gn' columns to the DataFrame
  df['Agent_uws_gn']=Refined_Agents
  df['Target_uws_gn']=Refined_Target

  # Fill any NaN values in the DataFrame with empty strings
  df = df.fillna('')

  # Return the modified DataFrame
  return(df)

# **Post-processing: Confidence computation:** The following cell contains the methods for computing confidence factor for extracted relations.

In [26]:
# Function to compute confidence scores at the relation level
def compute_confidence(df2):
  confidence = []
  unique_sentences = []

  # Iterate through relation pairs
  for pair in df2['Relation']:
      score = 0
      abc = df2[df2['Relation']==pair][['ID','Sent original','prediction_RE_1']]
      unique_sentences.append(len(abc["Sent original"].unique()))

      # Compute confidence score for each unique sentence
      for x in abc["Sent original"].unique():
          score = score + np.average(abc[abc['Sent original']==x]['prediction_RE_1'])
      confidence.append(score)

  # Add 'sentence_level_c' and 're_unique_sentences' columns to the DataFrame
  df2['sentence_level_c'] = confidence
  df2['re_unique_sentences']=unique_sentences #sentences giving the same relationship
  return df2

# Function to check if relations are related to E. Coli regulations
def if_e_coli_reg(df2):
  df = pd.read_excel(E_Coli_Regulations)
  rel_exists = []

  # Iterate through DataFrame rows
  for x in range(len(df2)):
      agent = df2['Agent_uws_gn'].iloc[x]
      target = df2['Target_uws_gn'].iloc[x]
      rele = 0

      # Iterate through E. Coli regulations
      for y in range(len(df)):
          if(df['Agent'].iloc[y].lower() in agent.lower()):
              if(df['Target'].iloc[y].lower() in target.lower()):
                  rele = 1
                  break

      rel_exists.append(rele)

  # Add 'E_Coli_regulation' column to the DataFrame
  df2['E_Coli_regulation']=rel_exists
  return df2


# Function to check if agents are related to E. Coli regulons

def if_e_coli_regulon(df2):
  df = pd.read_excel(E_Coli_Regulations)
  E_Coli_regulons = []
  for x in list(df['Agent'].unique()):
      E_Coli_regulons.append(x.lower())
  is_regulon = []

  # Iterate through DataFrame agents
  for agent in df2['Agent_uws_gn']:
      is_r = 0
      # Iterate through E. Coli regulons
      for ag in E_Coli_regulons:
          if(ag.lower() in agent.lower().split(" ")):
              is_r=1

              break
      is_regulon.append(is_r)

  # Add 'is_regulon' column to the DataFrame
  df2['is_regulon']=is_regulon
  return df2


# Function to calculate the final confidence score
def final_conf(df2):

  # Add 'c' column to the DataFrame
  df2['c'] = df2['sentence_level_c'] + if_e_coli_regulon(df2)['is_regulon'] + 3*if_e_coli_reg(df2)['E_Coli_regulation']
  return df2

In [None]:

# Search for articles based on keywords
E_Coli = search(Keywords,Article_count)

# Extract and preprocess articles
df_articles = extract_articles(E_Coli['IdList'])
df_articles = df_articles.dropna(axis=0)

# Tokenize abstracts and perform sentence classification
df_sents_1 = tokenize_abs(df_articles)
df_sents_2 = sentence_classification_1(df_sents_1, max_len)
df_sents_2 = df_sents_2.fillna('')

# Identify sentences containing genes
df_sents_3 = if_contains_genes(df_sents_2[df_sents_2["predictied"]==1])

# Perform Named Entity Recognition (NER)
df_sents_NER = NER_1(df_sents_3[df_sents_3["gene_mention"]==1])

# Extract relations using relation extraction model
df_sents_RE = relation_extraction(df_sents_NER)

# Remove unwanted words from refined relations
df_RE_refined_1 = remove_unwanted_wds(df_sents_RE[df_sents_RE['prediction_RE']==1])
df_RE_refined_1 = df_RE_refined_1.fillna('')

# Remove non-gene words from refined relations
df_RE_refined_2 = remove_non_gene(df_RE_refined_1)
df_RE_refined_2 = df_RE_refined_2.fillna('')

# Filter out rows where 'Agent_uws_gn' and 'Target_uws_gn' are not present
df_RE_refined_2 = df_RE_refined_2[df_RE_refined_2['Agent_uws_gn']!=""]
df_RE_refined_2 = df_RE_refined_2[df_RE_refined_2['Target_uws_gn']!=""]

# Create a 'Relation' column by combining 'Agent_uws_gn' and 'Target_uws_gn'
df_RE_refined_2['Relation'] = df_RE_refined_2['Agent_uws_gn']+"--"+df_RE_refined_2['Target_uws_gn']

# Compute confidence scores for relations
df_RE_scored = compute_confidence(df_RE_refined_2)
# For E. Coli regulations
df_RE_scored_EColi = final_conf(df_RE_scored)

# Extract final relations and remove duplicates
Final_relations = df_RE_scored_EColi[['Agent_uws_gn','Target_uws_gn','Relation','sentence_level_c','re_unique_sentences',	'is_regulon',	'E_Coli_regulation',	'c']].drop_duplicates()

# Sort and export the final relations to an Excel file
print("Final Relations extracted as as follows")
Final_relations.sort_values(['c'],ascending=False).to_excel("ExGI_result.xlsx")
Final_relations.sort_values(['c'],ascending=False)