# Instructions
Run the notebook except the last cell to load all the packages, models and functions. Then, provide an input when promtped to the `test_sentence` string variable in the last cell of the notebook. Your sentence must contain tags to indicate the entities of the relation you would like the model to predict. For example your sentence may be: `'The <e1>bottle</e1> is filled with <e2>water</e2>.'`. The `'<e1>'` and `'</e1>'` tags indicate the first entity, `'bottle'`, and the `'<e2>'` and `'</e2>'` tags indicate the second entity, `'water'`.

In [None]:
import numpy as np
import pandas as pd
import nltk
import re
import math
import json
import pickle
import shutil
import os
import tensorflow as tf

from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.metrics import precision_score, recall_score

from sklearn.model_selection import KFold


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dropout, Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding, Bidirectional, SpatialDropout1D
from tensorflow.keras.layers import Concatenate, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau

import gensim.downloader as api


from google.colab import drive

path = '/content/drive/MyDrive/Text_Mining/BILSTM/'
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## **Data Pre-processing**

In [None]:
# method to one hot encode the relation label
def one_hot_encode_relations(relation_label, num_classes=19):
  one_hot = np.zeros(num_classes)
  one_hot[relation_label] = 1
  return one_hot

# method to produce relative postion encodings for sentences
def get_relative_positions(sentence):

  tokens = re.findall(r'<e\d+>|</e\d+>|\S+', sentence)

  # Identify entity indices and words inside tags
  e1_index = None
  e2_index = None
  inside_e1 = False
  inside_e2 = False

  for i, token in enumerate(tokens):
      if token == "<e1>":
          e1_index = i  # Start of entity
          inside_e1 = True
      elif token == "</e1>":
          inside_e1 = False
      elif token == "<e2>":
          e2_index = i  # Start of entity
          inside_e2 = True
      elif token == "</e2>":
          inside_e2 = False

  # make sure valid indices were found in the sentence
  if e1_index is None or e2_index is None:
      raise ValueError("Both <e1> and <e2> entities must be present.")

  pos1 = []
  pos2 = []
  inside_e1 = False
  inside_e2 = False

  # calculate the relative positions based on tokens
  for i, token in enumerate(tokens):
      if token == "<e1>":
          pos1.append(-1)
          inside_e1 = True
      elif token == "</e1>":
          pos1.append(1)
          inside_e1 = False
      elif inside_e1:
          pos1.append(0)
      elif pos1 and pos1[-1] >= 0:
          pos1.append(pos1[-1] + 1)
      else:
          pos1.append(i - e1_index)

      if token == "<e2>":
          pos2.append(-1)
          inside_e2 = True
      elif token == "</e2>":
          pos2.append(1)
          inside_e2 = False
      elif inside_e2:
          pos2.append(0)
      elif pos2 and pos2[-1] >= 0:
          pos2.append(pos2[-1] + 1)
      else:
          pos2.append(i - e2_index)

  return tokens, pos1, pos2

# method to produce token and positional encodings for sentences
def preprocess_dataset(dataset, tokenizer, max_seq_length=128, num_classes=19):

  X_word, X_pos1, X_pos2, Y = [], [], [], []

  for _, sample in dataset.iterrows():
    sentence = sample['sentence']
    relation = sample['relation']

    # tokens, entity1_pos, entity2_pos, close_entity1_pos, close_entity2_pos = preprocess_sentence(sentence)

    tokens, pos1, pos2 = get_relative_positions(sentence)

    token_ids = tokenizer.texts_to_sequences([tokens])[0] #convert to indices?


    # pos1, pos2 = compute_relative_positions(len(token_ids), entity1_pos, entity2_pos, close_entity1_pos, close_entity2_pos)

    token_ids = pad_sequences([token_ids], maxlen=max_seq_length, padding='post')[0]
    pos1 = pad_sequences([pos1], maxlen=max_seq_length, padding='post')[0]
    pos2 = pad_sequences([pos2], maxlen=max_seq_length, padding='post')[0]


    relation_one_hot = one_hot_encode_relations(relation, num_classes)

    X_word.append(token_ids)
    X_pos1.append(pos1)
    X_pos2.append(pos2)
    Y.append(relation_one_hot)

  return np.array(X_word), np.array(X_pos1), np.array(X_pos2), np.array(Y)

# method to delimit entity markers with spaces for proper tokenization later on
def replace_entity_markers(text):
    text = text.replace("<e1>", " <e1> ").replace("</e1>", " </e1> ")
    text = text.replace("<e2>", " <e2> ").replace("</e2>", " </e2> ")
    return text

## **Interact with Model**

In [None]:
class_label_mapping = {
    0: "Cause-Effect(e1,e2)", 1: "Cause-Effect(e2,e1)",
    2: "Component-Whole(e1,e2)", 3: "Component-Whole(e2,e1)",
    4: "Content-Container(e1,e2)", 5: "Content-Container(e2,e1)",
    6: "Entity-Destination(e1,e2)", 7: "Entity-Destination(e2,e1)",
    8: "Entity-Origin(e1,e2)", 9: "Entity-Origin(e2,e1)",
    10: "Instrument-Agency(e1,e2)", 11: "Instrument-Agency(e2,e1)",
    12: "Member-Collection(e1,e2)", 13: "Member-Collection(e2,e1)",
    14: "Message-Topic(e1,e2)", 15: "Message-Topic(e2,e1)",
    16: "Product-Producer(e1,e2)", 17: "Product-Producer(e2,e1)",
    18: "Other"
}

def load_tokenizer(tokenizer_path):
    with open(tokenizer_path, 'rb') as handle:
        tokenizer = pickle.load(handle)
    return tokenizer

def load_model(model_path):
    model = tf.keras.models.load_model(model_path)
    return model

def sentence__relation_prediction(sentence, tokenizer, model, max_seq_length=256):

  sentence = replace_entity_markers(sentence)

  words = tokenizer.word_index
  vocab_size = len(words) + 1
  print(vocab_size)

  sentence_df = pd.DataFrame([[sentence, 18]], columns=["sentence", "relation"])

  tokenized_sentence, e1_pos_encoding, e2_pos_encoding, _ = preprocess_dataset(sentence_df, tokenizer, max_seq_length)

  y_pred_prob = model.predict([tokenized_sentence, e1_pos_encoding, e2_pos_encoding])

  predicted_class = np.argmax(y_pred_prob, axis=1)[0]

  predicted_relation = class_label_mapping.get(predicted_class)

  return predicted_relation

In [None]:
best_model = load_model(path + 'models/best_bilstm_model_75.keras')
tokenizer = load_tokenizer(path + 'models/best_bilstm_tokenizer_75.pickle')

In [None]:
test_sentence = 'The <e1>bottle</e1> was filled with <e2>water</e2> and placed on the table.'
predicted_relation = sentence__relation_prediction(test_sentence, tokenizer, best_model)
print(predicted_relation)

19572
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
Content-Container(e2,e1)


In [None]:
test_sentence = input("Please Enter a sentence with two entities in the format provided in the previous cell: ")
predicted_relation = sentence__relation_prediction(test_sentence, tokenizer, best_model)
print(predicted_relation)

Please Enter a sentence with two entities in the format provided in the previous cell: The <e1>bottle</e1> was filled with <e2>water</e2> and placed on the table.
19572
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
Content-Container(e2,e1)
