For inference, please follow the instructions before the second last cell.


In [120]:
!pip install datasets torch numpy spacy joblib pandas



In [121]:
from datasets import load_dataset
from datasets import Dataset
from collections import Counter
import torch
import torch.nn.functional as F
import torch
import torch.nn as nn
import joblib
import numpy as np
import spacy
from spacy.tokenizer import Tokenizer
import re
import warnings
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd

In [122]:
warnings.filterwarnings("ignore")

In [123]:
dataset = load_dataset("sem_eval_2010_task_8")

In [124]:
nlp = spacy.load("en_core_web_sm")

special_tokens = ["<e1>", "</e1>", "<e2>", "</e2>"]

def custom_tokenizer(nlp):
    prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)
    infix_re = spacy.util.compile_infix_regex(nlp.Defaults.infixes)

    tokenizer = Tokenizer(nlp.vocab,
                          prefix_search=prefix_re.search,
                          suffix_search=suffix_re.search,
                          infix_finditer=infix_re.finditer,
                          token_match=None)

    for token in special_tokens:
        tokenizer.add_special_case(token, [{"ORTH": token}])

    return tokenizer

nlp.tokenizer = custom_tokenizer(nlp)

def tokenize(text):
    for token in special_tokens:
        text = text.replace(token, f" {token} ")  #Adding spaces to isolate markers

    return [token.text.lower() for token in nlp(text)]

In [125]:
from gensim.models import KeyedVectors
import gensim.downloader as api
import numpy as np

def load_word2vec(word2idx):
  word2vec = api.load("word2vec-google-news-300")
  vocab_size = len(word2idx)
  embedding_matrix = np.random.uniform(-0.25, 0.25, (vocab_size, embedding_dim))

  for word, idx in word2idx.items():
      if word in word2vec:
          embedding_matrix[idx] = word2vec[word]

  return embedding_matrix

In [126]:
def load_word_embeddings(word2idx, vecType):
  if vecType == "turian":
    embedding_file = "embeddings-scaled.50.gz"
  elif  vecType == "word2vec":
    word_embeddings = load_word2vec(word2idx)

  return word_embeddings


In [127]:
#creating a mapping of vocabulary

word_freq = Counter(word for sentence in dataset["train"]["sentence"] for word in tokenize(sentence))
word2idx = {"<PAD>": 0, "<UNK>": 1}

for i, (word, _) in enumerate(word_freq.most_common(), start=2):
    word2idx[word] = i

In [128]:
#Word2Vec embeddings are too large to share as a file (1.5GB). Please download. Will take ~5 min
word_embeddings=load_word_embeddings(word2idx, "word2vec")

In [129]:
entity_pos = { "inference":[]}
target_classes = 19
num_clusters = 5
batch_size = 32
max_num_clusters=15
max_sentence_len=80
embedding_dim=300

In [130]:
relation_names=[
"Cause-Effect(e1,e2)",
"Cause-Effect(e2,e1)",
"Component-Whole(e1,e2)",
"Component-Whole(e2,e1)",
"Content-Container(e1,e2)",
"Content-Container(e2,e1)",
"Entity-Destination(e1,e2)",
"Entity-Destination(e2,e1)",
"Entity-Origin(e1,e2)",
"Entity-Origin(e2,e1)",
"Instrument-Agency(e1,e2)",
"Instrument-Agency(e2,e1)",
"Member-Collection(e1,e2)",
"Member-Collection(e2,e1)",
"Message-Topic(e1,e2)",
"Message-Topic(e2,e1)",
"Product-Producer(e1,e2)",
"Product-Producer(e2,e1)",
"Other"
]

In [131]:

def encode_sentence(tokens, train_or_test, word2idx=word2idx, max_len=max_sentence_len):

    #saving positions of entities in each sentence for later use
    ei1_s = tokens.index("<e1>")
    ei1_e = tokens.index("</e1>")
    ei2_s = tokens.index("<e2>")
    ei2_e = tokens.index("</e2>")
    entity_pos[train_or_test].append(((ei1_s + 1, ei1_e),  (ei2_s + 1, ei2_e)))


    #getting unique indices for each word based on vocab dictionary
    indices = [word2idx.get(token, word2idx["<UNK>"]) for token in tokens]
    indices = indices[:max_len] + [word2idx["<PAD>"]] * (max_len - len(indices))

    return torch.tensor(indices, dtype=torch.long)

In [132]:
def get_data_loader(data, train_or_test, shuffle, entity_hidden_states = None, entity_features = None):
  encoded_sentences = [encode_sentence(tokenize(sentence), train_or_test) for sentence in data["sentence"]]
  labels = torch.tensor(data["relation"], dtype=torch.long)
  sentences_tensor = torch.stack(encoded_sentences)

  dataset_args = [sentences_tensor, labels]

  if entity_hidden_states is not None and entity_features is not None:
    dataset_args.append(entity_hidden_states)
    dataset_args.append(entity_features)

  torch_dataset = TensorDataset(*dataset_args)
  dataloader = DataLoader(torch_dataset, batch_size=batch_size, shuffle=shuffle)
  return dataloader

In [133]:
def get_hs_mean(sent_hidden_states, ep):
  start = ep[0]
  end = ep[1]
  if start == end:
    return sent_hidden_states[start]
  else:
    return torch.sum(sent_hidden_states[start:end], dim=0) / (end - start)

In [134]:
def get_entities_hs_from_sentence(lstm_out, pos_index, train_or_test):

  en_hs_list = []
  for sent_hid_states in (lstm_out):
    (e1p, e2p) = entity_pos[train_or_test][pos_index]
    h1 = get_hs_mean(sent_hid_states, e1p)
    h2 = get_hs_mean(sent_hid_states, e2p)
    en_pair = torch.stack([h1, h2]) #dim => 2 x hidden_dim * 2
    en_hs_list.append(en_pair)
    pos_index += 1


  return torch.stack(en_hs_list), pos_index #dim => batch_size x 2 x hidden_dim * 2

In [135]:
def get_entity_hidden_states(model, loader, train_or_test):

  #Getting the hidden states of entities based on position in sentence

  model.eval()
  en_hs_tensors_list = []
  with torch.no_grad():
      pos_index = 0
      for x_batch, y_batch in loader:
        _, lstm_out = model(x_batch)
        en_hs_list, pos_index = get_entities_hs_from_sentence(lstm_out, pos_index, train_or_test)
        en_hs_tensors_list.append(en_hs_list)

      return torch.cat(en_hs_tensors_list, dim=0)  #dim => batch_size, 2, hidden_dim * 2


In [136]:
def get_entity_type_features(hidden_states, clustering_type, model, hs_array=None):

  if hs_array is None:
    hs_array = np.array([e for tup in hidden_states for e in tup])

  if clustering_type == 'KMeans':
    entity_types = model.predict(hs_array)
  elif clustering_type == 'GaussianMixture':
    soft_labels = model.predict_proba(hs_array)
    component_indices = np.arange(num_clusters)
    entity_types = np.dot(soft_labels, component_indices)  # (number of entities,) weighted assignment


  entity_type_embeddings = nn.Embedding(num_clusters, embedding_dim=hidden_dim*2)
  entity_type_features = entity_type_embeddings(torch.tensor(entity_types, dtype=torch.long)) #shape = (num_of_entities, hidden_dim * 2)

  num_sentences = entity_type_features.shape[0] // 2
  entity_type_features = entity_type_features.view(num_sentences, 2, -1)  #dim => num_setences, num_entities per sentence (=2), hidden_dim * 2

  return entity_type_features


In [137]:
class RE_BiLSTM(nn.Module):
  def __init__(self, embeddings, hidden_dim, vocab_size, num_layers, dropout_rate, output_dim=target_classes, embedding_dim=embedding_dim):
    super(RE_BiLSTM, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=word2idx['<PAD>'])
    if embeddings is not None:
        self.embedding.weight.data.copy_(torch.from_numpy(embeddings))
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True, batch_first=True, dropout=dropout_rate if num_layers > 1 else 0)
    self.embedding_dropout = nn.Dropout(dropout_rate)
    self.output_dropout = nn.Dropout(dropout_rate)


  def forward(self, x):
    embedded = self.embedding(x)  # [batch_size, seq_len, embedding_dim]
    embedded = self.embedding_dropout(embedded)
    lstm_out, _ = self.lstm(embedded)  # [batch_size, seq_len, hidden_dim*2]
    lstm_out = self.output_dropout(lstm_out)

    return lstm_out

In [138]:
class EntityAwareAttention(nn.Module):
  def __init__(self, hidden_dim, dropout_rate, w_norm_threshold=0.5):
    super(EntityAwareAttention, self).__init__()
    self.tanh_hs = nn.Tanh()
    self.w = nn.Parameter(torch.randn(hidden_dim * 2))
    self.tanh_sent = nn.Tanh()
    self.linear = nn.Linear(hidden_dim * 2, target_classes)
    self.dropout = nn.Dropout(dropout_rate)
    self.w_norm_threshold=w_norm_threshold

  def forward(self, hidden_states):
    #hidden_states dim => batch_size, T, d
    M = self.tanh_hs(hidden_states) #word_features dim => batch_size x T x d, where d = 2 * hidden_dim and T= max sentence length
    alpha = torch.softmax(torch.matmul(M, self.w), dim=1)  #attention scores, batch_size x T dimensional
    alpha = alpha.unsqueeze(1)  # Shape: [batch_size, 1, T]
    r = torch.bmm(alpha, hidden_states).squeeze(1)  #batch_size x d dimensional
    h_star = self.tanh_sent(r) #batch_size x d dimensional
    h_star = self.dropout(h_star)
    preds = self.linear(h_star) #batch_size x target_classes

    return preds



In [139]:
class RE_Model(nn.Module):
  def __init__(self, embeddings, hidden_dim, vocab_size, lstm_layers,
               dropout_rate, output_dim=target_classes, embedding_dim=embedding_dim,  use_entity_attention=True):
    super(RE_Model, self).__init__()
    self.lstm = RE_BiLSTM(embeddings, hidden_dim, vocab_size, num_layers=lstm_layers, dropout_rate=dropout_rate)
    self.WH = nn.Linear(hidden_dim * 2, hidden_dim * 2)
    self.WE = nn.Linear((hidden_dim * 2) * 4, hidden_dim * 2)
    self.ent_att = EntityAwareAttention(hidden_dim, dropout_rate)
    self.use_entity_attention = use_entity_attention
    self.layer_norm = nn.LayerNorm(hidden_dim * 2)
    self.dropout = nn.Dropout(dropout_rate)

  def forward(self, x, entity_hidden_states, entity_types):
    lstm_out = self.lstm(x)

    if self.use_entity_attention:
      entity_hidden_states = torch.cat((entity_hidden_states[:, 0, :], entity_hidden_states[:, 1, :]), dim=-1)
      entity_types = torch.cat((entity_types[:, 0, :], entity_types[:, 1, :]), dim=-1)
      entity_hidden_states.unsqueeze(1)
      entity_types.unsqueeze(1)

      #dim of both tensors above => batch_size, 1, (hidden_dim * 2) * 2

      entity_features = torch.cat([entity_hidden_states, entity_types], dim = -1) #dim =>[batch_size, 1, (hidden_dim * 2) * 2]

      entity_features = self.dropout(entity_features)
      e = self.layer_norm(self.WE(entity_features))
      l = self.WH(lstm_out)
      e = e.unsqueeze(1)

      word_features = e + l
      preds = self.ent_att(word_features)
    else:
      preds = self.ent_att(lstm_out)


    return preds


In [140]:
class EntityEncoderBiLSTM(nn.Module):
  def __init__(self, embeddings, hidden_dim, vocab_size, num_layers, dropout_rate, output_dim=target_classes, embedding_dim=embedding_dim):
    super(EntityEncoderBiLSTM, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=word2idx['<PAD>'])
    if embeddings is not None:
        self.embedding.weight.data.copy_(torch.from_numpy(embeddings))
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True, batch_first=True, dropout=dropout_rate if num_layers > 1 else 0)
    self.embedding_dropout = nn.Dropout(dropout_rate)
    self.fc = nn.Linear(hidden_dim * 2, output_dim)
    self.dropout = nn.Dropout(dropout_rate)
    self.lstm_dropout = nn.Dropout(dropout_rate)

  def forward(self, x):
    embedded = self.embedding(x)
    embedded = self.embedding_dropout(embedded)
    lstm_out, _ = self.lstm(embedded)
    lstm_out = self.lstm_dropout(lstm_out)
    pooled = torch.mean(lstm_out, dim=1)
    output = self.dropout(pooled)
    output = self.fc(output)
    return output, lstm_out



In [141]:
hidden_dim = 256
dropout_rate = 0.44586702274269596
num_layers = 2
weight_decay=1e-05
epochs= 25
learning_rate=1


entity_lstm_model = EntityEncoderBiLSTM(embeddings=word_embeddings, hidden_dim=hidden_dim, vocab_size=len(word2idx), num_layers=num_layers,
                       dropout_rate=dropout_rate, output_dim=target_classes)
entity_lstm_model.load_state_dict(torch.load("encoder_weights.pth"))


<All keys matched successfully>

In [142]:
hidden_dim = 256
dropout_rate = 0.44586702274269596
learning_rate = 1
num_layers = 1
weight_decay= 1e-05
epochs= 15
re_model_wo_types = RE_Model(embeddings=word_embeddings, hidden_dim=hidden_dim, vocab_size=len(word2idx), lstm_layers=num_layers,
                      dropout_rate=dropout_rate, use_entity_attention=False)
re_model_wo_types.load_state_dict(torch.load("RE_without_entities_model_weights.pth"))

<All keys matched successfully>

In [143]:
kmeans = joblib.load("kmeans_model.pkl")
gmm = joblib.load("gmm_model.pkl")

In [144]:
hidden_dim = 256
dropout_rate = 0.55
learning_rate = 1
num_layers = 1
weight_decay= 1e-02
epochs= 15

re_model_kmeans = RE_Model(embeddings=word_embeddings, hidden_dim=hidden_dim, vocab_size=len(word2idx), lstm_layers=num_layers,
                      dropout_rate=dropout_rate, use_entity_attention=True)
re_model_kmeans.load_state_dict(torch.load("RE_with_kmeans_entities_model_weights.pth"))

<All keys matched successfully>

In [145]:
hidden_dim = 256
dropout_rate = 0.55
learning_rate = 1
num_layers = 1
weight_decay= 1e-02
epochs= 15

re_model_gmm = RE_Model(embeddings=word_embeddings, hidden_dim=hidden_dim, vocab_size=len(word2idx), lstm_layers=num_layers,
                      dropout_rate=dropout_rate, use_entity_attention=True)
re_model_gmm.load_state_dict(torch.load("RE_with_gmm_entities_model_weights.pth"))

<All keys matched successfully>

In [146]:
def get_prediction(model, sentences_tensor, entity_hidden_states, infer_entity_type_features):
  if (entity_hidden_states is None) and (infer_entity_type_features is None):
    preds, _ = model(sentences_tensor)
  else:
    preds = model(sentences_tensor, infer_entity_hidden_states, infer_entity_type_features)

  probs = torch.softmax(preds, dim=1)
  max_indices = torch.tensor(torch.argmax(probs, dim=1))
  predicted_relations = [relation_names[idx] for idx in max_indices.tolist()]

  combined_array = np.column_stack((max_indices.numpy(), np.array(predicted_relations, dtype=object)))

  return combined_array

Please enter a list of sentences in the cell below.


1.   The max length of the sentence is 80 characters.
2.   e1 and e2 should be marked as tags e.g. <e1>...</e1>
3.   We assume only one pair of entities per sentence

Then run the last cell. The code will display a Pandas Dataframe with the prediction for each sentence based on all 4 experimental LSTM models we developed

In [147]:
sentences =  [ "<e1>John</e1> works at <e2>Google</e2>.",  "The<e1>storm</e1> caused poor <e2>visibility</e2>."]
#e.g. sentence = "<e1>John</e1> works at <e2>Google</e2>."

In [148]:
inference_sentences = [{"sentence": sentence, "relation":-1} for sentence in sentences]

inference_dataset = Dataset.from_list(inference_sentences)
infer_loader = get_data_loader(inference_dataset, "inference", False)
infer_entity_hidden_states = get_entity_hidden_states(entity_lstm_model, infer_loader, "inference")
infer_entity_type_features = get_entity_type_features(infer_entity_hidden_states, "KMeans", kmeans)
encoded_sentences = [torch.tensor(encode_sentence(x, "inference"), dtype=torch.long) for x in inference_dataset["sentence"]]
sentences_tensor = torch.stack(encoded_sentences)

data1 = get_prediction(entity_lstm_model, sentences_tensor, None, None)
df1 = pd.DataFrame(data1, columns=["Relation Number", "Relation Name"])
df1["Model Name"]="BiLSTM"
df1 = df1[["Model Name", "Relation Number", "Relation Name"]]


data2 = get_prediction(re_model_wo_types, sentences_tensor, infer_entity_hidden_states, infer_entity_type_features)
df2 = pd.DataFrame(data2, columns=["Relation Number", "Relation Name"])
df2["Model Name"]="Att-LSTM (w/o Entity Types)"
df2 = df2[["Model Name", "Relation Number", "Relation Name"]]

data3 = get_prediction(re_model_kmeans, sentences_tensor, infer_entity_hidden_states, infer_entity_type_features)
df3 = pd.DataFrame(data3, columns=["Relation Number", "Relation Name"])
df3["Model Name"]="Att-LSTM (KMeans)"
df3 = df3[["Model Name", "Relation Number", "Relation Name"]]

data4 = get_prediction(re_model_gmm, sentences_tensor, infer_entity_hidden_states, infer_entity_type_features)
df4 = pd.DataFrame(data4, columns=["Relation Number", "Relation Name"])
df4["Model Name"]="Att-LSTM (GMMs)"
df4 = df4[["Model Name", "Relation Number", "Relation Name"]]

result = pd.concat([df1, df2, df3, df4], axis=0)

result

Unnamed: 0,Model Name,Relation Number,Relation Name
0,BiLSTM,6,"Entity-Destination(e1,e2)"
1,BiLSTM,6,"Entity-Destination(e1,e2)"
0,Att-LSTM (w/o Entity Types),18,Other
1,Att-LSTM (w/o Entity Types),18,Other
0,Att-LSTM (KMeans),18,Other
1,Att-LSTM (KMeans),0,"Cause-Effect(e1,e2)"
0,Att-LSTM (GMMs),18,Other
1,Att-LSTM (GMMs),0,"Cause-Effect(e1,e2)"
