In [None]:
!pip install datasets spacy gensim numpy torch scikit-learn tqdm matplotlib optuna



In [None]:
from datasets import load_dataset
dataset = load_dataset("sem_eval_2010_task_8")

In [None]:
entity_pos = {"train":[], "test":[], "inference":[]}
embedding_dim=300
entity_list = []
target_classes = 19
num_clusters = 5
batch_size = 32
max_num_clusters=15
max_sentence_len=80

In [None]:
relation_names=[
"Cause-Effect(e1,e2)",
"Cause-Effect(e2,e1)",
"Component-Whole(e1,e2)",
"Component-Whole(e2,e1)",
"Content-Container(e1,e2)",
"Content-Container(e2,e1)",
"Entity-Destination(e1,e2)",
"Entity-Destination(e2,e1)",
"Entity-Origin(e1,e2)",
"Entity-Origin(e2,e1)",
"Instrument-Agency(e1,e2)",
"Instrument-Agency(e2,e1)",
"Member-Collection(e1,e2)",
"Member-Collection(e2,e1)",
"Message-Topic(e1,e2)",
"Message-Topic(e2,e1)",
"Product-Producer(e1,e2)",
"Product-Producer(e2,e1)",
"Other"
]

In [None]:
import spacy
from spacy.tokenizer import Tokenizer
import re

nlp = spacy.load("en_core_web_sm")

special_tokens = ["<e1>", "</e1>", "<e2>", "</e2>"]

def custom_tokenizer(nlp):
    prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)
    infix_re = spacy.util.compile_infix_regex(nlp.Defaults.infixes)

    tokenizer = Tokenizer(nlp.vocab,
                          prefix_search=prefix_re.search,
                          suffix_search=suffix_re.search,
                          infix_finditer=infix_re.finditer,
                          token_match=None)

    for token in special_tokens:
        tokenizer.add_special_case(token, [{"ORTH": token}])

    return tokenizer

nlp.tokenizer = custom_tokenizer(nlp)

def tokenize(text):
    for token in special_tokens:
        text = text.replace(token, f" {token} ")  #Adding spaces to isolate markers

    return [token.text.lower() for token in nlp(text)]

In [None]:
from collections import Counter

word_freq = Counter(word for sentence in dataset["train"]["sentence"] for word in tokenize(sentence))
word2idx = {"<PAD>": 0, "<UNK>": 1}

for i, (word, _) in enumerate(word_freq.most_common(), start=2):
    word2idx[word] = i


In [None]:
print(len(word2idx))

19321


In [None]:
from gensim.models import KeyedVectors
import gensim.downloader as api
import numpy as np

def load_word2vec(word2idx):
  word2vec = api.load("word2vec-google-news-300")
  vocab_size = len(word2idx)
  embedding_matrix = np.random.uniform(-0.25, 0.25, (vocab_size, embedding_dim))

  for word, idx in word2idx.items():
      if word in word2vec:
          embedding_matrix[idx] = word2vec[word]

  return embedding_matrix

In [None]:
import gzip
import numpy as np

# Load embeddings from file
def load_turian_embeddings(filepath):
    word_vectors = {}
    with gzip.open(filepath, 'rt', encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]  # First column is the word
            vector = np.array(values[1:], dtype=np.float32)  # Remaining columns are vector values
            word_vectors[word] = vector
    return word_vectors


In [None]:
def load_word_embeddings(word2idx, vecType):
  if vecType == "turian":
    embedding_file = "embeddings-scaled.50.gz"
    word_embeddings = load_turian_embeddings(embedding_file)
  elif  vecType == "word2vec":
    word_embeddings = load_word2vec(word2idx)

  return word_embeddings


In [None]:
word_embeddings=load_word_embeddings(word2idx, "word2vec")

In [None]:
import torch
import torch.nn.functional as F

def encode_sentence(tokens, train_or_test, word2idx=word2idx, max_len=max_sentence_len):

    #saving positions of entities in each sentence for later use
    ei1_s = tokens.index("<e1>")
    ei1_e = tokens.index("</e1>")
    ei2_s = tokens.index("<e2>")
    ei2_e = tokens.index("</e2>")
    entity_pos[train_or_test].append(((ei1_s + 1, ei1_e),  (ei2_s + 1, ei2_e)))


    #getting unique indices for each word based on vocab dictionary
    indices = [word2idx.get(token, word2idx["<UNK>"]) for token in tokens]
    indices = indices[:max_len] + [word2idx["<PAD>"]] * (max_len - len(indices))

    return torch.tensor(indices, dtype=torch.long)


In [None]:
import random
from sklearn.model_selection import train_test_split
from datasets import Dataset

train_dataset = dataset["train"]
test_dataset = dataset["test"]


In [None]:
print(train_dataset.shape)
print(test_dataset.shape)

(8000, 2)
(2717, 2)


In [None]:
from torch.utils.data import TensorDataset, DataLoader

def get_data_loader(data, train_or_test, shuffle, entity_hidden_states = None, entity_features = None):
  encoded_sentences = [encode_sentence(tokenize(sentence), train_or_test) for sentence in data["sentence"]]
  labels = torch.tensor(data["relation"], dtype=torch.long)
  sentences_tensor = torch.stack(encoded_sentences)

  dataset_args = [sentences_tensor, labels]

  if entity_hidden_states is not None and entity_features is not None:
    dataset_args.append(entity_hidden_states)
    dataset_args.append(entity_features)

  torch_dataset = TensorDataset(*dataset_args)
  dataloader = DataLoader(torch_dataset, batch_size=batch_size, shuffle=shuffle)
  return dataloader


train_loader = get_data_loader(train_dataset, "train", False)
test_loader = get_data_loader(test_dataset, "test", False)


In [None]:
import torch
import torch.nn as nn

class RE_BiLSTM(nn.Module):
  def __init__(self, embeddings, hidden_dim, vocab_size, num_layers, dropout_rate, output_dim=target_classes, embedding_dim=embedding_dim):
    super(RE_BiLSTM, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=word2idx['<PAD>'])
    if embeddings is not None:
        self.embedding.weight.data.copy_(torch.from_numpy(embeddings))
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True, batch_first=True, dropout=dropout_rate if num_layers > 1 else 0)
    self.embedding_dropout = nn.Dropout(dropout_rate)
    self.output_dropout = nn.Dropout(dropout_rate)


  def forward(self, x):
    embedded = self.embedding(x)  # [batch_size, seq_len, embedding_dim]
    embedded = self.embedding_dropout(embedded)
    lstm_out, _ = self.lstm(embedded)  # [batch_size, seq_len, hidden_dim*2]
    lstm_out = self.output_dropout(lstm_out)

    return lstm_out

In [None]:
class EntityAwareAttention(nn.Module):
  def __init__(self, hidden_dim, dropout_rate, w_norm_threshold=0.5):
    super(EntityAwareAttention, self).__init__()
    self.tanh_hs = nn.Tanh()
    self.w = nn.Parameter(torch.randn(hidden_dim * 2))
    self.tanh_sent = nn.Tanh()
    self.linear = nn.Linear(hidden_dim * 2, target_classes)
    self.dropout = nn.Dropout(dropout_rate)
    self.w_norm_threshold=w_norm_threshold

  def forward(self, hidden_states):
    #hidden_states dim => batch_size, T, d
    M = self.tanh_hs(hidden_states) #word_features dim => batch_size x T x d, where d = 2 * hidden_dim and T= max sentence length
    alpha = torch.softmax(torch.matmul(M, self.w), dim=1)  #attention scores, batch_size x T dimensional
    alpha = alpha.unsqueeze(1)  # Shape: [batch_size, 1, T]
    r = torch.bmm(alpha, hidden_states).squeeze(1)  #batch_size x d dimensional
    h_star = self.tanh_sent(r) #batch_size x d dimensional
    h_star = self.dropout(h_star)
    preds = self.linear(h_star) #batch_size x target_classes

    return preds



In [None]:
class RE_Model(nn.Module):
  def __init__(self, embeddings, hidden_dim, vocab_size, lstm_layers,
               dropout_rate, output_dim=target_classes, embedding_dim=embedding_dim,  use_entity_attention=True):
    super(RE_Model, self).__init__()
    self.lstm = RE_BiLSTM(embeddings, hidden_dim, vocab_size, num_layers=lstm_layers, dropout_rate=dropout_rate)
    self.WH = nn.Linear(hidden_dim * 2, hidden_dim * 2)
    self.WE = nn.Linear((hidden_dim * 2) * 4, hidden_dim * 2)
    self.ent_att = EntityAwareAttention(hidden_dim, dropout_rate)
    self.use_entity_attention = use_entity_attention
    self.layer_norm = nn.LayerNorm(hidden_dim * 2)
    self.dropout = nn.Dropout(dropout_rate)

  def forward(self, x, entity_hidden_states, entity_types):
    lstm_out = self.lstm(x)

    if self.use_entity_attention:
      entity_hidden_states = torch.cat((entity_hidden_states[:, 0, :], entity_hidden_states[:, 1, :]), dim=-1)
      entity_types = torch.cat((entity_types[:, 0, :], entity_types[:, 1, :]), dim=-1)
      entity_hidden_states.unsqueeze(1)
      entity_types.unsqueeze(1)

      #dim of both tensors above => batch_size, 1, (hidden_dim * 2) * 2

      entity_features = torch.cat([entity_hidden_states, entity_types], dim = -1) #dim =>[batch_size, 1, (hidden_dim * 2) * 2]

      entity_features = self.dropout(entity_features)
      e = self.layer_norm(self.WE(entity_features))
      l = self.WH(lstm_out)
      e = e.unsqueeze(1)

      word_features = e + l
      preds = self.ent_att(word_features)
    else:
      preds = self.ent_att(lstm_out)


    return preds


In [None]:
class EntityEncoderBiLSTM(nn.Module):
  def __init__(self, embeddings, hidden_dim, vocab_size, num_layers, dropout_rate, output_dim=target_classes, embedding_dim=embedding_dim):
    super(EntityEncoderBiLSTM, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=word2idx['<PAD>'])
    if embeddings is not None:
        self.embedding.weight.data.copy_(torch.from_numpy(embeddings))
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True, batch_first=True, dropout=dropout_rate if num_layers > 1 else 0)
    self.embedding_dropout = nn.Dropout(dropout_rate)
    self.fc = nn.Linear(hidden_dim * 2, output_dim)
    self.dropout = nn.Dropout(dropout_rate)
    self.lstm_dropout = nn.Dropout(dropout_rate)

  def forward(self, x):
    embedded = self.embedding(x)
    embedded = self.embedding_dropout(embedded)
    lstm_out, _ = self.lstm(embedded)
    lstm_out = self.lstm_dropout(lstm_out)
    pooled = torch.mean(lstm_out, dim=1)
    output = self.dropout(pooled)
    output = self.fc(output)
    return output, lstm_out



In [None]:
def get_hs_mean(sent_hidden_states, ep):
  start = ep[0]
  end = ep[1]
  if start == end:
    return sent_hidden_states[start]
  else:
    return torch.sum(sent_hidden_states[start:end], dim=0) / (end - start)

In [None]:
def get_entities_hs_from_sentence(lstm_out, pos_index, train_or_test):

  en_hs_list = []
  for sent_hid_states in (lstm_out):
    (e1p, e2p) = entity_pos[train_or_test][pos_index]
    h1 = get_hs_mean(sent_hid_states, e1p)
    h2 = get_hs_mean(sent_hid_states, e2p)
    en_pair = torch.stack([h1, h2]) #dim => 2 x hidden_dim * 2
    en_hs_list.append(en_pair)
    pos_index += 1


  return torch.stack(en_hs_list), pos_index #dim => batch_size x 2 x hidden_dim * 2

In [None]:
def get_entity_hidden_states(model, loader, train_or_test):

  #Getting the hidden states of entities based on position in sentence

  model.eval()
  en_hs_tensors_list = []
  with torch.no_grad():
      pos_index = 0
      for x_batch, y_batch in loader:
        _, lstm_out = model(x_batch)
        en_hs_list, pos_index = get_entities_hs_from_sentence(lstm_out, pos_index, train_or_test)
        en_hs_tensors_list.append(en_hs_list)

      return torch.cat(en_hs_tensors_list, dim=0)  #dim => batch_size, 2, hidden_dim * 2


In [None]:
from sklearn.metrics import f1_score

def evaluate_encoder_model(model, data_loader, train_or_test):
    model.eval()
    correct = total = 0
    pos_index = 0
    load_hs = False
    hs_list = []

    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for x_batch, y_batch in data_loader:
          outputs, lstm_out = model(x_batch)
          predictions = torch.argmax(outputs, dim=1)
          correct += (predictions == y_batch).sum().item()
          total += y_batch.size(0)

          all_predictions.extend(predictions.cpu().numpy())
          all_labels.extend(y_batch.cpu().numpy())

    acc = correct / total * 100
    macro_f1 = f1_score(all_labels, all_predictions, average="macro")
    print(f'{train_or_test} Accuracy: {acc}%')
    print(f"{train_or_test} Macro F1-score: {macro_f1:.4f}")

    return acc



In [None]:
def evaluate_RE_model(model, data_loader, train_or_test):
    model.eval()
    correct = total = 0
    pos_index = 0
    load_hs = False
    hs_list = []
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for x_batch, y_batch, entity_hidden_states, entity_type_features in data_loader:
          outputs = model(x_batch, entity_hidden_states, entity_type_features)
          predictions = torch.argmax(outputs, dim=1)
          correct += (predictions == y_batch).sum().item()
          total += y_batch.size(0)
          all_predictions.extend(predictions.cpu().numpy())
          all_labels.extend(y_batch.cpu().numpy())


    acc = correct / total * 100
    macro_f1 = f1_score(all_labels, all_predictions, average="macro")
    print(f'{train_or_test} Accuracy: {acc}%')
    print(f"{train_or_test} Macro F1-score: {macro_f1:.4f}")

    return acc



In [None]:
def evaluate_RE_wo_types_model(model, data_loader, train_or_test):
    model.eval()
    correct = total = 0
    pos_index = 0
    load_hs = False
    hs_list = []
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for x_batch, y_batch in data_loader:
          outputs = model(x_batch, None, None)
          predictions = torch.argmax(outputs, dim=1)
          correct += (predictions == y_batch).sum().item()
          total += y_batch.size(0)
          all_predictions.extend(predictions.cpu().numpy())
          all_labels.extend(y_batch.cpu().numpy())


    acc = correct / total * 100
    macro_f1 = f1_score(all_labels, all_predictions, average="macro")
    print(f'{train_or_test} Accuracy: {acc}%')
    print(f"{train_or_test} Macro F1-score: {macro_f1:.4f}")

    return acc



In [None]:
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from tqdm import tqdm

def train_RE_wo_types_model(model, train_loader, optimizer, criterion, epochs=10, lr=0.001):
    prev_acc = 0
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for x_batch, y_batch in tqdm(train_loader, desc=f'Epoch {epoch+1}'):
          optimizer.zero_grad()
          outputs = model(x_batch,  None, None)
          loss = criterion(outputs, y_batch)
          loss.backward()
          optimizer.step()
          total_loss += loss.item()

        print(f'Epoch {epoch+1} Loss: {total_loss / len(train_loader):.4f}')
        train_acc = 0
        prev_acc = train_acc
        train_acc = evaluate_RE_wo_types_model(model, train_loader, "train")



In [None]:
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from tqdm import tqdm

def train_RE_model(model, train_loader, optimizer, criterion, epochs=10, lr=0.001):
    prev_acc = 0
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for x_batch, y_batch, hidden_states, entity_type_features in tqdm(train_loader, desc=f'Epoch {epoch+1}'):
          optimizer.zero_grad()
          outputs = model(x_batch,  hidden_states.detach(), entity_type_features.detach())
          loss = criterion(outputs, y_batch)
          loss.backward()
          optimizer.step()
          total_loss += loss.item()

        print(f'Epoch {epoch+1} Loss: {total_loss / len(train_loader):.4f}')
        train_acc = 0
        prev_acc = train_acc
        train_acc = evaluate_RE_model(model, train_loader, "train")



In [None]:
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from tqdm import tqdm

def train_encoder_model(model, train_loader, optimizer, criterion, epochs=10):

    prev_acc = 0
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for x_batch, y_batch in tqdm(train_loader, desc=f'Epoch {epoch+1}'):
            optimizer.zero_grad()
            outputs, lstm_out = model(x_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f'Epoch {epoch+1} Loss: {total_loss / len(train_loader):.4f}')
        train_acc = 0
        prev_acc = train_acc
        train_acc = evaluate_encoder_model(model, train_loader, "train")



In [None]:
hidden_dim = 256
dropout_rate = 0.44586702274269596
num_layers = 2
weight_decay=1e-05
epochs= 25
learning_rate=1

entity_lstm_model = EntityEncoderBiLSTM(embeddings=word_embeddings, hidden_dim=hidden_dim, vocab_size=len(word2idx), num_layers=num_layers,
                       dropout_rate=dropout_rate, output_dim=target_classes)
optimizer = optim.Adadelta(entity_lstm_model.parameters(), lr=learning_rate, weight_decay=weight_decay)
loss_fn = torch.nn.CrossEntropyLoss()

train_encoder_model(entity_lstm_model, train_loader, optimizer, loss_fn, epochs=epochs)


Epoch 1: 100%|██████████| 250/250 [00:43<00:00,  5.69it/s]


Epoch 1 Loss: 2.6611
train Accuracy: 17.575%
train Macro F1-score: 0.0157


Epoch 2: 100%|██████████| 250/250 [00:43<00:00,  5.78it/s]


Epoch 2 Loss: 2.5766
train Accuracy: 17.5625%
train Macro F1-score: 0.0168


Epoch 3: 100%|██████████| 250/250 [00:43<00:00,  5.75it/s]


Epoch 3 Loss: 2.4614
train Accuracy: 18.0375%
train Macro F1-score: 0.0301


Epoch 4: 100%|██████████| 250/250 [00:43<00:00,  5.71it/s]


Epoch 4 Loss: 2.2853
train Accuracy: 23.6125%
train Macro F1-score: 0.0549


Epoch 5: 100%|██████████| 250/250 [00:43<00:00,  5.74it/s]


Epoch 5 Loss: 2.0627
train Accuracy: 25.637500000000003%
train Macro F1-score: 0.0896


Epoch 6: 100%|██████████| 250/250 [00:43<00:00,  5.78it/s]


Epoch 6 Loss: 1.8288
train Accuracy: 40.0%
train Macro F1-score: 0.2455


Epoch 7: 100%|██████████| 250/250 [00:43<00:00,  5.75it/s]


Epoch 7 Loss: 1.5829
train Accuracy: 44.3375%
train Macro F1-score: 0.2889


Epoch 8: 100%|██████████| 250/250 [00:43<00:00,  5.73it/s]


Epoch 8 Loss: 1.4007
train Accuracy: 55.012499999999996%
train Macro F1-score: 0.4527


Epoch 9: 100%|██████████| 250/250 [00:43<00:00,  5.71it/s]


Epoch 9 Loss: 1.2360
train Accuracy: 66.95%
train Macro F1-score: 0.5571


Epoch 10: 100%|██████████| 250/250 [00:43<00:00,  5.77it/s]


Epoch 10 Loss: 1.0869
train Accuracy: 70.8%
train Macro F1-score: 0.5834


Epoch 11: 100%|██████████| 250/250 [00:43<00:00,  5.73it/s]


Epoch 11 Loss: 0.9633
train Accuracy: 74.0125%
train Macro F1-score: 0.6124


Epoch 12: 100%|██████████| 250/250 [00:43<00:00,  5.74it/s]


Epoch 12 Loss: 0.8614
train Accuracy: 76.7%
train Macro F1-score: 0.6475


Epoch 13: 100%|██████████| 250/250 [00:43<00:00,  5.76it/s]


Epoch 13 Loss: 0.7600
train Accuracy: 76.825%
train Macro F1-score: 0.6664


Epoch 14: 100%|██████████| 250/250 [00:43<00:00,  5.74it/s]


Epoch 14 Loss: 0.6954
train Accuracy: 78.9375%
train Macro F1-score: 0.6967


Epoch 15: 100%|██████████| 250/250 [00:43<00:00,  5.75it/s]


Epoch 15 Loss: 0.6301
train Accuracy: 84.325%
train Macro F1-score: 0.7637


Epoch 16: 100%|██████████| 250/250 [00:43<00:00,  5.73it/s]


Epoch 16 Loss: 0.5805
train Accuracy: 85.82499999999999%
train Macro F1-score: 0.7853


Epoch 17: 100%|██████████| 250/250 [00:43<00:00,  5.80it/s]


Epoch 17 Loss: 0.5203
train Accuracy: 87.2125%
train Macro F1-score: 0.8068


Epoch 18: 100%|██████████| 250/250 [00:44<00:00,  5.67it/s]


Epoch 18 Loss: 0.4692
train Accuracy: 89.6375%
train Macro F1-score: 0.8377


Epoch 19: 100%|██████████| 250/250 [00:43<00:00,  5.74it/s]


Epoch 19 Loss: 0.4262
train Accuracy: 90.5875%
train Macro F1-score: 0.8472


Epoch 20: 100%|██████████| 250/250 [00:43<00:00,  5.79it/s]


Epoch 20 Loss: 0.3911
train Accuracy: 92.6875%
train Macro F1-score: 0.8700


Epoch 21: 100%|██████████| 250/250 [00:42<00:00,  5.81it/s]


Epoch 21 Loss: 0.3465
train Accuracy: 92.825%
train Macro F1-score: 0.8761


Epoch 22: 100%|██████████| 250/250 [00:44<00:00,  5.68it/s]


Epoch 22 Loss: 0.3259
train Accuracy: 91.9625%
train Macro F1-score: 0.8628


Epoch 23: 100%|██████████| 250/250 [00:43<00:00,  5.74it/s]


Epoch 23 Loss: 0.2967
train Accuracy: 94.2625%
train Macro F1-score: 0.8897


Epoch 24: 100%|██████████| 250/250 [00:43<00:00,  5.81it/s]


Epoch 24 Loss: 0.2637
train Accuracy: 95.6%
train Macro F1-score: 0.9025


Epoch 25: 100%|██████████| 250/250 [00:43<00:00,  5.80it/s]


Epoch 25 Loss: 0.2468
train Accuracy: 95.78750000000001%
train Macro F1-score: 0.9068


In [None]:
evaluate_encoder_model(entity_lstm_model, test_loader, "test")

test Accuracy: 70.26131762973868%
test Macro F1-score: 0.6631


70.26131762973868

In [None]:
torch.save(entity_lstm_model.state_dict(), "encoder_weights.pth")

In [None]:
train_entity_hidden_states = get_entity_hidden_states(entity_lstm_model, train_loader, "train")
test_entity_hidden_states = get_entity_hidden_states(entity_lstm_model, test_loader, "test")

In [None]:
print("train_entity_hidden_states", train_entity_hidden_states.shape)
print("test_entity_hidden_states", test_entity_hidden_states.shape)

train_entity_hidden_states torch.Size([8000, 2, 512])
test_entity_hidden_states torch.Size([2717, 2, 512])


In [None]:
def get_entity_type_features(hidden_states, clustering_type, model, hs_array=None):

  if hs_array is None:
    hs_array = np.array([e for tup in hidden_states for e in tup])

  if clustering_type == 'KMeans':
    entity_types = model.predict(hs_array)
  elif clustering_type == 'GaussianMixture':
    soft_labels = model.predict_proba(hs_array)
    component_indices = np.arange(num_clusters)
    entity_types = np.dot(soft_labels, component_indices)  # (number of entities,) weighted assignment


  entity_type_embeddings = nn.Embedding(num_clusters, embedding_dim=hidden_dim*2)
  entity_type_features = entity_type_embeddings(torch.tensor(entity_types, dtype=torch.long)) #shape = (num_of_entities, hidden_dim * 2)

  num_sentences = entity_type_features.shape[0] // 2
  entity_type_features = entity_type_features.view(num_sentences, 2, -1)  #dim => num_setences, num_entities per sentence (=2), hidden_dim * 2

  return entity_type_features


In [None]:
#collect all entities in np array for clustering
entity_hs_array = np.array([e for tup in train_entity_hidden_states for e in tup])

Attention LSTM (without Entity Awareness)

In [None]:
#get new dataloaders

train_loader = get_data_loader(train_dataset, "train", False, entity_hidden_states=None, entity_features=None)
test_loader = get_data_loader(test_dataset, "test", False, entity_hidden_states=None, entity_features=None)

In [None]:

hidden_dim = 256
dropout_rate = 0.44586702274269596
learning_rate = 1
num_layers = 1
weight_decay= 1e-05
epochs= 15
re_model = RE_Model(embeddings=word_embeddings, hidden_dim=hidden_dim, vocab_size=len(word2idx), lstm_layers=num_layers,
                      dropout_rate=dropout_rate, use_entity_attention=False)

optimizer = optim.Adadelta(re_model.parameters(), lr=learning_rate, weight_decay=weight_decay)
loss_fn = torch.nn.CrossEntropyLoss()

train_RE_wo_types_model(re_model, train_loader, optimizer, loss_fn, epochs=epochs, lr=learning_rate)


Epoch 1: 100%|██████████| 250/250 [00:25<00:00,  9.97it/s]


Epoch 1 Loss: 2.6852
train Accuracy: 17.625%
train Macro F1-score: 0.0158


Epoch 2: 100%|██████████| 250/250 [00:24<00:00, 10.08it/s]


Epoch 2 Loss: 2.6410
train Accuracy: 17.625%
train Macro F1-score: 0.0158


Epoch 3: 100%|██████████| 250/250 [00:25<00:00,  9.88it/s]


Epoch 3 Loss: 2.1744
train Accuracy: 39.0125%
train Macro F1-score: 0.2379


Epoch 4: 100%|██████████| 250/250 [00:25<00:00,  9.73it/s]


Epoch 4 Loss: 1.5234
train Accuracy: 58.225%
train Macro F1-score: 0.4902


Epoch 5: 100%|██████████| 250/250 [00:26<00:00,  9.47it/s]


Epoch 5 Loss: 1.2272
train Accuracy: 66.75%
train Macro F1-score: 0.5660


Epoch 6: 100%|██████████| 250/250 [00:26<00:00,  9.60it/s]


Epoch 6 Loss: 1.0729
train Accuracy: 72.425%
train Macro F1-score: 0.6256


Epoch 7: 100%|██████████| 250/250 [00:29<00:00,  8.47it/s]


Epoch 7 Loss: 0.9531
train Accuracy: 76.075%
train Macro F1-score: 0.6946


Epoch 8: 100%|██████████| 250/250 [00:28<00:00,  8.81it/s]


Epoch 8 Loss: 0.8324
train Accuracy: 80.825%
train Macro F1-score: 0.7531


Epoch 9: 100%|██████████| 250/250 [00:27<00:00,  9.04it/s]


Epoch 9 Loss: 0.7495
train Accuracy: 82.475%
train Macro F1-score: 0.7791


Epoch 10: 100%|██████████| 250/250 [00:27<00:00,  9.16it/s]


Epoch 10 Loss: 0.6775
train Accuracy: 85.6625%
train Macro F1-score: 0.8077


Epoch 11: 100%|██████████| 250/250 [00:28<00:00,  8.78it/s]


Epoch 11 Loss: 0.6178
train Accuracy: 86.05000000000001%
train Macro F1-score: 0.8150


Epoch 12: 100%|██████████| 250/250 [00:29<00:00,  8.38it/s]


Epoch 12 Loss: 0.5603
train Accuracy: 89.05%
train Macro F1-score: 0.8433


Epoch 13: 100%|██████████| 250/250 [00:30<00:00,  8.08it/s]


Epoch 13 Loss: 0.4993
train Accuracy: 91.525%
train Macro F1-score: 0.8650


Epoch 14: 100%|██████████| 250/250 [00:30<00:00,  8.29it/s]


Epoch 14 Loss: 0.4497
train Accuracy: 92.225%
train Macro F1-score: 0.8733


Epoch 15: 100%|██████████| 250/250 [00:29<00:00,  8.57it/s]


Epoch 15 Loss: 0.4076
train Accuracy: 94.1%
train Macro F1-score: 0.8881


In [None]:
evaluate_RE_wo_types_model(re_model, test_loader, "test")

test Accuracy: 74.97239602502759%
test Macro F1-score: 0.7223


74.97239602502759

In [None]:
torch.save(re_model.state_dict(), "RE_without_entities_model_weights.pth")

Attention Mechanism with Entity Awareness

Using KMeans for Entity Typing

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(entity_hs_array)

In [None]:
import torch.nn as nn

train_entity_type_features = get_entity_type_features(train_entity_hidden_states, "KMeans", kmeans, hs_array=entity_hs_array)
test_entity_type_features = get_entity_type_features(test_entity_hidden_states, "KMeans", kmeans)

In [None]:
#get new dataloaders, which include entity hidden states and type features

train_loader = get_data_loader(train_dataset, "train", False, entity_hidden_states=train_entity_hidden_states, entity_features=train_entity_type_features)
test_loader = get_data_loader(test_dataset, "test", False, entity_hidden_states=test_entity_hidden_states, entity_features=test_entity_type_features)

In [458]:
hidden_dim = 256
dropout_rate = 0.5
num_layers = 1
weight_decay = 5e-5
epochs = 15
learning_rate=5e-5

re_model = RE_Model(embeddings=word_embeddings, hidden_dim=hidden_dim, vocab_size=len(word2idx), lstm_layers=num_layers,
                      dropout_rate=dropout_rate, use_entity_attention=True)

optimizer = optim.AdamW(re_model.parameters(), lr=learning_rate, weight_decay=weight_decay)
loss_fn = torch.nn.CrossEntropyLoss()

train_RE_model(re_model, train_loader, optimizer, loss_fn, epochs=epochs, lr=learning_rate)


Epoch 1: 100%|██████████| 250/250 [00:51<00:00,  4.89it/s]


Epoch 1 Loss: 0.4055
train Accuracy: 92.475%
train Macro F1-score: 0.8716


Epoch 2: 100%|██████████| 250/250 [00:50<00:00,  4.93it/s]


Epoch 2 Loss: 0.2049
train Accuracy: 94.6375%
train Macro F1-score: 0.8882


Epoch 3: 100%|██████████| 250/250 [00:50<00:00,  4.90it/s]


Epoch 3 Loss: 0.1850
train Accuracy: 94.1875%
train Macro F1-score: 0.8744


Epoch 4: 100%|██████████| 250/250 [00:50<00:00,  4.93it/s]


Epoch 4 Loss: 0.1883
train Accuracy: 95.375%
train Macro F1-score: 0.8971


Epoch 5: 100%|██████████| 250/250 [00:50<00:00,  4.95it/s]


Epoch 5 Loss: 0.1699
train Accuracy: 95.92500000000001%
train Macro F1-score: 0.9046


Epoch 6: 100%|██████████| 250/250 [00:50<00:00,  4.93it/s]


Epoch 6 Loss: 0.1661
train Accuracy: 95.6875%
train Macro F1-score: 0.9023


Epoch 7: 100%|██████████| 250/250 [00:51<00:00,  4.87it/s]


Epoch 7 Loss: 0.1587
train Accuracy: 96.1%
train Macro F1-score: 0.9092


Epoch 8: 100%|██████████| 250/250 [00:50<00:00,  4.92it/s]


Epoch 8 Loss: 0.1584
train Accuracy: 95.5875%
train Macro F1-score: 0.9019


Epoch 9: 100%|██████████| 250/250 [00:50<00:00,  4.93it/s]


Epoch 9 Loss: 0.1584
train Accuracy: 96.0125%
train Macro F1-score: 0.9069


Epoch 10: 100%|██████████| 250/250 [00:50<00:00,  4.93it/s]


Epoch 10 Loss: 0.1531
train Accuracy: 96.1125%
train Macro F1-score: 0.9067


In [459]:
evaluate_RE_model(re_model, test_loader, "test")

test Accuracy: 68.49466323150534%
test Macro F1-score: 0.6451


68.49466323150534

In [None]:
torch.save(re_model.state_dict(), "RE_with_kmeans_entities_model_weights.pth")

Using GMM for Entity Typing

In [None]:
from sklearn.mixture import GaussianMixture
import numpy as np

gmm = GaussianMixture(n_components=num_clusters, covariance_type='tied', random_state=42)
gmm.fit(entity_hs_array)

In [None]:
train_entity_type_features = get_entity_type_features(train_entity_hidden_states, "GaussianMixture", gmm, hs_array=entity_hs_array)
test_entity_type_features = get_entity_type_features(test_entity_hidden_states, "GaussianMixture", gmm)

In [None]:
#get new dataloaders, which include entity hidden states and type features

train_loader = get_data_loader(train_dataset, "train", False, entity_hidden_states=train_entity_hidden_states, entity_features=train_entity_type_features)
test_loader = get_data_loader(test_dataset, "test", False, entity_hidden_states=test_entity_hidden_states, entity_features=test_entity_type_features)

In [None]:
hidden_dim = 256
dropout_rate = 0.5
num_layers = 1
weight_decay = 5e-5
epochs = 15
learning_rate=5e-5

re_model = RE_Model(embeddings=word_embeddings, hidden_dim=hidden_dim, vocab_size=len(word2idx), lstm_layers=num_layers,
                      dropout_rate=dropout_rate, use_entity_attention=True)

optimizer = optim.AdamW(re_model.parameters(), lr=learning_rate, weight_decay=weight_decay)
loss_fn = torch.nn.CrossEntropyLoss()

train_RE_model(re_model, train_loader, optimizer, loss_fn, epochs=epochs, lr=learning_rate)

Epoch 1: 100%|██████████| 250/250 [00:21<00:00, 11.76it/s]


Epoch 1 Loss: 1.0249
train Accuracy: 90.47500000000001%
train Macro F1-score: 0.7823


Epoch 2: 100%|██████████| 250/250 [00:21<00:00, 11.67it/s]


Epoch 2 Loss: 0.4005
train Accuracy: 93.7625%
train Macro F1-score: 0.8657


Epoch 3: 100%|██████████| 250/250 [00:21<00:00, 11.49it/s]


Epoch 3 Loss: 0.2800
train Accuracy: 95.0375%
train Macro F1-score: 0.8920


Epoch 4: 100%|██████████| 250/250 [00:23<00:00, 10.70it/s]


Epoch 4 Loss: 0.2276
train Accuracy: 95.575%
train Macro F1-score: 0.8997


Epoch 5: 100%|██████████| 250/250 [00:23<00:00, 10.67it/s]


Epoch 5 Loss: 0.1965
train Accuracy: 95.92500000000001%
train Macro F1-score: 0.9035


Epoch 6: 100%|██████████| 250/250 [00:23<00:00, 10.50it/s]


Epoch 6 Loss: 0.1859
train Accuracy: 96.1%
train Macro F1-score: 0.9048


Epoch 7: 100%|██████████| 250/250 [00:23<00:00, 10.68it/s]


Epoch 7 Loss: 0.1696
train Accuracy: 96.15%
train Macro F1-score: 0.9060


Epoch 8: 100%|██████████| 250/250 [00:23<00:00, 10.63it/s]


Epoch 8 Loss: 0.1608
train Accuracy: 96.275%
train Macro F1-score: 0.9062


Epoch 9: 100%|██████████| 250/250 [00:23<00:00, 10.79it/s]


Epoch 9 Loss: 0.1554
train Accuracy: 96.46249999999999%
train Macro F1-score: 0.9094


Epoch 10: 100%|██████████| 250/250 [00:23<00:00, 10.78it/s]


Epoch 10 Loss: 0.1460
train Accuracy: 96.26249999999999%
train Macro F1-score: 0.9053


Epoch 11: 100%|██████████| 250/250 [00:23<00:00, 10.73it/s]


Epoch 11 Loss: 0.1391
train Accuracy: 96.7%
train Macro F1-score: 0.9124


Epoch 12: 100%|██████████| 250/250 [00:23<00:00, 10.71it/s]


Epoch 12 Loss: 0.1397
train Accuracy: 96.89999999999999%
train Macro F1-score: 0.9139


Epoch 13: 100%|██████████| 250/250 [00:23<00:00, 10.73it/s]


Epoch 13 Loss: 0.1297
train Accuracy: 96.89999999999999%
train Macro F1-score: 0.9147


Epoch 14: 100%|██████████| 250/250 [00:23<00:00, 10.49it/s]


Epoch 14 Loss: 0.1264
train Accuracy: 96.8625%
train Macro F1-score: 0.9140


Epoch 15: 100%|██████████| 250/250 [00:23<00:00, 10.77it/s]


Epoch 15 Loss: 0.1258
train Accuracy: 96.75%
train Macro F1-score: 0.9130


In [None]:
evaluate_RE_model(re_model, test_loader, "test")

test Accuracy: 64.26205373573795%
test Macro F1-score: 0.6205


64.26205373573795

In [None]:
torch.save(re_model.state_dict(), "RE_with_gmm_entities_model_weights.pth")

In [453]:
import joblib
joblib.dump(gmm, "gmm_model.pkl")
joblib.dump(kmeans, "kmeans_model.pkl")

['gmm_model.pkl']