In [1]:
from configuration import BaseConfig
from datahandler import DataReader, DataWriter
import pandas as pd
from nltk.corpus import wordnet as wn
from transformers import AutoTokenizer, BertForMaskedLM
import torch
from tqdm.notebook import tqdm

config = BaseConfig(version=2).get_args(db_name="wn18rr")

# tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased')
# model = BertForMaskedLM.from_pretrained('bert-large-uncased')

In [2]:
re_test  = DataReader.load_df(config.processed_test)
en_test  = DataReader.load_df(config.processed_entity_test)

print(f"REL shape:{re_test.shape}, ENT shape:‌{en_test.shape}")
en_test.head(2)

REL shape:(2995, 7), ENT shape:‌(5138, 3)


Unnamed: 0,entity,type,definition
0,__healthy_JJ_1,JJ,having or indicating good health in body or mi...
1,__unbelievable_JJ_1,JJ,"beyond belief or understanding; ""at incredible..."


# Functions

In [3]:
ent_templates = {
    "template-1": "The word [A] POS is a [MASK].",
    "template-2": "The word '[A]' POS is a [MASK].",
    "template-3": "[EXAMPLE]. The word [A] POS is a [MASK].",
    "template-4": "[EXAMPLE]. The word '[A]' POS is a [MASK]."
}

label_mapper = {"JJ":"adjective", "NN":"noun", "VB":"verb"}

wn_types_identifier = {"J": wn.ADJ, "V":wn.VERB, "N": wn.NOUN}

def make_prediction(sentence, top_n = 5):
    inputs = tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        token_logits = model(**inputs).logits
    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
    mask_token_logits = token_logits[0, mask_token_index, :]
    top_n_tokens = torch.topk(mask_token_logits, top_n, dim=1).indices[0].tolist()
    predictions = []
    for token in top_n_tokens:
        predictions.append(tokenizer.decode([token]))
    return predictions, mask_token_logits

def precision_at_k(actual, predicted):
    act_set = set(actual)
    pred_set = set(predicted)
    result = len(act_set & pred_set) / float(len(predicted))
    return result

def recall_at_k(actual, predicted):
    act_set = set(actual)
    pred_set = set(predicted)
    result = len(act_set & pred_set) / float(len(act_set))
    return result

def generate_samples_label_entity(sample, template="template-1"):
    concept, wn_type = " ".join(sample.split("_")[2:-2]), sample.split("_")[-2]
    label = [label_mapper[wn_type]]
    sentence_template = ent_templates[template]     
    sentence = sentence_template.replace("[A]", concept)
    if template == "template-3" or template == "template-4":
        prefix = concept
        synsets = wn.synsets("_".join(concept.split()), pos=wn_types_identifier[wn_type[0]])
        if len(synsets) != 0:
            example = synsets[0].examples()
            if len(example) != 0:
                prefix = example[0]
        sentence = sentence.replace("[EXAMPLE]", prefix)
    return sentence, label

def fill_mask_prediction(entity_list, template, top_n=5):
    P1, P5 = [], []
    R1, R5 = [], []
    for entity in tqdm(entity_list):
        sentence, label = generate_samples_label_entity(entity, template=template)
        predictions, _ = make_prediction(sentence, top_n=top_n)
        P1.append(precision_at_k(label, predictions[:1]))
        P5.append(precision_at_k(label, predictions))
        R1.append(recall_at_k(label, predictions[:1]))
        R5.append(recall_at_k(label, predictions))

    print(f"ENT-Template: {ent_templates[template]}, Test Size:{len(entity_list)} ,MP@1 = {sum(P1)/len(P1)},   MR@1 ={sum(R1)/len(R1)}")
    print(f"ENT-Template: {ent_templates[template]}, Test Size:{len(entity_list)} ,MP@{top_n} = {sum(P5)/len(P5)},   MR@{top_n} ={sum(R5)/len(R5)}")

In [71]:
top_n = 1
entity = "__covering_NN_1"

print("template-1")
sentence, label = generate_samples_label_entity(entity, template="template-1")
print("sent:", sentence)
print("label:", label)

print("template-2")
sentence, label = generate_samples_label_entity(entity, template="template-2")
print("sent:", sentence)
print("label:", label)

print("template-3")
sentence, label = generate_samples_label_entity(entity, template="template-3")
print("sent:", sentence)
print("label:", label)

print("template-4")
sentence, label = generate_samples_label_entity(entity, template="template-4")
print("sent:", sentence)
print("label:", label)

template-1
sent: The word covering POS is a [MASK].
label: ['noun']
template-2
sent: The word 'covering' POS is a [MASK].
label: ['noun']
template-3
sent: under a covering of dust. The word covering POS is a [MASK].
label: ['noun']
template-4
sent: under a covering of dust. The word 'covering' POS is a [MASK].
label: ['noun']


# 1) Template-1: `The word [A] POS is a [MASK].`

In [27]:
entities = en_test['entity'].tolist()

fill_mask_prediction(entity_list = entities, template="template-1", top_n=5)

  0%|          | 0/5138 [00:00<?, ?it/s]

ENT-Template: The word [A] POS is a [MASK]., Test Size:5138 ,MP@1 = 0.0017516543402101986,   MR@1 =0.0017516543402101986
ENT-Template: The word [A] POS is a [MASK]., Test Size:5138 ,MP@5 = 0.14332425068120483,   MR@5 =0.7166212534059946


# 2) Template-2: `The word '[A]' POS is a [MASK].`

In [32]:
entities = en_test['entity'].tolist()

fill_mask_prediction(entity_list = entities, template="template-2", top_n=5)

  0%|          | 0/5138 [00:00<?, ?it/s]

ENT-Template: The word '[A]' POS is a [MASK]., Test Size:5138 ,MP@1 = 0.00019462826002335538,   MR@1 =0.00019462826002335538
ENT-Template: The word '[A]' POS is a [MASK]., Test Size:5138 ,MP@5 = 0.14896847022188342,   MR@5 =0.7448423511093811


# 3) Template-3: `[EXAMPLE]. The word [A] POS is a [MASK].`

In [33]:
entities = en_test['entity'].tolist()

fill_mask_prediction(entity_list = entities, template="template-3", top_n=5)

  0%|          | 0/5138 [00:00<?, ?it/s]

ENT-Template: [EXAMPLE]. The word [A] POS is a [MASK]., Test Size:5138 ,MP@1 = 0.014207862981704943,   MR@1 =0.014207862981704943
ENT-Template: [EXAMPLE]. The word [A] POS is a [MASK]., Test Size:5138 ,MP@5 = 0.13219151420786637,   MR@5 =0.660957571039315


# 4) Template-4: `[EXAMPLE]. The word '[A]' POS is a [MASK].`

In [34]:
entities = en_test['entity'].tolist()

fill_mask_prediction(entity_list = entities, template="template-4", top_n=5)

  0%|          | 0/5138 [00:00<?, ?it/s]

ENT-Template: [EXAMPLE]. The word '[A]' POS is a [MASK]., Test Size:5138 ,MP@1 = 0.10685091475282212,   MR@1 =0.10685091475282212
ENT-Template: [EXAMPLE]. The word '[A]' POS is a [MASK]., Test Size:5138 ,MP@5 = 0.16340988711561968,   MR@5 =0.8170494355780459


# Check Number of entities with example in `WordNet`

In [84]:
def check_concept_in_wordnet(sample):
    synsets = wn.synsets("_".join(sample.split("_")[2:-2]), 
                         pos=wn_types_identifier[sample.split("_")[-2][0]])
    if len(synsets) != 0:
        if len(synsets[0].examples()) != 0:
            return True
    return False

entities = en_test['entity'].tolist()
contain_example = 0
for entity in tqdm(entities):
    if check_concept_in_wordnet(entity):
        contain_example += 1

print(f"WN contains {contain_example} example for {len(entities)} entities.")

  0%|          | 0/5138 [00:00<?, ?it/s]

WN contains 1903 example for 5138 entities.


# 5) FreqModel

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.metrics import classification_report

class WNFrqDataset:
    def __init__(self, df, label_mapper, template, wn_types_identifier, is_train):
        self.data = df
        self.label_mapper = label_mapper
        self.template = template
        self.wn_types_identifier = wn_types_identifier
        self.is_train = is_train
    
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, index):
        sample = self.data['entity'].tolist()[index]
        concept, wn_type = " ".join(sample.split("_")[2:-2]), sample.split("_")[-2]
        label = self.label_mapper[wn_type]
        sentence = self.template.replace("[A]", concept)
        prefix = concept
        synsets = wn.synsets("_".join(concept.split()), pos=self.wn_types_identifier[wn_type[0]])
        if len(synsets) != 0:
            example = synsets[0].examples()
            if len(example) != 0:
                prefix = example[0]
        sentence = sentence.replace("[EXAMPLE]", prefix)
        if self.is_train:
            sentence = sentence.replace("[MASK]", "'"+label+"'")
        return sentence, label


class FreqModel:
    def __init__(self, labels=['verb', 'adjective', 'noun']):
        self.transformer = CountVectorizer(ngram_range=(1,1))
        self.labels = labels
    
    def fit(self, dataset):
        texts = [text for text, _ in dataset]
        self.freq_matrix_labels = np.array([label for _, label in dataset])
        self.transformer.fit(texts)
        self.freq_matrix = self.__transform(texts)
    
    def predict(self, dataset):
        # texts = [text for text, _ in range(dataset)]
        X = [text for text, _ in dataset]
        y_true = [label for _, label in dataset]
        dataset_freq_matrix = self.__transform(X)
        proba_matrix = np.zeros((dataset_freq_matrix.shape[0], len(self.labels)))
        for index, label in enumerate(self.labels):
            class_wise_tokens_freq = self.freq_matrix[np.where(self.freq_matrix_labels == label)[0]].sum(axis=0)
            probas = dataset_freq_matrix/class_wise_tokens_freq
            predict_proba = np.ma.masked_invalid(probas).sum(axis=1)
            proba_matrix[:, index] = predict_proba.data[:]
        y_pred = [self.labels[pred] for pred in np.argmax(proba_matrix, axis=1)]
        return y_true, y_pred
    
    def __transform(self, X):
        return self.transformer.transform(X).toarray()
    
def calculate_p1(y_true, y_pred):
    p = [precision_at_k([true], [pred]) for true, pred in zip(y_true, y_pred)]
    return sum(p)/len(p)

In [5]:
en_train  = DataReader.load_df(config.processed_entity_train)
en_valid  = DataReader.load_df(config.processed_entity_valid)
en_train = en_train.append(en_valid).reset_index(drop=True)
en_test  = DataReader.load_df(config.processed_entity_test)

print(f"train shape:{en_train.shape}, test shape:‌{en_test.shape}")

freq_train_dataset = WNFrqDataset(df=en_train, 
                                  label_mapper=label_mapper, 
                                  template=ent_templates["template-4"], 
                                  wn_types_identifier=wn_types_identifier,
                                  is_train = True)

freq_test_dataset = WNFrqDataset(df=en_test, 
                                 label_mapper=label_mapper, 
                                 template=ent_templates["template-4"], 
                                 wn_types_identifier=wn_types_identifier,
                                 is_train = False)

freq_model = FreqModel()

freq_model.fit(freq_train_dataset)

y_true, y_pred = freq_model.predict(freq_test_dataset)

print("Test Set MP@1:", calculate_p1(y_true, y_pred)) 
print("Test Classification Report: \n", classification_report(y_true, y_pred))

y_true, y_pred = freq_model.predict(freq_train_dataset)
print("Train Set MP@1:", calculate_p1(y_true, y_pred)) 
print("Train Classification Report: \n", classification_report(y_true, y_pred))

  en_train = en_train.append(en_valid).reset_index(drop=True)


train shape:(23701, 3), test shape:‌(5138, 3)


  probas = dataset_freq_matrix/class_wise_tokens_freq
  probas = dataset_freq_matrix/class_wise_tokens_freq


Test Set MP@1: 0.4739198131568704
Test Classification Report: 
               precision    recall  f1-score   support

   adjective       0.09      0.87      0.17       105
        noun       0.82      0.48      0.60      3700
        verb       0.29      0.44      0.35      1333

    accuracy                           0.47      5138
   macro avg       0.40      0.59      0.37      5138
weighted avg       0.67      0.47      0.53      5138



  probas = dataset_freq_matrix/class_wise_tokens_freq
  probas = dataset_freq_matrix/class_wise_tokens_freq


Train Set MP@1: 0.5579089489894942
Train Classification Report: 
               precision    recall  f1-score   support

   adjective       0.14      0.84      0.24      1130
        noun       0.78      0.74      0.76     13634
        verb       0.56      0.25      0.34      8937

    accuracy                           0.56     23701
   macro avg       0.49      0.61      0.45     23701
weighted avg       0.67      0.56      0.58     23701



# Results

|#| Template |Model|Set | Test Size | MAP@1 | MAP@5 | |
|:---:|:---|:---:|:---:|:---:|:---:|:---:|:---|
|1|`The word [A] POS is a [MASK].`|BERT-Large|Entity Test| 5138 |  0.175|14.332||
|2|`The word '[A]' POS is a [MASK].`|BERT-Large|Entity Test| 5138 |  0.019 | 14.896 ||
|3|`[EXAMPLE]. The word [A] POS is a [MASK].`| BERT-Large | Entity Test | 5138 | 1.420 | 13.219 |Examples are from WordNet. WN contain examples for 1903 entities|
|4|`[EXAMPLE]. The word '[A]' POS is a [MASK].`| BERT-Large | Entity Test | 5138 | 10.685| 16.340 |Examples are from WordNet. WN contain examples for 1903 entities|
|5|`[EXAMPLE]. The word '[A]' POS is a [MASK].`| Freq Based Model | Entity Test | 5138 | 47.391 | | This is frequency based probability model. I just calculated tokens probaility for each class and sum them over appeared vocabulary probaility in samples for each class|

In [2]:
import numpy as np

mtx = np.array([[1.14835892],
 [1.07693035],
 [0.57433328],
 [0.04026681],
 [0.04684576],
 [1.54026681],
 [0.37433328],
 [0.62360014],
 [0.37360014],
 [0.37360014]])

In [8]:
mtx.reshape(1, -1)[0]

array([1.14835892, 1.07693035, 0.57433328, 0.04026681, 0.04684576,
       1.54026681, 0.37433328, 0.62360014, 0.37360014, 0.37360014])