In [None]:
import sys
!pip install transformers
sys.path.append('/content/drive/MyDrive/')

In [4]:
from sklearn.neighbors import NearestNeighbors
import torch.nn.functional as F
from tqdm import tqdm
from utils import *
import pickle
import torch

word2pos, words_noun, words_verb, words_adj, words_adv = load_pos_from_json('/content/drive/MyDrive/word2pos.json')
data, data_train, data_seen_500, data_unseen_500, data_test = load_data()

Num of Noun words: 71185 (78.26%)
Num of Verb words: 13830 (15.20%)
Num of Adj  words: 13445 (14.78%)
Num of Adv  words: 3567 (3.92%)


In [5]:
with open('/content/drive/MyDrive/neural/final_model', 'rb') as f:
    neural_model = pickle.load(f)
    neural_model.eval()
with open('/content/drive/MyDrive/neural/tokenizer', 'rb') as f:
    neural_tokenizer = pickle.load(f)
with open('/content/drive/MyDrive/neural/name', 'rb') as f:
    neural_y = pickle.load(f)
with open('/content/drive/MyDrive/neural/all-word2', 'rb') as f:
    neural_X_embed = pickle.load(f)  # (117659, 768)

In [6]:
seed =0
num = 500
noun_500 = np.random.RandomState(seed).choice(words_noun, num)
verb_500 = np.random.RandomState(seed).choice(words_verb, num)
adj_500 = np.random.RandomState(seed).choice(words_adj, num)
adv_500 = np.random.RandomState(seed).choice(words_adv, num)

### Fit data

In [7]:
# fit knn
knn = NearestNeighbors(metric='cosine')
knn.fit(neural_X_embed)

NearestNeighbors(metric='cosine')

In [8]:
def mean_pooling(model_output, attention_mask):
    # Mean Pooling - Take attention mask into account for correct averaging
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


def get_embed(y_def, model, tokenizer):
    # Tokenize sentences
    encoded_input = tokenizer(y_def, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
        # Perform pooling
        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
        # Normalize embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        return sentence_embeddings

### Noun

In [13]:
y_pred = []
y_gold = []
for word, defi in tqdm(data.items()):
    if word in noun_500:
        query = list(defi)[0]
        query = get_embed(query, neural_model, neural_tokenizer)

        prediction = search(query, knn, neural_y, n=1000)

        y_pred.append(prediction)
        y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 93218/93218 [03:25<00:00, 453.87it/s]

acc@1: 0.63
acc@10: 0.73
acc@100: 0.79
median rank: 0
standard error of mean rank: 17





(0.6271929824561403,
 0.7324561403508771,
 0.7916666666666666,
 0.0,
 17.192169264802914)

### Verb

In [12]:
y_pred = []
y_gold = []
for word, defi in tqdm(data.items()):
    if word in verb_500:
        query = list(defi)[0]
        query = get_embed(query, neural_model, neural_tokenizer)

        prediction = search(query, knn, neural_y, n=1000)

        y_pred.append(prediction)
        y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 93218/93218 [03:26<00:00, 450.49it/s] 

acc@1: 0.27
acc@10: 0.36
acc@100: 0.49
median rank: 140
standard error of mean rank: 22





(0.27114967462039047,
 0.3579175704989154,
 0.4945770065075922,
 140.0,
 22.26292442056393)

### Adj

In [10]:
y_pred = []
y_gold = []
for word, defi in tqdm(data.items()):
    if word in adj_500:
        query = list(defi)[0]
        query = get_embed(query, neural_model, neural_tokenizer)

        prediction = search(query, knn, neural_y, n=1000)

        y_pred.append(prediction)
        y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 93218/93218 [03:36<00:00, 430.90it/s]  

acc@1: 0.31
acc@10: 0.43
acc@100: 0.54
median rank: 39
standard error of mean rank: 22





(0.3050847457627119,
 0.4322033898305085,
 0.5444915254237288,
 39.0,
 21.540076115360776)

### Noun

In [11]:
y_pred = []
y_gold = []
for word, defi in tqdm(data.items()):
    if word in adv_500:
        query = list(defi)[0]
        query = get_embed(query, neural_model, neural_tokenizer)

        prediction = search(query, knn, neural_y, n=1000)

        y_pred.append(prediction)
        y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 93218/93218 [02:46<00:00, 560.85it/s] 

acc@1: 0.34
acc@10: 0.48
acc@100: 0.64
median rank: 12
standard error of mean rank: 21





(0.3423913043478261,
 0.4782608695652174,
 0.6385869565217391,
 12.5,
 21.483948458261235)