In [None]:
import json

import torch
import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

torch.manual_seed(7)

In [None]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('../uncased_L-12_H-768_A-12')

# Load pre-trained model (weights)
model = BertModel.from_pretrained('../uncased_L-12_H-768_A-12')

In [None]:
def tokenize(sent):
    text = "[CLS] " + sent + " [SEP]"

    # Tokenize
    tokenized_text = tokenizer.tokenize(text)
    
    # Convert token to vocabulary indices
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
    segments_ids = [0]*len(indexed_tokens)

    return tokenized_text, indexed_tokens, segments_ids


def get_representation(indexed_tokens, segments_ids):
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    # If you have a GPU, put everything on cuda
    tokens_tensor = tokens_tensor.to('cuda')
    segments_tensors = segments_tensors.to('cuda')
    model.to('cuda')
    
    # Predict hidden states features for each layer
    with torch.no_grad():
        encoded_layers, _ = model(tokens_tensor, segments_tensors)
        return encoded_layers[0]

In [None]:
dictionary = json.load(open('data/dict.json', 'r', encoding='utf8'))

In [None]:
info_pairs, examples = [], []

for word in dictionary:
    for pos in dictionary[word]:
        for each in dictionary[word][pos]: # list
            info_pairs.append((word, each['definition']))
            try:    
                examples.append(each['dic_examples'][0])
            except:
                examples.append("")

assert len(info_pairs) == len(examples)

In [None]:
indices, vectors = [], []

for i, example in enumerate(examples):
    example = example.lower().strip()
    tokenized_text, indexed_tokens, segments_ids = tokenize(example)
    embeddings = get_representation(indexed_tokens, segments_ids)

    for wv in embeddings[0]:
        indices.append(i)
        vectors.append(wv)

In [None]:
vectors = torch.stack(vectors)

In [None]:
sents = ["I try to finish the project."]

sent = sents[0].lower().strip()
tokenized_text, indexed_tokens, segments_ids = tokenize(sent)
embeddings = get_representation(indexed_tokens, segments_ids)

index = tokenized_text.index('try')
target_emb = embeddings[0][index]

cos = nn.CosineSimilarity(dim=1, eps=1e-6)
output = cos(target_emb.unsqueeze(0), vectors)
    
matches = output > 0.7
if any(matches):
    for i, is_match in enumerate(matches):
        if is_match:
            example_idx = indices[i]
            print(examples[example_idx])
            print(info_pairs[example_idx])
            print(output[i:i+5])

### Use word2vec

In [None]:
import gzip 
import numpy as np

word2Idx = {}
embeddings = []

embeddingsIn = gzip.open('../embeddings/word2vec.txt.gz', "rt", encoding='utf8')
embeddingsDimension = None

for line in embeddingsIn:
    split = line.rstrip().split(" ")
    word = split[0]

    if embeddingsDimension == None:
        embeddingsDimension = len(split) - 1

    if (len(
            split) - 1) != embeddingsDimension:  # Assure that all lines in the embeddings file are of the same length
        print("ERROR: A line in the embeddings file had more or less  dimensions than expected. Skip token.")
        continue

    if len(word2Idx) == 0:  # Add padding+unknown
        word2Idx["PADDING_TOKEN"] = len(word2Idx)
        vector = np.zeros(embeddingsDimension)
        embeddings.append(vector)

        word2Idx["UNKNOWN_TOKEN"] = len(word2Idx)
        vector = np.random.uniform(-0.25, 0.25, embeddingsDimension)  # Alternativ -sqrt(3/dim) ... sqrt(3/dim)
        embeddings.append(vector)

    vector = np.array([float(num) for num in split[1:]])

    if word not in word2Idx:
        embeddings.append(vector)
        word2Idx[word] = len(word2Idx)

In [None]:
word2Idx['apple']

### Gensim

In [1]:
# from utils.Dictionary import Dictionary
from gensim.models import KeyedVectors

# dictionary = Dictionary()

In [24]:
#!/usr/bin/env python
# coding: utf-8

from collections import defaultdict
from utils.config import level_table


# ### Just lookup dictionary directly (ignore POS)

class Dictionary:
    
    def __init__(self):
        self.vocab_dict = defaultdict(lambda: defaultdict(lambda: []))
        self.vocab_to_pos = defaultdict(lambda: [])
        self.pos_to_vocabs = defaultdict(lambda: [])
        self.word2vec =  KeyedVectors.load_word2vec_format("/atom/word_vectors/GoogleNews-vectors-negative300.bin", binary=True)  # C bin format
        
        for line in open('./data/dict.slim.txt', 'r', encoding='utf8'):
            vocab, level, poss, gw, href = line.split('\t')
            
            for pos in poss.replace(";", ",").split(','):
                pos = pos.strip().lower()
                self.vocab_dict[vocab][pos] = level
                self.vocab_to_pos[vocab].append(pos)
                self.pos_to_vocabs[pos].append(vocab)
            

    def lookup(self, vocab):
        if vocab not in self.vocab_dict: return None
        
        poss = self.vocab_to_pos[vocab]
        return [self.vocab_dict[vocab][pos] for pos in poss]
    
        
    def recommend(self, vocab):
        if vocab not in self.vocab_dict: return None
        
        poss = self.vocab_to_pos[vocab]
        
        candidates = defaultdict(lambda: [])
        for pos in poss:
            for word in self.pos_to_vocabs[pos]:
                candidates[word].extend((pos, self.vocab_dict[word][pos]))
        for word in candidates:
            candidates[word] = set(candidates[word])
            
        for sim, score in self.word2vec.similar_by_word(vocab, topn=100):
            if sim in candidates:
                print(sim)
                print(candidates[sim])

#         return self.vocab_level[vocab]


In [25]:
dictionary = Dictionary()

In [26]:
dictionary.lookup('try')

want
{'verb', 'A1'}
attempt
{'verb', 'B2', 'B1', 'noun'}
let
{'verb', 'B1'}
do
{'verb', 'A2'}
help
{'verb', 'B2', 'noun'}
need
{'verb', 'B2', 'B1', 'noun'}
seek
{'verb', 'C2'}
hope
{'verb', 'A2', 'B1', 'noun'}
strive
{'verb', 'C2'}
can
{'A2', 'noun'}
chance
{'B2', 'noun'}
aim
{'B1', 'noun'}
intend
{'verb', 'B1'}
think
{'verb', 'B2'}
urge
{'verb', 'C2', 'noun'}


In [None]:
can