In [None]:
import json

import torch
import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

torch.manual_seed(7)

In [None]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('../uncased_L-12_H-768_A-12')

# Load pre-trained model (weights)
model = BertModel.from_pretrained('../uncased_L-12_H-768_A-12')

In [None]:
def tokenize(sent):
    text = "[CLS] " + sent + " [SEP]"

    # Tokenize
    tokenized_text = tokenizer.tokenize(text)
    
    # Convert token to vocabulary indices
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
    segments_ids = [0]*len(indexed_tokens)

    return tokenized_text, indexed_tokens, segments_ids


def get_representation(indexed_tokens, segments_ids):
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    # If you have a GPU, put everything on cuda
    tokens_tensor = tokens_tensor.to('cuda')
    segments_tensors = segments_tensors.to('cuda')
    model.to('cuda')
    
    # Predict hidden states features for each layer
    with torch.no_grad():
        encoded_layers, _ = model(tokens_tensor, segments_tensors)
        return encoded_layers[0]

In [None]:
vectors = torch.stack(vectors)

In [None]:
sents = ["I try to finish the project."]

sent = sents[0].lower().strip()
tokenized_text, indexed_tokens, segments_ids = tokenize(sent)
embeddings = get_representation(indexed_tokens, segments_ids)

index = tokenized_text.index('try')
target_emb = embeddings[0][index]

cos = nn.CosineSimilarity(dim=1, eps=1e-6)
output = cos(target_emb.unsqueeze(0), vectors)
    
matches = output > 0.7
if any(matches):
    for i, is_match in enumerate(matches):
        if is_match:
            example_idx = indices[i]
            print(examples[example_idx])
            print(info_pairs[example_idx])
            print(output[i:i+5])

In [None]:
#!/usr/bin/env python
# coding: utf-8
from utils.config import level_table
from itertools import product

class EVP:

    def __init__(self):
        self.vocab_level = {}

        for line in open('./data/cambridge.dict.slim.txt', 'r', encoding='utf8'):
            vocab, level, poss, gw, href = line.split('\t')

            if (vocab not in self.vocab_level or
               level_table[level] < level_table[self.vocab_level[vocab]]):
                self.vocab_level[vocab] = level

    def lookup(self, vocab):
        if vocab not in self.vocab_level:
            return None

        return self.vocab_level[vocab]

    
def duplicate_sent(sent):
    sent = sent.replace('\\', '')
    tokens = []
    for token in sent.split():
        tokens.append(token.split('/') if '/' in token else [token])

    composes = product(*tokens)
    sents = [' '.join(compose) for compose in composes]
    
    return sents

from collections import defaultdict
import json

vocab_level = defaultdict(list)
dictionary = json.load(open('data/cambridge.dict.json', 'r', encoding='utf8'))

for vocab in dictionary:
    for pos in dictionary[vocab]:
        for each in dictionary[vocab][pos]:
            indices, vectors = [], []

            example = each['dic_examples'][0] if each['dic_examples'] else ''
            example = duplicate_sent(example)[0].lower().strip()
            tokenized_text, indexed_tokens, segments_ids = tokenize(example)

            embeddings = get_representation(indexed_tokens, segments_ids)

            index = tokenized_text.index(vocab)
            word_emb = embeddings[0][index]
            
            vocab_level[vocab].append({
                'level': each['level'],
                'definition': each['definition'],
                'emb': word_emb,
                'example': example
            })

In [None]:
vocab_level

In [13]:
from gensim.models import KeyedVectors
from utils.EVP import Evp
from utils.config import level_table

model = KeyedVectors.load_word2vec_format('/atom/word_vectors/GoogleNews-vectors-negative300.bin', binary=True)

In [22]:
def recommend_vocab(vocab):
    if Evp.vocab_exists(vocab):
        level, poss = Evp.get_level(vocab), Evp.get_pos(vocab)
        
        if vocab in model:
            sims = model.similar_by_word(vocab, topn=100)
            recs = [{ 'vocab': sim, 'level': Evp.get_level(sim) } for sim, score in sims 
                    if Evp.vocab_exists(sim) and level_table[Evp.get_level(sim)] > level_table[level] and len(Evp.get_pos(sim) & poss) > 0]

            return recs[:10]

    return []

In [25]:
recommend_vocab('eat')

[{'vocab': 'consume', 'level': 'B2'}, {'vocab': 'chew', 'level': 'B2'}]

In [None]:
Evp.lookup_by_sense('set', 'he sets up the house.')