In [12]:
import json

import torch
import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

torch.manual_seed(7)

<torch._C.Generator at 0x7fc87b1e5f50>

In [13]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('../uncased_L-12_H-768_A-12')

# Load pre-trained model (weights)
model = BertModel.from_pretrained('../uncased_L-12_H-768_A-12')

In [14]:
def tokenize(sent):
    text = "[CLS] " + sent + " [SEP]"

    # Tokenize
    tokenized_text = tokenizer.tokenize(text)
    
    # Convert token to vocabulary indices
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
    segments_ids = [0]*len(indexed_tokens)

    return tokenized_text, indexed_tokens, segments_ids


def get_representation(indexed_tokens, segments_ids):
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    # If you have a GPU, put everything on cuda
    tokens_tensor = tokens_tensor.to('cuda')
    segments_tensors = segments_tensors.to('cuda')
    model.to('cuda')
    
    # Predict hidden states features for each layer
    with torch.no_grad():
        encoded_layers, _ = model(tokens_tensor, segments_tensors)
        return encoded_layers[0]

In [None]:
vectors = torch.stack(vectors)

In [None]:
sents = ["I try to finish the project."]

sent = sents[0].lower().strip()
tokenized_text, indexed_tokens, segments_ids = tokenize(sent)
embeddings = get_representation(indexed_tokens, segments_ids)

index = tokenized_text.index('try')
target_emb = embeddings[0][index]

cos = nn.CosineSimilarity(dim=1, eps=1e-6)
output = cos(target_emb.unsqueeze(0), vectors)
    
matches = output > 0.7
if any(matches):
    for i, is_match in enumerate(matches):
        if is_match:
            example_idx = indices[i]
            print(examples[example_idx])
            print(info_pairs[example_idx])
            print(output[i:i+5])

In [21]:
#!/usr/bin/env python
# coding: utf-8
from utils.config import level_table
from itertools import product

class EVP:

    def __init__(self):
        self.vocab_level = {}

        for line in open('./data/cambridge.dict.slim.txt', 'r', encoding='utf8'):
            vocab, level, poss, gw, href = line.split('\t')

            if (vocab not in self.vocab_level or
               level_table[level] < level_table[self.vocab_level[vocab]]):
                self.vocab_level[vocab] = level

    def lookup(self, vocab):
        if vocab not in self.vocab_level:
            return None

        return self.vocab_level[vocab]

    
def duplicate_sent(sent):
    sent = sent.replace('\\', '')
    tokens = []
    for token in sent.split():
        tokens.append(token.split('/') if '/' in token else [token])

    composes = product(*tokens)
    sents = [' '.join(compose) for compose in composes]
    
    return sents

from collections import defaultdict
import json

vocab_level = defaultdict(list)
dictionary = json.load(open('data/cambridge.dict.json', 'r', encoding='utf8'))

for vocab in dictionary:
    for pos in dictionary[vocab]:
        for each in dictionary[vocab][pos]:
            indices, vectors = [], []

            example = each['dic_examples'][0] if each['dic_examples'] else ''
            example = duplicate_sent(example)[0].lower().strip()
            tokenized_text, indexed_tokens, segments_ids = tokenize(example)

            embeddings = get_representation(indexed_tokens, segments_ids)

            index = tokenized_text.index(vocab)
            word_emb = embeddings[0][index]
            
            vocab_level[vocab].append({
                'level': each['level'],
                'definition': each['definition'],
                'emb': word_emb,
                'example': example
            })

['[CLS]', 'she', "'", 's', 'got', 'a', 'boyfriend', '.', '[SEP]']
9
['[CLS]', 'they', 'were', 'forced', 'to', 'abandon', 'the', 'car', '.', '[SEP]']
10
['[CLS]', 'an', 'abandoned', 'house', '[SEP]']
5
['[CLS]', 'she', 'had', 'the', 'ability', 'to', 'explain', 'things', 'clearly', '.', '[SEP]']
11
['[CLS]', 'he', "'", 'll', 'be', 'able', 'to', 'help', 'you', '.', '[SEP]']
11
['[CLS]', 'abnormal', 'behavior', '[SEP]']
4
['[CLS]', 'once', 'everyone', 'was', 'aboard', 'the', 'plane', ',', 'the', 'doors', 'were', 'closed', 'and', 'the', 'crew', 'prepared', 'for', 'takeoff', '.', '[SEP]']
20
['[CLS]', 'i', 'think', 'bull', '##fighting', 'should', 'be', 'abolished', '.', '[SEP]']


ValueError: 'abolish' is not in list

In [7]:
vocab_level

defaultdict(list,
            {'a': [{'level': 'A1',
               'definition': 'used before a noun to refer to a single thing or person but not a particular thing or person or not one that you have referred to before',
               'example': "She's got a boyfriend."},
              {'level': 'A1',
               'definition': 'used to mean any or every thing or person of the type you are referring to',
               'example': 'Can you ride a bike?'},
              {'level': 'A1',
               'definition': 'one',
               'example': 'a hundred dollars'},
              {'level': 'A1',
               'definition': 'used to state what type of thing or person something or someone is',
               'example': "Sally's an engineer."},
              {'level': 'A2',
               'definition': 'used before some phrases saying how much of something there is',
               'example': 'a few days'},
              {'level': 'A2',
               'definition': 'used before some 