In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import numpy as np
import itertools
import json

In [2]:
def load_data_from_json(fname, use_examples=False):
    '''
    Load data dictionary from local file.
    '''
    data = {}
    with open(fname, 'r') as f:
        for i in json.load(f):
            word = i['word']
            defi = i['definitions']
            examples = i['examples']
            data[word] = set(defi)
            if use_examples and len(examples) > 0:
                for j, example in enumerate(examples.copy()):
                    # masking
                    examples[j] = example.replace(word, ' ')
                data[word] = data[word].union(examples)

    all_defi = list(itertools.chain(*data.values()))
    l = np.array([len(defi.split()) for defi in all_defi])
    print(f'num of words: {len(set(data.keys()))}')
    print(f'num of docs: {len(all_defi)}')
    print(f'max doc length: {np.max(l)}')
    print(f'min doc length: {np.min(l)}')
    print(f'mean doc length: {round(np.mean(l), 2)}')
    print(f'median doc length: {np.median(l)}')
    
    return data

In [3]:
data_5d = load_data_from_json('data_5d.json', use_examples=False)

num of words: 103874
num of docs: 733318
max doc length: 144
min doc length: 1
mean doc length: 11.6
median doc length: 9.0


In [4]:
X = []
y = []
for word, defi_set in data_5d.items():
    for defi in defi_set:
        X.append(defi)
        y.append(word)

# try to implement it by yourself
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(X)
print(X_tfidf.shape) # (733318, 105176)

(733318, 105176)


In [5]:
query = 'a road that car can go through quickly'
# query = 'a place can hold patients'
# query = 'a place had a lot of books'
# query = 'a food tastes good'
# query = 'very beautiful'

query = tfidf_vectorizer.transform([query])

knn = NearestNeighbors(n_neighbors=10, metric='cosine')
knn.fit(X_tfidf)

dist, nbrs = knn.kneighbors(query)

for i, score in zip(nbrs[0], 1-dist[0]):
    word = y[i]
    print(f'({word:<15}, {round(score,3)}) -----> {X[i]}')

(boogie         , 0.694) -----> to move quickly boogied down the road in their car
(jet            , 0.588) -----> to move very quickly
(quickest       , 0.588) -----> most quickly
(zoom           , 0.588) -----> move along very quickly
(fastest        , 0.588) -----> most quickly
(faster         , 0.588) -----> more quickly
(quicker        , 0.588) -----> more quickly
(sharp          , 0.588) -----> quickly
(flirting       , 0.588) -----> to move quickly
(smartly        , 0.588) -----> quickly
