### This notebook demonstrates how to evaluate accuracy

In [1]:
from sklearn.neighbors import NearestNeighbors
from TfidfVectorizer import TfidfVectorizer
from tqdm import tqdm
from utils import *

# don't forget to set clean=True to clean the data!
data = load_data_from_json('data/data_5d.json', clean=True)
data_test = load_data_from_json('data/test_200.json')

100%|██████████| 103874/103874 [00:10<00:00, 9980.25it/s] 


In [2]:
# fit tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words=None, stemmer=None, norm=False)
X_tfidf = vectorizer.fit_transform(data)
words = vectorizer.words

# fit knn
knn = NearestNeighbors(metric='cosine')
knn.fit(X_tfidf)

y_pred = []
y_gold = []
for word, defi in tqdm(data_test.items()):
    query = list(defi)[0]
    query = vectorizer.transform(query)

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

acc_1, acc_10, acc_100, median_rank, var_rank = evaluate(y_pred, y_gold)

print(f'acc@1: {acc_1}')
print(f'acc@10: {acc_10}')
print(f'acc@100: {acc_100}')
print(f'median rank: {median_rank}')
print(f'var rank: {var_rank}')

100%|██████████| 95141/95141 [00:42<00:00, 2213.68it/s] 
100%|██████████| 200/200 [00:36<00:00,  5.49it/s]

acc@1: 0.015
acc@10: 0.065
acc@100: 0.29
median rank: 283.5
var rank: 406.42777461069267



