### This notebook demonstrates how to evaluate accuracy

In [7]:
from sklearn.neighbors import NearestNeighbors
from TfidfVectorizer import TfidfVectorizer
from tqdm import tqdm
from utils import *

# don't forget to set clean=True to clean the data!
data = load_data_from_json('data/data_5d.json', clean=True, use_examples=True)
data_test = load_data_from_json('data/test_200.json')

100%|██████████| 103874/103874 [00:04<00:00, 22702.00it/s]


In [8]:
data_train, data_seen_500, data_unseen_500 = split_seen_unseen(data)
print(len(list(itertools.chain(*data_train.values()))))
print(len(list(itertools.chain(*data_seen_500.values()))))
print(len(list(itertools.chain(*data_unseen_500.values()))))

100%|██████████| 90995/90995 [00:12<00:00, 7339.61it/s] 


576804
500
500


### Fit training data

In [9]:
# fit tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words=None, stemmer=None, norm=False)
X_tfidf = vectorizer.fit_transform(data_train) # <--- fit train data here
words = vectorizer.words

# fit knn
knn = NearestNeighbors(metric='cosine')
knn.fit(X_tfidf)

100%|██████████| 96222/96222 [00:24<00:00, 3865.49it/s] 


NearestNeighbors(metric='cosine')

### Test seen data

In [10]:
y_pred = []
y_gold = []
for word, defi in tqdm(data_seen_500.items()): # <--- use seen_500 data here
    query = list(defi)[0]
    query = vectorizer.transform(query)

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 475/475 [00:57<00:00,  8.20it/s]

acc@1: 0.37
acc@10: 0.67
acc@100: 0.93
median rank: 2
standard error of mean rank: 3





(0.3684210526315789,
 0.6652631578947369,
 0.9284210526315789,
 2.0,
 3.226944942428781)

### Test unseen data

In [11]:
y_pred = []
y_gold = []
for word, defi in tqdm(data_unseen_500.items()): # <--- use unseen_500 data here
    query = list(defi)[0]
    query = vectorizer.transform(query)

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 485/485 [00:54<00:00,  8.85it/s]

acc@1: 0.11
acc@10: 0.32
acc@100: 0.6
median rank: 42
standard error of mean rank: 17





(0.10721649484536082,
 0.3237113402061856,
 0.6041237113402061,
 42.0,
 17.23568458253458)

### Test description set (200) data

In [12]:
# fit tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words=None, stemmer=None, norm=False)
X_tfidf = vectorizer.fit_transform(data) # <--- fit all data here
words = vectorizer.words

# fit knn
knn = NearestNeighbors(metric='cosine')
knn.fit(X_tfidf)

y_pred = []
y_gold = []
for word, defi in tqdm(data_test.items()): # <--- use desc_200 data here
    query = list(defi)[0]
    query = vectorizer.transform(query)

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 96254/96254 [00:25<00:00, 3834.82it/s] 
100%|██████████| 200/200 [00:35<00:00,  5.71it/s]

acc@1: 0.02
acc@10: 0.06
acc@100: 0.28
median rank: 316
standard error of mean rank: 29





(0.02, 0.06, 0.285, 316.5, 28.616604585450037)