### This notebook demonstrates how to evaluate accuracy

In [1]:
from sklearn.neighbors import NearestNeighbors
from TfidfVectorizer import TfidfVectorizer
from tqdm import tqdm
from utils import *

data, data_train, data_seen_500, data_unseen_500, data_test = load_data()

### Fit training data

In [2]:
# fit tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words=None, stemmer=None, norm=False)
X_tfidf = vectorizer.fit_transform(data_train) # <--- fit train data here
words = vectorizer.words

# fit knn
knn = NearestNeighbors(metric='cosine')
knn.fit(X_tfidf)

100%|██████████| 129297/129297 [01:52<00:00, 1145.67it/s] 


NearestNeighbors(metric='cosine')

### Test seen data

In [3]:
y_pred = []
y_gold = []
for word, defi in tqdm(data_seen_500.items()): # <--- use seen_500 data here
    query = list(defi)[0]
    query = vectorizer.transform(query)

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 498/498 [02:40<00:00,  3.10it/s]

acc@1: 0.26
acc@10: 0.57
acc@100: 0.85
median rank: 5
standard error of mean rank: 8





(0.25903614457831325,
 0.5662650602409639,
 0.8453815261044176,
 5.0,
 8.06893217546962)

### Test unseen data

In [4]:
y_pred = []
y_gold = []
for word, defi in tqdm(data_unseen_500.items()): # <--- use unseen_500 data here
    query = list(defi)[0]
    query = vectorizer.transform(query)

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 494/494 [03:00<00:00,  2.74it/s]

acc@1: 0.22
acc@10: 0.47
acc@100: 0.76
median rank: 11
standard error of mean rank: 10





(0.21862348178137653,
 0.47368421052631576,
 0.757085020242915,
 11.0,
 10.06019615818346)

### Test description set (200) data

In [5]:
# fit tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words=None, stemmer=None, norm=False)
X_tfidf = vectorizer.fit_transform(data) # <--- fit all data here
words = vectorizer.words

# fit knn
knn = NearestNeighbors(metric='cosine')
knn.fit(X_tfidf)

y_pred = []
y_gold = []
for word, defi in tqdm(data_test.items()): # <--- use desc_200 data here
    query = list(defi)[0]
    query = vectorizer.transform(query)

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 129297/129297 [01:49<00:00, 1185.26it/s] 
100%|██████████| 200/200 [01:04<00:00,  3.11it/s]

acc@1: 0.01
acc@10: 0.02
acc@100: 0.19
median rank: 448
standard error of mean rank: 28





(0.005, 0.02, 0.19, 447.5, 27.65448954509918)