### This notebook demonstrates how to evaluate accuracy

In [1]:
from sklearn.neighbors import NearestNeighbors
from BM25Vectorizer import BM25Vectorizer
from tqdm import tqdm
from utils import *

data, data_train, data_seen_500, data_unseen_500, data_test = load_data()

### Fit training data

In [2]:
# fit BM25 vectorizer
vectorizer = BM25Vectorizer(stop_words=None, stemmer=None, norm=False)
X_BM25 = vectorizer.fit_transform(data_train) # <--- fit train data here
words = vectorizer.words

# fit knn
knn = NearestNeighbors(metric='cosine')
knn.fit(X_BM25)

100%|██████████| 129297/129297 [00:00<00:00, 320909.80it/s]


NearestNeighbors(metric='cosine')

### Test seen data

In [3]:
y_pred = []
y_gold = []
for word, defi in tqdm(data_seen_500.items()): # <--- use seen_500 data here
    query = list(defi)[0]
    query = vectorizer.transform(query)

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 498/498 [02:35<00:00,  3.20it/s]

acc@1: 0.28
acc@10: 0.62
acc@100: 0.87
median rank: 4
standard error of mean rank: 7





(0.2791164658634538,
 0.6204819277108434,
 0.8674698795180723,
 4.0,
 6.85071702179288)

### Test unseen data

In [4]:
y_pred = []
y_gold = []
for word, defi in tqdm(data_unseen_500.items()): # <--- use unseen_500 data here
    query = list(defi)[0]
    query = vectorizer.transform(query)

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 494/494 [02:36<00:00,  3.16it/s]

acc@1: 0.26
acc@10: 0.52
acc@100: 0.81
median rank: 8
standard error of mean rank: 10





(0.25708502024291496,
 0.5182186234817814,
 0.8117408906882592,
 8.0,
 10.039759265323937)

### Test description set (200) data

In [5]:
# fit BM25 vectorizer
vectorizer = BM25Vectorizer(stop_words=None, stemmer=None, norm=False)
X_BM25 = vectorizer.fit_transform(data) # <--- fit all data here
words = vectorizer.words

# fit knn
knn = NearestNeighbors(metric='cosine')
knn.fit(X_BM25)

y_pred = []
y_gold = []
for word, defi in tqdm(data_test.items()): # <--- use desc_200 data here
    query = list(defi)[0]
    query = vectorizer.transform(query)

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 129297/129297 [00:00<00:00, 315248.11it/s]
100%|██████████| 200/200 [01:04<00:00,  3.10it/s]

acc@1: 0.01
acc@10: 0.04
acc@100: 0.32
median rank: 230
standard error of mean rank: 27





(0.005, 0.035, 0.315, 230.5, 27.220657759778693)