### This notebook demonstrates how to evaluate accuracy

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors
from BOW import BOW
from tqdm import tqdm
from utils import *

data, data_train, data_seen_500, data_unseen_500, data_test = load_data()

### Fit training data

In [2]:
# fit BOW vectorizer
bow = BOW(stop_words=None, stemmer=None, norm=False)
X, words = bow.get_training_samples(data_train)
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(X)

# fit knn
knn = NearestNeighbors(metric='cosine')
knn.fit(X_bow)

NearestNeighbors(metric='cosine')

### Test seen data

In [3]:
y_pred = []
y_gold = []
for word, defi in tqdm(data_seen_500.items()): # <--- use seen_500 data here
    query = bow.preprocess(list(defi)[0])
    query = vectorizer.transform([query])

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 498/498 [03:00<00:00,  2.76it/s]

acc@1: 0.23
acc@10: 0.32
acc@100: 0.47
median rank: 146
standard error of mean rank: 20





(0.22690763052208834,
 0.3232931726907631,
 0.4738955823293173,
 145.5,
 20.15444121229428)

### Test unseen data

In [4]:
y_pred = []
y_gold = []
for word, defi in tqdm(data_unseen_500.items()): # <--- use unseen_500 data here
    query = bow.preprocess(list(defi)[0])
    query = vectorizer.transform([query])

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 494/494 [02:59<00:00,  2.76it/s]

acc@1: 0.2
acc@10: 0.31
acc@100: 0.41
median rank: 347
standard error of mean rank: 21





(0.19838056680161945,
 0.3076923076923077,
 0.4089068825910931,
 347.0,
 20.827406440544788)

### Test description set (200) data

In [5]:
# fit BOW vectorizer
bow = BOW(stop_words=None, stemmer=None, norm=False)
X, words = bow.get_training_samples(data) # <--- fit all data here
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(X)

# fit knn
knn = NearestNeighbors(metric='cosine')
knn.fit(X_bow)

y_pred = []
y_gold = []
for word, defi in tqdm(data_test.items()): # <--- use desc_200 data here
    query = bow.preprocess(list(defi)[0])
    query = vectorizer.transform([query])

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 200/200 [01:18<00:00,  2.54it/s]

acc@1: 0.01
acc@10: 0.04
acc@100: 0.14
median rank: 1000
standard error of mean rank: 26





(0.015, 0.04, 0.14, 1000.0, 26.4048114081506)