### This notebook demonstrates how to evaluate accuracy

In [1]:
from sklearn.neighbors import NearestNeighbors
from TfidfVectorizer import TfidfVectorizer
from tqdm import tqdm
from utils import *

data_test = load_data_from_json('data/test_200.json')

### Data_5d

In [2]:
data = load_data_from_json('data/data_5d.json', clean=True, use_examples=False)
data_train, data_seen_500, data_unseen_500 = split_seen_unseen(data)

# fit tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words=None, stemmer=None, norm=False)
X_tfidf = vectorizer.fit_transform(data_train) # <--- fit train data here
words = vectorizer.words

# fit knn
knn = NearestNeighbors(metric='cosine')
knn.fit(X_tfidf)

y_pred = []
y_gold = []
for word, defi in tqdm(data_unseen_500.items()): # <--- use unseen_500 data here
    query = list(defi)[0]
    query = vectorizer.transform(query)

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 103874/103874 [00:03<00:00, 31574.02it/s]
100%|██████████| 89656/89656 [00:11<00:00, 7596.22it/s] 
100%|██████████| 95105/95105 [00:22<00:00, 4144.20it/s] 
100%|██████████| 486/486 [01:02<00:00,  7.79it/s]

acc@1: 0.1
acc@10: 0.29
acc@100: 0.57
median rank: 61
standard error of mean rank: 17





(0.102880658436214,
 0.294238683127572,
 0.5720164609053497,
 61.0,
 16.880152235877166)

### Data_5d with Examples

In [3]:
data = load_data_from_json('data/data_5d.json', clean=True, use_examples=True)
data_train, data_seen_500, data_unseen_500 = split_seen_unseen(data)

# fit tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words=None, stemmer=None, norm=False)
X_tfidf = vectorizer.fit_transform(data_train) # <--- fit train data here
words = vectorizer.words

# fit knn
knn = NearestNeighbors(metric='cosine')
knn.fit(X_tfidf)

y_pred = []
y_gold = []
for word, defi in tqdm(data_unseen_500.items()): # <--- use unseen_500 data here
    query = list(defi)[0]
    query = vectorizer.transform(query)

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 103874/103874 [00:05<00:00, 20464.17it/s]
100%|██████████| 90995/90995 [00:12<00:00, 7203.82it/s] 
100%|██████████| 96223/96223 [00:23<00:00, 4049.80it/s] 
100%|██████████| 488/488 [00:49<00:00,  9.83it/s]

acc@1: 0.1
acc@10: 0.3
acc@100: 0.57
median rank: 53
standard error of mean rank: 17





(0.09836065573770492,
 0.29508196721311475,
 0.5717213114754098,
 53.0,
 17.213963157183002)

### Data_merged

In [4]:
data = load_data_from_json('data/data.json', clean=True)
data_train, data_seen_500, data_unseen_500 = split_seen_unseen(data)

# fit tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words=None, stemmer=None, norm=False)
X_tfidf = vectorizer.fit_transform(data_train) # <--- fit train data here
words = vectorizer.words

# fit knn
knn = NearestNeighbors(metric='cosine')
knn.fit(X_tfidf)

y_pred = []
y_gold = []
for word, defi in tqdm(data_unseen_500.items()): # <--- use unseen_500 data here
    query = list(defi)[0]
    query = vectorizer.transform(query)

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 103874/103874 [00:05<00:00, 18255.25it/s]
100%|██████████| 93218/93218 [00:15<00:00, 5848.81it/s] 
100%|██████████| 113050/113050 [00:28<00:00, 3955.37it/s] 
100%|██████████| 480/480 [00:58<00:00,  8.26it/s]

acc@1: 0.13
acc@10: 0.34
acc@100: 0.64
median rank: 44
standard error of mean rank: 15





(0.13125, 0.3416666666666667, 0.6375, 44.5, 15.419463639567038)

### Data_merged with Examples

In [5]:
data = load_data_from_json('data/data_with_examples.json', clean=True)
data_train, data_seen_500, data_unseen_500 = split_seen_unseen(data)

# fit tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words=None, stemmer=None, norm=False)
X_tfidf = vectorizer.fit_transform(data_train) # <--- fit train data here
words = vectorizer.words

# fit knn
knn = NearestNeighbors(metric='cosine')
knn.fit(X_tfidf)

y_pred = []
y_gold = []
for word, defi in tqdm(data_unseen_500.items()): # <--- use unseen_500 data here
    query = list(defi)[0]
    query = vectorizer.transform(query)

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 103874/103874 [00:05<00:00, 18163.59it/s]
100%|██████████| 94452/94452 [00:16<00:00, 5784.74it/s] 
100%|██████████| 117034/117034 [00:30<00:00, 3861.19it/s] 
100%|██████████| 478/478 [01:02<00:00,  7.65it/s]

acc@1: 0.1
acc@10: 0.27
acc@100: 0.51
median rank: 90
standard error of mean rank: 19





(0.10460251046025104,
 0.26778242677824265,
 0.5125523012552301,
 90.5,
 18.960784146456554)

### Data_augmented

In [6]:
data = load_data_from_json('data/data_with_augmented.json', clean=True)
data_train, data_seen_500, data_unseen_500 = split_seen_unseen(data)

# fit tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words=None, stemmer=None, norm=False)
X_tfidf = vectorizer.fit_transform(data_train) # <--- fit train data here
words = vectorizer.words

# fit knn
knn = NearestNeighbors(metric='cosine')
knn.fit(X_tfidf)

y_pred = []
y_gold = []
for word, defi in tqdm(data_unseen_500.items()): # <--- use unseen_500 data here
    query = list(defi)[0]
    query = vectorizer.transform(query)

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 103874/103874 [00:50<00:00, 2054.64it/s]
100%|██████████| 97369/97369 [02:46<00:00, 583.85it/s] 
100%|██████████| 125217/125217 [01:38<00:00, 1276.86it/s] 
100%|██████████| 492/492 [02:30<00:00,  3.26it/s]

acc@1: 0.23
acc@10: 0.5
acc@100: 0.81
median rank: 10
standard error of mean rank: 10





(0.23373983739837398,
 0.4959349593495935,
 0.8089430894308943,
 10.0,
 9.64732386671763)

### Data_augmented with Examples

In [7]:
data = load_data_from_json('data/data_with_augmented_examples.json', clean=True)
data_train, data_seen_500, data_unseen_500 = split_seen_unseen(data)

# fit tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words=None, stemmer=None, norm=False)
X_tfidf = vectorizer.fit_transform(data_train) # <--- fit train data here
words = vectorizer.words

# fit knn
knn = NearestNeighbors(metric='cosine')
knn.fit(X_tfidf)

y_pred = []
y_gold = []
for word, defi in tqdm(data_unseen_500.items()): # <--- use unseen_500 data here
    query = list(defi)[0]
    query = vectorizer.transform(query)

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 103874/103874 [00:58<00:00, 1778.64it/s]
100%|██████████| 98481/98481 [03:04<00:00, 535.14it/s] 
100%|██████████| 129297/129297 [01:53<00:00, 1143.36it/s] 
100%|██████████| 496/496 [03:01<00:00,  2.73it/s]

acc@1: 0.25
acc@10: 0.51
acc@100: 0.77
median rank: 8
standard error of mean rank: 10





(0.2540322580645161,
 0.5100806451612904,
 0.7701612903225806,
 8.0,
 9.746020237446324)