### This notebook demonstrates how to evaluate accuracy

In [8]:
from sklearn.neighbors import NearestNeighbors
from TfidfVectorizer import TfidfVectorizer
from tqdm import tqdm
from utils import *

data_test = load_data_from_json('data/test_200.json')

### Data_5d

In [9]:
data = load_data_from_json('data/data_5d.json', clean=True, use_examples=False)
data_train, data_seen_500, data_unseen_500 = split_seen_unseen(data)

# fit tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words=None, stemmer=None, norm=False)
X_tfidf = vectorizer.fit_transform(data_train) # <--- fit train data here
words = vectorizer.words

# fit knn
knn = NearestNeighbors(metric='cosine')
knn.fit(X_tfidf)

y_pred = []
y_gold = []
for word, defi in tqdm(data_unseen_500.items()): # <--- use unseen_500 data here
    query = list(defi)[0]
    query = vectorizer.transform(query)

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 103874/103874 [00:07<00:00, 13123.72it/s]
100%|██████████| 95109/95109 [00:23<00:00, 4066.76it/s] 
100%|██████████| 489/489 [00:52<00:00,  9.36it/s]

acc@1: 0.08
acc@10: 0.28
acc@100: 0.54
median rank: 71
standard error of mean rank: 17





(0.081799591002045,
 0.278118609406953,
 0.5398773006134969,
 71.0,
 16.718255260206963)

### Data_5d with Examples

In [10]:
data = load_data_from_json('data/data_5d.json', clean=True, use_examples=True)
data_train, data_seen_500, data_unseen_500 = split_seen_unseen(data)

# fit tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words=None, stemmer=None, norm=False)
X_tfidf = vectorizer.fit_transform(data_train) # <--- fit train data here
words = vectorizer.words

# fit knn
knn = NearestNeighbors(metric='cosine')
knn.fit(X_tfidf)

y_pred = []
y_gold = []
for word, defi in tqdm(data_unseen_500.items()): # <--- use unseen_500 data here
    query = list(defi)[0]
    query = vectorizer.transform(query)

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 103874/103874 [00:04<00:00, 23826.92it/s]
100%|██████████| 96224/96224 [00:24<00:00, 3863.51it/s] 
100%|██████████| 489/489 [00:54<00:00,  8.96it/s]

acc@1: 0.09
acc@10: 0.28
acc@100: 0.56
median rank: 62
standard error of mean rank: 17





(0.08997955010224949,
 0.2842535787321063,
 0.5644171779141104,
 62.0,
 17.204774367070524)

### Data_merged

In [11]:
data = load_data_from_json('data/data.json', clean=True)
data_train, data_seen_500, data_unseen_500 = split_seen_unseen(data)

# fit tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words=None, stemmer=None, norm=False)
X_tfidf = vectorizer.fit_transform(data_train) # <--- fit train data here
words = vectorizer.words

# fit knn
knn = NearestNeighbors(metric='cosine')
knn.fit(X_tfidf)

y_pred = []
y_gold = []
for word, defi in tqdm(data_unseen_500.items()): # <--- use unseen_500 data here
    query = list(defi)[0]
    query = vectorizer.transform(query)

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 103874/103874 [00:05<00:00, 19275.28it/s]
100%|██████████| 113059/113059 [00:27<00:00, 4077.40it/s] 
100%|██████████| 480/480 [00:57<00:00,  8.33it/s]

acc@1: 0.1
acc@10: 0.32
acc@100: 0.63
median rank: 34
standard error of mean rank: 17





(0.09583333333333334, 0.3229166666666667, 0.63125, 33.5, 17.00783086565852)

### Data_merged with Examples

In [12]:
data = load_data_from_json('data/data_with_examples.json', clean=True)
data_train, data_seen_500, data_unseen_500 = split_seen_unseen(data)

# fit tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words=None, stemmer=None, norm=False)
X_tfidf = vectorizer.fit_transform(data_train) # <--- fit train data here
words = vectorizer.words

# fit knn
knn = NearestNeighbors(metric='cosine')
knn.fit(X_tfidf)

y_pred = []
y_gold = []
for word, defi in tqdm(data_unseen_500.items()): # <--- use unseen_500 data here
    query = list(defi)[0]
    query = vectorizer.transform(query)

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 103874/103874 [00:05<00:00, 17429.15it/s]
100%|██████████| 117039/117039 [00:30<00:00, 3820.83it/s] 
100%|██████████| 478/478 [01:00<00:00,  7.91it/s]

acc@1: 0.1
acc@10: 0.28
acc@100: 0.57
median rank: 62
standard error of mean rank: 17





(0.09832635983263599,
 0.27615062761506276,
 0.5690376569037657,
 62.5,
 17.29531581329862)

### Data_augmented

In [13]:
data = load_data_from_json('data/data_with_augmented.json', clean=True)
data_train, data_seen_500, data_unseen_500 = split_seen_unseen(data)

# fit tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words=None, stemmer=None, norm=False)
X_tfidf = vectorizer.fit_transform(data_train) # <--- fit train data here
words = vectorizer.words

# fit knn
knn = NearestNeighbors(metric='cosine')
knn.fit(X_tfidf)

y_pred = []
y_gold = []
for word, defi in tqdm(data_unseen_500.items()): # <--- use unseen_500 data here
    query = list(defi)[0]
    query = vectorizer.transform(query)

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 103874/103874 [00:50<00:00, 2046.97it/s]
100%|██████████| 125216/125216 [01:38<00:00, 1269.40it/s] 
100%|██████████| 495/495 [02:32<00:00,  3.26it/s]

acc@1: 0.27
acc@10: 0.56
acc@100: 0.84
median rank: 6
standard error of mean rank: 9





(0.2686868686868687,
 0.5575757575757576,
 0.8383838383838383,
 6.0,
 8.507769948839112)

### Data_augmented with Examples

In [14]:
data = load_data_from_json('data/data_with_augmented_examples.json', clean=True)
data_train, data_seen_500, data_unseen_500 = split_seen_unseen(data)

# fit tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words=None, stemmer=None, norm=False)
X_tfidf = vectorizer.fit_transform(data_train) # <--- fit train data here
words = vectorizer.words

# fit knn
knn = NearestNeighbors(metric='cosine')
knn.fit(X_tfidf)

y_pred = []
y_gold = []
for word, defi in tqdm(data_unseen_500.items()): # <--- use unseen_500 data here
    query = list(defi)[0]
    query = vectorizer.transform(query)

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 103874/103874 [00:58<00:00, 1783.56it/s]
100%|██████████| 129297/129297 [01:49<00:00, 1175.67it/s] 
100%|██████████| 489/489 [02:55<00:00,  2.79it/s]

acc@1: 0.22
acc@10: 0.5
acc@100: 0.81
median rank: 9
standard error of mean rank: 10





(0.2249488752556237,
 0.5030674846625767,
 0.8098159509202454,
 9.0,
 10.189472655313182)