### This notebook demonstrates how to evaluate accuracy

In [1]:
from sklearn.neighbors import NearestNeighbors
from TfidfVectorizer import TfidfVectorizer
from tqdm import tqdm
from utils import *

data_test = load_data_from_json('data/test_200.json')

### Data_5d

In [2]:
data = load_data_from_json('data/data_5d.json', clean=True, use_examples=False)

# fit tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words=None, stemmer=None, norm=False)
X_tfidf = vectorizer.fit_transform(data) # <--- fit all data here
words = vectorizer.words

# fit knn
knn = NearestNeighbors(metric='cosine')
knn.fit(X_tfidf)

y_pred = []
y_gold = []
for word, defi in tqdm(data_test.items()): # <--- use desc_200 data here
    query = list(defi)[0]
    query = vectorizer.transform(query)

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 103874/103874 [00:03<00:00, 30045.40it/s]
100%|██████████| 95141/95141 [00:23<00:00, 4034.34it/s] 
100%|██████████| 200/200 [00:19<00:00, 10.04it/s]


acc@1: 0.01
acc@10: 0.07
acc@100: 0.29
median rank: 284
standard error of mean rank: 29


(0.015, 0.065, 0.29, 283.5, 28.73878354897785)

### Data_5d with Examples

In [3]:
data = load_data_from_json('data/data_5d.json', clean=True, use_examples=True)

# fit tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words=None, stemmer=None, norm=False)
X_tfidf = vectorizer.fit_transform(data) # <--- fit all data here
words = vectorizer.words

# fit knn
knn = NearestNeighbors(metric='cosine')
knn.fit(X_tfidf)

y_pred = []
y_gold = []
for word, defi in tqdm(data_test.items()): # <--- use desc_200 data here
    query = list(defi)[0]
    query = vectorizer.transform(query)

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 103874/103874 [00:03<00:00, 29091.20it/s]
100%|██████████| 96254/96254 [00:24<00:00, 3981.00it/s] 
100%|██████████| 200/200 [00:20<00:00,  9.71it/s]

acc@1: 0.02
acc@10: 0.06
acc@100: 0.28
median rank: 316
standard error of mean rank: 29





(0.02, 0.06, 0.285, 316.5, 28.616604585450037)

### Data_merged

In [4]:
data = load_data_from_json('data/data.json', clean=True)

# fit tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words=None, stemmer=None, norm=False)
X_tfidf = vectorizer.fit_transform(data) # <--- fit all data here
words = vectorizer.words

# fit knn
knn = NearestNeighbors(metric='cosine')
knn.fit(X_tfidf)

y_pred = []
y_gold = []
for word, defi in tqdm(data_test.items()): # <--- use desc_200 data here
    query = list(defi)[0]
    query = vectorizer.transform(query)

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 103874/103874 [00:05<00:00, 19926.14it/s]
100%|██████████| 113086/113086 [00:27<00:00, 4062.69it/s] 
100%|██████████| 200/200 [00:24<00:00,  8.18it/s]

acc@1: 0.01
acc@10: 0.06
acc@100: 0.34
median rank: 218
standard error of mean rank: 27





(0.015, 0.06, 0.34, 217.5, 27.069715493056073)

### Data_merged with Examples

In [5]:
data = load_data_from_json('data/data_with_examples.json', clean=True)

# fit tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words=None, stemmer=None, norm=False)
X_tfidf = vectorizer.fit_transform(data) # <--- fit all data here
words = vectorizer.words

# fit knn
knn = NearestNeighbors(metric='cosine')
knn.fit(X_tfidf)

y_pred = []
y_gold = []
for word, defi in tqdm(data_test.items()): # <--- use desc_200 data here
    query = list(defi)[0]
    query = vectorizer.transform(query)

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 103874/103874 [00:04<00:00, 21714.51it/s]
100%|██████████| 117063/117063 [00:31<00:00, 3671.13it/s] 
100%|██████████| 200/200 [00:25<00:00,  7.89it/s]

acc@1: 0.02
acc@10: 0.05
acc@100: 0.33
median rank: 235
standard error of mean rank: 27





(0.02, 0.05, 0.325, 235.0, 26.97302761788153)

### Data_augmented

In [6]:
data = load_data_from_json('data/data_with_augmented.json', clean=True)

# fit tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words=None, stemmer=None, norm=False)
X_tfidf = vectorizer.fit_transform(data) # <--- fit all data here
words = vectorizer.words

# fit knn
knn = NearestNeighbors(metric='cosine')
knn.fit(X_tfidf)

y_pred = []
y_gold = []
for word, defi in tqdm(data_test.items()): # <--- use desc_200 data here
    query = list(defi)[0]
    query = vectorizer.transform(query)

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 103874/103874 [00:49<00:00, 2103.52it/s]
100%|██████████| 125218/125218 [01:37<00:00, 1278.84it/s] 
100%|██████████| 200/200 [01:01<00:00,  3.25it/s]

acc@1: 0.01
acc@10: 0.03
acc@100: 0.23
median rank: 446
standard error of mean rank: 28





(0.005, 0.025, 0.225, 446.0, 27.562574262938504)

### Data_augmented with Examples

In [7]:
data = load_data_from_json('data/data_with_augmented_examples.json', clean=True)

# fit tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words=None, stemmer=None, norm=False)
X_tfidf = vectorizer.fit_transform(data) # <--- fit all data here
words = vectorizer.words

# fit knn
knn = NearestNeighbors(metric='cosine')
knn.fit(X_tfidf)

y_pred = []
y_gold = []
for word, defi in tqdm(data_test.items()): # <--- use desc_200 data here
    query = list(defi)[0]
    query = vectorizer.transform(query)

    prediction = search(query, knn, words, n=1000)

    y_pred.append(prediction)
    y_gold.append(word)

evaluate(y_pred, y_gold)

100%|██████████| 103874/103874 [00:57<00:00, 1795.68it/s]
100%|██████████| 129297/129297 [01:48<00:00, 1194.70it/s] 
100%|██████████| 200/200 [01:12<00:00,  2.76it/s]

acc@1: 0.01
acc@10: 0.02
acc@100: 0.19
median rank: 448
standard error of mean rank: 28





(0.005, 0.02, 0.19, 447.5, 27.65448954509918)