In [1]:
from IPython.display import clear_output 
!pip install nmslib spacy tqdm
!python -m spacy download en_core_web_md
clear_output()

# Generate some vectors 

Download [Jeopardy dataset](https://github.com/hsu-ai-course/hsu.ai/blob/master/code/datasets/nlp/JEOPARDY_CSV.zip). Read it as CSV. Embed `Question`s with spacy model.

In [2]:
MAX = 30000
DATASET_FILENAME = '../datasets/JEOPARDY_CSV.csv'

In [3]:
import csv
import numpy as np

strings = []

with open(DATASET_FILENAME, 'r', encoding='utf8') as f:
    dr = csv.DictReader(f, delimiter=',')
    for i, line in enumerate(dr):
        if i == MAX: break
        strings.append(line)

In [4]:
import spacy
import tqdm
import pickle
import os

nlp = spacy.load('en_core_web_md')
vectors = np.zeros((MAX, 300))

if os.path.exists('vectors'):
    with open('vectors', 'rb') as f:
        vectors = pickle.load(f)
else:
    for i, line in enumerate(tqdm.tqdm(strings)):
        # TODO your code here to find `Question` embedding and write it to i'th string of the matrix
        vectors[i, :] =  ...
    with open('vectors', 'wb') as f:
        pickle.dump(vectors, f)

In [5]:
print("Vectors:", vectors.shape, "| strings:", len(strings))

Vectors: (30000, 300) | strings: 30000


# Index construction

- index methods: `'nsw'`, `'hnsw'` and [others](https://github.com/nmslib/nmslib/blob/master/manual/methods.md). We will use `hnsw` and `brute_force`.
- space: `'cosinesimil'` and [many others](https://github.com/nmslib/nmslib/blob/master/manual/spaces.md). We will use `cosinesimil`.
- `post` parameters of `createIndex()`: optimization after HNSW built. [Read more here](https://github.com/nmslib/nmslib/blob/master/manual/methods.md#graph-based-search-methods-sw-graph-and-hnsw).
- `ef`: the size of the dynamic list for the nearest neighbors (used during the search). `efConstruction` improves the quality of a constructed graph and leads to higher accuracy of search, also leads to longer indexing times. Typically `ef=100..2000`. Similarly, increasing the value of `efSearch` improves recall at the expense of longer retrieval time.
- `M`: defines the maximum number of neighbors. The reasonable range of values for these parameters is `5..100`.

Here implement `find_for_vector` and `find_for_batch` methods according to the quick start documentation.

In [6]:
import nmslib

def get_index(
        vectors, 
        method='hnsw', 
        space='cosinesimil', 
        postprocessing_optimization_level=2,
        efConstruction=100,
        efSearch=100,
        M=16,
        cutoff=None
):
    import gc
    gc.collect()
    
    if cutoff is not None:
        vectors = vectors[:cutoff]
    
    index = nmslib.init(method=method, space=space, space_params = {
        "efConstruction": efConstruction,
        "efSearch": efSearch,
        "M": M,
    })
    index.addDataPointBatch(vectors)
    if method == 'hnsw':
        # this parameter appies to HNSW only
        index.createIndex({'post': postprocessing_optimization_level}, print_progress=True)
    else:
        index.createIndex(print_progress=True)
    return index


# should return the list of ids, sorted by dist from closest
def find_for_vector(index, vector, nn) -> list:
    # TODO your code here
    ...

# should return list of lists of indices. Outer list is len(batch), inner lists are `nn`
def find_for_batch(index, batch, nn) -> list:
    # TODO your code here
    ...

## Construct buckets with ground truth

In [7]:
bucket = ['opera', 'math', 'geography', 'films', 'cats', 'zebra', 
          'cooking', 'armory', 'war', 'history', 'money', 'books', 
          'quantum physics', 'snakes', 'sea', 'plants', 'chemistry', 'stars', 'science fiction',
          'languages', 'clothes']
bucket_vectors = [nlp(t).vector for t in bucket]
bucket += [s['Question'] for s in strings[150:14000:7]]
bucket_vectors += [v for v in vectors[150:14000:7]]
print(len(bucket))

2000


## Let's collect real nearst neighbours with flat index (no index)

In [8]:
# dataset sizes
DS = [2000, 5000, 10000, 20000, 30000]
# M values
MS = [8, 16, 32, 64]
# ef values
EFS = [100, 200, 400]

In [9]:
flats = {}
for cutoff in DS:
    flats[cutoff] = get_index(vectors, method='brute_force', cutoff=cutoff)

In [10]:
ground_truth = {}
for cutoff in DS:
    bucket_results = find_for_batch(flats[cutoff], bucket_vectors, nn=1000)
    ground_truth[cutoff] = bucket_results

## Compute recall @N

Implement recall@N computation. Discuss the values.

In [11]:
def recall(sorted_result : list, sorted_ground_truth: list, N: int) -> float:
    # your code here

In [12]:
for b in ground_truth[10000][:5]:
    print(recall(b, b, 700), recall(b[:500], b, 700), recall(b[:350], b, 700), recall([], b, 700))

1.0 0.7142857142857143 0.5 0.0
1.0 0.7142857142857143 0.5 0.0
1.0 0.7142857142857143 0.5 0.0
1.0 0.7142857142857143 0.5 0.0
1.0 0.7142857142857143 0.5 0.0


Disciss the values.
- Why do you see these numbers? 
- Why are they equal?

# Test HNSW

Create an index with default settings and assess its recall numbers.

In [13]:
hnsw = get_index(vectors, method='hnsw')

In [14]:
vector_id = 5
vector = bucket_vectors[vector_id]
reference = ground_truth[DS[-1]][vector_id]

for n in [100, 1000, 2000]:
    ids = find_for_vector(hnsw, bucket_vectors[5], nn=n)
    print(f"recall@{n} =", recall(ids, ground_truth[DS[-1]][5], n))

recall@100 = 0.58
recall@1000 = 0.168
recall@2000 = 0.084


Why recall drops with the growth of N?

In [15]:
import time

def test(index, bucket_vectors, nn=100):
    start = time.time()
    result = find_for_batch(index, bucket_vectors, nn=nn)
    return time.time() - start, result

# Assessment

Let's vary parameters of index to see how it influences the following properties:
- construction time.
- search time (for a bucket in total).
- `recall@100`, `recall@1000`, and `recall@100` for 1000 results.

What do we vary? Dataset size `D`, `M`, and `ef`.

In [16]:
from itertools import product
import tqdm
import time

# full combination of parameters for grid search
hypers = list(product(MS, DS, EFS))
result = []

for M, D, ef in tqdm.tqdm(hypers):
    start = time.time()
    
    # todo write here the code to construct index object given provided hyperparameters
    I = get_index( ... )
    
    build_time = time.time() - start
    
    t100, ids100 = test(I, bucket_vectors, nn=100)
    t1000, ids1000 = test(I, bucket_vectors, nn=1000)
    r100, r1000, r100_ = 0., 0., 0.
    
    for i, r in enumerate(ground_truth[D]):
        r100 += recall(ids100[i], r, 100)
        r1000 += recall(ids1000[i], r, 1000)
        r100_ += recall(ids1000[i], r, 100)
    
    r100 /= len(bucket_results)
    r1000 /= len(bucket_results)
    r100_ /= len(bucket_results) 
    
    result.append(
        [M, D, ef, build_time, t100, t1000, r100, r1000, r100_]
    )

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 60/60 [12:09<00:00, 12.15s/it]


In [17]:
import pandas as pd
headers = ["M", "D", "ef", "built_time", "search 100", "search 1000", "recall@100", "recall@1000", "recall@100*"]
d = {}
for i, h in enumerate(headers):
    d[headers[i]] = [r[i] for r in result]

df = pd.DataFrame(d)
df.to_pickle('results.pkl')
df

Unnamed: 0,M,D,ef,built_time,search 100,search 1000,recall@100,recall@1000,recall@100*
0,8,2000,100,0.536024,0.072502,0.138012,0.88118,0.307296,0.88118
1,8,2000,200,0.522583,0.070411,0.140841,0.88047,0.306122,0.88047
2,8,2000,400,0.5376,0.069945,0.146034,0.881765,0.312111,0.881765
3,8,5000,100,1.778082,0.111281,0.199486,0.84259,0.319985,0.84259
4,8,5000,200,1.756466,0.122839,0.193428,0.841125,0.32149,0.841125
5,8,5000,400,1.771538,0.120865,0.213542,0.84282,0.324111,0.84282
6,8,10000,100,5.13015,0.151198,0.243996,0.813515,0.313156,0.813515
7,8,10000,200,6.657881,0.181639,0.348991,0.814,0.315297,0.814
8,8,10000,400,6.492734,0.181638,0.260796,0.81376,0.315052,0.81376
9,8,20000,100,21.512939,0.230073,0.336297,0.786395,0.303318,0.786395


Discuss, how parameters (using the table) affect perfromance?

## Load results.

(for some reason `pyplot` does not survive together with `nmslib` in Windows. So we save and load)

Lets display them now!

Discuss the graphs.

In [3]:
import pandas as pd
headers = ["M", "D", "ef", "built_time", "search 100", "search 1000", "recall@100", "recall@1000", "recall@100*"]
DS = [2000, 5000, 10000, 20000, 30000]
# M values
MS = [8, 16, 32, 64]
# ef values
EFS = [100, 200, 400]
df = pd.read_pickle('results.pkl')

In [None]:
%matplotlib inline 
import matplotlib.pyplot as plt

fig, ax = plt.subplots(len(headers) - 3, sharex=True, figsize=(15, 30))

for j in range(len(MS)):
    for k in range(3, len(headers)):
        subdata = df[(df['M'] == MS[j]) & (df['ef'] == EFS[0])]
        ax[k-3].plot(subdata['D'], subdata[headers[k]], label=f"M={MS[j]}")
        ax[k-3].set_title(f"{headers[k]}")
        ax[k-3].legend()
        
plt.tight_layout()
plt.show()