# Train index

Will preform training on index, for compress it

In [1]:
# plotly standard imports
import plotly.graph_objs as go
import chart_studio.plotly as py

# Cufflinks wrapper on plotly
import cufflinks

# Data science imports
import pandas as pd
import numpy as np

# Options for pandas
pd.options.display.max_columns = 30

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from plotly.offline import iplot, init_notebook_mode
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)

# Set global theme
cufflinks.set_config_file(world_readable=True, theme='pearl')

## Save simple index

Save simple index without compression

In [2]:
import numpy as np

num_rows = 17553713
batch_size = 128

filename = './indexes/wiki40b_passages_reps_32_l-8_h-768_b-512-512.dat'

passages_reps = np.memmap(
    filename,
    dtype='float32', mode='r',
    shape=(num_rows, batch_size)
)

In [6]:
import faiss

index = faiss.IndexFlatIP(batch_size)

In [7]:
index.add(passages_reps)

In [9]:
faiss.write_index(index, './indexes/flat_wiki40b_num_17553713_vector_128_no_compress.index')

# Train index with compression

In [3]:
import faiss

# Explanation https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
D = batch_size / 2
M = D / 4
K = 262144
index = faiss.index_factory(batch_size, f'OPQ{M}_{D},IVF{K}_HNSW32,PQ{M}') # High compression, speed 0.017760442368741334 p/s
# index = faiss.index_factory(batch_size, f'OPQ{M}_{D},IVF{K}_HNSW32,PQ{M}x4fsr') # Low compression, speed 0.016993699426944037 p/s

In [4]:
len(passages_reps)

17553713

In [5]:
faiss_res = faiss.StandardGpuResources()

index = faiss.index_cpu_to_gpu(faiss_res, 0, index)

In [7]:
train_size = 100 * K
train_reps = passages_reps[:train_size]
len(train_reps)

17553713

started at 21.08.2021 16:45

In [None]:
import time
from datetime import datetime

start = time.time()
print('Started at', datetime.fromtimestamp(start))
print('Will train', len(train_reps))

index.train(passages_reps)

end = time.time()
print('Took seconds:', end - start)

Started at 2021-08-21 13:45:34.781853
Will train 17553713


In [None]:
seconds = end - start
print('Training took', seconds, 'seconds')
print('Hours', seconds / 60 / 60)

speed = len(train_reps) / seconds
print('Speed', speed, 'p/s')

In [None]:
index.add(passages_reps)

In [None]:
index = faiss.index_gpu_to_cpu(index)

In [None]:
faiss.write_index(index, './indexes/ivf_wiki40b_num_17553713_vector_128_high_compress_17M_train.index')