# Train index

Will preform training on index, for compress it

In [1]:
# plotly standard imports
import plotly.graph_objs as go
import chart_studio.plotly as py

# Cufflinks wrapper on plotly
import cufflinks

# Data science imports
import pandas as pd
import numpy as np

# Options for pandas
pd.options.display.max_columns = 30

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from plotly.offline import iplot, init_notebook_mode
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)

# Set global theme
cufflinks.set_config_file(world_readable=True, theme='pearl')

## Save simple index

Save simple index without compression

In [2]:
import numpy as np

num_rows = 17553713
batch_size = 128

filename = './indexes/wiki40b_passages_reps_32_l-8_h-768_b-512-512.dat'

passages_reps = np.memmap(
    filename,
    dtype='float32', mode='r',
    shape=(num_rows, batch_size)
)

# Train index with compression

In [3]:
import faiss

# Explanation https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
D = batch_size
M = D // 4
K = 262144
n_bits = 2 * D

index = faiss.index_factory(D, f'IVF{K},PQ128')
# index = faiss.index_factory(D, f'IVF{K},PQ32') # high compress
# index = faiss.index_factory(D, f'IVF{K},SQfp16') compress two times
# index = faiss.index_factory(D, 'PCA64,Flat')
# index = faiss.index_factory(D, 'PCAR64,Flat')
# index = faiss.IndexLSH(D, n_bits) # hashing, high compression
# index = faiss.IndexFlatIP(batch_size) # no compression
# index = faiss.IndexScalarQuantizer(D, getattr(faiss.ScalarQuantizer, 'QT_fp16')) # minimal compression
# index = faiss.index_factory(batch_size, f'OPQ{M}_{D},IVF{K}_HNSW32,PQ{M}') # High compression, speed 0.017760442368741334 p/s
# index = faiss.index_factory(batch_size, f'OPQ{M}_{D},IVF{K}_HNSW32,PQ{M}x4fsr') # Low compression, speed 0.016993699426944037 p/s

# index = faiss.IndexBinaryFlat(D)
# if binary
# passages_reps = np.empty((passages_reps, D // 8), dtype='uint8')



In [4]:
save_index_filename = './indexes/IVF262144_PQ128_wiki40b_num_17553713_nprobe_2048.index'

In [5]:
len(passages_reps)

17553713

In [6]:
index.is_trained

False

In [7]:
if index.nprobe is not None:
    index.nprobe = 2048
    print('Nprobe', index.nprobe)

Nprobe 2048


In [8]:
# if not index.is_trained:
#     faiss_res = faiss.StandardGpuResources()

#     index = faiss.index_cpu_to_gpu(faiss_res, 0, index)

#     params = faiss.GpuParameterSpace()
#     params.initialize(index)

In [9]:
if not index.is_trained:
    train_size = 100 * K
    train_reps = passages_reps[:train_size]
    len(train_reps)

17553713

In [10]:
import time
from datetime import datetime

if not index.is_trained:
    start = time.time()
    print('Started at', datetime.fromtimestamp(start))
    print('Will train', len(train_reps))

    index.train(train_reps)

    end = time.time()
    seconds = end - start
    print('Training took', seconds, 'seconds')
    print('Hours', seconds / 60 / 60)

    speed = len(train_reps) / seconds
    print('Speed', speed, 'p/s')
    
#     opi = params.explore(index, xq, crit)
#     opi.display()

Started at 2021-08-22 20:35:21.162023
Will train 17553713


KeyboardInterrupt: 

In [None]:
index.add(passages_reps)

In [None]:
# index = faiss.index_gpu_to_cpu(index)

In [None]:
faiss.write_index(index, save_index_filename)

In [None]:
index.nprobe