In [1]:
import os
import sys
sys.path.insert(0, '../')

from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert.data import Queries, Collection
from colbert import Indexer, Searcher

In [2]:
fld_pth = "/mnt/vdb1/Datasets/18_categories/metrics_data"
models = '/mnt/vdb1/Datasets/18_categories/metrics_data/models.tsv'
offers = '/mnt/vdb1/Datasets/18_categories/metrics_data/offers.tsv'

offers = Queries(path=offers)
models = Collection(path=models)

f'Loaded {len(offers)} queries and {len(models):,} passages'

[Jan 09, 13:46:05] #> Loading the queries from /mnt/vdb1/Datasets/18_categories/metrics_data/offers.tsv ...
[Jan 09, 13:46:05] #> Got 135896 queries. All QIDs are unique.

[Jan 09, 13:46:05] #> Loading collection...
0M 


'Loaded 135896 queries and 103,214 passages'

## Indexing

For efficient search, we can pre-compute the ColBERT representation of each passage and index them.

Below, the `Indexer` take a model checkpoint and writes a (compressed) index to disk. We then prepare a `Searcher` for retrieval from this index.

(With four Titan V GPUs, indexing should take about 13 minutes. The output is fairly long/ugly at the moment!)

In [3]:
nbits = 2   # encode each dimension with 2 bits
doc_maxlen = 300   # truncate passages at 300 tokens

# checkpoint = 'downloads/colbertv2.0'
# checkpoint = "/mnt/vdb1/ColBERT/experiments/18_categories/none/2023-12/21/15.54.25/checkpoints/colbert"
# checkpoint = "/mnt/vdb1/ColBERT/experiments/triples_X3/none/2023-12/29/08.33.53/checkpoints/colbert"
# checkpoint = "/mnt/vdb1/ColBERT/experiments/triples_X1/none/2023-12/29/14.52.59/checkpoints/colbert"
checkpoint = "/mnt/vdb1/ColBERT/experiments/triples_X1_5epochs/none/2024-01/09/12.18.56/checkpoints/colbert"
index_name = f'models.18_categories.{nbits}bits'

with Run().context(RunConfig(nranks=1, experiment='notebook')):  # nranks specifies the number of GPUs to use.
    config = ColBERTConfig(doc_maxlen=doc_maxlen, nbits=nbits)

    indexer = Indexer(checkpoint=checkpoint, config=config)
    indexer.index(name=index_name, collection=models, overwrite=True)
indexer.get_index() # You can get the absolute path of the index, if needed.



[Jan 09, 13:46:05] #> Note: Output directory /mnt/vdb1/ColBERT/docs/experiments/notebook/indexes/models.18_categories.2bits already exists


[Jan 09, 13:46:05] #> Will delete 26 files already at /mnt/vdb1/ColBERT/docs/experiments/notebook/indexes/models.18_categories.2bits in 20 seconds...
#> Starting...
nranks = 1 	 num_gpus = 1 	 device=0
{
    "query_token_id": "[unused0]",
    "doc_token_id": "[unused1]",
    "query_token": "[Q]",
    "doc_token": "[D]",
    "ncells": null,
    "centroid_score_threshold": null,
    "ndocs": null,
    "load_index_with_mmap": false,
    "index_path": null,
    "nbits": 2,
    "kmeans_niters": 4,
    "resume": false,
    "similarity": "cosine",
    "bsize": 64,
    "accumsteps": 1,
    "lr": 1e-5,
    "maxsteps": 50000,
    "save_every": 1000,
    "warmup": null,
    "warmup_bert": null,
    "relu": false,
    "nway": 2,
    "use_ib_negatives": false,
    "reranker": false,
    "distillation_alpha": 1.0,
    "ignore_scores": false,
    "model_name":

0it [00:00, ?it/s]

[Jan 09, 13:47:27] [0] 		 #> Saving chunk 0: 	 25,000 passages and 842,718 embeddings. From #0 onward.


1it [00:17, 17.30s/it]

[Jan 09, 13:47:27] [0] 		 #> Encoding 25000 passages..
[Jan 09, 13:47:40] [0] 		 #> Saving chunk 1: 	 25,000 passages and 550,405 embeddings. From #25,000 onward.
[Jan 09, 13:47:40] [0] 		 #> Encoding 25000 passages..


2it [00:30, 14.97s/it]

[Jan 09, 13:47:49] [0] 		 #> Saving chunk 2: 	 25,000 passages and 254,364 embeddings. From #50,000 onward.
[Jan 09, 13:47:49] [0] 		 #> Encoding 25000 passages..


3it [00:39, 12.04s/it]

[Jan 09, 13:48:01] [0] 		 #> Saving chunk 3: 	 25,000 passages and 642,034 embeddings. From #75,000 onward.
[Jan 09, 13:48:02] [0] 		 #> Encoding 3214 passages..


4it [00:51, 12.23s/it]

[Jan 09, 13:48:03] [0] 		 #> Saving chunk 4: 	 3,214 passages and 27,297 embeddings. From #100,000 onward.
[Jan 09, 13:48:03] [0] 		 #> Checking all files were saved...
[Jan 09, 13:48:03] [0] 		 Found all files!
[Jan 09, 13:48:03] [0] 		 #> Building IVF...
[Jan 09, 13:48:03] [0] 		 #> Loading codes...
[Jan 09, 13:48:03] [0] 		 Sorting codes...


5it [00:52, 10.54s/it]
100%|██████████| 5/5 [00:00<00:00, 815.44it/s]


[Jan 09, 13:48:03] [0] 		 Getting unique codes...
[Jan 09, 13:48:03] #> Optimizing IVF to store map from centroids to list of pids..
[Jan 09, 13:48:03] #> Building the emb2pid mapping..
[Jan 09, 13:48:03] len(emb2pid) = 2316818


100%|██████████| 16384/16384 [00:00<00:00, 28386.78it/s]


[Jan 09, 13:48:04] #> Saved optimized IVF to /mnt/vdb1/ColBERT/docs/experiments/notebook/indexes/models.18_categories.2bits/ivf.pid.pt
[Jan 09, 13:48:04] [0] 		 #> Saving the indexing metadata to /mnt/vdb1/ColBERT/docs/experiments/notebook/indexes/models.18_categories.2bits/metadata.json ..
#> Joined...


'/mnt/vdb1/ColBERT/docs/experiments/notebook/indexes/models.18_categories.2bits'

## Search

Having built the index and prepared our `searcher`, we can search for individual query strings.

We can use the `queries` set we loaded earlier — or you can supply your own questions. Feel free to get creative! But keep in mind this set of ~300k lifestyle passages can only answer a small, focused set of questions!

In [4]:
with Run().context(RunConfig(experiment='notebook')):
    searcher = Searcher(index=index_name)

[Jan 09, 13:48:05] #> Loading collection...
0M 
[Jan 09, 13:48:08] #> Loading codec...
[Jan 09, 13:48:08] Loading decompress_residuals_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[Jan 09, 13:48:09] Loading packbits_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[Jan 09, 13:48:09] #> Loading IVF...
[Jan 09, 13:48:09] #> Loading doclens...


100%|██████████| 5/5 [00:00<00:00, 944.41it/s]

[Jan 09, 13:48:09] #> Loading codes and residuals...



100%|██████████| 5/5 [00:00<00:00, 70.97it/s]


In [5]:
offers[37]

'Смартфон Samsung Galaxy Z Flip3 F711B 128Gb Violet'

In [6]:
query = offers[37]   # or supply your own query

# Find the top-3 passages for this query
results = searcher.search(query, k=5)

# Print out the top-k retrieved passages
for passage_id, passage_rank, passage_score in zip(*results):
    print(f"\t [{passage_rank}] \t\t {passage_score:.1f} \t\t {searcher.collection[passage_id]}")


#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . Смартфон Samsung Galaxy Z Flip3 F711B 128Gb Violet, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  1196, 29745, 10260, 16856, 22919, 29749, 14150, 18947,
        19102,  9088,  1062, 11238,  2509,  1042,  2581, 14526,  2497, 11899,
        18259,  8766,   102,   103,   103,   103,   103,   103,   103,   103,
          103,   103])
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0])



	 [1] 		 18.8 		 Samsung Galaxy Z Flip3 5G 128Gb
	 [2] 		 18.7 		 Samsung Galaxy Tab S7 11 128Gb
	 [3] 		 18.7 		 Samsung Galaxy F13 128Gb
	 [4] 		 18.7 		 Samsung Galaxy A13 128Gb
	 [5] 		 18.2 		 Samsung Galaxy Z Flip4 128Gb


## Batch Search

In many applications, you have a large batch of queries and you need to maximize the overall throughput. For that, you can use the `searcher.search_all(queries, k)` method, which returns a `Ranking` object that organizes the results across all queries.

(Batching provides many opportunities for higher-throughput search, though we have not implemented most of those optimizations for compressed indexes yet.)

In [7]:
import time
start_time = time.time()
rankings = searcher.search_all(offers, k=5).todict()
print(f"time_spent = {time.time() - start_time}")

135896it [19:20, 117.09it/s]


time_spent = 1197.1275715827942


In [8]:
rankings

{0: [(102118, 1, 23.703125),
  (73325, 2, 23.6875),
  (42113, 3, 17.90625),
  (10713, 4, 17.78125),
  (4930, 5, 17.75)],
 1: [(53818, 1, 16.390625),
  (53668, 2, 16.328125),
  (56936, 3, 16.265625),
  (56922, 4, 16.234375),
  (53684, 5, 16.203125)],
 2: [(18044, 1, 19.015625),
  (12570, 2, 18.953125),
  (12384, 3, 18.953125),
  (15813, 4, 18.5625),
  (8493, 5, 18.546875)],
 3: [(74426, 1, 23.125),
  (77648, 2, 22.5),
  (75973, 3, 22.453125),
  (76344, 4, 22.34375),
  (76343, 5, 22.296875)],
 4: [(49481, 1, 26.421875),
  (45541, 2, 25.234375),
  (49504, 3, 23.625),
  (49341, 4, 22.71875),
  (49416, 5, 22.328125)],
 5: [(73253, 1, 26.796875),
  (102047, 2, 26.796875),
  (72230, 3, 24.59375),
  (101058, 4, 24.578125),
  (72214, 5, 23.734375)],
 6: [(38204, 1, 24.5),
  (98644, 2, 18.59375),
  (64285, 3, 18.1875),
  (70368, 4, 18.171875),
  (38060, 5, 18.0)],
 7: [(98752, 1, 17.3125),
  (88612, 2, 16.5),
  (92617, 3, 16.375),
  (84646, 4, 16.171875),
  (90303, 5, 16.109375)],
 8: [(73339, 1

In [9]:
searcher.collection[27946]

'Hoco Задняя накладка для iP 13 mini (5.4) Light силикон притемненная'

In [10]:
rankings[30]  # For query 30, a list of (passage_id, rank, score) for the top-k passages

[(89986, 1, 28.171875),
 (90749, 2, 26.765625),
 (89884, 3, 26.59375),
 (89997, 4, 26.0625),
 (92083, 5, 25.5)]

In [11]:
type(rankings)

dict

In [12]:
import json

with open('/mnt/vdb1/Datasets/18_categories/metrics_data/triples_X1_5epochs.json', 'w') as fp:
    json.dump(rankings, fp)