In [35]:
import os
import sys
sys.path.insert(0, '../')
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert.data import Queries, Collection
from colbert import Indexer, Searcher

In [36]:
fld_pth = "/home/sondors/Documents/price/BERT_data/data/metrics_data"
models = f'{fld_pth}/models.tsv'
offers = f'{fld_pth}/offers.tsv'

offers = Queries(path=offers)
models = Collection(path=models)

f'Loaded {len(offers)} queries and {len(models):,} passages'

[Dec 21, 18:04:33] #> Loading the queries from /home/sondors/Documents/price/BERT_data/data/metrics_data/offers.tsv ...
[Dec 21, 18:04:34] #> Got 135896 queries. All QIDs are unique.

[Dec 21, 18:04:34] #> Loading collection...
0M 


'Loaded 135896 queries and 103,214 passages'

## Indexing

For efficient search, we can pre-compute the ColBERT representation of each passage and index them.

Below, the `Indexer` take a model checkpoint and writes a (compressed) index to disk. We then prepare a `Searcher` for retrieval from this index.

(With four Titan V GPUs, indexing should take about 13 minutes. The output is fairly long/ugly at the moment!)

In [37]:
nbits = 2   # encode each dimension with 2 bits
doc_maxlen = 180   # truncate passages at 300 tokens

checkpoint = '/home/sondors/colbert'
index_name = f'models.18_categories.{nbits}bits'

with Run().context(RunConfig(nranks=1, experiment='notebook', gpus=0)):  # nranks specifies the number of GPUs to use.
    config = ColBERTConfig(doc_maxlen=doc_maxlen, nbits=nbits)

    indexer = Indexer(checkpoint=checkpoint, config=config)
    indexer.index(name=index_name, collection=models, overwrite=True)
indexer.get_index() # You can get the absolute path of the index, if needed.



[Dec 21, 18:04:34] #> Note: Output directory /home/sondors/Documents/price/ColBERT/docs/experiments/notebook/indexes/models.18_categories.2bits already exists


#> Starting...


 98%|█████████▊| 49/50 [02:19<00:03,  3.10s/it]

{
    "query_token_id": "[unused0]",
    "doc_token_id": "[unused1]",
    "query_token": "[Q]",
    "doc_token": "[D]",
    "ncells": null,
    "centroid_score_threshold": null,
    "ndocs": null,
    "load_index_with_mmap": false,
    "index_path": null,
    "nbits": 2,
    "kmeans_niters": 4,
    "resume": false,
    "similarity": "cosine",
    "bsize": 64,
    "accumsteps": 1,
    "lr": 1e-5,
    "maxsteps": 500000,
    "save_every": 1,
    "warmup": null,
    "warmup_bert": null,
    "relu": false,
    "nway": 2,
    "use_ib_negatives": false,
    "reranker": false,
    "distillation_alpha": 1.0,
    "ignore_scores": false,
    "model_name": null,
    "query_maxlen": 32,
    "attend_to_mask_tokens": false,
    "interaction": "colbert",
    "dim": 128,
    "doc_maxlen": 180,
    "mask_punctuation": true,
    "checkpoint": "\/home\/sondors\/colbert",
    "triples": "\/mnt\/vdb1\/Datasets\/18_categories\/ColBERT_dataset\/triples.json",
    "collection": "\/home\/sondors\/Documents\/pr

100%|██████████| 50/50 [02:22<00:00,  2.86s/it]


[Dec 21, 18:04:43] [0] 		 # of sampled PIDs = 56310 	 sampled_pids[:3] = [54607, 96034, 1332]
[Dec 21, 18:04:43] [0] 		 #> Encoding 56310 passages..


100%|██████████| 50/50 [05:09<00:00,  6.19s/it]
100%|██████████| 50/50 [06:21<00:00,  7.62s/it]
100%|██████████| 50/50 [06:12<00:00,  7.45s/it]
100%|██████████| 50/50 [06:22<00:00,  7.65s/it]
100%|██████████| 50/50 [05:38<00:00,  6.77s/it]
100%|██████████| 50/50 [04:17<00:00,  5.15s/it]
100%|██████████| 50/50 [04:09<00:00,  5.00s/it]
100%|██████████| 50/50 [05:03<00:00,  6.07s/it]
 14%|█▍        | 7/50 [00:43<04:33,  6.35s/it]]

 16%|█▌        | 8/50 [00:48<04:06,  5.88s/it]]

## Search

Having built the index and prepared our `searcher`, we can search for individual query strings.

We can use the `queries` set we loaded earlier — or you can supply your own questions. Feel free to get creative! But keep in mind this set of ~300k lifestyle passages can only answer a small, focused set of questions!

In [None]:
with Run().context(RunConfig(experiment='notebook')):
    searcher = Searcher(index=index_name)

In [None]:
offers[37]

'Смартфон Samsung Galaxy Z Flip3 F711B 128Gb Violet'

In [None]:
query = offers[37]   # or supply your own query

# Find the top-3 passages for this query
results = searcher.search(query, k=5)

# Print out the top-k retrieved passages
for passage_id, passage_rank, passage_score in zip(*results):
    print(f"\t [{passage_rank}] \t\t {passage_score:.1f} \t\t {searcher.collection[passage_id]}")

## Batch Search

In many applications, you have a large batch of queries and you need to maximize the overall throughput. For that, you can use the `searcher.search_all(queries, k)` method, which returns a `Ranking` object that organizes the results across all queries.

(Batching provides many opportunities for higher-throughput search, though we have not implemented most of those optimizations for compressed indexes yet.)

In [None]:
import time
start_time = time.time()
rankings = searcher.search_all(offers, k=5).todict()
print(f"time_spent = {time.time() - start_time}")

In [None]:
rankings

In [None]:
searcher.collection[27946]

In [None]:
rankings[30]  # For query 30, a list of (passage_id, rank, score) for the top-k passages

In [None]:
type(rankings)

In [None]:
import json

with open(f'{fld_pth}/rankings_1_epoch.json', 'w') as fp:
    json.dump(rankings, fp)

 94%|█████████▍| 47/50 [02:13<00:08,  2.82s/it]