# ColBERTv2: Indexing & Search Notebook

In [50]:
!git -C ColBERT/ pull || git clone https://github.com/stanford-futuredata/ColBERT.git
import sys; sys.path.insert(0, 'ColBERT/')


Already up to date.


In [51]:
try: # When on google Colab, let's install all dependencies with pip.
    import google.colab
    !pip install -U pip
    !pip install -e ColBERT/['faiss-gpu','torch']
except Exception:
  import sys; sys.path.insert(0, 'ColBERT/')
  try:
    from colbert import Indexer, Searcher
  except Exception:
    print("If you're running outside Colab, please make sure you install ColBERT in conda following the instructions in our README. You can also install (as above) with pip but it may install slower or less stable faiss or torch dependencies. Conda is recommended.")
    assert False

[0mObtaining file:///content/ColBERT
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: colbert-ir
  Attempting uninstall: colbert-ir
    Found existing installation: colbert-ir 0.2.14
    Uninstalling colbert-ir-0.2.14:
      Successfully uninstalled colbert-ir-0.2.14
  Running setup.py develop for colbert-ir
Successfully installed colbert-ir-0.2.14
[0m

In [52]:
import colbert

In [53]:
from colbert import Indexer, Searcher
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert.data import Queries, Collection

In [54]:
from datasets import Dataset
import pandas as pd
dataset = 'test'
df1 = pd.read_csv("Information-Retrieval/test/doc_col.tsv", delimiter = '\t', index_col=0)
df2 =  pd.read_csv("Information-Retrieval/test/queries_20.tsv", delimiter = '\t', index_col=0)
collection = Dataset.from_pandas(df1)
query = Dataset.from_pandas(df2)
print(query['query'][5])

WHAT IS THE EFFECT OF WATER OR OTHER THERAPEUTIC AGENTS ON THE PHYSICAL PROPERTIES VISCOSITY ELASTICITY OF SPUTUM OR BRONCHIAL SECRETIONS FROM CF PATIENTS


## Indexing

In [55]:
nbits = 2   # encode each dimension with 2 bits
doc_maxlen = 300 # truncate passages at 300 tokens

index_name = f'{dataset}.{nbits}bits'

Assuming the use of only one GPU, this cell should take about six minutes to finish running.

In [56]:
checkpoint = 'colbert-ir/colbertv2.0'

with Run().context(RunConfig(nranks=1, experiment='notebook')):  # nranks specifies the number of GPUs to use
    config = ColBERTConfig(doc_maxlen=doc_maxlen, nbits=nbits, kmeans_niters=4) # kmeans_niters specifies the number of iterations of k-means clustering; 4 is a good and fast default.
                                                                                # Consider larger numbers for small datasets.

    indexer = Indexer(checkpoint=checkpoint, config=config)
    indexer.index(name=index_name, collection=collection["doc"], overwrite=True)



[Dec 10, 22:58:28] #> Note: Output directory /content/experiments/notebook/indexes/test.2bits already exists


[Dec 10, 22:58:28] #> Will delete 10 files already at /content/experiments/notebook/indexes/test.2bits in 20 seconds...
#> Starting...
#> Joined...


In [57]:
indexer.get_index() # You can get the absolute path of the index, if needed.

'/content/experiments/notebook/indexes/test.2bits'

## Search

Having built the index and prepared our `searcher`, we can search for individual query strings.

We can use the `queries_20` set we loaded earlier — or you can supply your own questions. Feel free to get creative! But keep in mind this set of ~300k lifestyle passages can only answer a small, focused set of questions!

In [58]:
# To create the searcher using its relative name (i.e., not a full path), set
# experiment=value_used_for_indexing in the RunConfig.
with Run().context(RunConfig(experiment='notebook')):
    searcher = Searcher(index=index_name, collection=collection["doc"])

[Dec 10, 23:00:50] #> Loading codec...
[Dec 10, 23:00:50] #> Loading IVF...
[Dec 10, 23:00:50] #> Loading doclens...


100%|██████████| 1/1 [00:00<00:00, 5127.51it/s]

[Dec 10, 23:00:50] #> Loading codes and residuals...



100%|██████████| 1/1 [00:00<00:00, 701.86it/s]


In [63]:
question = query["query"][14] # try with an in-range query or supply your own
print(f"#> {question}")

# Find the top-3 passages for this query
results = searcher.search(question, k=3)

# Print out the top-k retrieved passages
for passage_id, passage_rank, passage_score in zip(*results):
    print(f"\t [{passage_rank}] \t\t {passage_score:.1f} \t\t {searcher.collection[passage_id]}")

#> WHAT ARE THE HEPATIC COMPLICATIONS OR MANIFESTATIONS OF CF
	 [1] 		 15.0 		 COEXISTENT RESPIRATORY ALLERGY AND CYSTIC FIBROSIS THE PREVALENCE OF RESPIRATORY ALLERGIES AND THEIR EFFECTS ON THE COURSE OF CYSTIC FIBROSIS CF WERE STUDIED IN 63 PATIENTS FIFTEEN PATIENTS 24 HAD RESPIRATORY ALLERGIES AS DEFINED BY HISTORY PHYSICAL EXAMINATION POSITIVE ALLERGY SKIN TESTS NASAL AND PERIPHERAL EOSINOPHILIA AND REVERSIBLE AIRWAY OBSTRUCTION SERUM IMMUNOGLOBULINS ON THESE 15 ALLERGIC CF AND 22 NONALLERGIC CF PATIENTS WERE ELEVATED WHEN COMPARED WITH THOSE IN NORMAL AGEMATCHED CONTROLS THE ALLERGIC CF PATIENTS HAD SIGNIFICANTLY HIGHER IGE LEVELS MEAN 598 INTERNATIONAL UNITS IU COMPARED TO 281 IU IGG IGM AND IGA LEVELS WERE SIMILAR IN THE TWO CF GROUPS THE CLINICAL CONDITION OF THE ALLERGIC CF PATIENTS SHWACHMAN SCORE WAS BETTER THAN THE NONALLERGIC SUGGESTING SOME MODULATING EFFECT OF RESPIRATORY ALLERGY ON THE SEVERITY OF THE DISEASE
	 [2] 		 14.2 		 PULMONARY INVOLVEMENT IN ADULTS WITH CYSTIC 