In [1]:
from rich import print
from ragatouille import RAGTrainer

trainer = RAGTrainer(
    model_name="JerryColBERT",
    pretrained_model_name="colbert-ir/colbertv2.0",
    language_code="en",
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
f = open("data/term_paper.txt")
my_full_corpus = f.read()

In [3]:
from ragatouille.data import CorpusProcessor, llama_index_sentence_splitter

corpus_processor = CorpusProcessor(
    document_splitter_fn=llama_index_sentence_splitter)
documents = corpus_processor.process_corpus(my_full_corpus, chunk_size=256)

In [4]:
import random

queries = [
    "Why is there a growing need for machine learning in petroleum engineering?",
    "Why was early stopping used during training of the CONV-LSTM model for oil production rate prediction?",
    "What were the primary findings of the case study regarding the application of deep learning models for predicting oil production rates?",
    "In the context of the case study, what is the significance of the learning curve shown in Figure 12?",
] * 3

pairs = []
for query in queries:
    fake_relevant_docs = random.sample(documents, 10)
    for doc in fake_relevant_docs:
        pairs.append((query, doc))

In [5]:
trainer.prepare_training_data(
    raw_data=pairs,
    data_out_path="./data/",
    all_documents=my_full_corpus,
    num_new_negatives=10,
    mine_hard_negatives=True,
)

Loading Hard Negative SimpleMiner dense embedding model BAAI/bge-small-en-v1.5...




Building hard negative index for 89 documents...
All documents embedded, now adding to index...
save_index set to False, skipping saving hard negative index
Hard negative index generated


'./data/'

In [6]:
trainer.train(batch_size=32,
              nbits=4, # How many bits will the trained model use when compressing indexes
              maxsteps=500000, # Maximum steps hard stop
              use_ib_negatives=True, # Use in-batch negative to calculate loss
              dim=128, # How many dimensions per embedding. 128 is the default and works well.
              learning_rate=5e-6, # Learning rate, small values ([3e-6,3e-5] work best if the base model is BERT-like, 5e-6 is often the sweet spot)
              doc_maxlen=256, # Maximum document length. Because of how ColBERT works, smaller chunks (128-256) work very well.
              use_relu=False, # Disable ReLU -- doesn't improve performance
              warmup_steps="auto", # Defaults to 10%
             )

#> Starting...
nranks = 1 	 num_gpus = 1 	 device=0
{
    "query_token_id": "[unused0]",
    "doc_token_id": "[unused1]",
    "query_token": "[Q]",
    "doc_token": "[D]",
    "ncells": null,
    "centroid_score_threshold": null,
    "ndocs": null,
    "load_index_with_mmap": false,
    "index_path": null,
    "index_bsize": 64,
    "nbits": 4,
    "kmeans_niters": 20,
    "resume": false,
    "similarity": "cosine",
    "bsize": 32,
    "accumsteps": 1,
    "lr": 5e-6,
    "maxsteps": 500000,
    "save_every": 0,
    "warmup": 0,
    "warmup_bert": null,
    "relu": false,
    "nway": 2,
    "use_ib_negatives": true,
    "reranker": false,
    "distillation_alpha": 1.0,
    "ignore_scores": false,
    "model_name": "JerryColBERT",
    "query_maxlen": 32,
    "attend_to_mask_tokens": false,
    "interaction": "colbert",
    "dim": 128,
    "doc_maxlen": 256,
    "mask_punctuation": true,
    "checkpoint": "colbert-ir\/colbertv2.0",
    "triples": "data\/triples.train.colbert.jsonl",
  



[May 11, 23:00:26] #> Loading the queries from data/queries.train.colbert.tsv ...
[May 11, 23:00:26] #> Got 4 queries. All QIDs are unique.

[May 11, 23:00:26] #> Loading collection...
0M 




#> LR will use 0 warmup steps and linear decay over 500000 steps.

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . What were the primary findings of the case study regarding the application of deep learning models for predicting oil production rates?, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  2054,  2020,  1996,  3078,  9556,  1997,  1996,  2553,
         2817,  4953,  1996,  4646,  1997,  2784,  4083,  4275,  2005, 29458,
         3514,  2537,  6165,  1029,   102,   103,   103,   103,   103,   103,
          103,   103], device='cuda:0')
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')

				 0.9447746872901917 4.579592227935791
#>>>    1.48 1.47 		|		 0.010000000000000009




[May 11, 23:00:38] 0 5.524366855621338
				 1.0428612232208252 4.827328205108643
#>>>    1.26 1.49 		|		 -0.22999999999999998
[May 11, 23:00:41] 1 5.524712678432464
[May 11, 23:00:41] #> Done with all triples!
#> Saving a checkpoint to .ragatouille/colbert/none/2024-05/11/23.00.08/checkpoints/colbert ..
#> Joined...
