In [1]:
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert import Trainer

In [2]:
config = ColBERTConfig(
        bsize=64,
        root=r"../retrain_colbert",
    )

In [3]:
with Run().context(RunConfig(nranks=1, experiment="msmarco")):
    trainer = Trainer(
        triples= r"../data/triples.train.small.id.json",
        queries= r"../data/queries.train.tsv",
        collection= r"../data/collection.tsv",
        config=config,
    )

    checkpoint_path = trainer.train()

    print(f"Saved checkpoint to {checkpoint_path}...")

#> Starting...
#> Joined...
Saved checkpoint to None...


In [None]:
checkpoint_path

In [2]:
from colbert.evaluation.loaders import *

In [3]:
from tqdm.auto import tqdm

In [15]:
param = {
    'triples': '../data/triples.train.small.tsv',
    'queries': '../data/queries.train.tsv',
    'collection': '../data/collection.tsv'
}

In [None]:
def load_queries(queries_path):
    queries = OrderedDict()

    print_message("#> Loading the queries from", queries_path, "...")

    with open(queries_path, encoding='utf-8') as f:
        for line in f:
            qid, query, *_ = line.replace("\xa0", " ").strip().split('\t')
            qid = int(qid)

            assert (qid not in queries), ("Query QID", qid, "is repeated!")
            queries[re.sub('[^ 0-9a-zA-Z_-]', '', query.strip(" "))] = qid

    print_message("#> Got", len(queries), "queries. All QIDs are unique.\n")

    return queries

In [None]:
def load_collection(collection_path):
    print_message("#> Loading collection...")

    collection = {}

    with open(collection_path, encoding="utf-8") as f:
        for line_idx, line in enumerate(f):
            if line_idx % (1000*1000) == 0:
                print(f'{line_idx // 1000 // 1000}M', end=' ', flush=True)

            pid, passage, *rest = line.strip('\n\r ').split('\t')
            assert pid == 'id' or int(pid) == line_idx

            if len(rest) >= 1:
                title = rest[0]
                passage = title + ' | ' + passage

            collection[passage] = line_idx

    print()

    return collection

In [None]:
import re

In [None]:
queries = load_queries(param['queries'])

[Sep 26, 18:41:38] #> Loading the queries from ../data/queries.train.tsv ...
[Sep 26, 18:41:39] #> Got 806349 queries. All QIDs are unique.



In [None]:
collection = load_collection(param['collection'])

[Sep 26, 18:41:39] #> Loading collection...
0M 1M 2M 3M 4M 5M 6M 7M 8M 


In [None]:
global err_text
err_text = ""
def get_id(text, data):
    err_text = text
    _id = data.get(exceptions.get(text, text), None)
    if _id is None: 
        text = text.replace("\xa0",' ')
        _id = data.get(exceptions.get(text, text), None)
    if _id is None: 
        n_text = text.strip(' ')
        _id = data.get(exceptions.get(n_text, n_text), None)
    if _id is None: 
        n_text = re.sub('[^ 0-9a-zA-Z_-]', '', n_text)
        _id = data.get(exceptions.get(n_text, n_text), None)
    if _id is None: 
        _id = data.get(text.strip(' '), None)
    
    if _id is None:
        print(text)
        raise Exception(text)
    return _id

In [None]:
exceptions = {'divorce et sÃ©paration': 'divorce et séparation',
 'what is intelÂ® vpro technology': 'what is intel® vpro technology',
 'what is aÂ\xa0shock wave': 'what is a shock wave',
 'Germanyâ\x80\x99s perspective, the Treaty of Versailles was a fair settlement for its national interests': 'Germany’s perspective, the Treaty of Versailles was a fair settlement for its national interests',
 'yesÃ¼n temÃ¼r khan emperor taiding of yuan': 'yesün temür khan emperor taiding of yuan',
 ' The vitamin that prevents beriberi is ': ' The vitamin that prevents beriberi is',
 ' phosphates as food ingredients ': ' phosphates as food ingredients',
 ' who invented the periodic table ': ' who invented the periodic table',
 'what does bokmÃ¥l mean': 'what does bokmål mean',
 'which action should youÂ\xa0never take when selecting quotations': 'which action should you never take when selecting quotations',
 'dermatitis, anemia, convulsions, depressions, and confusion are all signs of a vitamin _________Â\xa0deficiency.': 'dermatitis anemia convulsions depressions and confusion are all signs of a vitamin _________ deficiency',
 ' In humans, the normal set point for body temperature is ': 'In humans the normal set point for body temperature is',
 'what did you notice about the relationship between pressure and volume when the temperatureÂ\xa0 is held constant?': 'what did you notice about the relationship between pressure and volume when the temperature  is held constant',
 'the Â\xa0____________Â\xa0 that vibrates like a drum when sound waves hit.': 'the  ____________  that vibrates like a drum when sound waves hit',
 'what is composition?Â\xa0 why is composition important?': 'what is composition  why is composition important',
 'the lithosphere consists of Â\xa0____________': 'the lithosphere consists of  ____________',
 "what is a 'cost engineer ": 'what is a cost engineer',
 'A simple way to save with a competitive interest rate. Your Personal Savings account earns interest daily and is posted to your account monthly. You can easily set up recurring transfers from your current bank accounts to your Personal Savings account.â\x80\xa0. Just deposit your savings and watch it grow. Your Personal Savings account earns interest daily and is posted to your account monthly. You can easily set up recurring transfers from your current bank accounts to your Personal Savings account.â\x80\xa0. ': 'A simple way to save with a competitive interest rate. Your Personal Savings account earns interest daily and is posted to your account monthly. You can easily set up recurring transfers from your current bank accounts to your Personal Savings account.â\x80\xa0. Just deposit your savings and watch it grow. Your Personal Savings account earns interest daily and is posted to your account monthly. You can easily set up recurring transfers from your current bank accounts to your Personal Savings account.â\x80\xa0.',
 "Islamic Laws are made up of Shari'ah ('â\x80\x8eØ´Ø±Ù\x8aØ¹Ø© Å\xa0arÄ«Ê¿ah) and Islamic jurisprudence (Ù\x81Ù\x82Ù\x87â\x80\x8e Fiqh). Shari'ah is seen as sacred and constitutes the Qur'an and Prophet Muhammad 's Sunnah (way), which is found in the Hadith and Sira. Islamic jurisprudence is a complimentary expansion of the former by Islamic juris efinition [edit]. Islamic Laws are made up of Shari'ah ('â\x80\x8eØ´Ø±Ù\x8aØ¹Ø© Å\xa0arÄ«Ê¿ah) and Islamic jurisprudence (Ù\x81Ù\x82Ù\x87â\x80\x8e Fiqh). Shari'ah is seen as sacred and constitutes the Qur'an and Prophet Muhammad 's Sunnah (way), which is found in the Hadith and Sir ": "Islamic Laws are made up of Shari'ah ('â\x80\x8eØ´Ø±Ù\x8aØ¹Ø© Å\xa0arÄ«Ê¿ah) and Islamic jurisprudence (Ù\x81Ù\x82Ù\x87â\x80\x8e Fiqh). Shari'ah is seen as sacred and constitutes the Qur'an and Prophet Muhammad 's Sunnah (way), which is found in the Hadith and Sira. Islamic jurisprudence is a complimentary expansion of the former by Islamic juris efinition [edit]. Islamic Laws are made up of Shari'ah ('â\x80\x8eØ´Ø±Ù\x8aØ¹Ø© Å\xa0arÄ«Ê¿ah) and Islamic jurisprudence (Ù\x81Ù\x82Ù\x87â\x80\x8e Fiqh). Shari'ah is seen as sacred and constitutes the Qur'an and Prophet Muhammad 's Sunnah (way), which is found in the Hadith and Sir",
 "Definition [edit]. Islamic Laws are made up of Shari'ah ('â\x80\x8eØ´Ø±Ù\x8aØ¹Ø© Å\xa0arÄ«Ê¿ah) and Islamic jurisprudence (Ù\x81Ù\x82Ù\x87â\x80\x8e Fiqh). Shari'ah is seen as sacred and constitutes the Qur'an and Prophet Muhammad 's Sunnah (way), which is found in the Hadith and Sir efinition [edit]. Islamic Laws are made up of Shari'ah ('â\x80\x8eØ´Ø±Ù\x8aØ¹Ø© Å\xa0arÄ«Ê¿ah) and Islamic jurisprudence (Ù\x81Ù\x82Ù\x87â\x80\x8e Fiqh). Shari'ah is seen as sacred and constitutes the Qur'an and Prophet Muhammad 's Sunnah (way), which is found in the Hadith and Sir ": "Definition [edit]. Islamic Laws are made up of Shari'ah ('â\x80\x8eØ´Ø±Ù\x8aØ¹Ø© Å\xa0arÄ«Ê¿ah) and Islamic jurisprudence (Ù\x81Ù\x82Ù\x87â\x80\x8e Fiqh). Shari'ah is seen as sacred and constitutes the Qur'an and Prophet Muhammad 's Sunnah (way), which is found in the Hadith and Sir efinition [edit]. Islamic Laws are made up of Shari'ah ('â\x80\x8eØ´Ø±Ù\x8aØ¹Ø© Å\xa0arÄ«Ê¿ah) and Islamic jurisprudence (Ù\x81Ù\x82Ù\x87â\x80\x8e Fiqh). Shari'ah is seen as sacred and constitutes the Qur'an and Prophet Muhammad 's Sunnah (way), which is found in the Hadith and Sir"}

In [None]:
examples = []
with open(param['triples'], encoding='utf-8') as f:
    for line in tqdm(f):
        q_str, p_str_p, p_str_n = line.strip('\n').split('\t')
        qid = get_id(q_str, queries)
        pid_p = get_id(p_str_p, collection)
        pid_n = get_id(p_str_n, collection)
        example = [qid, pid_p, pid_n]
        examples.append(example)

0it [00:00, ?it/s]

In [None]:
id_file = '../data/triples.train.small.id.json'
    
with open(id_file, 'w') as f:
    for example in examples:
        ujson.dump(example, f)
        f.write('\n')

output_path = f.name
print_message(f"#> Saved examples with {len(self.data)} lines to {f.name}")

NameError: name 'self' is not defined

In [None]:
id_file = '../data/triples.train.small.id.tsv'
with open(id_file, 'r') as f:
    id_examples = ujson.load(f)

In [None]:
len(id_examples)
len(id_examples[0])

In [None]:
data = collection
# data = queries
qid = [k for k in data.keys() if "lamic Laws are made up of Shar" in k]
qid

["Islamic Laws are made up of Shari'ah ('â\x80\x8eØ´Ø±Ù\x8aØ¹Ø© Å\xa0arÄ«Ê¿ah) and Islamic jurisprudence (Ù\x81Ù\x82Ù\x87â\x80\x8e Fiqh). Shari'ah is seen as sacred and constitutes the Qur'an and Prophet Muhammad 's Sunnah (way), which is found in the Hadith and Sira. Islamic jurisprudence is a complimentary expansion of the former by Islamic juris efinition [edit]. Islamic Laws are made up of Shari'ah ('â\x80\x8eØ´Ø±Ù\x8aØ¹Ø© Å\xa0arÄ«Ê¿ah) and Islamic jurisprudence (Ù\x81Ù\x82Ù\x87â\x80\x8e Fiqh). Shari'ah is seen as sacred and constitutes the Qur'an and Prophet Muhammad 's Sunnah (way), which is found in the Hadith and Sir",
 "Definition [edit]. Islamic Laws are made up of Shari'ah ('â\x80\x8eØ´Ø±Ù\x8aØ¹Ø© Å\xa0arÄ«Ê¿ah) and Islamic jurisprudence (Ù\x81Ù\x82Ù\x87â\x80\x8e Fiqh). Shari'ah is seen as sacred and constitutes the Qur'an and Prophet Muhammad 's Sunnah (way), which is found in the Hadith and Sir efinition [edit]. Islamic Laws are made up of Shari'ah ('â\x80\x8eØ´Ø±Ù\x8aØ¹Ø

In [304]:
exceptions = {**exceptions, **{p_str_n: qid[1]}}
exceptions

{'divorce et sÃ©paration': 'divorce et séparation',
 'what is intelÂ® vpro technology': 'what is intel® vpro technology',
 'what is aÂ\xa0shock wave': 'what is a shock wave',
 'Germanyâ\x80\x99s perspective, the Treaty of Versailles was a fair settlement for its national interests': 'Germany’s perspective, the Treaty of Versailles was a fair settlement for its national interests',
 'yesÃ¼n temÃ¼r khan emperor taiding of yuan': 'yesün temür khan emperor taiding of yuan',
 ' The vitamin that prevents beriberi is ': ' The vitamin that prevents beriberi is',
 ' phosphates as food ingredients ': ' phosphates as food ingredients',
 ' who invented the periodic table ': ' who invented the periodic table',
 'what does bokmÃ¥l mean': 'what does bokmål mean',
 'which action should youÂ\xa0never take when selecting quotations': 'which action should you never take when selecting quotations',
 'dermatitis, anemia, convulsions, depressions, and confusion are all signs of a vitamin _________Â\xa0defic