# Experiment 1 - Training

## Collect negatives

In [3]:
import ir_datasets
import pandas as pd
# from tqdm import tqdm
from tqdm.auto import tqdm, trange

In [3]:
negatives = ir_datasets.load("msmarco-qna/train")
qrels = pd.DataFrame(negatives.qrels_iter())
qid_grouped = qrels.groupby('query_id')

In [16]:
negative_lookup = {}

for group in tqdm(qid_grouped):
    tmp = []
    zero_relevance = True
    
    for _, row in group[1].iterrows():
        if row.relevance == 1:
            zero_relevance = False
            break
        else:
            tmp.append(row.doc_id)
    if not zero_relevance and len(tmp) != 0:
        qid = group[0]
        negative_lookup[qid] = tmp

100%|█████████████████████████████████| 808731/808731 [04:12<00:00, 3208.65it/s]


### Save negatives to avoid collecting it again

In [4]:
import pickle
import os

STORAGE_DIR = r'Experiment_1'

In [5]:
def save_data(data, path, name):
    if not os.path.exists(path):
        os.mkdir(path)
    with open(f'{path}/{name}.pkl', 'wb') as p:
        pickle.dump(data, p)

In [None]:
save_data(negative_lookup, STORAGE_DIR, "negative_lookup")

### Load the saved negatives from local storage

In [6]:
def load_data(path, name):
    with open(f'{path}/{name}.pkl', 'rb') as p:
        return pickle.load(p)

negative_lookup = load_data(STORAGE_DIR, "negative_lookup")

## Intialise Dataset, Storage and Model

In [7]:
BATCH_SIZE = 16 # How many triples in a mini-batch
#EPOCHS = 10 # How many epochs
NUM_STEPS = 100000 # How many steps to train for
LR = 5e-5 # Learning Rate for T5 Training
CUDA = True # Optionally run on CPU (False)

DATASET = r'msmarco-passage/train/triples-small' # What dataset are we training on? (From https://ir-datasets.com/)

OUTPUTS = ['true', 'false'] # Tokens for relevant and non-relevant

In [8]:
dataset = ir_datasets.load(DATASET)

In [7]:
from datetime import datetime

start_time = datetime.now()
scoreddocs = pd.DataFrame(dataset.scoreddocs_iter())
end_time = datetime.now()

print("Loading time: %s" % (end_time - start_time))

KeyboardInterrupt: 

> It actually occupied 130GB of memory to load the 478M scoreddocs data. That is why 128GB caused dead kernel.

#### Save data

In [None]:
save_data(scoreddocs, STORAGE_DIR, "scoreddocs")

### Testing memory usages

In [17]:
from itertools import islice

#### 1% of the total scoreddocs

In [19]:
new_scoreddocs_iter_0_01 = itertools.islice(dataset.scoreddocs_iter(), 0, int(dataset.scoreddocs_count() * 0.01))

In [26]:
start_time = datetime.now()
scoreddocs_0_01 = pd.DataFrame(new_scoreddocs_iter)
end_time = datetime.now()
print(end_time - start_time)

0:00:08.143718


#### 10% of the total scoreddocs

In [28]:
new_scoreddocs_iter_0_1 = itertools.islice(dataset.scoreddocs_iter(), 0, int(dataset.scoreddocs_count() * 0.1))

In [29]:
start_time = datetime.now()
scoreddocs_0_1 = pd.DataFrame(new_scoreddocs_iter_0_1)
end_time = datetime.now()
print(end_time - start_time)

0:01:37.796508


> 10% data occupied around 16GB of memory to load the data, therefore, we increased the memory from 128GB to 208GB to see the difference.

### 100000 samples

In [7]:
new_scoreddocs = scoreddocs.sample(n=100000)

In [83]:
new_scoreddocs

Flushing oldest 2 entries.
  warn('Output cache limit (currently {sz} entries) hit.\n'


Unnamed: 0,query_id,doc_id,score
102203556,25768,334673,0.0
91920743,21122,3713224,0.0
281097745,1143367,1286117,0.0
477850218,997313,4964744,0.0
269847297,184259,2815866,0.0
...,...,...,...
97610795,292752,5987940,0.0
211994404,980282,7523899,0.0
365833954,438362,4098226,0.0
388936949,840279,8693181,0.0


In [8]:
len(new_scoreddocs)

100000

In [13]:
from collections import Counter

Counter(new_scoreddocs["score"])

Counter({0.0: 100000})

> All random sampled 100000 scored docs scored 0.0

#### Save data

In [184]:
save_data(new_scoreddocs, STORAGE_DIR, "new_scoreddocs")

#### Load data

In [9]:
new_scoreddocs = load_data(STORAGE_DIR, "new_scoreddocs")

### Build triples

In [77]:
truenegative_scoreddocs = new_scoreddocs.copy().set_index('query_id')['doc_id'].to_dict()

In [78]:
truenegative_scoreddocs['doc_id_b'] = truenegative_scoreddocs['query_id'].apply(lambda x : negative_lookup[x] if x in negative_lookup.keys() else None)

In [79]:
truenegative_scoreddocs

Unnamed: 0,query_id,doc_id,score,doc_id_b
102203556,25768,334673,0.0,"[1282122-0, 336754-0, 704874-1, 6654185-0, 757..."
91920743,21122,3713224,0.0,[7482002-0]
281097745,1143367,1286117,0.0,[696731-1]
477850218,997313,4964744,0.0,
269847297,184259,2815866,0.0,"[5098953-0, 5098954-0, 5098955-0, 5098956-0, 5..."
...,...,...,...,...
97610795,292752,5987940,0.0,"[5782733-0, 5782734-0, 5782735-0, 5782736-0, 5..."
211994404,980282,7523899,0.0,
365833954,438362,4098226,0.0,[348260-0]
388936949,840279,8693181,0.0,"[3822441-0, 961633-11, 1142567-1, 3822442-0, 3..."


#### Drop triples without negative documents

In [80]:
truenegative_scoreddocs = truenegative_scoreddocs[truenegative_scoreddocs.apply(lambda x : True if x.doc_id_b is not None else False, axis=1)]

In [81]:
truenegative_scoreddocs

Flushing oldest 2 entries.
  warn('Output cache limit (currently {sz} entries) hit.\n'


Unnamed: 0,query_id,doc_id,score,doc_id_b
102203556,25768,334673,0.0,"[1282122-0, 336754-0, 704874-1, 6654185-0, 757..."
91920743,21122,3713224,0.0,[7482002-0]
281097745,1143367,1286117,0.0,[696731-1]
269847297,184259,2815866,0.0,"[5098953-0, 5098954-0, 5098955-0, 5098956-0, 5..."
460782537,539951,2440243,0.0,"[3804779-0, 3804780-0]"
...,...,...,...,...
329442322,1170789,1561323,0.0,"[3215925-0, 3215926-0, 3215927-0, 3215928-0, 3..."
97610795,292752,5987940,0.0,"[5782733-0, 5782734-0, 5782735-0, 5782736-0, 5..."
365833954,438362,4098226,0.0,[348260-0]
388936949,840279,8693181,0.0,"[3822441-0, 961633-11, 1142567-1, 3822442-0, 3..."


#### Save data

In [185]:
save_data(truenegative_scoreddocs, STORAGE_DIR, "truenegative_scoreddocs")

#### Load data

In [10]:
truenegative_scoreddocs = load_data(STORAGE_DIR, "truenegative_scoreddocs")

In [11]:
def dataset_from_idx(dataset, scoreddocs, baseline=False):
    frame = scoreddocs
    docs = pd.DataFrame(dataset.docs_iter()).set_index('doc_id').text.to_dict()
    queries = pd.DataFrame(dataset.queries_iter()).set_index('query_id').text.to_dict()
    
    frame['query'] = frame['query_id'].apply(lambda x: queries[x])
    print("query done.")
    frame['pid'] = frame['doc_id'].apply(lambda x: docs[x])
    print("pid done.")
    
    if baseline:
        frame['nid'] = frame.apply(lambda x: "")
        print("nid done.")
        # docpairs = pd.DataFrame(dataset.docpairs_iter())
        # frame['query'] = docpairs['query_id'].apply(lambda x: queries[x])
        # frame['pid'] = docpairs.apply(lambda x : docs[x.doc_id_a] if x.query_id in frame["query_id"].values else None, axis=1)
        # frame['nid'] = docpairs.apply(lambda x : docs[x.doc_id_b] if x.query_id in frame["query_id"].values else None, axis=1)
        # frame = frame[frame.apply(lambda x : True if x.doc_id_a is not None and x.doc_id_b is not None else False, axis=1)]
    else:
        # frame['nid'] = frame.apply(lambda x : ' '.join([docs[i.split('-')[0]] for i in x.doc_id_b]), axis=1)
        frame['nid'] = frame.apply(lambda x : docs[[i for i in x.doc_id_b][0].split('-')[0]], axis=1)
        print("nid done.")

    return frame[['query', 'pid', 'nid']]

In [10]:
baseline_triples = dataset_from_idx(dataset, new_scoreddocs, True)

query done.
pid done.
nid done.


In [11]:
baseline_triples

Unnamed: 0,query,pid,nid
102203556,are there specific species of polar bears,A male polar bear is called a polar bear. Thei...,
91920743,are beanitos good for you,Use complementary good in a sentence. It was a...,
281097745,when did andrew jackson come president,The party later split. Some scholars say that ...,
477850218,where is the radar for the weather?,A: There are many things to consider when buyi...,
269847297,extending the olive branch meaning,The name Olivia is an American baby name. In A...,
...,...,...,...
97610795,how many people live in the vatican city,"It is also used in the Canary Islands, French ...",
211994404,where is cold creek nv,Lookout 80 Main South Fork 6/10/14 Cleared to ...,
365833954,leafy greens is food source of what,"For instance, certain foods can actually boost...",
388936949,what is the postal code for derry nh,"In English-speaking countries, the postal code...",


In [12]:
new_triples = dataset_from_idx(dataset, truenegative_scoreddocs)

query done.
pid done.
nid done.


In [13]:
new_triples

Unnamed: 0,query,pid,nid
102203556,are there specific species of polar bears,A male polar bear is called a polar bear. Thei...,Because of ongoing and potential loss of their...
91920743,are beanitos good for you,Use complementary good in a sentence. It was a...,Beanitos are an honestly delicious snack made ...
281097745,when did andrew jackson come president,The party later split. Some scholars say that ...,President Andrew Jackson firmly established th...
269847297,extending the olive branch meaning,The name Olivia is an American baby name. In A...,"An olive branch symbolizes peace, so to extend..."
460782537,was elvis offered bye bye birdie,Linda Henning (I) The daughter of veteran writ...,The film is credited with making Ann-Margret a...
...,...,...,...
329442322,sergio mendes net worth,Definition of net worth statement in the Finan...,Barry Mann Articles. 1 Barry Mann Net Worth B...
97610795,how many people live in the vatican city,"It is also used in the Canary Islands, French ...",best answer it is a city like all others in th...
365833954,leafy greens is food source of what,"For instance, certain foods can actually boost...",Dark green leafy vegetables are great sources ...
388936949,what is the postal code for derry nh,"In English-speaking countries, the postal code...",Derry NH ZIP Code. Derry city is located in Ne...


In [14]:
def iter_train_samples(dataset, triples=None):
    # queries = pd.DataFrame(dataset.queries_iter()).set_index('query_id').text.to_dict()
    # docs = pd.DataFrame(dataset.docs_iter()).set_index('doc_id').text.to_dict()
    triples = triples if triples is not None else pd.DataFrame(dataset.docpairs_iter())
    while True:
        for _, row in triples.iterrows():
            # yield 'Query: ' + queries[row.query_id] + ' Document: ' + docs[row.doc_id_a] + ' Relevant:', OUTPUTS[0]
            yield 'Query: ' + row.query + ' Document: ' + row.pid + ' Relevant:', OUTPUTS[0]
            # yield 'Query: ' + queries[row.query_id] + ' Document: ' + docs[row.doc_id_b] + ' Relevant:', OUTPUTS[1]
            yield 'Query: ' + row.query + ' Document: ' + str(row.nid) + ' Relevant:', OUTPUTS[1]

In [15]:
import time
import logging
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import AdamW

torch.manual_seed(0)
_logger = ir_datasets.log.easy()

### Baseline

In [17]:
train_iter_base = _logger.pbar(iter_train_samples(dataset, triples=baseline_triples), desc='total train samples')

In [19]:
model_base = T5ForConditionalGeneration.from_pretrained("t5-base")
if CUDA: model_base = model_base.cuda()

In [None]:
optimiser_base = AdamW(model_base.parameters(), lr=LR)

### True Negatives

In [16]:
train_iter_new = _logger.pbar(iter_train_samples(dataset, triples=new_triples), desc='total train samples')

In [17]:
model_new = T5ForConditionalGeneration.from_pretrained("t5-base")
if CUDA: model_new = model_new.cuda()

In [18]:
optimiser_new = AdamW(model_new.parameters(), lr=LR)



In [19]:
tokeniser = T5Tokenizer.from_pretrained("t5-base") #, model_max_length=1000)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


## Training

In [20]:
def train(model, triples, train_iter, NUM_STEPS, BATCH_SIZE, optimiser):

    start = time.time()
    step = 0

    while step < NUM_STEPS:
        with _logger.pbar_raw(desc=f'train', total=len(triples)//BATCH_SIZE) as pbar:
            model.train()
            total_loss = 0
            count = 0
            for _ in range(len(triples) // BATCH_SIZE):
                inp, out = [], []
                for i in range(BATCH_SIZE):
                    i, o = next(train_iter)
                    inp.append(i)
                    out.append(o)
                inp_ids = tokeniser(inp, return_tensors='pt', padding=True).input_ids
                out_ids = tokeniser(out, return_tensors='pt', padding=True).input_ids
                if CUDA:
                    inp_ids = inp_ids.cuda()
                    out_ids = out_ids.cuda()
                loss = model(input_ids=inp_ids, labels=out_ids).loss
                loss.backward()
                optimiser.step()
                optimiser.zero_grad()
                total_loss = loss.item()
                count += 1
                pbar.update(1)
                pbar.set_postfix({'loss': total_loss/count})
                step += BATCH_SIZE
    end = time.time() - start

### Train the baseline

In [None]:
train(model_base, baseline_triples, train_iter_base, EPOCHS, BATCH_SIZE, optimiser_base)

[INFO] [starting] train 0
[INFO] [starting] total train samples                                           
train 0:   0%|                                         | 0/6250 [00:00<?, ?it/s]
train 0:   0%|                    | 1/6250 [00:01<2:44:43,  1.58s/it, loss=14.7]
train 0:   0%|                    | 2/6250 [00:01<1:30:59,  1.14it/s, loss=6.19]
train 0:   0%|                     | 3/6250 [00:01<1:06:01,  1.58it/s, loss=3.6]
train 0:   0%|                      | 4/6250 [00:02<52:47,  1.97it/s, loss=2.17]
train 0:   0%|                      | 5/6250 [00:02<45:26,  2.29it/s, loss=1.49]
train 0:   0%|                      | 6/6250 [00:02<40:07,  2.59it/s, loss=1.06]
train 0:   0%|                     | 7/6250 [00:02<36:31,  2.85it/s, loss=0.712]
train 0:   0%|                     | 8/6250 [00:02<34:02,  3.06it/s, loss=0.541]
train 0:   0%|                     | 9/6250 [00:02<32:05,  3.24it/s, loss=0.399]
train 0:   0%|                    | 10/6250 [00:02<30:35,  3.40it/s, loss=0.302]
tr

In [None]:
model_base.save_pretrained(os.path.join(STORAGE_DIR, 'model_base'))

### Train with the hard negatives

In [None]:
train(model_new, new_triples, train_iter_new, EPOCHS, BATCH_SIZE, optimiser_new)

[INFO] [starting] train 0
[INFO] [starting] total train samples                                           
train 0:   0%|                                         | 0/5037 [00:00<?, ?it/s]
train 0:   0%|                    | 1/5037 [00:00<1:05:08,  1.29it/s, loss=17.4]
train 0:   0%|                      | 2/5037 [00:00<39:52,  2.10it/s, loss=7.19]
train 0:   0%|                      | 3/5037 [00:01<32:29,  2.58it/s, loss=4.02]
train 0:   0%|                      | 4/5037 [00:01<27:38,  3.03it/s, loss=2.25]
train 0:   0%|                      | 5/5037 [00:01<24:20,  3.45it/s, loss=1.53]
train 0:   0%|                     | 6/5037 [00:01<23:03,  3.64it/s, loss=0.898]
train 0:   0%|                     | 7/5037 [00:01<21:54,  3.83it/s, loss=0.668]
train 0:   0%|                     | 8/5037 [00:02<21:23,  3.92it/s, loss=0.536]
train 0:   0%|                     | 9/5037 [00:02<20:45,  4.04it/s, loss=0.376]
train 0:   0%|                    | 10/5037 [00:02<19:57,  4.20it/s, loss=0.336]
tr

In [None]:
model_new.save_pretrained(os.path.join(STORAGE_DIR, 'model_new'))