In [1]:
import sys
sys.path.append("/home/gerald/Documents/CPD/repository/LifelongInformationRetrieval") 

# Implementation of "Document Ranking with a Pretrained Sequence-to-Sequence Model" and adaptation to lifelong setting

The aims of the proposed methods is to adress ranking using a pretrained sequence to sequence model. Contrary to most of ranking paper considering a similarity matrix between words off query and documents, authors use a directly output of sequences model. Considering as output **relevant** sequence of words reduced here to one word for positive document/query relation and an other for negative relation.
To this end authors use the **t5** pretrained model and modelise the input as the concatenation of document and query.

## Loading the MSMarco corpus

In [4]:
from lire.data_tools.dataset import MSMarco
import pandas as pd

# where are or will be downloaded the corpus
data_folder = '/local/gerald/CPD/data'
# what split to use
split = 'train'
# laod the dataset with triplet output (query, positive, negative)

dataset = MSMarco.MSMarcoPassageRankingDataset(data_folder,
                                               download=True,
                                               split="dev",
                                               storage='full',
                                               getter='positive')

  mask |= (ar1 == a)


In [5]:
print('sample of the dataset : ',dataset[1])

sample of the dataset :  ('why did rachel carson write an obligation to endure', ['Carson believes that as man tries to eliminate unwanted insects and weeds, however he is actually causing more problems by polluting the environment with, for example, DDT and harming living things. Carson adds that the intensification of agriculture is causing other major problems, like newly developed or created insects and diseases.', "The Obligation to Endure by Rachel Carson Rachel Carson's essay on The Obligation to Endure, is a very convincing argument about the harmful uses of chemicals, pesticides, herbicides, and fertilizers on the environment."])


In [3]:
# explained later
identity_function = lambda x : x
dataset.set_output_transformation(identity_function)
dataset.set_query_transform(identity_function)
dataset.set_document_transform(identity_function)

local_index = 42
print('Local index of the query ', local_index)
q, dp, dn = dataset[local_index]
print('Query: \n\t"', q, '"\n')
print('Positive Document: \n\t"', dp, '"\n')
print('Negative Document: \n\t"', dn, '"\n')

Local index of the query  42
Query: 
	" what is the average cost of a work related back injury "

Positive Document: 
	" The average cost of a low-back associated workers compensation claim is nearly $8,500. This is double the cost of the average injury claim. The total estimate for the United States ranges between $50 and $100 billion per year. A large portion of this cost is directly workers compensation related. "

Negative Document: 
	" Provided with a workers compensation policy, protects you if you become liable for injury to an employee for a job-related injury that is not covered under the workers compensation law. Additional Coverages "



### Set tokenizer using HuggingFace

In [4]:
#set the size of the model we will use (you can replace it)
t5_size = 't5-small'

from transformers import T5Tokenizer

In [5]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
q, dp, dn = dataset[local_index]
transformation = lambda x: tokenizer(x, return_tensors="pt").input_ids
dataset.set_document_transform(transformation)
dataset.set_query_transform(transformation)

In [6]:
local_index = 42
print('Local index of the query ', local_index)
q, dp, dn = dataset[local_index]
print('Query: \n\t"', q, '"\n')
print('Positive Document: \n\t"', dp, '"\n')
print('Negative Document: \n\t"', dn, '"\n')

Local index of the query  42
Query: 
	" tensor([[ 125,   19,    8, 1348,  583,   13,    3,    9,  161, 1341,  223, 2871,
            1]]) "

Positive Document: 
	" tensor([[   37,  1348,   583,    13,     3,     9,   731,    18,  1549,  1968,
          2765,  6107,  1988,    19,  2111, 13155,     6,  2560,     5,   100,
            19,  1486,     8,   583,    13,     8,  1348,  2871,  1988,     5,
            37,   792,  7037,    21,     8,   907,  1323,   620,     7,   344,
         13309,    11, 10417,  2108,   399,   215,     5,    71,   508,  4149,
            13,    48,   583,    19,  1461,  2765,  6107,  1341,     5,     1]]) "

Negative Document: 
	" tensor([[ 7740,    26,    28,     3,     9,  2765,  6107,  1291,     6,  1822,
             7,    25,     3,    99,    25,   582,     3,  8860,    21,  2871,
            12,    46,  3490,    21,     3,     9,   613,    18,  3897,  2871,
            24,    19,    59,  2303,   365,     8,  2765,  6107,   973,     5,
         11180,  5

We now have the query and documents appearing into tokenized format. However we wants a sequence with query and documents concatenated. We use a token to separte query from document, being *\</s>* (id 1) in the **t5** model. 

In [7]:
import torch
transformation_output = lambda x: (torch.cat((x[0],x[1]), -1),torch.cat((x[0],x[2]), -1))
dataset.set_output_transformation(transformation_output)

In [8]:
query_document_positive = dataset[0][0][0]
query_document_negative = dataset[0][1][0]
detokenized_sentence = tokenizer.convert_ids_to_tokens(query_document_positive.tolist())
print("\tDetokenized query sentence positive : \n\t", detokenized_sentence)
detokenized_sentence = tokenizer.convert_ids_to_tokens(query_document_negative.tolist())
print("\tDetokenized query sentence negative : \n\t", detokenized_sentence)

	Detokenized query sentence positive : 
	 ['▁what', '▁fruit', '▁is', '▁native', '▁to', '▁australia', '</s>', '▁Pass', 'if', 'lor', 'a', '▁her', 'bert', 'iana', '.', '▁A', '▁rare', '▁passion', '▁fruit', '▁native', '▁to', '▁Australia', '.', '▁Fruit', 's', '▁are', '▁green', '-', 's', 'k', 'inne', 'd', ',', '▁white', '▁flesh', 'e', 'd', ',', '▁with', '▁an', '▁unknown', '▁edible', '▁rating', '.', '▁Some', '▁sources', '▁list', '▁the', '▁fruit', '▁as', '▁edible', ',', '▁sweet', '▁and', '▁tasty', ',', '▁while', '▁others', '▁list', '▁the', '▁fruits', '▁as', '▁being', '▁bitter', '▁and', '▁in', 'e', 'd', 'ible', '.', 'assi', 'f', 'lor', 'a', '▁her', 'bert', 'iana', '.', '▁A', '▁rare', '▁passion', '▁fruit', '▁native', '▁to', '▁Australia', '.', '▁Fruit', 's', '▁are', '▁green', '-', 's', 'k', 'inne', 'd', ',', '▁white', '▁flesh', 'e', 'd', ',', '▁with', '▁an', '▁unknown', '▁edible', '▁rating', '.', '▁Some', '▁sources', '▁list', '▁the', '▁fruit', '▁as', '▁edible', ',', '▁sweet', '▁and', '▁tasty', ','

<h2>The complete input preprocessing</h2>

In [9]:
# reinit transformation
identity_function = lambda x : x
dataset.set_output_transformation(identity_function)
dataset.set_query_transform(identity_function)
dataset.set_document_transform(identity_function)

tokenizer_query = T5Tokenizer.from_pretrained("t5-small")
tokenizer_document = T5Tokenizer.from_pretrained("t5-small")

query_transform = lambda x: x+'</s>'

output_transformation = lambda x: (x[0] + x[1], x[0] + x[2])

dataset.set_query_transform(query_transform)
dataset.set_output_transformation(output_transformation)

import torch.utils.data as data_utils
training_dataloader = data_utils.DataLoader(dataset, batch_size=10)


In [10]:

next(iter(training_dataloader))

[('what fruit is native to australia</s>Passiflora herbertiana. A rare passion fruit native to Australia. Fruits are green-skinned, white fleshed, with an unknown edible rating. Some sources list the fruit as edible, sweet and tasty, while others list the fruits as being bitter and inedible.assiflora herbertiana. A rare passion fruit native to Australia. Fruits are green-skinned, white fleshed, with an unknown edible rating. Some sources list the fruit as edible, sweet and tasty, while others list the fruits as being bitter and inedible.',
  'types of fruit trees</s>Cherry. Cherry trees are found throughout the world. There are 40 or more varieties, ranging from bing cherry to black cherry. Along with the fruit, cherry trees produce light and delicate pinkish-white blossoms that are highly fragrant.omments. Submit. Planting fruit trees on your property not only provides you with a steady supply of organic fruit, it also allows you to beautify your yard and give oxygen back to the envir

## Getting output tokens

In [20]:
token_positive = "true"
token_negative = "false"
index_token_positive = tokenizer(token_positive,return_tensors="pt").input_ids.cuda()
index_token_negative = tokenizer(token_negative, return_tensors="pt").input_ids.cuda()
index_token_positive = index_token_positive.repeat(10,1).cuda()
index_token_negative = index_token_negative.repeat(10,1).cuda()


#print("sep token ", tokenizer("true", return_tensors="pt").input_ids)
#tokenizer.sep_token

## Instanciate the model

In [12]:
from transformers import T5ForConditionalGeneration
from torch import optim

model = T5ForConditionalGeneration.from_pretrained('t5-small')
model.cuda()




In [22]:
adam_optimizer = optim.Adam(model.parameters())

## Fine-tunning

In [23]:
n_epoch = 1



loss_accumulator = 0.
for epoch in range(n_epoch):
    for it, (positive, negative) in enumerate(training_dataloader):
        adam_optimizer.zero_grad()
        positive_index, negative_index =\
            tokenizer(positive, return_tensors="pt", padding=True, max_length=512).input_ids,\
            tokenizer(negative, return_tensors="pt", padding=True, max_length=512).input_ids

        outputs_positive = model(input_ids=positive_index.cuda(), labels=index_token_positive)
        outputs_negative = model(input_ids=negative_index.cuda(), labels=index_token_negative)
        loss_positive = outputs_positive.loss
        loss_negative = outputs_negative.loss
        
        loss = loss_positive + loss_negative
        loss_accumulator = loss.item()
        
        if(it%100 == 0):
            print("loss iter ",it,'/',len(training_dataloader)," -> ", loss_accumulator/100)
            loss_accumulator = 0.
        
        loss.backward()
        
        adam_optimizer.step()
        if(it == 10000):
            break
        
        

RuntimeError: CUDA out of memory. Tried to allocate 40.00 MiB (GPU 0; 7.92 GiB total capacity; 5.57 GiB already allocated; 90.50 MiB free; 6.31 GiB reserved in total by PyTorch)

In [14]:
print("saving the model")
model.cpu()
torch.save(model, '/media/gerald/00B1B02B44A76AB2/CPD/saved_models/t5_ranking_transformer_test_5000.pth')
model.cuda()
positive, negative = dataset[292929]
positive_index, negative_index =\
    tokenizer(positive, return_tensors="pt", max_length=512).input_ids,\
    tokenizer(negative, return_tensors="pt", max_length=512).input_ids
outputs_positive = model.generate(input_ids=positive_index.cuda())
outputs_negative = model.generate(input_ids=negative_index.cuda())

print("Positive -> ", positive)
print("Predicted ->", tokenizer.convert_ids_to_tokens(outputs_positive.squeeze().tolist())[1] ," \n")


print("Negative -> ", negative)
print("Predicted ->", tokenizer.convert_ids_to_tokens(outputs_negative.squeeze().tolist())[1] )

saving the model


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Positive ->  what is the cdl waveform</s>CDL TESTBED. The Common Data Link (CDL) Test Bed located at JITC is equipped with the appropriate CDL test tools and personnel to provide CDL Waveform Specification Compliance Testing and Certification. CDL vendors are charged a standard rate using a Standard Rate Schedule.
Predicted -> ▁false  

Negative ->  what is the cdl waveform</s>Indeed.com provides a good sample of salaries for all experience levels. 1  Student truck drivers earn $41,000 a year on average. 2  CDL truck drivers can expect to earn $66,000 a year on average.  OTR CDL truck drivers earn the most, with salaries averaging $82,000 a year.
Predicted -> ▁false


In [17]:
import torch

import torch.utils.data as data_utils

model = torch.load('/media/gerald/00B1B02B44A76AB2/CPD/saved_models/t5_ranking_transformer_test_5000.pth')
model.cuda()
batch_size = 100
training_dataloader = data_utils.DataLoader(dataset, batch_size=100, shuffle=True)
token_positive = "true"
token_negative = "false"
index_token_positive = tokenizer(token_positive,return_tensors="pt").input_ids
index_token_negative = tokenizer(token_negative, return_tensors="pt").input_ids
index_token_positive = index_token_positive.repeat(100,1).cuda()
index_token_negative = index_token_negative.repeat(100,1).cuda()

import tqdm
n_epoch = 1
positive_score, negative_score = 0., 0.
loss_accumulator = 0.

with torch.no_grad():

    for it, (positive, negative) in zip(tqdm.trange(len(training_dataloader)),(training_dataloader)):
        positive_index, negative_index =\
            tokenizer(positive, return_tensors="pt", padding=True, max_length=512).input_ids.cuda(),\
            tokenizer(negative, return_tensors="pt", padding=True, max_length=512).input_ids.cuda()

        outputs_positive = model(input_ids=positive_index.cuda(), labels=index_token_positive)
        outputs_negative = model(input_ids=negative_index.cuda(), labels=index_token_negative)
        positive_score += (outputs_positive["logits"][:,0,index_token_positive[0,0]] > outputs_positive["logits"][:,0,index_token_negative[0,0]]).sum()
        negative_score += (outputs_negative["logits"][:,0,index_token_negative[0,0]] > outputs_negative["logits"][:,0,index_token_positive[0,0]]).sum()

        if(it == 100):
            break





  0%|          | 100/2699191 [01:42<766:33:06,  1.02s/it]


In [19]:
print("Positive score obtained : ", ((positive_score.item()/ (100*100)) *1e3)//1/10, "%")
print("Negative score obtained : ", ((negative_score.item()/ (100*100)) *1e3)//1/10, "%")

Positive score obtained :  94.2 %
Negative score obtained :  72.6 %


## Ranking score on the Dev set