In [2]:
!pip install transformers

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device = 'cpu'
model = model.to(device)

## Check Flan-T5 in Target Ranking

In [26]:
lm_instruction = 'Find <extra_id_0>.'
samples = [
    {"id": 1,
     "query": "Scientists are studying the quality of the water in the <extra_id_0>.",
     "facts": ["Taking samples of water is used for studying the quality of water.", "Scientists go to a lake once a month to take samples of water."],
     "targets": ["<extra_id_0> lake", "<extra_id_0> water"]},
    {"id": 2,
     "query": "Matter in solid phase has definite shape and <extra_id_0>.",
     "facts": ["Matter in the solid phase has definite shape.", "Matter in the solid phase has definite volume."],
     "targets": ["<extra_id_0> volume", "<extra_id_0> shape", "<extra_id_0> volume"]},
    {"id": 3,
     "query": "The <extra_id_0> is opaque.",
     "facts": ["If an object is opaque , then light will not shine through that object.", "Opacity is a property of an object and includes ordered values of opaque / translucent / transparent.", "The light cannot shine through an object."],
     "targets": ['<extra_id_0> object', '<extra_id_0> opacity']},
]

In [51]:
def encode_ex(query, retrieved_passages, tokenizer, instruction=''):
    def append_question(q, docs):
        return ['{}\n {} {}'.format(instruction, " ".join(docs), q[0])]

    text_passages = append_question(query, retrieved_passages)
    passage_ids, passage_masks = [], []
    p = tokenizer.batch_encode_plus(
        text_passages,
        max_length=512,
        padding='max_length',
        return_tensors='pt',
        truncation=True
    )
    passage_ids.append(p['input_ids'][None])
    passage_masks.append(p['attention_mask'][None])
    passage_ids = torch.cat(passage_ids, dim=0)
    passage_masks = torch.cat(passage_masks, dim=0).bool()
    passage_ids, passage_masks = passage_ids.squeeze(1), passage_masks.squeeze(1)
    return passage_ids, passage_masks, text_passages

def encode_target(targets, tokenizer):
    target = tokenizer.batch_encode_plus(
        targets,
        max_length=200,
        padding=True,
        return_tensors='pt',
        truncation=True
    )
    target_ids = target["input_ids"]
    target_mask = target["attention_mask"].bool()
    target_ids = target_ids.masked_fill(~target_mask, -100)
    return target_ids, target_mask

In [42]:
def target_ranking(sample, flan, tokenizer):
    targets = sample["targets"]
    alt_num = len(targets)
    target_losses = torch.zeros(alt_num)
    query = sample["query"]

    context_ids, context_mask, new_q = encode_ex([query], sample["facts"], tokenizer, instruction=lm_instruction)
    label_ids, _ = encode_target(targets, tokenizer)
    for alt_i in range(alt_num):
        labels_output = flan(input_ids=context_ids.to(device), attention_mask=context_mask.to(device), labels=label_ids[alt_i].unsqueeze(0).to(device))
        target_losses[alt_i] = labels_output[0]
    predicted_alt = torch.argmin(target_losses)
    return new_q, predicted_alt, target_losses

In [52]:
for sample in samples:
    new_q, pred, loss = target_ranking(sample, model, tokenizer)
    print("{} {})\tquery: {}\n\ttargets: {}\n\t\tpred:\t{} (loss: {}),\n\t\tanswer:\t{} (loss: {})\n".format("+" if pred == 0 else "-", sample["id"], new_q, sample["targets"], sample["targets"][pred], loss[pred], sample["targets"][0], loss[0]))

+ 1)	query: ['Find <extra_id_0>.\n Taking samples of water is used for studying the quality of water. Scientists go to a lake once a month to take samples of water. Scientists are studying the quality of the water in the <extra_id_0>.']
	targets: ['<extra_id_0> lake', '<extra_id_0> water']
		pred:	<extra_id_0> lake (loss: 16.30898094177246),
		answer:	<extra_id_0> lake (loss: 16.30898094177246)

- 2)	query: ['Find <extra_id_0>.\n Matter in the solid phase has definite shape. Matter in the solid phase has definite volume. Matter in solid phase has definite shape and <extra_id_0>.']
	targets: ['<extra_id_0> volume', '<extra_id_0> shape', '<extra_id_0> volume']
		pred:	<extra_id_0> shape (loss: 17.38692283630371),
		answer:	<extra_id_0> volume (loss: 18.05997657775879)

- 3)	query: ['Find <extra_id_0>.\n If an object is opaque , then light will not shine through that object. Opacity is a property of an object and includes ordered values of opaque / translucent / transparent. The light can

## Check Flan-T5's loss [old]

In [None]:
query = "Parishad is studying at <extra_id_0> in Canada."
inputs = tokenizer(query, return_tensors="pt").to(device)

In [None]:
query2 = 'Answer questions with short factoid answers.\n\n---\n\nQuestion: Which of these human activities in a forest has a positive effect on the ecosystem?\nAnswer: Planting new trees where old ones were cut down\n\nQuestion: Which is the best plan to make the fossil fuel supply last longer?\nAnswer: reduce electricity use\n\n---\n\nFollow the following format.\n\nContext:\n${sources that may contain relevant content}\n\nQuestion: ${the question to be answered}\n\nRationale: Let\'s think step by step. ${a step-by-step deduction that identifies the correct response, which will be provided below}\n\nAnswer: ${a short factoid answer, often between 1 and 5 words}\n\n---\n\nContext:\n[1] «Melinda learned that days in some seasons have more daylight hours than in other seasons. Which season receives the most hours of sunlight in the Northern Hemisphere?»\n[2] «As the distance of a location from the north pole becomes smaller / closer , the amount of daylight received by that location will increase during the summer.»\n[3] «When the season changes , the amount of daylight will change.»\n[4] «If a place is in summer, then it will have the most sunlight.»\n[5] «Daylight hours means time during which there is daylight.»\n[6] «If places are receiving the same amount of sunlight , then these places will have similar seasonal weather pattern.»\n\nQuestion: Melinda learned that days in some seasons have more daylight hours than in other seasons. Which season receives the most hours of sunlight in the Northern Hemisphere?\n\nRationale: Let\'s think step by step. Melinda learned that days in some seasons have more daylight hours than in other seasons. Which season receives the most hours of sunlight in the Northern Hemisphere?\n\nAnswer:'
inputs2 = tokenizer(query2, return_tensors="pt").to(device)
outputs2 = model.generate(**inputs2, max_new_tokens=150)
print(tokenizer.batch_decode(outputs2, skip_special_tokens=True))
print(inputs2.input_ids.shape)

['[1] ---']
torch.Size([1, 401])


In [None]:
query2 = 'Answer questions with short factoid answers.\n\n---\n\nQuestion: Which of these human activities in a forest has a positive effect on the ecosystem?\nAnswer: Planting new trees where old ones were cut down\n\nQuestion: Which is the best plan to make the fossil fuel supply last longer?\nAnswer: reduce electricity use\n\n---\n\nFollow the following format.\n\nContext:\n${sources that may contain relevant content}\n\nQuestion: ${the question to be answered}\n\nRationale: Let\'s think step by step. ${a step-by-step deduction that identifies the correct response, which will be provided below}\n\nAnswer: ${a short factoid answer, often between 1 and 5 words}\n\n---\n\nContext:\n- «Melinda learned that days in some seasons have more daylight hours than in other seasons. Which season receives the most hours of sunlight in the Northern Hemisphere?»\n- «As the distance of a location from the north pole becomes smaller / closer , the amount of daylight received by that location will increase during the summer.»\n- «When the season changes , the amount of daylight will change.»\n- «If a place is in summer, then it will have the most sunlight.»\n- «Daylight hours means time during which there is daylight.»\n «If places are receiving the same amount of sunlight , then these places will have similar seasonal weather pattern.»\n\nQuestion: Melinda learned that days in some seasons have more daylight hours than in other seasons. Which season receives the most hours of sunlight in the Northern Hemisphere?\n\nRationale: Let\'s think step by step. Melinda learned that days in some seasons have more daylight hours than in other seasons. Which season receives the most hours of sunlight in the Northern Hemisphere?\n\nAnswer:'
inputs2 = tokenizer(query2, return_tensors="pt").to(device)
outputs2 = model.generate(**inputs2, max_new_tokens=150)
print(tokenizer.batch_decode(outputs2, skip_special_tokens=True))
print(inputs2.input_ids.shape)

['As the distance of a location from the north pole becomes smaller / closer, the amount of daylight received by that location will increase during the summer.» ---']
torch.Size([1, 395])


In [None]:
# next token prediction
outputs = model.generate(**inputs, max_length=3)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

['Parishad']


In [None]:
# target ranking
options = ["<extra_id_0> cheese", "<extra_id_0> Harvard University", "<extra_id_0> University of Montreal"]
options_loss = torch.zeros(len(options))
for i, option in enumerate(options):
  _input = tokenizer(option, return_tensors="pt").to(device)
  loss = model(**inputs, labels=_input.input_ids)[0].item()
  options_loss[i] = loss

print('Query:', query)
print('Best option:', options[torch.argmin(options_loss)])
print('Losses:', list(zip(options, options_loss.tolist())))

Query: Parishad is studying at <extra_id_0> in Canada.
Best option: <extra_id_0> University of Montreal
Losses: [('<extra_id_0> cheese', 18.1092472076416), ('<extra_id_0> Harvard University', 12.804937362670898), ('<extra_id_0> University of Montreal', 10.592082977294922)]
