# Lost in the middle evaluation

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

while 'notebooks' in os.getcwd():
    os.chdir('..')

In [3]:
from llmrankers.setwise import SetwiseLlmRanker
from llmrankers.rankers import SearchResult

import ir_datasets

In [4]:
docs = [SearchResult(docid=i, text=f'this is passage {i}', score=None) for i in range(100)]
query = 'Give me passage 34'

ranker = SetwiseLlmRanker(model_name_or_path='google/flan-t5-large',
                          tokenizer_name_or_path='google/flan-t5-large',
                          device='cuda',
                          num_child=10,
                          scoring='generation',
                          method='heapsort',
                          k=10)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
print(ranker.rerank(query, docs)[0])

SearchResult(docid=34, score=-1, text=None)


In [6]:
class Doc():
    def __init__(self, text: str):
        self.text = text

In [7]:
ranker.compare("what is the largest fruit in the world?",
               [Doc("watermelon is a very large fruit"),\
                Doc("otto van bismarck was a german leader in XIX century")])

'A'

In [8]:
ranker.compare("Who was the german leader in XIX century?",
               [Doc("watermelon is a very large fruit"),\
                Doc("otto van bismarck was a german leader in XIX century")])

'B'

# Get documents and queries

In [9]:
dataset = ir_datasets.load("msmarco-passage/trec-dl-2020")

In [10]:
docstore = dataset.docs_store()

In [11]:
class QueryWithPossibleAnswers():
    query_id: str
    query_text: str
    documents_relevance: list[tuple[str, int]]
    
    def __init__(self, query_id, query_text):
        self.query_id = query_id
        self.query_text = query_text
        self.documents_relevance = []

In [12]:
queries = {}

for query in dataset.queries_iter():
    query_obj = QueryWithPossibleAnswers(query.query_id, query.text)
    
    queries[query_obj.query_id] = query_obj    

[INFO] Please confirm you agree to the MSMARCO data usage agreement found at <http://www.msmarco.org/dataset.aspx>
[INFO] [starting] https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz
[INFO] [finished] https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz: [00:00] [4.13kB] [2.94MB/s]
                                                                                                                    

In [13]:
queries['1030303'].query_text

'who is aziz hashim'

In [14]:
for qrel in dataset.qrels_iter():
    documents_relevance_entry = (docstore.get(qrel.doc_id), qrel.relevance)
    
    queries[qrel.query_id].documents_relevance.append(documents_relevance_entry)

[INFO] [starting] https://trec.nist.gov/data/deep/2020qrels-pass.txt
[INFO] [finished] https://trec.nist.gov/data/deep/2020qrels-pass.txt: [00:00] [219kB] [479kB/s]
[INFO] [starting] building docstore                                          
[INFO] If you have a local copy of https://msmarco.z22.web.core.windows.net/msmarcoranking/collectionandqueries.tar.gz, you can symlink it here to avoid downloading it again: /scratch-local/scur2870.8665586/ir_datasets/downloads/31644046b18952c1386cd4564ba2ae69
[INFO] [starting] https://msmarco.z22.web.core.windows.net/msmarcoranking/collectionandqueries.tar.gz
docs_iter:   0%|                                   | 0/8841823 [00:00<?, ?doc/s]
https://msmarco.z22.web.core.windows.net/msmarcoranking/collectionandqueries.tar.gz: 0.0%| 0.00/1.06G [00:00<?, ?B/s][A
https://msmarco.z22.web.core.windows.net/msmarcoranking/collectionandqueries.tar.gz: 0.0%| 16.4k/1.06G [00:00<2:38:19, 111kB/s][A
https://msmarco.z22.web.core.windows.net/msmarcoranking/coll

In [15]:
queries['1030303'].query_text

'who is aziz hashim'

In [16]:
queries['1030303'].documents_relevance

[(GenericDoc(doc_id='1038342', text="Best TV shows on Netflix: Our favorites. Netflix originals Making a Murderer, Master of None (starring Aziz Ansari) and Marvel's Jessica Jones, plus other TV series favorites we love that are available on the streaming service. (Credit: BET / Daniel McFadden)"),
  0),
 (GenericDoc(doc_id='1042969', text='Charles Keating Articles. 1  Charles Keating Net WorthCharles Keating net worth: Charles Keating was an English actor and narrator who had a net worth of $8 million. 2  Keating was born i...harles Keating Articles. 1  Charles Keating Net WorthCharles Keating net worth: Charles Keating was an English actor and narrator who had a net worth of $8 million. 2  Keating was born i...'),
  0),
 (GenericDoc(doc_id='1044043', text='organization in txts and in chat.... 0. 0. Hi guys. but i am sorry to say PLO doesnt Stands for Palestine Liberation. Organization in txts and in chat .. they means : P = please. L = leave. O = ok.'),
  0),
 (GenericDoc(doc_id='105

In [51]:
for query_id in queries.keys():
    queries[query_id].relevant_docs = [d for (d, s) in queries[query_id].documents_relevance if (s == 3 or s == 2)]
    queries[query_id].not_relevant_docs = [d for (d, s) in queries[query_id].documents_relevance if s == 0]

In [52]:
def prepare_malicious_docs_list(query, top_doc_position: int = 3, total_docs = 5):
    not_relevant_docs = query.not_relevant_docs[:(total_docs - 1)]
    relevant_doc = [query.relevant_docs[0]]

    return not_relevant_docs[:top_doc_position - 1] + relevant_doc + not_relevant_docs[top_doc_position - 1:]

In [53]:
queries['1030303'].query_text

'who is aziz hashim'

In [54]:
prepare_malicious_docs_list(queries['1030303'])

[GenericDoc(doc_id='1038342', text="Best TV shows on Netflix: Our favorites. Netflix originals Making a Murderer, Master of None (starring Aziz Ansari) and Marvel's Jessica Jones, plus other TV series favorites we love that are available on the streaming service. (Credit: BET / Daniel McFadden)"),
 GenericDoc(doc_id='1042969', text='Charles Keating Articles. 1  Charles Keating Net WorthCharles Keating net worth: Charles Keating was an English actor and narrator who had a net worth of $8 million. 2  Keating was born i...harles Keating Articles. 1  Charles Keating Net WorthCharles Keating net worth: Charles Keating was an English actor and narrator who had a net worth of $8 million. 2  Keating was born i...'),
 GenericDoc(doc_id='7156982', text='Rounding out the IFA leadership team is Aziz Hashim, a multi-unit, multi-brand franchisee and president & CEO of NRD Holdings, LLC, as vice chair; Shelly Sun, CFE, CEO & co-founder of BrightStar Franchising, LLC, as treasurer; and Liam Brown, pre

In [55]:
ranker.compare(queries['1030303'].query_text, prepare_malicious_docs_list(queries['1030303']))

'C'

In [56]:
ranker.compare(queries['1030303'].query_text, prepare_malicious_docs_list(queries['1030303'], top_doc_position=5, total_docs=5))

'E'

# Accuracy when the top document is at the given position

In [57]:
alphabet = 'ABCDEFGHIJKLMNO'

In [58]:
def get_accuracy_at_position(ranker, top_doc_position=3, total_docs=5):
    correct_counter = 0
    all_counter = 0
    for query_id in queries.keys():
        try:
            prediction = ranker.compare(queries[query_id].query_text,\
                                    prepare_malicious_docs_list(queries[query_id], top_doc_position=top_doc_position, total_docs=total_docs))
            # print(prediction, alphabet[top_doc_position - 1])
            if prediction == alphabet[top_doc_position - 1]:
                correct_counter += 1
            all_counter += 1
        except IndexError as e:
            # If there are no relevant docs
            pass

    return correct_counter, all_counter, correct_counter / all_counter

In [68]:
ranker = SetwiseLlmRanker(model_name_or_path='google/flan-t5-large',
                          tokenizer_name_or_path='google/flan-t5-large',
                          device='cuda',
                          num_child=10,
                          scoring='generation',
                          method='heapsort',
                          k=10)

for i in range(3):
    print(i + 1)
    print(get_accuracy_at_position(ranker, i + 1, 3))

1


Token indices sequence length is longer than the specified maximum sequence length for this model (534 > 512). Running this sequence through the model will result in indexing errors


(45, 54, 0.8333333333333334)
2
(48, 54, 0.8888888888888888)
3
(46, 54, 0.8518518518518519)


In [70]:
ranker_llama_2 = SetwiseLlmRanker(model_name_or_path='meta-llama/Llama-2-7b-chat-hf',
                          tokenizer_name_or_path='meta-llama/Llama-2-7b-chat-hf',
                          device='cuda',
                          num_child=10,
                          scoring='generation',
                          method='heapsort',
                          k=10)

for i in range(10):
    print(i + 1)
    print(get_accuracy_at_position(ranker_llama_2, i + 1, 10))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

0
Unexpected output: PASS
(13, 54, 0.24074074074074073)
1
(40, 54, 0.7407407407407407)
2


KeyboardInterrupt: 