In [1]:
%reload_ext autoreload
%autoreload 2
import sys
sys.path.insert(0, './analogies_mining')
from tqdm.auto import tqdm
import torch
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
from functools import partial

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
)



In [2]:
# def get_model_general(model_name):
#     from transformers import AutoTokenizer, pipeline
#     tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
#     pipe = pipeline(
#         "text-generation",
#         model=model_name,
#         tokenizer=tokenizer,
#         torch_dtype=torch.float16,
#         trust_remote_code=True,
#         device_map="cuda:0",
#         # max_new_tokens=8,
#         # do_sample=True,

        
#     )
#     tokenizer = pipe.tokenizer
#     model = pipe.model
#     model.eval()
#     return model, tokenizer

from transformers import AutoTokenizer, pipeline, BitsAndBytesConfig
import torch
from accelerate.utils import BnbQuantizationConfig

def get_model_general(model_name):
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="fp4",
    )
    # quantization_config = BitsAndBytesConfig(load_in_8bit=True)
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        padding_side="left",
        trust_remote_code=True
    )

    # pipe = pipeline(
    #     "text-generation",
    #     model=model_name,
    #     tokenizer=tokenizer,
    #     trust_remote_code=True,
    #     device_map="cuda:0",
    #     quantization_config=quantization_config,  # Pass the quantization config
    #     load_in_4bit=True,                     # Enable 4-bit quantization

    # )
    model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, device_map="cuda:0")
    model.eval()
    # tokenizer = pipe.tokenizer
    # model = pipe.model
    return model, tokenizer



import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from peft import PeftModel, PeftConfig
import numpy as np

class Promptriever:
    def __init__(self, model_name_or_path):
        self.model, self.tokenizer = self.get_model(model_name_or_path)
        self.model.eval().cuda()

    def get_model(self, peft_model_name):
        # Load the PEFT configuration to get the base model name
        peft_config = PeftConfig.from_pretrained(peft_model_name)
        base_model_name = peft_config.base_model_name_or_path

        # Load the base model and tokenizer
        base_model = AutoModel.from_pretrained(base_model_name,
                                               device_map="auto",
                                              torch_dtype=torch.float16)
        tokenizer = AutoTokenizer.from_pretrained(base_model_name)
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id
        tokenizer.padding_side = "right"

        # Load and merge the PEFT model
        model = PeftModel.from_pretrained(base_model, peft_model_name)
        model = model.merge_and_unload()

        # can be much longer, but for the example 512 is enough
        model.config.max_length = 512
        tokenizer.model_max_length = 512

        return model, tokenizer

    def create_batch_dict(self, tokenizer, input_texts):
        max_length = self.model.config.max_length
        batch_dict = tokenizer(
            input_texts,
            max_length=max_length - 1,
            return_token_type_ids=False,
            return_attention_mask=False,
            padding=False,
            truncation=True,
        )
        batch_dict["input_ids"] = [
            input_ids + [tokenizer.eos_token_id]
            for input_ids in batch_dict["input_ids"]
        ]
        return tokenizer.pad(
            batch_dict,
            padding=True,
            pad_to_multiple_of=8,
            return_attention_mask=True,
            return_tensors="pt",
        )

    def encode(self, sentences, max_length: int = 2048, batch_size: int = 4):
        all_embeddings = []
        for i in tqdm(range(0, len(sentences), batch_size)):
            batch_texts = sentences[i : i + batch_size]

            batch_dict = self.create_batch_dict(self.tokenizer, batch_texts)
            batch_dict = {
                key: value.to(self.model.device) for key, value in batch_dict.items()
            }

            with torch.amp.autocast(device_type='cuda'):
                with torch.no_grad():
                    outputs = self.model(**batch_dict)
                    last_hidden_state = outputs.last_hidden_state
                    sequence_lengths = batch_dict["attention_mask"].sum(dim=1) - 1
                    batch_size = last_hidden_state.shape[0]
                    reps = last_hidden_state[
                        torch.arange(batch_size, device=last_hidden_state.device),
                        sequence_lengths,
                    ]
                    embeddings = F.normalize(reps, p=2, dim=-1)
                    all_embeddings.append(embeddings.cpu().numpy())

        return np.concatenate(all_embeddings, axis=0)



In [None]:
# model_name = 'multi-qa-mpnet-base-dot-v1'
# model_name = 'paraphrase-MiniLM-L12-v2'

# model = SentenceTransformer("paraphrase-MiniLM-L6-v2")




def preprocess_options(options_str):
    splits = ['A.', 'B.', 'C.', 'D.']
    final_options = []
    for sidx in range(len(splits)):
        if sidx == len(splits) - 1:
            x = options_str.split(splits[sidx])[1]
        else:
            x = options_str.split(splits[sidx])[1].split(splits[sidx+1])[0]
        final_options.append(x.strip())
    return final_options


import numpy as np

def predict_labels(questions, options, model, bs=256, questions_indices=None, options_indices=None):
    question_embeddings = model.encode(questions, show_progress_bar=True, batch_size=bs, convert_to_tensor=True)
    question_embeddings = question_embeddings.reshape(len(questions), -1)
    # question_embeddings = np.repeat(question_embeddings, 4, axis=0)
    if options_indices is None:
        question_embeddings = torch.repeat_interleave(question_embeddings, 4, dim=0)
    option_embeddings = model.encode(options, show_progress_bar=True, batch_size=bs, convert_to_tensor=True)
    print(question_embeddings.shape, option_embeddings.shape)


    if options_indices is None:
        similarities = []
        print('cossim')
        for qidx in tqdm(range(0, len(question_embeddings), bs)):
            sim = torch.nn.functional.cosine_similarity(question_embeddings[qidx:qidx+bs], option_embeddings[qidx:qidx+bs], dim=-1)
            similarities.append(sim)
        similarities = torch.hstack(similarities)
    else:
        matrix = question_embeddings @ option_embeddings.T / (question_embeddings.norm(dim=-1)[:, None] * option_embeddings.norm(dim=-1)[None, :])
        matrix = matrix.cpu()
        questions_indices = torch.tensor(questions_indices).repeat_interleave(4)
        similarities = matrix[questions_indices, options_indices]
        print(similarities.shape)

    
    similarities = similarities.reshape(-1, 4)
    print('sorting')
    # ranked_indices = torch.argsort(similarities, dim=-1, descending=True)
    predicted_labels = torch.argmax(similarities, dim=-1).cpu().numpy()
    print('sorting finished')
    return predicted_labels



def predict_labels_FollowIR(quesitons, options, model, tokenizer, bs=32):

    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"
    token_false_id = tokenizer.get_vocab()["false"]
    token_true_id = tokenizer.get_vocab()["true"]
    template = """<s> [INST] Consider the Document to be relevant only if it can be analogous to the Query. Answer in (true/false)

    Query: {query}
    Document: {text}
    Analogically Relevant (only output one word, either "true" or "false"):  [/INST] """

    # assert bs % 4 == 0, "Batch size must be a multiple of 4"
    scores = []
    for bsidx in tqdm(range(0, len(options), bs)):
        cur_options = options[bsidx:bsidx+bs]
        cur_questions = [quesitons[opidx//4] for opidx in range(bsidx, bsidx+bs)]

        prompts = [
            template.format(query=query, text=text) for (query, text) in zip(cur_questions, cur_options)
        ]
        tokens = tokenizer(
            prompts,
            padding=True,
            truncation=True,
            return_tensors="pt",
            pad_to_multiple_of=None,
        )

        # move to cuda if desired
        for key in tokens:
            tokens[key] = tokens[key].cuda()

        # calculate the scores by comparing true and false tokens
        batch_scores = model(**tokens).logits[:, -1, :]
        true_vector = batch_scores[:, token_true_id]
        false_vector = batch_scores[:, token_false_id]
        batch_scores = torch.stack([false_vector, true_vector], dim=1)
        batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
        cur_scores = batch_scores[:, 1].exp().tolist()
        scores.extend(cur_scores)

    print(len(cur_scores))
    scores = np.array(scores)
    scores = scores.reshape(len(quesitons), 4)
    predicted_labels = np.argmax(scores, axis=1)
    return predicted_labels


def predict_labels_Promptriever(questions, options, model, bs=4):

    # instruction = "" 
    # 6471, 5912
    
    # instruction = "A relevant document would be the most analogous to the query. I don't care about semantic similarity. Think carefully about these conditions when determining relevance."
    # 6588, 5824
    # instruction = "A relevant document would be the most analogous to the query. I don't care about semantic similarity."
    # 6735, 5912
    # instruction = "A relevant document would be the most analogous to the query. I don't care about semantic similarity. Ignore concrete details and focus on the relational structure and abstract meaning."
    # 6265, 5824
    # instruction = "A relevant document would be the most analogous to the query. I care about abstraction. I don't care about semantic similarity or concrete details."
    # 6353, 5824
    # instruction = "A relevant document would be the most analogous to the query. Think carefully about these conditions when determining relevance."
    # 6647, 5794
    # instruction = "Instruction: A relevant document would be the most analogous to the query."
    # 6500, 5676

    # instruction = "Summarize the key concepts behind this query."
    # 6176, 5676
    # instruction = "Consider what documents could be analogous to this query."
    # 6382, 5941
    # instruction = "I don't care about semantic similarity."
    # 6118, 5706

    # instruction = "Happy Thanksgiving."
    # 6000, 5324
    # instruction = "Analogical sense. Analogical similarity. Not semantic similarity."
    # 5706, 5647
    # instruction = "Focus on details."
    # 6029, 5676
    # instruction = "Focus on high-level concepts, abstraction, and key ideas."
    # 5618, 5235

    # instruction = "A relevant document would be the most analogous to the query and shares the same key idea as the query."
    # 6618, 5882

    # instruction = "A relevant document would be the most analogous to the query. I don't care about semantic similarity. I don't care about semantic similarity."
    # 6824, 5971
    # instruction = "A relevant document would be the most analogous to the query. " * 2 + "I don't care about semantic similarity. " * 2
    # 6912, 6000
    # instruction = "A relevant document would be the most analogous to the query. " * 3 + "I don't care about semantic similarity. " * 3
    # 6706, 6000
    # instruction = "A relevant document would be the most analogous to the query. " * 3 + "I don't care about semantic similarity. " * 2
    # 6735, 6118
    # instruction = "A relevant document would be the most analogous to the query. " * 3 + "I don't care about semantic similarity. " * 1
    # 6794, 5941

    # instruction = "A relevant document would be the most analogous to the query. " * 2 + "I don't care about semantic similarity. " * 2 + "Think carefully about these conditions when determining relevance."
    # 6647, 6118

    instruction = "A relevant document would be the most analogous to the query. " * 2 + "Don't care about semantic similarity. " * 2 
    # 6941, 6029
    # instruction = "relevant would be the most analogous. " * 2 + " Don't care about semantic similarity. " * 2 
    # 6500, 5971

    # instruction = "A relevant document would be the most analogous to the query."
    # 6735, 5941

    # instruction = "A relevant document would be the most analogous to the query. " * 2
    # 6824, 5941

    # instruction = "A relevant document would be the most analogous to the query. " * 10
    # 6706, 5912

    # instruction = "A relevant document would reflect the same relationship described in the query, prioritizing relational over semantic similarity."
    # 5853, 5588
    # instruction = "A relevant document would reflect the same relationship or analogy described in the query. Forget about semantic similarity. Think carefully about these conditions when determining relevance."
    # 6176, 5618
    # instruction = "Retrieve documents that exhibit the same relational analogy as the query. Avoid selecting documents that merely echo the query's keywords without addressing the relational structure. Focus on those that provide analogous reasoning or context."
    # 5853, 5676
    # instruction = "A relevant document demonstrates a relational analogy to the query, focusing on parallels in context, structure, or reasoning rather than direct semantic overlap. Ensure that the documents adhere to these criteria by avoiding those that diverge into tangential or overly literal interpretations. Additionally, exclude passages from [specific field/domain] unless they offer clear analogical insights."
    # 5265, 4824
    
    '''second round'''

    # instruction = "A relevant document captures the query's analogy in structure and meaning, not surface-level overlap."
    # 5912, 5912

    # instruction = "A relevant document should focus solely on providing a clear and accurate answer to the query, without distracting or unnecessary information"
    # # 6059, 5441

    # template mod: query ... question: which passage is the most analogous to the query? instruction: ...
    # instruction = "A relevant document should focus solely on providing a clear and accurate answer to the query, without distracting or unnecessary information"
    # 6147, 5441

    # instruction = "A relevant document should be the most analogous to the query. When in doubt, prioritize documents that are analogically similar to the query."
    # 6618, 5588

    # instruction = "A relevant document should also be the most analogous to the query."
    # 6735, 5559

    input_text_list = [
        f"query: {query.strip()} {instruction.strip()}".strip() for query in questions
        # f"query: {query.strip()} question: which passage is the most analogous to the query? {instruction.strip()}".strip() for query in questions

    ] 

    question_embeddings = model.encode(input_text_list, batch_size=bs)
    question_embeddings = question_embeddings.reshape(len(questions), -1)
    question_embeddings = np.repeat(question_embeddings, 4, axis=0)

    '''option instructions: '''

    # ====question: A relevant document would be the most analogous to the query.

    # instruction = "A relevant document would be the most analogous to the query." # same as question
    # 6853, 5706
    # instruction = "A relevant query would be the most analogous to the passage." 
    # 6824, 5824
    # instruction = "Instruction: A relevant document would be the most analogous to the query."
    # 6676, 5912
    # instruction = "A relevant document would be the most analogous to the query. A relevant query would be the most analogous to the passage." 
    # 6853, 5647
    # instruction = "A relevant document would be the most analogous to the query. I don't care about semantic similarity. Think carefully about these conditions when determining relevance."
    # 6559, 5618

    # ====question: "A relevant document demonstrates a relational analogy to the query, focusing on parallels in context, structure, or reasoning rather than direct semantic overlap. Ensure that the documents adhere to these criteria by avoiding those that diverge into tangential or overly literal interpretations. Additionally, exclude passages from [specific field/domain] unless they offer clear analogical insights."
    
    # instruction = "A relevant document demonstrates a relational analogy to the query, focusing on parallels in context, structure, or reasoning rather than direct semantic overlap. Ensure that the documents adhere to these criteria by avoiding those that diverge into tangential or overly literal interpretations. Additionally, exclude passages from [specific field/domain] unless they offer clear analogical insights."
    # 5118, 5000

    # ====question:"A relevant document would be the most analogous to the query. I don't care about semantic similarity. I don't care about semantic similarity."

    # instruction = "A relevant document would be the most analogous to the query. I don't care about semantic similarity. I don't care about semantic similarity." # same as question
    # 6824, 5971 (same)

    options = ["passage: " + option for option in options]
    options = [f"{option} {instruction}" for option in options]
    option_embeddings = model.encode(options, batch_size=bs)
    print(question_embeddings.shape, option_embeddings.shape)


    # similarities = (question_embeddings * option_embeddings).sum(axis=-1)
    
    similarities = []
    cur_bs = 32
    for qidx in tqdm(range(0, len(question_embeddings), cur_bs)):
        sim = (question_embeddings[qidx:qidx+cur_bs] * option_embeddings[qidx:qidx+cur_bs]).sum(axis=-1)
        similarities.append(sim)
    
    similarities = np.hstack(similarities)
    similarities = similarities.reshape(len(questions), 4)

    
    print('sorting')
    predicted_labels = np.argmax(similarities, axis=-1)
    print('sorting finished')
    return predicted_labels

def preprocess_data(samples):
    if 'Story' not in samples:
        questions_pool = samples['Sentence']
    else:
        questions_pool = samples['Sentence'] # story to story
    questions = [sample for sample in questions_pool]
    all_options = [sample for sample in samples['Options']]
    all_options = [preprocess_options(options) for options in all_options]
    flattened_options = [option for options in all_options for option in options]
    return questions, flattened_options




def evaluate_ranking(dataset, pred_func):

    questions, options = preprocess_data(dataset)
    print(len(options))
    unique_options = list(set(options))
    print(len(unique_options))
    unique_options_to_idx = {option: idx for idx, option in enumerate(unique_options)}
    options_indices = [unique_options_to_idx[option] for option in options]

    print(len(questions))

    unique_questions = list(set(questions))
    print(len(unique_questions))
    unique_questions_to_idx = {question: idx for idx, question in enumerate(unique_questions)}
    questions_indices = [unique_questions_to_idx[question] for question in questions]
    predicted_labels = pred_func(unique_questions, unique_options, questions_indices=questions_indices, options_indices=options_indices)
    
    labels = dataset['Label']  # The index of the correct option
    labels = np.array([ord(label) - ord('A') for label in labels])
    total_samples = len(labels)    
    correct = (predicted_labels == labels)
    precision_at_1 = sum(correct) / total_samples

    incorrect_sample_indices = []
    correct_sample_indices = []
    for i, crr in enumerate(correct):
        if not crr:
            incorrect_sample_indices.append(i)
        else:
            correct_sample_indices.append(i)

    return  precision_at_1, incorrect_sample_indices, correct_sample_indices

results = {}

model_name_list =[
    # 'all-mpnet-base-v2', #  0.5481, 0.2312, 0.2559 (1 mins) (2312, 2559) 
    # 'sentence-t5-xl', # 0.6953, 0.2556, 0.2564
    'sentence-t5-xxl', # 2530, 2490
    # 'gtr-t5-xl', # 0.5860,
	# 'multi-qa-mpnet-base-dot-v1', # 1999, 2593
    # "jhu-clsp/FollowIR-7B" #  0.6290
    # "promptriever"
]

for model_name in model_name_list:
    if model_name== "jhu-clsp/FollowIR-7B":
        # model = AutoModelForCausalLM.from_pretrained(
        #     model_name,
        #     torch_dtype=torch.float16  # Load in fp16 precision
        # ).to('cuda')
        # tokenizer = AutoTokenizer.from_pretrained(
        #     model_name, padding_side="left"
        # )
        model, tokenizer = get_model_general(model_name)
        bs = 2
        pred_func = partial(predict_labels_FollowIR, model=model, tokenizer=tokenizer, bs=bs)
    elif model_name == "promptriever":
        # model = Promptriever("samaya-ai/promptriever-llama2-7b-v1")
        # bs = 4
        bs = 4
        pred_func = partial(predict_labels_Promptriever, model=model, bs=bs)
    else:
        bs = 256 if not 'xl' in model_name else 64
        bs = bs if not 'xxl' in model_name else 16
        model = SentenceTransformer(model_name, device='cuda:0')
        pred_func = partial(predict_labels, model=model, bs=bs)
    print('finish loading', model_name)

    results[f'{model_name}'] = [] 
    # for datastr in ['1', '10', '30']:# '10', '30',
    # for datastr in ['1', '10', '30']:# '10', '30',
    for datastr in ['10', '30']:# '10', '30',

    
        dataset = load_dataset('jhu-clsp/AnaloBench', f'T1S{datastr}-Full')['train']
        precision_at_1, incorrect_sample_indices, correct_sample_indices = evaluate_ranking(dataset, pred_func)
        print(f'Precision@1: {precision_at_1:.4f}')
        results[f'{model_name}'].append(precision_at_1) 

  model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu')))


finish loading sentence-t5-xxl
97728
18
24432
340


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

torch.Size([340, 768]) torch.Size([18, 768])
torch.Size([97728])
sorting
sorting finished
Precision@1: 0.2530
97728
19
24432
340


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

torch.Size([340, 768]) torch.Size([19, 768])
torch.Size([97728])
sorting
sorting finished
Precision@1: 0.2490


In [None]:
def preprocess_data(samples):
    if 'Story' not in samples:
        questions_pool = samples['Sentence']
    else:
        questions_pool = samples['Sentence'] # story to story
    questions = [sample for sample in questions_pool]
    all_options = [sample for sample in samples['Options']]

    all_options = [preprocess_options(options) for options in all_options]
    flattened_options = [option for options in all_options for option in options]
    return questions, flattened_options
dataset = load_dataset('jhu-clsp/AnaloBench', f'T1S{30}-Full')['train']
list(set(preprocess_data(dataset)[1]))

A. John had been born into a well-to-do family, inheriting a fortune so large that it was often incomprehensible even to him. He owned luxurious penthouses in cities renowned for their exuberance and affluent life. The company he inherited from his father was a multi-billion-dollar business empire, earning him the title of a billionaire. John, it seemed, was an epitome of prosperity and success. And while many looked at his life with a tinge of green in their eyes, everyone envied his wealth.

John's life was extravagant, filled with parties, fast cars, and the finest of wines. His dinners consisted of the rarest delicacies catered by celebrity chefs, and his holidays were in the most sought-after tourist destinations. His days were filled with business, while his nights were filled with entertaining high-profile guests. John was living the proverbial dream, or so it seemed to those looking from the outside.

Despite such a luxurious life and seemingly endless wealth, he yearned for so

[' Trisha gazed out from the window of her apartment, soaking in the vibrant nightlife of Tokyo. The street was buzzing with life, bright neon signs glittering on the facades of buildings, people bustling about, laughing and living life to the fullest. "It looks like fun," she murmured to herself.\n\nYes, the new environment was exciting and thrilling. Each corner boasted unique shops, each alleyway teemed with secrets waiting to be discovered. It was like being a part of a dynamic, pulsating organism, forever evolving and adapting.\n\nBut beneath this awe and wonder, she felt the pressure, the stress, and the undeniable struggle. She missed her home in London, the comfort of familiarity, the warmth of loved ones. She craved for a simple cup of tea at her favorite café, yearned to hear the chime of Big Ben, and felt a profound longing for her friends and family.\n\nTrisha moved to Japan in the hope of a better career opportunity. She had always dreamed of working for an anime studio, a

In [None]:
def visualize_example(example_idx):
    example = dataset[example_idx]
    lines = '='*10
    print(lines, f"Query: {example['Sentence']}")
    print(lines, f"Options:")
    options = preprocess_options(example['Options'])
    for idx, option in enumerate(options):
        print(f"  {chr(ord('A') + idx)}. {option}")
    answer_idx = ord(example['Label']) - ord('A')
    print(lines, f"Correct Answer: {example['Options'][answer_idx]}")
    # print(lines, f"Predicted Answer: {results[f'{model_name}'][example_idx]}")
    print()

from ipywidgets import interact

interact(visualize_example, example_idx=incorrect_sample_indices)

interactive(children=(Dropdown(description='example_idx', options=(1, 4, 5, 6, 7, 10, 13, 14, 15, 17, 18, 21, …

<function __main__.visualize_example(example_idx)>

## PromptRetriever


In [None]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from peft import PeftModel, PeftConfig
import numpy as np

class Promptriever:
    def __init__(self, model_name_or_path):
        self.model, self.tokenizer = self.get_model(model_name_or_path)
        self.model.eval().cuda()

    def get_model(self, peft_model_name):
        # Load the PEFT configuration to get the base model name
        peft_config = PeftConfig.from_pretrained(peft_model_name)
        base_model_name = peft_config.base_model_name_or_path

        # Load the base model and tokenizer
        base_model = AutoModel.from_pretrained(base_model_name,
                                               device_map="auto",
                                              torch_dtype=torch.float16)
        tokenizer = AutoTokenizer.from_pretrained(base_model_name)
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id
        tokenizer.padding_side = "right"

        # Load and merge the PEFT model
        model = PeftModel.from_pretrained(base_model, peft_model_name)
        model = model.merge_and_unload()

        # can be much longer, but for the example 512 is enough
        model.config.max_length = 512
        tokenizer.model_max_length = 512

        return model, tokenizer

    def create_batch_dict(self, tokenizer, input_texts):
        max_length = self.model.config.max_length
        batch_dict = tokenizer(
            input_texts,
            max_length=max_length - 1,
            return_token_type_ids=False,
            return_attention_mask=False,
            padding=False,
            truncation=True,
        )
        batch_dict["input_ids"] = [
            input_ids + [tokenizer.eos_token_id]
            for input_ids in batch_dict["input_ids"]
        ]
        return tokenizer.pad(
            batch_dict,
            padding=True,
            pad_to_multiple_of=8,
            return_attention_mask=True,
            return_tensors="pt",
        )

    def encode(self, sentences, max_length: int = 2048, batch_size: int = 4):
        all_embeddings = []
        for i in range(0, len(sentences), batch_size):
            batch_texts = sentences[i : i + batch_size]

            batch_dict = self.create_batch_dict(self.tokenizer, batch_texts)
            batch_dict = {
                key: value.to(self.model.device) for key, value in batch_dict.items()
            }

            with torch.amp.autocast(device_type='cuda'):
                with torch.no_grad():
                    outputs = self.model(**batch_dict)
                    last_hidden_state = outputs.last_hidden_state
                    sequence_lengths = batch_dict["attention_mask"].sum(dim=1) - 1
                    batch_size = last_hidden_state.shape[0]
                    reps = last_hidden_state[
                        torch.arange(batch_size, device=last_hidden_state.device),
                        sequence_lengths,
                    ]
                    embeddings = F.normalize(reps, p=2, dim=-1)
                    all_embeddings.append(embeddings.cpu().numpy())

        return np.concatenate(all_embeddings, axis=0)

# Initialize the model
model = Promptriever("samaya-ai/promptriever-llama2-7b-v1")

# Example query and instruction
query = "What universities are in Baltimore, Maryland?"

# add specific relevance conditions if desired (and/or/not) and any other prompts
instruction = "A relevant document would describe any university in Baltimore. I am not interested in any university that was the first American university. Think carefully about these conditions when determining relevance."

# Combine query and instruction with **two spaces** after "query: "
input_text = f"query:  {query.strip()} {instruction.strip()}".strip()

# Example documents
# NOTE: double space after `passage:`
doc1 = "passage:  Johns Hopkins University (often abbreviated as Johns Hopkins, Hopkins, or JHU) is a private research university in Baltimore, Maryland. Founded in 1876, Johns Hopkins was the first American university based on the European research institution model."
doc2 = "passage:  Johns Hopkins University (often abbreviated as Johns Hopkins, Hopkins, or JHU) is a private research university in Baltimore, Maryland. Founded in 1876, Johns Hopkins was the second American university based on the European research institution model."

# Encode query and documents
query_embedding = model.encode([input_text])
doc_embeddings = model.encode([doc1, doc2])

# Calculate similarities
similarities = np.dot(query_embedding, doc_embeddings.T)[0]
print(f"Similarities: {similarities}") # Similarities: [0.53341305 0.53451955]
assert similarities[1] > similarities[0]


# change up the instruction to the opposite, to see it works
instruction = "A relevant document would describe any university in Baltimore. I am interested in any university that was the first American university. Think carefully about these conditions when determining relevance."
input_text = f"query:  {query.strip()} {instruction.strip()}".strip()
query_embedding = model.encode([input_text])
similarities = np.dot(query_embedding, doc_embeddings.T)[0]
print(f"Similarities: {similarities}") # Similarities: [0.60182875 0.5874183 ]
assert similarities[0] > similarities[1]




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Similarities: [0.5335429 0.5346871]
Similarities: [0.60181165 0.58740616]


# Life is a Circus

In [None]:


def prepare_file_to_qasrl(src, dst):
    """
    Prepare the input for QA-SRL (adding line number, tab and the sentence for every sentence in the text)
    """
    input, output = open(src, 'r'), open(dst, 'w')
    for i, line in enumerate(input):
        new_line = str(i + 1) + '\t' + line
        output.write(new_line)
    input.close()
    output.close()

In [None]:
from analogies_mining.find_mappings import generate_mappings
sentence_pair = (["Fred built an immense fortune by swindling others, but he lost it all when someone swindled him."],
                 ["his shopkeeper always palms off old stock to the customers."])
generate_mappings(sentence_pair, 0.5)