In [1]:
%reload_ext autoreload
%autoreload 2
import sys
sys.path.insert(0, './analogies_mining')
from tqdm.auto import tqdm
import torch
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
from functools import partial

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
)



In [2]:
# def get_model_general(model_name):
#     from transformers import AutoTokenizer, pipeline
#     tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
#     pipe = pipeline(
#         "text-generation",
#         model=model_name,
#         tokenizer=tokenizer,
#         torch_dtype=torch.float16,
#         trust_remote_code=True,
#         device_map="cuda:0",
#         # max_new_tokens=8,
#         # do_sample=True,

        
#     )
#     tokenizer = pipe.tokenizer
#     model = pipe.model
#     model.eval()
#     return model, tokenizer

from transformers import AutoTokenizer, pipeline, BitsAndBytesConfig
import torch
from accelerate.utils import BnbQuantizationConfig

def get_model_general(model_name):
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="fp4",
    )
    # quantization_config = BitsAndBytesConfig(load_in_8bit=True)
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        padding_side="left",
        trust_remote_code=True
    )

    # pipe = pipeline(
    #     "text-generation",
    #     model=model_name,
    #     tokenizer=tokenizer,
    #     trust_remote_code=True,
    #     device_map="cuda:0",
    #     quantization_config=quantization_config,  # Pass the quantization config
    #     load_in_4bit=True,                     # Enable 4-bit quantization

    # )
    model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, device_map="cuda:0")
    model.eval()
    # tokenizer = pipe.tokenizer
    # model = pipe.model
    return model, tokenizer

In [3]:
# model_name = 'multi-qa-mpnet-base-dot-v1'
# model_name = 'paraphrase-MiniLM-L12-v2'

# model = SentenceTransformer("paraphrase-MiniLM-L6-v2")




def preprocess_options(options_str):
    splits = ['A.', 'B.', 'C.', 'D.']
    final_options = []
    for sidx in range(len(splits)):
        if sidx == len(splits) - 1:
            x = options_str.split(splits[sidx])[1]
        else:
            x = options_str.split(splits[sidx])[1].split(splits[sidx+1])[0]
        final_options.append(x)
    return final_options


import numpy as np

def predict_labels(questions, options, model, bs=256):
    question_embeddings = model.encode(questions, show_progress_bar=True, batch_size=bs, convert_to_tensor=True)
    question_embeddings = question_embeddings.reshape(len(questions), -1)
    # question_embeddings = np.repeat(question_embeddings, 4, axis=0)
    question_embeddings = torch.repeat_interleave(question_embeddings, 4, dim=0)
    option_embeddings = model.encode(options, show_progress_bar=True, batch_size=bs, convert_to_tensor=True)
    print(question_embeddings.shape, option_embeddings.shape)


    similarities = []
    print('cossim')
    for qidx in tqdm(range(0, len(question_embeddings), bs)):
        sim = torch.nn.functional.cosine_similarity(question_embeddings[qidx:qidx+bs], option_embeddings[qidx:qidx+bs], dim=-1)
        similarities.append(sim)
    
    similarities = torch.hstack(similarities)
    similarities = similarities.reshape(len(questions), 4)
    print('sorting')
    # ranked_indices = torch.argsort(similarities, dim=-1, descending=True)
    predicted_labels = torch.argmax(similarities, dim=-1).cpu().numpy()
    print('sorting finished')
    return predicted_labels



def predict_labels_FollowIR(quesitons, options, model, tokenizer, bs=32):

    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"
    token_false_id = tokenizer.get_vocab()["false"]
    token_true_id = tokenizer.get_vocab()["true"]
    template = """<s> [INST] Consider the Document to be relevant only if it can be analogous to the Query. Answer in (true/false)

    Query: {query}
    Document: {text}
    Analogically Relevant (only output one word, either "true" or "false"):  [/INST] """

    # assert bs % 4 == 0, "Batch size must be a multiple of 4"
    scores = []
    for bsidx in tqdm(range(0, len(options), bs)):
        cur_options = options[bsidx:bsidx+bs]
        cur_questions = [quesitons[opidx//4] for opidx in range(bsidx, bsidx+bs)]

        prompts = [
            template.format(query=query, text=text) for (query, text) in zip(cur_questions, cur_options)
        ]
        tokens = tokenizer(
            prompts,
            padding=True,
            truncation=True,
            return_tensors="pt",
            pad_to_multiple_of=None,
        )

        # move to cuda if desired
        for key in tokens:
            tokens[key] = tokens[key].cuda()

        # calculate the scores by comparing true and false tokens
        batch_scores = model(**tokens).logits[:, -1, :]
        true_vector = batch_scores[:, token_true_id]
        false_vector = batch_scores[:, token_false_id]
        batch_scores = torch.stack([false_vector, true_vector], dim=1)
        batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
        cur_scores = batch_scores[:, 1].exp().tolist()
        scores.extend(cur_scores)
    print(len(cur_scores))
    scores = np.array(scores)
    scores = scores.reshape(len(quesitons), 4)
    predicted_labels = np.argmax(scores, axis=1)
    return predicted_labels


def preprocess_data(samples):
    if 'Story' not in samples:
        questions_pool = samples['Sentence']
    else:
        questions_pool = samples['Story']
    questions = [sample for sample in questions_pool]
    all_options = [sample for sample in samples['Options']]
    all_options = [preprocess_options(options) for options in all_options]
    flattened_options = [option for options in all_options for option in options]
    return questions, flattened_options




def evaluate_ranking(dataset, pred_func):

    questions, options = preprocess_data(dataset)
    print(len(questions))
    predicted_labels = pred_func(questions, options)
    
    labels = dataset['Label']  # The index of the correct option
    labels = np.array([ord(label) - ord('A') for label in labels])
    total_samples = len(labels)    
    correct = (predicted_labels == labels)
    precision_at_1 = sum(correct) / total_samples

    incorrect_sample_indices = []
    correct_sample_indices = []
    for i, crr in enumerate(correct):
        if not crr:
            incorrect_sample_indices.append(i)
        else:
            correct_sample_indices.append(i)

    return  precision_at_1, incorrect_sample_indices, correct_sample_indices

results = {}

model_name_list =[
    'all-mpnet-base-v2', #  0.5481, 0.2312, 0.2559 (1 mins)
    # 'sentence-t5-xl', # 0.6953, 0.2556, 0.2564
    # 'gtr-t5-xl', # 0.5860,
	# 'multi-qa-mpnet-base-dot-v1',
    # "jhu-clsp/FollowIR-7B" #  0.6290
]

for model_name in model_name_list:
    if model_name== "jhu-clsp/FollowIR-7B":
        # model = AutoModelForCausalLM.from_pretrained(
        #     model_name,
        #     torch_dtype=torch.float16  # Load in fp16 precision
        # ).to('cuda')
        # tokenizer = AutoTokenizer.from_pretrained(
        #     model_name, padding_side="left"
        # )
        model, tokenizer = get_model_general(model_name)
        bs = 2
        pred_func = partial(predict_labels_FollowIR, model=model, tokenizer=tokenizer, bs=bs)
    
    else:
        bs = 256 if not 'xl' in model_name else 64
        bs = bs if not 'xxl' in model_name else 32
        model = SentenceTransformer(model_name, device='cuda:0')
        pred_func = partial(predict_labels, model=model, bs=bs)
    print('finish loading', model_name)

    results[f'{model_name}'] = [] 
    # for datastr in ['1', '10', '30']:# '10', '30',
    for datastr in ['1', '10', '30']:# '10', '30',
    
        dataset = load_dataset('jhu-clsp/AnaloBench', f'T1S{datastr}-Subset')['train']
        precision_at_1, incorrect_sample_indices, correct_sample_indices = evaluate_ranking(dataset, pred_func)
        print(f'Precision@1: {precision_at_1:.4f}')
        results[f'{model_name}'].append(precision_at_1) 



finish loading all-mpnet-base-v2
340


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

torch.Size([1360, 768]) torch.Size([1360, 768])
cossim


  0%|          | 0/6 [00:00<?, ?it/s]

sorting
sorting finished
Precision@1: 0.5618
340


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

torch.Size([1360, 768]) torch.Size([1360, 768])
cossim


  0%|          | 0/6 [00:00<?, ?it/s]

sorting
sorting finished
Precision@1: 0.4676
340


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

torch.Size([1360, 768]) torch.Size([1360, 768])
cossim


  0%|          | 0/6 [00:00<?, ?it/s]

sorting
sorting finished
Precision@1: 0.4706


In [9]:
results

{'all-mpnet-base-v2': [0.5480517354289457, 0.231213163064833]}

In [11]:
def visualize_example(example_idx):
    example = dataset[example_idx]
    lines = '='*10
    print(lines, f"Query: {example['Sentence']}")
    print(lines, f"Options:")
    options = preprocess_options(example['Options'])
    for idx, option in enumerate(options):
        print(f"  {chr(ord('A') + idx)}. {option}")
    answer_idx = ord(example['Label']) - ord('A')
    print(lines, f"Correct Answer: {example['Options'][answer_idx]}")
    # print(lines, f"Predicted Answer: {results[f'{model_name}'][example_idx]}")
    print()

from ipywidgets import interact

interact(visualize_example, example_idx=incorrect_sample_indices)

interactive(children=(Dropdown(description='example_idx', options=(19, 20, 22, 23, 24, 25, 26, 28, 29, 31, 32,…

<function __main__.visualize_example(example_idx)>

## Follow IR


In [4]:


# model loading and setup


import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from peft import PeftModel, PeftConfig
import numpy as np

class Promptriever:
    def __init__(self, model_name_or_path):
        self.model, self.tokenizer = self.get_model(model_name_or_path)
        self.model.eval().cuda()

    def get_model(self, peft_model_name):
        # Load the PEFT configuration to get the base model name
        peft_config = PeftConfig.from_pretrained(peft_model_name)
        base_model_name = peft_config.base_model_name_or_path

        # Load the base model and tokenizer
        base_model = AutoModel.from_pretrained(base_model_name)
        tokenizer = AutoTokenizer.from_pretrained(base_model_name)
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id
        tokenizer.padding_side = "right"

        # Load and merge the PEFT model
        model = PeftModel.from_pretrained(base_model, peft_model_name)
        model = model.merge_and_unload()

        # can be much longer, but for the example 512 is enough
        model.config.max_length = 512
        tokenizer.model_max_length = 512

        return model, tokenizer

    def create_batch_dict(self, tokenizer, input_texts):
        max_length = self.model.config.max_length
        batch_dict = tokenizer(
            input_texts,
            max_length=max_length - 1,
            return_token_type_ids=False,
            return_attention_mask=False,
            padding=False,
            truncation=True,
        )
        batch_dict["input_ids"] = [
            input_ids + [tokenizer.eos_token_id]
            for input_ids in batch_dict["input_ids"]
        ]
        return tokenizer.pad(
            batch_dict,
            padding=True,
            pad_to_multiple_of=8,
            return_attention_mask=True,
            return_tensors="pt",
        )

    def encode(self, sentences, max_length: int = 2048, batch_size: int = 4):
        all_embeddings = []
        for i in range(0, len(sentences), batch_size):
            batch_texts = sentences[i : i + batch_size]

            batch_dict = self.create_batch_dict(self.tokenizer, batch_texts)
            batch_dict = {
                key: value.to(self.model.device) for key, value in batch_dict.items()
            }

            with torch.cuda.amp.autocast():
                with torch.no_grad():
                    outputs = self.model(**batch_dict)
                    last_hidden_state = outputs.last_hidden_state
                    sequence_lengths = batch_dict["attention_mask"].sum(dim=1) - 1
                    batch_size = last_hidden_state.shape[0]
                    reps = last_hidden_state[
                        torch.arange(batch_size, device=last_hidden_state.device),
                        sequence_lengths,
                    ]
                    embeddings = F.normalize(reps, p=2, dim=-1)
                    all_embeddings.append(embeddings.cpu().numpy())

        return np.concatenate(all_embeddings, axis=0)

# Initialize the model
model = Promptriever("samaya-ai/promptriever-llama3.1-8b-instruct-v1")

# Example query and instruction
query = "What universities are in Baltimore, Maryland?"

# add specific relevance conditions if desired (and/or/not) and any other prompts
instruction = "A relevant document would describe any university in Baltimore. I am not interested in any university that was the first American university. Think carefully about these conditions when determining relevance."

# Combine query and instruction with **two spaces** after "query: "
input_text = f"query:  {query.strip()} {instruction.strip()}".strip()

# Example documents
# NOTE: double space after `passage:`
doc1 = "passage:  Johns Hopkins University (often abbreviated as Johns Hopkins, Hopkins, or JHU) is a private research university in Baltimore, Maryland. Founded in 1876, Johns Hopkins was the first American university based on the European research institution model."
doc2 = "passage:  Johns Hopkins University (often abbreviated as Johns Hopkins, Hopkins, or JHU) is a private research university in Baltimore, Maryland. Founded in 1876, Johns Hopkins was the second American university based on the European research institution model."

# Encode query and documents
query_embedding = model.encode([input_text])
doc_embeddings = model.encode([doc1, doc2])

# Calculate similarities
similarities = np.dot(query_embedding, doc_embeddings.T)[0]
print(f"Similarities: {similarities}") # Similarities: [0.53341305 0.53451955]
assert similarities[1] > similarities[0]


# change up the instruction to the opposite, to see it works
instruction = "A relevant document would describe any university in Baltimore. I am interested in any university that was the first American university. Think carefully about these conditions when determining relevance."
input_text = f"query:  {query.strip()} {instruction.strip()}".strip()
query_embedding = model.encode([input_text])
similarities = np.dot(query_embedding, doc_embeddings.T)[0]
print(f"Similarities: {similarities}") # Similarities: [0.60182875 0.5874183 ]
assert similarities[0] > similarities[1]


adapter_config.json:   0%|          | 0.00/748 [00:00<?, ?B/s]

OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct.
403 Client Error. (Request ID: Root=1-674e5b7a-5188c2ee6e8f499d0e7e5483;66c77e1c-b8bc-4c3c-bb89-93c31a0741cf)

Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.1-8B-Instruct is restricted and you are not in the authorized list. Visit https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct to ask for access.

In [None]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from peft import PeftModel, PeftConfig
import numpy as np

# Include the Promptriever class from your second snippet
class Promptriever:
    def __init__(self, model_name_or_path):
        self.model, self.tokenizer = self.get_model(model_name_or_path)
        self.model.eval().cuda()

    def get_model(self, peft_model_name):
        # Load the PEFT configuration to get the base model name
        peft_config = PeftConfig.from_pretrained(peft_model_name)
        base_model_name = peft_config.base_model_name_or_path

        # Load the base model and tokenizer
        base_model = AutoModel.from_pretrained(base_model_name)
        tokenizer = AutoTokenizer.from_pretrained(base_model_name)
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id
        tokenizer.padding_side = "right"

        # Load and merge the PEFT model
        model = PeftModel.from_pretrained(base_model, peft_model_name)
        model = model.merge_and_unload()

        # Set maximum sequence length
        model.config.max_length = 512
        tokenizer.model_max_length = 512

        return model, tokenizer

    def create_batch_dict(self, tokenizer, input_texts):
        max_length = self.model.config.max_length
        batch_dict = tokenizer(
            input_texts,
            max_length=max_length - 1,
            return_token_type_ids=False,
            return_attention_mask=False,
            padding=False,
            truncation=True,
        )
        batch_dict["input_ids"] = [
            input_ids + [tokenizer.eos_token_id]
            for input_ids in batch_dict["input_ids"]
        ]
        return tokenizer.pad(
            batch_dict,
            padding=True,
            pad_to_multiple_of=8,
            return_attention_mask=True,
            return_tensors="pt",
        )

    def encode(self, sentences, max_length: int = 2048, batch_size: int = 4):
        all_embeddings = []
        for i in range(0, len(sentences), batch_size):
            batch_texts = sentences[i : i + batch_size]

            batch_dict = self.create_batch_dict(self.tokenizer, batch_texts)
            batch_dict = {
                key: value.to(self.model.device) for key, value in batch_dict.items()
            }

            with torch.cuda.amp.autocast():
                with torch.no_grad():
                    outputs = self.model(**batch_dict)
                    last_hidden_state = outputs.last_hidden_state
                    sequence_lengths = batch_dict["attention_mask"].sum(dim=1) - 1
                    batch_size_local = last_hidden_state.shape[0]
                    reps = last_hidden_state[
                        torch.arange(batch_size_local, device=last_hidden_state.device),
                        sequence_lengths,
                    ]
                    embeddings = F.normalize(reps, p=2, dim=-1)
                    all_embeddings.append(embeddings.cpu().numpy())

        return np.concatenate(all_embeddings, axis=0)

# Initialize the Promptriever model
promptriever_model = Promptriever("samaya-ai/promptriever-llama3.1-8b-instruct-v1")

# Modified predict_labels_FollowIR function
def predict_labels_FollowIR(questions, options, model, bs=32):
    template = """ Consider the document to be relevant if it shares analogous structure, relationships, or concepts with the query, even if the specific details differ."""

    scores = []
    for bsidx in range(0, len(questions), bs):
        cur_questions = questions[bsidx:bsidx+bs]
        cur_options = options[bsidx:bsidx+bs]
        prompts = [
            template.format(query=query, text=text) for (query, text) in zip(cur_questions, cur_options)
        ]

        # Encode prompts (queries with documents)
        prompt_embeddings = model.encode(prompts, batch_size=bs)
        # Encode documents separately
        doc_embeddings = model.encode(cur_options, batch_size=bs)

        # Compute cosine similarities between prompt and document embeddings
        similarities = np.sum(prompt_embeddings * doc_embeddings, axis=1)

        # Append similarities to scores list
        scores.extend(similarities.tolist())

    return scores

# Life is a Circus

In [4]:


def prepare_file_to_qasrl(src, dst):
    """
    Prepare the input for QA-SRL (adding line number, tab and the sentence for every sentence in the text)
    """
    input, output = open(src, 'r'), open(dst, 'w')
    for i, line in enumerate(input):
        new_line = str(i + 1) + '\t' + line
        output.write(new_line)
    input.close()
    output.close()

In [None]:
from analogies_mining.find_mappings import generate_mappings
sentence_pair = (["Fred built an immense fortune by swindling others, but he lost it all when someone swindled him."],
                 ["his shopkeeper always palms off old stock to the customers."])
generate_mappings(sentence_pair, 0.5)