In [2]:
import torch
import numpy as np
from tqdm.notebook import tqdm
import os
import json

In [3]:
# metrics
def mean_recall_at_k(true_labels, predicted_labels, k=10):
    """
    Calculate the mean Recall@k for a list of recommendations.
    """
    recalls_at_k = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate Recall@k for each recommendation list
        true_set = set(true)
        k = min(k, len(pred))
        relevant_count = sum(1 for item in pred[:k] if item in true_set)
        recalls_at_k.append(relevant_count / len(true_set) if len(true_set) > 0 else 0)

    # Calculate the mean Recall@k
    mean_recall = sum(recalls_at_k) / len(recalls_at_k) if recalls_at_k else 0

    return mean_recall

def mean_inv_ranking(true_labels, predicted_labels):
    """
    Calculate the mean of lists of the mean inverse rank of true relevant items
    in the lists of sorted recommended items.
    """
    mean_ranks = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate the inverse rank of true relevant items
        # in the recommendation list
        ranks = []
        for item in true:
            try:
                rank = 1 / (pred.index(item) + 1)
            except ValueError:
                rank = 0  # If item not found, assign 0
            ranks.append(rank)

        # Calculate the mean inverse rank of true relevant items
        # in the recommendation list
        mean_rank = sum(ranks) / len(ranks) if ranks else 0
        mean_ranks.append(mean_rank)

    # Calculate the mean of the mean inverse ranks across all recommendation lists
    mean_of_mean_ranks = sum(mean_ranks) / len(mean_ranks) if mean_ranks else 0

    return mean_of_mean_ranks

def mean_ranking(true_labels, predicted_labels):
    """
    Calculate the mean of lists of the mean rank of true relevant items
    in the lists of sorted recommended items.
    """
    mean_ranks = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate the rank of true relevant items
        # in the recommendation list
        ranks = []
        for item in true:
            try:
                rank = pred.index(item) + 1
            except ValueError:
                rank = len(pred)  # If item not found, assign the length of the list
            ranks.append(rank)

        # Calculate the mean rank of true relevant items
        # in the recommendation list
        mean_rank = sum(ranks) / len(ranks) if ranks else 0
        mean_ranks.append(mean_rank)

    # Calculate the mean of the mean ranks across all recommendation lists
    mean_of_mean_ranks = sum(mean_ranks) / len(mean_ranks) if mean_ranks else 0

    return mean_of_mean_ranks

# for getting true labels and our predictions
def get_true_and_predicted(citing_to_cited_dict, recommendations_dict):
    """
    Get the true and predicted labels for the metrics calculation.
    """
    # for i in recommendations_dict:
    #     print(i, recommendations_dict[i])
    #     break
    
    # Initialize lists to store true labels and predicted labels
    true_labels = []
    predicted_labels = []
    not_in_citation_mapping = 0

    # Iterate over the items in both dictionaries
    for citing_id in recommendations_dict.keys():
        # Check if the citing_id is present in both dictionaries
        if citing_id in citing_to_cited_dict:
            # If yes, append the recommended items from both dictionaries to the respective lists
            true_labels.append(citing_to_cited_dict[citing_id])
            predicted_labels.append(recommendations_dict[citing_id])
        else:
            print(citing_id, "not in citation mapping")
            not_in_citation_mapping += 1

    return true_labels, predicted_labels, not_in_citation_mapping

# load embeddings
def load_embeddings_and_ids(embedding_file, app_ids_file):
    """
    Load the embeddings and application IDs from saved files
    """
    print(f"Loading embeddings from {embedding_file}")
    embeddings = torch.from_numpy(np.load(embedding_file))

    print(f"Loading app_ids from {app_ids_file}")
    with open(app_ids_file, 'r') as f:
        app_ids = json.load(f)

    print(f"Loaded {len(embeddings)} embeddings and {len(app_ids)} app_ids")
    return embeddings, app_ids

# calculating cosine similarity:
def cos_sim(a, b):
    """
    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
    :return: Matrix with res[i][j] = cos_sim(a[i], b[j])
    """
    if not isinstance(a, torch.Tensor):
        a = torch.tensor(a)

    if not isinstance(b, torch.Tensor):
        b = torch.tensor(b)

    if len(a.shape) == 1:
        a = a.unsqueeze(0)

    if len(b.shape) == 1:
        b = b.unsqueeze(0)

    a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
    b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
    return torch.mm(a_norm, b_norm.transpose(0, 1))

def pytorch_cos_sim(a, b):
    """
    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
    :return: Matrix with res[i][j] = cos_sim(a[i], b[j])
    """
    return cos_sim(a, b)

# for getting train alignments
def citation_to_citing_to_cited_dict(citations):
    """
    Put a citation mapping in a dict format
    """
    # Initialize an empty dictionary to store the results
    citing_to_cited_dict = {}

    # Iterate over the items in the JSON list
    for citation in citations:
        # Check if the citing id already exists in the resulting dictionary
        if citation[0] in citing_to_cited_dict:
            # If the citing id exists, append the cited id to the existing list
            citing_to_cited_dict[citation[0]].append(citation[2])
        else:
            # If the citing id doesn't exist, create a new list with the cited id for that citing id
            citing_to_cited_dict[citation[0]] = [citation[2]]

    return citing_to_cited_dict

In [4]:
TOP_N = 100
K_VALUE = 10
POOLING = "mean"                
QUERY_SET = "test"  
BASE_DIR = "/Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025"
DOC_EMBEDDING_DIR = os.path.join(BASE_DIR, "embeddings/embeddings_precalculated_docs")
TRAIN_EMBEDDING_DIR = os.path.join(BASE_DIR, "embeddings/embeddings_precalculated_train")
TEST_EMBEDDING_DIR = os.path.join(BASE_DIR, "embeddings/embeddings_precalculated_test")
OUTPUT_DIR = "/Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/github/not_git/best_pipeline_search_results"
CITATION_FILE = os.path.join(BASE_DIR, "Citation_JSONs/Citation_Train.json")
MODEL_NAME = "PatentSBERTa"

rankings = []
combinations = [
    "claims_claims",
    "TAC_TAC",
    "claims_TAC",
    "TAC_claims"
]

for i in combinations:
    CONTENT_TYPE_coming, CONTENT_TYPE_existing = i.split("_")
    DOC_EMBEDDING_FILE = os.path.join(DOC_EMBEDDING_DIR, f"embeddings_{MODEL_NAME}_{POOLING}_{CONTENT_TYPE_existing}.npy")
    DOC_APP_IDS_FILE = os.path.join(DOC_EMBEDDING_DIR, f"app_ids_{MODEL_NAME}_{POOLING}_{CONTENT_TYPE_existing}.json")

    QUERY_EMBEDDING_DIR = TRAIN_EMBEDDING_DIR if QUERY_SET == "train" else TEST_EMBEDDING_DIR
    QUERY_EMBEDDING_FILE = os.path.join(QUERY_EMBEDDING_DIR, f"embeddings_{MODEL_NAME}_{POOLING}_{CONTENT_TYPE_coming}.npy")
    QUERY_APP_IDS_FILE = os.path.join(QUERY_EMBEDDING_DIR, f"app_ids_{MODEL_NAME}_{POOLING}_{CONTENT_TYPE_coming}.json")

    # Load existing embeddings and app_ids
    doc_embeddings, doc_app_ids = load_embeddings_and_ids(DOC_EMBEDDING_FILE, DOC_APP_IDS_FILE)

    # Load incoming embeddings and app_ids
    query_embeddings, query_app_ids = load_embeddings_and_ids(QUERY_EMBEDDING_FILE, QUERY_APP_IDS_FILE)
    only_query_results={}
    for i, (query_embedding, query_id) in enumerate(tqdm(zip(query_embeddings, query_app_ids), total=len(query_embeddings), desc="cosine scores")):
        # Compute cosine similarity
        query_embedding = query_embedding.unsqueeze(0)
        cos_scores = pytorch_cos_sim(query_embedding, doc_embeddings)[0].cpu()

        # Sort results and get top N
        top_n_index = torch.argsort(cos_scores, descending=True)[:TOP_N].numpy()

        # Get application IDs of top N documents
        top_n_app_ids = [doc_app_ids[i] for i in top_n_index]
        top_n_scores = cos_scores[top_n_index].tolist()
        
        #results[query_id][0] = IDS
        #results[query_id][0] = scores 
        only_query_results[query_id] = top_n_app_ids
    rankings.append(only_query_results)



Loading embeddings from /Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025/embeddings/embeddings_precalculated_docs/embeddings_PatentSBERTa_mean_claims.npy
Loading app_ids from /Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025/embeddings/embeddings_precalculated_docs/app_ids_PatentSBERTa_mean_claims.json
Loaded 16834 embeddings and 16834 app_ids
Loading embeddings from /Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025/embeddings/embeddings_precalculated_test/embeddings_PatentSBERTa_mean_claims.npy
Loading app_ids from /Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025/embeddings/embeddings_precalculated_test/app_ids_PatentSBERTa_mean_claims.json
Loaded 1000 embeddings and 1000 app_ids


cosine scores:   0%|          | 0/1000 [00:00<?, ?it/s]

Loading embeddings from /Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025/embeddings/embeddings_precalculated_docs/embeddings_PatentSBERTa_mean_TAC.npy
Loading app_ids from /Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025/embeddings/embeddings_precalculated_docs/app_ids_PatentSBERTa_mean_TAC.json
Loaded 16837 embeddings and 16837 app_ids
Loading embeddings from /Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025/embeddings/embeddings_precalculated_test/embeddings_PatentSBERTa_mean_TAC.npy
Loading app_ids from /Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025/embeddings/embeddings_precalculated_test/app_ids_PatentSBERTa_mean_TAC.json
Loaded 1000 embeddings and 1000 app_ids


cosine scores:   0%|          | 0/1000 [00:00<?, ?it/s]

Loading embeddings from /Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025/embeddings/embeddings_precalculated_docs/embeddings_PatentSBERTa_mean_TAC.npy
Loading app_ids from /Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025/embeddings/embeddings_precalculated_docs/app_ids_PatentSBERTa_mean_TAC.json
Loaded 16837 embeddings and 16837 app_ids
Loading embeddings from /Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025/embeddings/embeddings_precalculated_test/embeddings_PatentSBERTa_mean_claims.npy
Loading app_ids from /Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025/embeddings/embeddings_precalculated_test/app_ids_PatentSBERTa_mean_claims.json
Loaded 1000 embeddings and 1000 app_ids


cosine scores:   0%|          | 0/1000 [00:00<?, ?it/s]

Loading embeddings from /Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025/embeddings/embeddings_precalculated_docs/embeddings_PatentSBERTa_mean_claims.npy
Loading app_ids from /Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025/embeddings/embeddings_precalculated_docs/app_ids_PatentSBERTa_mean_claims.json
Loaded 16834 embeddings and 16834 app_ids
Loading embeddings from /Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025/embeddings/embeddings_precalculated_test/embeddings_PatentSBERTa_mean_TAC.npy
Loading app_ids from /Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025/embeddings/embeddings_precalculated_test/app_ids_PatentSBERTa_mean_TAC.json
Loaded 1000 embeddings and 1000 app_ids


cosine scores:   0%|          | 0/1000 [00:00<?, ?it/s]

In [5]:
import json
from collections import defaultdict

def reciprocal_rank_fusion(ranked_lists, k=20):
    """
    Implements Reciprocal Rank Fusion to combine multiple ranked lists.
    
    Args:
        ranked_lists: List of dictionaries, where each dictionary maps queries to ranked lists of document IDs
        k: Constant to mitigate the impact of high rankings (default: 60)
    
    Returns:
        Dictionary mapping queries to fused ranked lists
    """
    fused_results = {}
    
    # Get all unique query IDs
    all_queries = set()
    for result_dict in ranked_lists:
        all_queries.update(result_dict.keys())
    
    # Process each query
    for query in all_queries:
        # Dictionary to store RRF scores for each document
        rrf_scores = defaultdict(float)
        
        # Calculate RRF scores for each document from each ranked list
        for result_dict in ranked_lists:
            if query in result_dict:
                doc_list = result_dict[query]
                
                # Calculate RRF score based on rank position (1-indexed)
                for rank, doc_id in enumerate(doc_list, start=1):
                    rrf_scores[doc_id] += 1.0 / (k + rank)
        
        # Sort documents by RRF score in descending order
        sorted_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
        fused_results[query] = [doc_id for doc_id, score in sorted_docs]
    
    return fused_results


# Apply RRF
fused_rankings = reciprocal_rank_fusion(rankings)

# Example of how to use or save the results
# Save the fused results
with open("/Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/github/not_git/rrf_rankings/" + "prediction1.json", "w") as f:
    json.dump(fused_rankings, f, indent=2)

# Example: Print top 5 documents for first query
sample_query = list(fused_rankings.keys())[0]
print(f"Sample query: {sample_query}")
print(f"Top 5 documents after fusion: {fused_rankings[sample_query][:5]}")


Sample query: 3692876A1
Top 5 documents after fusion: ['1707101B1', '2013126B1', '1925895B1', '3139394A1', '1785083B1']
