In [1]:
import torch
import numpy as np
from tqdm.notebook import tqdm
import os
import json

In [2]:
# metrics

def mean_average_precision(true_labels, predicted_labels, k=10):
    """
    Calculate the mean Average Precision for a list of recommendations.

    Parameters:
    true_labels : list of list
        True relevant items for each recommendation list.
    predicted_labels : list of list
        Predicted recommended items for each recommendation list.
    k : int
        Number of recommendations to consider.

    Returns:
    float
        Mean Average Precision value.
    """
    average_precisions = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate Average Precision for each recommendation list
        true_set = set(true)
        precision_at_k = []
        relevant_count = 0
        for i, item in enumerate(pred[:k]):
            if item in true_set:
                relevant_count += 1
                precision_at_k.append(relevant_count / (i + 1))
        average_precision = sum(precision_at_k) / len(true_set)
        average_precisions.append(average_precision)

    # Calculate the mean Average Precision
    mean_average_precision = sum(average_precisions) / len(average_precisions)

    return mean_average_precision


def mean_recall_at_k(true_labels, predicted_labels, k=10):
    """
    Calculate the mean Recall@k for a list of recommendations.
    """
    recalls_at_k = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate Recall@k for each recommendation list
        true_set = set(true)
        k = min(k, len(pred))
        relevant_count = sum(1 for item in pred[:k] if item in true_set)
        recalls_at_k.append(relevant_count / len(true_set) if len(true_set) > 0 else 0)

    # Calculate the mean Recall@k
    mean_recall = sum(recalls_at_k) / len(recalls_at_k) if recalls_at_k else 0

    return mean_recall

def mean_inv_ranking(true_labels, predicted_labels):
    """
    Calculate the mean of lists of the mean inverse rank of true relevant items
    in the lists of sorted recommended items.
    """
    mean_ranks = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate the inverse rank of true relevant items
        # in the recommendation list
        ranks = []
        for item in true:
            try:
                rank = 1 / (pred.index(item) + 1)
            except ValueError:
                rank = 0  # If item not found, assign 0
            ranks.append(rank)

        # Calculate the mean inverse rank of true relevant items
        # in the recommendation list
        mean_rank = sum(ranks) / len(ranks) if ranks else 0
        mean_ranks.append(mean_rank)

    # Calculate the mean of the mean inverse ranks across all recommendation lists
    mean_of_mean_ranks = sum(mean_ranks) / len(mean_ranks) if mean_ranks else 0

    return mean_of_mean_ranks

def mean_ranking(true_labels, predicted_labels):
    """
    Calculate the mean of lists of the mean rank of true relevant items
    in the lists of sorted recommended items.
    """
    mean_ranks = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate the rank of true relevant items
        # in the recommendation list
        ranks = []
        for item in true:
            try:
                rank = pred.index(item) + 1
            except ValueError:
                rank = len(pred)  # If item not found, assign the length of the list
            ranks.append(rank)

        # Calculate the mean rank of true relevant items
        # in the recommendation list
        mean_rank = sum(ranks) / len(ranks) if ranks else 0
        mean_ranks.append(mean_rank)

    # Calculate the mean of the mean ranks across all recommendation lists
    mean_of_mean_ranks = sum(mean_ranks) / len(mean_ranks) if mean_ranks else 0

    return mean_of_mean_ranks

# for getting true labels and our predictions
def get_true_and_predicted(citing_to_cited_dict, recommendations_dict):
    """
    Get the true and predicted labels for the metrics calculation.
    """
    # for i in recommendations_dict:
    #     print(i, recommendations_dict[i])
    #     break
    
    # Initialize lists to store true labels and predicted labels
    true_labels = []
    predicted_labels = []
    not_in_citation_mapping = 0

    # Iterate over the items in both dictionaries
    for citing_id in recommendations_dict.keys():
        # Check if the citing_id is present in both dictionaries
        if citing_id in citing_to_cited_dict:
            # If yes, append the recommended items from both dictionaries to the respective lists
            true_labels.append(citing_to_cited_dict[citing_id])
            predicted_labels.append(recommendations_dict[citing_id])
        else:
            print(citing_id, "not in citation mapping")
            not_in_citation_mapping += 1

    return true_labels, predicted_labels, not_in_citation_mapping

# load embeddings
def load_embeddings_and_ids(embedding_file, app_ids_file):
    """
    Load the embeddings and application IDs from saved files
    """
    print(f"Loading embeddings from {embedding_file}")
    embeddings = torch.from_numpy(np.load(embedding_file))

    print(f"Loading app_ids from {app_ids_file}")
    with open(app_ids_file, 'r') as f:
        app_ids = json.load(f)

    print(f"Loaded {len(embeddings)} embeddings and {len(app_ids)} app_ids")
    return embeddings, app_ids

# calculating cosine similarity:
def cos_sim(a, b, normalize):
    """
    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
    :return: Matrix with res[i][j] = cos_sim(a[i], b[j])
    """
    if not isinstance(a, torch.Tensor):
        a = torch.tensor(a)

    if not isinstance(b, torch.Tensor):
        b = torch.tensor(b)

    if len(a.shape) == 1:
        a = a.unsqueeze(0)

    if len(b.shape) == 1:
        b = b.unsqueeze(0)

    if normalize:
        a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
        b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
        return torch.mm(a_norm, b_norm.transpose(0, 1))
    else:
        return torch.mm(a, b.transpose(0, 1))
    

def pytorch_cos_sim(a, b, normalize=True):
    """
    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
    :return: Matrix with res[i][j] = cos_sim(a[i], b[j])
    """
    return cos_sim(a, b, normalize)

# for getting train alignments
def citation_to_citing_to_cited_dict(citations):
    """
    Put a citation mapping in a dict format
    """
    # Initialize an empty dictionary to store the results
    citing_to_cited_dict = {}

    # Iterate over the items in the JSON list
    for citation in citations:
        # Check if the citing id already exists in the resulting dictionary
        if citation[0] in citing_to_cited_dict:
            # If the citing id exists, append the cited id to the existing list
            citing_to_cited_dict[citation[0]].append(citation[2])
        else:
            # If the citing id doesn't exist, create a new list with the cited id for that citing id
            citing_to_cited_dict[citation[0]] = [citation[2]]

    return citing_to_cited_dict

In [3]:
best_pipelines = [
    "claims_claims",
    "TAC_TAC",
    "TAC_claims",
    "claims_TAC",
]
## why these were chosen? look at the plot in printing_metrics.ipynb
incoming_dtype = ["claims","TAC","TAC","claims","TA","TA"]#, "TAC", "TA", "claims"] ## what part of incoming patent to consider
existing_dtype = ["claims","TAC","claims","TAC", "TA", "TAC"]#, "TA", "claims", "TA"] ## with what part of existing patent
# weigths_patentsbert = [0.81, 0.8, 0.79, 0.79, 0.71, 0.71] ##medians of true scores
# weigths_patentsbert = [0.5284088850021362, 0.558128297328949, 0.5326939821243286, 0.5411014556884766, 0.3544950485229492, 0.3110102713108063] ##minimum of true scores

TOP_N = 100
K_VALUE = 10
POOLING = "mean"                
QUERY_SET = "train"                  # for getting the scores, set this to train
BASE_DIR = "/Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025"
DOC_EMBEDDING_DIR = os.path.join(BASE_DIR, "embeddings/embeddings_precalculated_docs")
TRAIN_EMBEDDING_DIR = os.path.join(BASE_DIR, "embeddings/embeddings_precalculated_train")
TEST_EMBEDDING_DIR = os.path.join(BASE_DIR, "embeddings/embeddings_precalculated_test")
OUTPUT_DIR = "/Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/github/not_git/best_pipeline_search_results"
CITATION_FILE = os.path.join(BASE_DIR, "Citation_JSONs/Citation_Train.json")
MODEL_NAME = "PatentSBERTa"

with open(CITATION_FILE, 'r') as f:
    citations = json.load(f)
citing_to_cited_dict = citation_to_citing_to_cited_dict(citations)

In [4]:
## first get the embeddings from patentBERT for queries
incoming_id2emb = {}
for CONTENT_TYPE_coming in incoming_dtype:
    print()
    QUERY_EMBEDDING_DIR = TRAIN_EMBEDDING_DIR if QUERY_SET == "train" else TEST_EMBEDDING_DIR
    QUERY_EMBEDDING_FILE = os.path.join(QUERY_EMBEDDING_DIR, f"embeddings_{MODEL_NAME}_{POOLING}_{CONTENT_TYPE_coming}.npy")
    QUERY_APP_IDS_FILE = os.path.join(QUERY_EMBEDDING_DIR, f"app_ids_{MODEL_NAME}_{POOLING}_{CONTENT_TYPE_coming}.json")
    query_embeddings, query_app_ids = load_embeddings_and_ids(QUERY_EMBEDDING_FILE, QUERY_APP_IDS_FILE)
    for i in query_app_ids:
        try:
            incoming_id2emb[i].append(query_embeddings[query_app_ids.index(i)])
        except:
            incoming_id2emb[i] = [query_embeddings[query_app_ids.index(i)]]


Loading embeddings from /Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025/embeddings/embeddings_precalculated_train/embeddings_PatentSBERTa_mean_claims.npy
Loading app_ids from /Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025/embeddings/embeddings_precalculated_train/app_ids_PatentSBERTa_mean_claims.json
Loaded 6831 embeddings and 6831 app_ids

Loading embeddings from /Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025/embeddings/embeddings_precalculated_train/embeddings_PatentSBERTa_mean_TAC.npy
Loading app_ids from /Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025/embeddings/embeddings_precalculated_train/app_ids_PatentSBERTa_mean_TAC.json
Loaded 6831 embeddings and 6831 app_ids

Loading embeddings from /Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025/embeddings/embeddings_precalculated_train/embeddings_PatentSBERTa_mean_TAC.npy
Loading app_ids from /Users/kshitij/Documents/UPSaclay/T4/InfoRetr

In [5]:
## first get the embeddings from patentBERT for exisiting patents

existing_id2emb = {}
for CONTENT_TYPE_existing in existing_dtype:
    print()
    DOC_EMBEDDING_FILE = os.path.join(DOC_EMBEDDING_DIR, f"embeddings_{MODEL_NAME}_{POOLING}_{CONTENT_TYPE_existing}.npy")
    DOC_APP_IDS_FILE = os.path.join(DOC_EMBEDDING_DIR, f"app_ids_{MODEL_NAME}_{POOLING}_{CONTENT_TYPE_existing}.json")

    doc_embeddings, doc_app_ids = load_embeddings_and_ids(DOC_EMBEDDING_FILE, DOC_APP_IDS_FILE)
    for i in doc_app_ids:
        try:
            existing_id2emb[i].append(doc_embeddings[doc_app_ids.index(i)])
        except:
            existing_id2emb[i] = [doc_embeddings[doc_app_ids.index(i)]]


Loading embeddings from /Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025/embeddings/embeddings_precalculated_docs/embeddings_PatentSBERTa_mean_claims.npy
Loading app_ids from /Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025/embeddings/embeddings_precalculated_docs/app_ids_PatentSBERTa_mean_claims.json
Loaded 16834 embeddings and 16834 app_ids

Loading embeddings from /Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025/embeddings/embeddings_precalculated_docs/embeddings_PatentSBERTa_mean_TAC.npy
Loading app_ids from /Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025/embeddings/embeddings_precalculated_docs/app_ids_PatentSBERTa_mean_TAC.json
Loaded 16837 embeddings and 16837 app_ids

Loading embeddings from /Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025/embeddings/embeddings_precalculated_docs/embeddings_PatentSBERTa_mean_claims.npy
Loading app_ids from /Users/kshitij/Documents/UPSaclay/T4/InfoRe

In [6]:
# ## do the same thing for the best combination of incoming patent and existing patent for all-MiniLM...

# incoming_dtype = ["TAC", "claims", "claims", "TAC"] ## what part of incoming patent to comapre with
# existing_dtype = ["TAC", "claims", "TAC", "claims"] ## what part of existing patent

# ## weights according to min true_segments
# # weights_minilm = [0.28662535548210144, 0.2773967683315277, 0.29081717133522034]

# # weights = weigths_patentsbert + weigths_patentsbert
# # weights = np.array(weights)/np.sum()
# MODEL_NAME = "all-MiniLM-L6-v2"

# for CONTENT_TYPE_coming in incoming_dtype:
#     QUERY_EMBEDDING_DIR = TRAIN_EMBEDDING_DIR if QUERY_SET == "train" else TEST_EMBEDDING_DIR
#     QUERY_EMBEDDING_FILE = os.path.join(QUERY_EMBEDDING_DIR, f"embeddings_{MODEL_NAME}_{POOLING}_{CONTENT_TYPE_coming}.npy")
#     QUERY_APP_IDS_FILE = os.path.join(QUERY_EMBEDDING_DIR, f"app_ids_{MODEL_NAME}_{POOLING}_{CONTENT_TYPE_coming}.json")
#     query_embeddings, query_app_ids = load_embeddings_and_ids(QUERY_EMBEDDING_FILE, QUERY_APP_IDS_FILE)
#     for i in query_app_ids:
#         try:
#             incoming_id2emb[i].append(query_embeddings[query_app_ids.index(i)])
#         except:
#             incoming_id2emb[i] = [query_embeddings[query_app_ids.index(i)]]

# for CONTENT_TYPE_existing in existing_dtype:
#     DOC_EMBEDDING_FILE = os.path.join(DOC_EMBEDDING_DIR, f"embeddings_{MODEL_NAME}_{POOLING}_{CONTENT_TYPE_existing}.npy")
#     DOC_APP_IDS_FILE = os.path.join(DOC_EMBEDDING_DIR, f"app_ids_{MODEL_NAME}_{POOLING}_{CONTENT_TYPE_existing}.json")

#     doc_embeddings, doc_app_ids = load_embeddings_and_ids(DOC_EMBEDDING_FILE, DOC_APP_IDS_FILE)
#     for i in doc_app_ids:
#         try:
#             existing_id2emb[i].append(doc_embeddings[doc_app_ids.index(i)])
#         except:
#             existing_id2emb[i] = [doc_embeddings[doc_app_ids.index(i)]]

In [7]:
# import os
# os.isfile("/Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/CodaBench/IR2025/embeddings/embeddings_precalculated_test/embeddings_all-MiniLM-L6-v2_mean_claim.npy")

In [8]:
def attention_pool(embeddings):

    query = torch.mean(embeddings, dim=0, keepdim=True)  # Context vector -- mean
    scores = torch.matmul(embeddings, query.T).squeeze()
    weights = torch.softmax(scores, dim=0)
    return torch.sum(weights.unsqueeze(-1) * embeddings, dim=0)

def pooling(data_dict, operation, num_embs): #num_embs = total number of embeddings for incoming/exisiting patent

    print(f"length on input dict: {len(data_dict)}")

    if operation=="concatenation":
        keys_to_delete=[]
        for i in data_dict:
            if len(data_dict[i])==num_embs:
                data_dict[i] = torch.cat(data_dict[i])
                # print(data_dict[i].shape)
            else:
                keys_to_delete.append(i)
        for key in keys_to_delete:
            del data_dict[key]
            
    if operation=="average":
        keys_to_delete=[]
        for i in data_dict:
            if len(data_dict[i])==num_embs:
                data_dict[i] = torch.mean(torch.stack(data_dict[i], dim=0), dim=0)
            else:
                keys_to_delete.append(i)
        for key in keys_to_delete:
            del data_dict[key]
    
    if operation=="addition":
        keys_to_delete=[]
        for i in data_dict:
            if len(data_dict[i])==num_embs:
                data_dict[i] = torch.mean(torch.stack(data_dict[i], dim=0), dim=0)*num_embs
            else:
                keys_to_delete.append(i)
        for key in keys_to_delete:
            del data_dict[key]
    
    if operation == "attention":
        keys_to_delete=[]
        for i in data_dict:
            if len(data_dict[i])==num_embs:
                data_dict[i] = attention_pool(torch.stack(data_dict[i]))
            else:
                keys_to_delete.append(i)
        for key in keys_to_delete:
            del data_dict[key]

    print(f"length on output dict: {len(data_dict)}")
    return data_dict
    
    


In [9]:
incoming_id2emb = pooling(incoming_id2emb, "attention", num_embs=6)
print()
existing_id2emb = pooling(existing_id2emb, "attention", num_embs=6)
# incoming_id2emb_minilm = pooling(incoming_id2emb_minilm, "multi_strategy", num_embs=4)
# existing_id2emb_minilm = pooling(existing_id2emb_minilm, "multi_strategy", num_embs=4)

length on input dict: 6831
length on output dict: 6831

length on input dict: 16834
length on output dict: 16828


In [10]:
query_embeddings, query_ids = list(incoming_id2emb.values()), list(incoming_id2emb.keys())
# print(query_embeddings[0])
query_embeddings = torch.stack(query_embeddings)
query_embeddings.shape

torch.Size([6831, 768])

In [11]:
doc_embeddings, doc_ids = list(existing_id2emb.values()), list(existing_id2emb.keys())
doc_embeddings = torch.stack(doc_embeddings)
doc_embeddings.shape

torch.Size([16828, 768])

In [12]:
only_query_results = {}
for i, (query_embedding, query_id) in enumerate(tqdm(zip(query_embeddings, query_app_ids), total=len(query_embeddings), desc="cosine scores")):
    # Compute cosine similarity
    query_embedding = query_embedding.unsqueeze(0)
    cos_scores = pytorch_cos_sim(query_embedding, doc_embeddings, normalize=True)[0].cpu()

    # Sort results and get top N
    top_n_index = torch.argsort(cos_scores, descending=True)[:TOP_N].numpy()

    # Get application IDs of top N documents
    top_n_app_ids = [doc_app_ids[i] for i in top_n_index]
    top_n_scores = cos_scores[top_n_index].tolist()
    
    #results[query_id][0] = IDS
    #results[query_id][0] = scores 
    only_query_results[query_id] = top_n_app_ids


cosine scores:   0%|          | 0/6831 [00:00<?, ?it/s]

In [13]:
# with open ("/Users/kshitij/Documents/UPSaclay/T4/InfoRetrieval/github/not_git/concatenate_13/prediction1.json","w") as f:
#     json.dump(only_query_results, f)

In [14]:
true_labels, predicted_labels, not_in_citation_mapping = get_true_and_predicted(citing_to_cited_dict, only_query_results)

recall_at_k = mean_recall_at_k(true_labels, predicted_labels, k=K_VALUE)
print(recall_at_k)
mean_rank = mean_ranking(true_labels, predicted_labels)
print(mean_rank)

mean_inv_rank = mean_inv_ranking(true_labels, predicted_labels)
print(mean_inv_rank)

map = []
for i in range(10,100,10):
    map.append(mean_average_precision(true_labels, predicted_labels, k=i))
print(np.mean(map))

0.5659078709803346
28.82070463084957
0.3556313922052502
0.36852854188996453


In [15]:
'''
addition | norm = True | first 4 docs
0.5443639291465378
30.718111062313977
0.3435209450449923
0.35557668046845436
'''

'''
addition | norm = True | first 6 docs
0.5685990338164252
28.041257990533353
0.3607615251149155
0.3739892150148598
'''

'''
average | norm = True | first 6 docs
0.5685990338164252
28.041209193383107
0.36076161351555003
0.373989421283007
'''

'''
concatenate | norm = True | first 6 docs + 1 lm doc
0.5971941638608307
25.415602888791298
0.3824726140500106
0.39687537978272747
'''

'''
concatenate | norm = True | first 6 docs + 3 lm doc
0.6046357292734102
24.858563899868248
0.38852029919424486
0.40331076676419736

0.37 vs 0.38 normalize false vs true for avg of patentsberta

attention pooling on patentsberta -- 0.3567296318474875

mixed -- 0.39623929734429403 -- avg concatenated with max pool


'''

'\nconcatenate | norm = True | first 6 docs + 3 lm doc\n0.6046357292734102\n24.858563899868248\n0.38852029919424486\n0.40331076676419736\n\n0.37 vs 0.38 normalize false vs true for avg of patentsberta\n\nattention pooling on patentsberta -- 0.3567296318474875\n\nmixed -- 0.39623929734429403 -- avg concatenated with max pool\n\n\n'