In [51]:
from importlib import reload
from semantic_search import store
reload(store)
from semantic_search.store import LocalEmbeddingModel, FAISSDocumentStore

In [52]:
model = LocalEmbeddingModel(chunk_size=256)
store = FAISSDocumentStore(model, db_dir='/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/db/references-1')

assert store.load_index()

Loaded index with 36379 vectors


In [53]:
store.search('Test', top_k=5)

[{'rank': 1,
  'score': 0.8346070789658119,
  'document_id': 'W3203625766',
  'chunk_text': 'on the test set.'},
 {'rank': 2,
  'score': 0.8334892017230405,
  'document_id': 'W2963644257',
  'chunk_text': 'is shown to considerably improve performance on the benchmark tests.'},
 {'rank': 3,
  'score': 0.8208612219059436,
  'document_id': 'W4304084089',
  'chunk_text': 'of test - time adaptation of the proposed method.'},
 {'rank': 4,
  'score': 0.7901869436629443,
  'document_id': 'W4320013936',
  'chunk_text': 'research.'},
 {'rank': 5,
  'score': 0.7901869436629443,
  'document_id': 'W4312233703',
  'chunk_text': 'research.'}]

## Benchmarking

In [42]:
import pandas as pd
from typing import List

from importlib import reload
from semantic_search import utils
reload(utils)
from semantic_search.utils import parse_referenced_works, extract_abstract_from_md

In [57]:
df = pd.read_csv('/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/metadata/openalex-ids+refs.csv')
ref_df = pd.read_csv('/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/metadata/openalex-refs-abstracts.csv')
df['referenced_works'] = df['referenced_works'].apply(parse_referenced_works)
df['has_refs'] = df.referenced_works.apply(len) > 0
df['abstract'] = df['fpath'].apply(extract_abstract_from_md)
df['has_abstract'] = df.abstract.apply(len) > 0

In [66]:
df = df[df.has_refs & df.has_abstract]  # Only benchmarking on papers with available abstracts and GT references.

def predict_refs_from_abstract(abstract: str, max_n_refs: int = 20) -> List[str]:
    docs = store.search(abstract, top_k=max_n_refs)
    return list(set([doc['document_id'] for doc in docs]))

df['predicted_refs'] = df.abstract.apply(predict_refs_from_abstract)
df['GT_refs'] = df.referenced_works.apply(lambda refs: [ref.split('/')[-1] for ref in refs])
df.head()

Unnamed: 0,fpath,title,doi,oaid,referenced_works,has_refs,abstract,has_abstract,predicted_refs,GT_refs
0,/cluster/home/lcarretero/workspace/dsl/dsl-res...,Fixed Point Diffusion Models,https://doi.org/10.1063/1.2121687,https://openalex.org/W2000456051,"[https://openalex.org/W1504980292, https://ope...",True,We introduce the Fixed Point Diffusion Model (...,True,"[W3153469116, W4318348481, W4317539811, W43061...","[W1504980292, W1646044445, W1966745391, W19764..."
1,/cluster/home/lcarretero/workspace/dsl/dsl-res...,BEVNeXt: Reviving Dense BEV Frameworks for 3D ...,https://doi.org/10.1109/cvpr52733.2024.01901,https://openalex.org/W4402727763,"[https://openalex.org/W1861492603, https://ope...",True,"Recently, the rise of query-based Transformer ...",True,"[W3035308182, W4386076222, W4385970268, W43860...","[W1861492603, W2083047701, W2108598243, W21245..."
4,/cluster/home/lcarretero/workspace/dsl/dsl-res...,Efficient local correlation volume for unsuper...,https://doi.org/10.1109/cvprw63382.2024.00049,https://openalex.org/W4402904316,"[https://openalex.org/W1513100184, https://ope...",True,"With the advent of deep learning methods, perf...",True,"[W2604233003, W3034921716, W4390873135, W31662...","[W1513100184, W1578285471, W1867429401, W19931..."
7,/cluster/home/lcarretero/workspace/dsl/dsl-res...,Towards Understanding and Improving Adversaria...,https://doi.org/10.1109/cvpr52733.2024.02336,https://openalex.org/W4402753640,"[https://openalex.org/W2517229335, https://ope...",True,Recent literature has demonstrated that vision...,True,"[W3131500599, W2963163009, W2015461918, W42255...","[W2517229335, W2525579820, W2896457183, W29138..."
8,/cluster/home/lcarretero/workspace/dsl/dsl-res...,Adaptive Softassign via Hadamard-Equipped Sink...,https://doi.org/10.1109/cvpr52733.2024.01670,https://openalex.org/W4402753930,"[https://openalex.org/W1587878450, https://ope...",True,Softassign is a pivotal method in graph matchi...,True,"[W4312361099, W48175873, W2412782625, W1926090...","[W1587878450, W1592360969, W1982143037, W19902..."


In [72]:
# Calculate precision, recall, and F1 scores
def calculate_metrics(gt_refs, pred_refs):
    assert len(gt_refs) > 0
    gt_set = set(gt_refs)
    pred_set = set(pred_refs)
    
    true_positives = len(gt_set.intersection(pred_set))
    
    precision = true_positives / len(pred_set) if len(pred_set) > 0 else 0
    recall = true_positives / len(gt_set)
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1

# Apply the metrics calculation to each row
metrics = df.apply(lambda row: calculate_metrics(row['GT_refs'], row['predicted_refs']), axis=1)
metrics_df = pd.DataFrame(metrics.tolist(), columns=['precision', 'recall', 'f1'])
mean_precision, mean_recall, mean_f1 = metrics_df[['precision', 'recall', 'f1']].mean()

print(f"Mean Precision: {mean_precision:.4f}")
print(f"Mean Recall: {mean_recall:.4f}")
print(f"Mean F1 Score: {mean_f1:.4f}")

Mean Precision: 0.2445
Mean Recall: 0.0966
Mean F1 Score: 0.1349


In [None]:
## Correct for missing abstracts