In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm

## Create vector DB

### Testing HF embedding model

In [27]:
from importlib import reload
from semantic_search import store
reload(store)
from semantic_search.store import LocalEmbeddingModel, FAISSDocumentStore



### Using scraped abstracts

In [29]:
model = LocalEmbeddingModel(chunk_size=256)
store = FAISSDocumentStore(model, db_dir='/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/db/references-1')

if not store.load_index():
    docs = pd.read_csv('/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/metadata/openalex-refs-abstracts.csv')
    docs.loc[docs.abstract.isna(), 'abstract'] = ''
    docs['has_abstract'] = docs.abstract.apply(len) > 0

    docs['ref_work'] = docs['ref_work'].str.split('/').str[-1]
    docs.rename(columns={'ref_work': 'id', 'abstract': 'text'}, inplace=True)
    docs = docs[docs.has_abstract]

    store.create_index(docs)

2025-04-14 12:41:26.180904: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-14 12:41:30.523603: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Index or document store not found


Token indices sequence length is longer than the specified maximum sequence length for this model (706 > 512). Running this sequence through the model will result in indexing errors
Chunking and encoding: 100%|██████████| 14956/14956 [00:45<00:00, 328.00it/s]


Generating embeddings for 3 chunks...


Generating embeddings: 100%|██████████| 2685/2685 [00:20<00:00, 133.05it/s]


In [31]:
store.search("CLIP related works", top_k=5)

Chunking and encoding: 100%|██████████| 1/1 [00:00<00:00, 536.29it/s]


[{'rank': 1,
  'score': 0.8255381213288242,
  'document_id': 'W4307106676',
  'chunk_text': 'faster, and lighter. our code is available in https : / / github. com / rmokady / clip _ prefix _ caption.'},
 {'rank': 2,
  'score': 0.793164941719414,
  'document_id': 'W4387323008',
  'chunk_text': 'available at https : / / github. com / wusize / clipself.'},
 {'rank': 3,
  'score': 0.7750313673011238,
  'document_id': 'W3190434222',
  'chunk_text': 'recently, there have been breakthroughs in computer vision ( " cv " ) models that are more generalizable with the advent of models such as clip and align. in this paper, we analyze clip and highlight some of the challenges such models pose. clip reduces the need for task specific training data, potentially opening up many niche tasks to automation. clip also allows its users to flexibly specify image classification classes in natural language, which we find can shift how biases manifest. additionally, through some preliminary probes we find that

### Using Docling papers

In [58]:
from importlib import reload
from semantic_search import store
reload(store)
from semantic_search.store import LocalEmbeddingModel, FAISSDocumentStore

In [27]:
embedding_model = LocalEmbeddingModel(
    model_name='sentence-transformers/all-MiniLM-L6-v2',
    device='cuda',
    batch_size=8
)
document_store = FAISSDocumentStore(
    embedding_model=embedding_model,
    db_dir='/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/db/chunk1-txt-all',
)

if not document_store.load_index():
    document_store.create_index('/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt')

Loaded index with 73809 vectors


In [26]:
print(document_store.search("visual", top_k=5)[0]['chunk_text'])

er-efficient visual instruction model. arXiv 2304.15010 , 2023. 1, 3, 4
- [9] Yash Goyal, Tejas Khot, Douglas Summers-Stay, Dhruv Batra, and Devi Parikh. Making the V in VQA matter: Elevating the role...


## Create metadata for CVPR dataset

In [4]:
from importlib import reload
from semantic_search import utils
reload(utils)

from semantic_search.utils import get_title_from_fpath, get_orig_metadata, multithread_apply, parse_list_string, count_references, get_ref_metadata

# Set pandas display options to show wider dataframes
pd.set_option('display.max_colwidth', None)  # Show full text in columns
pd.set_option('display.width', 1000)         # Set the display width
pd.set_option('display.max_columns', 20)     # Show more columns

In [49]:
raw_dir = '/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt'
df = pd.DataFrame([(str(fpath), fpath.name) for fpath in Path(raw_dir).glob("*.txt")], columns=['fpath', 'fname'])
df['title'] = df['fpath'].apply(get_title_from_fpath)
df['doi', 'oaid', 'refs_oaid'] = multithread_apply(df['title'].values, get_orig_metadata, n_workers=5)
df['refs_doi'] = ''
# df.to_csv('/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/metadata/openalex-ids+refs.csv', index=False)

  0%|          | 0/1142 [00:00<?, ?it/s]

Unnamed: 0,fpath,fname,title,doi,oaid
0,/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt/Bai_Fixed_Point_Diffusion_Models_CVPR_2024_paper.txt,Bai_Fixed_Point_Diffusion_Models_CVPR_2024_paper.txt,Fixed Point Diffusion Models,https://doi.org/10.1063/1.2121687,https://openalex.org/W2000456051
1,/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt/Li_BEVNeXt_Reviving_Dense_BEV_Frameworks_for_3D_Object_Detection_CVPR_2024_paper.txt,Li_BEVNeXt_Reviving_Dense_BEV_Frameworks_for_3D_Object_Detection_CVPR_2024_paper.txt,BEVNeXt: Reviving Dense BEV Frameworks for 3D Object Detection,https://doi.org/10.1109/cvpr52733.2024.01901,https://openalex.org/W4402727763
2,/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt/Savchenko_Leveraging_Pre-trained_Multi-task_Deep_Models_for_Trustworthy_Facial_Analysis_in_CVPRW_2024_paper.txt,Savchenko_Leveraging_Pre-trained_Multi-task_Deep_Models_for_Trustworthy_Facial_Analysis_in_CVPRW_2024_paper.txt,Leveraging Pre-trained Multi-task Deep Models for Trustworthy Facial Analysis in Affective Behaviour Analysis in-the-Wild,https://doi.org/10.1109/cvprw63382.2024.00473,https://openalex.org/W4402916217


### Retrieving abstracts of cited works
(only on subset of papers whose references we found using OpenAlex)

In [33]:
# Load data
df = pd.read_csv('/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/metadata/openalex-ids+refs.csv')
df['referenced_works'] = df['referenced_works'].apply(parse_list_string)
df[['total_references', 'references_in_dataset']] = df.apply(lambda x: count_references(x, df), axis=1, result_type='expand')

# Retrieve references via OpenAlex API
all_refs = pd.Series(np.concatenate(df.referenced_works.values)).unique()
all_refs_batched = [all_refs[i:i+100] for i in range(0, len(all_refs), 100)]
results = multithread_apply(all_refs_batched, get_ref_metadata, n_workers=5)

ref_df = pd.DataFrame(np.concatenate(results), columns=['oaid', 'title', 'abstract', 'type', 'topic', 'domain', 'field', 'subfield'])
ref_df.to_csv('/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/metadata/openalex-refs-abstracts.csv', index=False)
ref_df.head()

100%|██████████| 187/187 [00:44<00:00,  4.21it/s]


Unnamed: 0,oaid,title,abstract,type,topic,domain,field,subfield
0,https://openalex.org/W2194775991,Deep Residual Learning for Image Recognition,"Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers - 8× deeper than VGG nets [40] but still having lower complexity. An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 with 100 and 1000 layers. The depth of representations is of central importance for many visual recognition tasks. Solely due to our extremely deep representations, we obtain a 28% relative improvement on the COCO object detection dataset. Deep residual nets are foundations of our submissions to ILSVRC & COCO 2015 competitions1, where we also won the 1st places on the tasks of ImageNet detection, ImageNet localization, COCO detection, and COCO segmentation.",article,Advanced Neural Network Applications,Physical Sciences,Computer Science,Computer Vision and Pattern Recognition
1,https://openalex.org/W2108598243,ImageNet: A large-scale hierarchical image database,"The explosion of image data on the Internet has the potential to foster more sophisticated and robust models and algorithms to index, retrieve, organize and interact with images and multimedia data. But exactly how such data can be harnessed and organized remains a critical problem. We introduce here a new database called ""ImageNet"", a large-scale ontology of images built upon the backbone of the WordNet structure. ImageNet aims to populate the majority of the 80,000 synsets of WordNet with an average of 500–1000 clean and full resolution images. This will result in tens of millions of annotated images organized by the semantic hierarchy of WordNet. This paper offers a detailed analysis of ImageNet in its current state: 12 subtrees with 5247 synsets and 3.2 million images in total. We show that ImageNet is much larger in scale and diversity and much more accurate than the current image datasets. Constructing such a large-scale database is a challenging task. We describe the data collection scheme with Amazon Mechanical Turk. Lastly, we illustrate the usefulness of ImageNet through three simple applications in object recognition, image classification and automatic object clustering. We hope that the scale, accuracy, diversity and hierarchical structure of ImageNet can offer unparalleled opportunities to researchers in the computer vision community and beyond.",article,Advanced Image and Video Retrieval Techniques,Physical Sciences,Computer Science,Computer Vision and Pattern Recognition
2,https://openalex.org/W1976499671,Comparison of simple potential functions for simulating liquid water,"Classical Monte Carlo simulations have been carried out for liquid water in the NPT ensemble at 25 °C and 1 atm using six of the simpler intermolecular potential functions for the water dimer: Bernal–Fowler (BF), SPC, ST2, TIPS2, TIP3P, and TIP4P. Comparisons are made with experimental thermodynamic and structural data including the recent neutron diffraction results of Thiessen and Narten. The computed densities and potential energies are in reasonable accord with experiment except for the original BF model, which yields an 18% overestimate of the density and poor structural results. The TIPS2 and TIP4P potentials yield oxygen–oxygen partial structure functions in good agreement with the neutron diffraction results. The accord with the experimental OH and HH partial structure functions is poorer; however, the computed results for these functions are similar for all the potential functions. Consequently, the discrepancy may be due to the correction terms needed in processing the neutron data or to an effect uniformly neglected in the computations. Comparisons are also made for self-diffusion coefficients obtained from molecular dynamics simulations. Overall, the SPC, ST2, TIPS2, and TIP4P models give reasonable structural and thermodynamic descriptions of liquid water and they should be useful in simulations of aqueous solutions. The simplicity of the SPC, TIPS2, and TIP4P functions is also attractive from a computational standpoint.",article,Chemical and Physical Properties in Aqueous Solutions,Physical Sciences,Chemical Engineering,Filtration and Separation
3,https://openalex.org/W1861492603,Microsoft COCO: Common Objects in Context,,book-chapter,Advanced Neural Network Applications,Physical Sciences,Computer Science,Computer Vision and Pattern Recognition
4,https://openalex.org/W2565639579,Feature Pyramid Networks for Object Detection,"Feature pyramids are a basic component in recognition systems for detecting objects at different scales. But pyramid representations have been avoided in recent object detectors that are based on deep convolutional networks, partially because they are slow to compute and memory intensive. In this paper, we exploit the inherent multi-scale, pyramidal hierarchy of deep convolutional networks to construct feature pyramids with marginal extra cost. A top-down architecture with lateral connections is developed for building high-level semantic feature maps at all scales. This architecture, called a Feature Pyramid Network (FPN), shows significant improvement as a generic feature extractor in several applications. Using a basic Faster R-CNN system, our method achieves state-of-the-art single-model results on the COCO detection benchmark without bells and whistles, surpassing all existing single-model entries including those from the COCO 2016 challenge winners. In addition, our method can run at 5 FPS on a GPU and thus is a practical and accurate solution to multi-scale object detection. Code will be made publicly available.",preprint,Advanced Neural Network Applications,Physical Sciences,Computer Science,Computer Vision and Pattern Recognition


## Misc

### Check papers with missing abstract

In [5]:
import re

raw_dir = '/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt'

def extract_abstract(fpath: str):
    doc_text = fpath.read_text(encoding="utf-8")
    abstract_match = re.search(r'## Abstract\n\n(.*?)(?=\n\n## \d+\.)', doc_text, re.DOTALL)
    return abstract_match.group(1) if abstract_match else ''

for fpath in Path(raw_dir).glob("*.txt"):
    if extract_abstract(fpath) == '':
        print(fpath)

# abstracts =[check_single_doc(fpath) for fpath in Path(raw_dir).glob("*.txt")]

/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt/Korycki_Class-Incremental_Mixture_of_Gaussians_for_Deep_Continual_Learning_CVPRW_2024_paper.txt
/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt/Hoque_IrrNet_Spatio-Temporal_Segmentation_Guided_Classification_for_Irrigation_Mapping_CVPRW_2024_paper.txt
/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt/Zhang_MOHO_Learning_Single-view_Hand-held_Object_Reconstruction_with_Multi-view_Occlusion-Aware_Supervision_CVPR_2024_paper.txt
/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt/Paissan_Structured_Sparse_Back-propagation_for_Lightweight_On-Device_Continual_Learning_on_Microcontroller_CVPRW_2024_paper.txt
/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt/Halawa_Multi-Task_Multi-Modal_Self-Super

### Using SemanticScholar API to retrieve references not available via OpenAlex

In [68]:
from semantic_search.utils import parse_list_string

df = pd.read_csv('/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/metadata/openalex-ids+refs.csv')
df['refs_oaid'] = df['refs_oaid'].apply(parse_list_string)
oa_df = df[['fpath', 'title', 'doi', 'oaid', 'refs_oaid']]

metadata_fpath = '/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/metadata/semschol-ids.csv'
df = pd.read_csv(metadata_fpath)
semschol_df = df[['fpath', 'paperId']]

combined_df = pd.merge(oa_df, semschol_df, on='fpath', how='left')
combined_df = combined_df[combined_df.refs_oaid.apply(len) == 0].drop(columns=['refs_oaid'])

combined_df.head(1)

Unnamed: 0,fpath,title,doi,oaid,paperId
2,/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt/Savchenko_Leveraging_Pre-trained_Multi-task_Deep_Models_for_Trustworthy_Facial_Analysis_in_CVPRW_2024_paper.txt,Leveraging Pre-trained Multi-task Deep Models for Trustworthy Facial Analysis in Affective Behaviour Analysis in-the-Wild,https://doi.org/10.1109/cvprw63382.2024.00473,https://openalex.org/W4402916217,9c50996ba35eb605cb9bcd5835103b441cf38e07


In [None]:
from semanticscholar import SemanticScholar
from semanticscholar.SemanticScholarException import ObjectNotFoundException
from functools import partial

paper_id = combined_df.iloc[0].paperId
print(f'Title: {combined_df.iloc[0].title}')
sch = SemanticScholar()

def get_referenced_dois(sch: SemanticScholar, paper_id: str):
    
    try:
          # 'paperId', 'title' TODO: Do title check with OA data?
        raw = sch.get_paper_references(paper_id=paper_id, fields=['externalIds'], limit=1000)
    except ObjectNotFoundException:
        return []

    dois = []
    for item in raw.items:
        external_ids = item['citedPaper'].get('externalIds')
        if external_ids is None: continue
        doi = external_ids.get('DOI', None)
        if doi is None: continue
        dois.append(doi)
    return dois

res = multithread_apply(combined_df.paperId.values[:5], partial(get_referenced_dois, sch), n_workers=5)

Title: Leveraging Pre-trained Multi-task Deep Models for Trustworthy Facial Analysis in Affective Behaviour Analysis in-the-Wild


100%|██████████| 5/5 [00:00<00:00,  8.54it/s]


In [52]:
res

[['10.48550/arXiv.2401.11605',
  '10.48550/arXiv.2401.08639',
  '10.48550/arXiv.2310.18605',
  '10.1109/WACV57701.2024.00532',
  '10.48550/arXiv.2304.14108',
  '10.1109/ICCV51070.2023.00387',
  '10.1109/CVPR52729.2023.00043',
  '10.48550/arXiv.2210.12867',
  '10.48550/arXiv.2210.08402',
  '10.48550/arXiv.2206.00927',
  '10.48550/arXiv.2206.00364',
  '10.48550/arXiv.2205.15019',
  '10.48550/arXiv.2204.13902',
  '10.1109/CVPR52688.2022.00070',
  '10.1109/CVPR52688.2022.01042',
  '10.1609/aaai.v36i6.20619',
  '10.1109/tci.2021.3118944',
  '10.1142/11590',
  '10.18653/v1/2021.findings-emnlp.344',
  '10.1007/978-3-319-24574-4_28',
  '10.1109/CVPR.2009.5206848',
  '10.1007/978-94-009-8177-5',
  '10.1145/321296.321305',
  '10.1090/S0025-5718-1965-0198670-6',
  '10.1007/978-3-642-01492-5_2',
  '10.1109/5.726791',
  '10.1007/13663.1687-1812',
  '10.2307/3608793',
  '10.1098/rstl.1685.0053'],
 ['10.48550/arXiv.2310.15670',
  '10.48550/arXiv.2308.09616',
  '10.1109/ICCV51070.2023.01703',
  '10.11

In [45]:
import pyalex
pyalex.Works().filter_or(doi=dois).select(['id', 'doi']).get(per_page=len(dois))

[{'id': 'https://openalex.org/W2963839617',
  'doi': 'https://doi.org/10.1109/fg.2018.00020'},
 {'id': 'https://openalex.org/W2051297709',
  'doi': 'https://doi.org/10.1016/j.imavis.2014.06.002'},
 {'id': 'https://openalex.org/W2745497104',
  'doi': 'https://doi.org/10.1109/taffc.2017.2740923'},
 {'id': 'https://openalex.org/W2713788831',
  'doi': 'https://doi.org/10.1109/cvprw.2017.248'},
 {'id': 'https://openalex.org/W3155551469',
  'doi': 'https://doi.org/10.1007/s00521-021-06012-8'},
 {'id': 'https://openalex.org/W4285250231',
  'doi': 'https://doi.org/10.1109/taffc.2022.3188390'},
 {'id': 'https://openalex.org/W2798536775',
  'doi': 'https://doi.org/10.1007/s11263-019-01158-4'},
 {'id': 'https://openalex.org/W3126750668',
  'doi': 'https://doi.org/10.1109/fg47880.2020.00126'},
 {'id': 'https://openalex.org/W4292794012',
  'doi': 'https://doi.org/10.1109/cvprw56347.2022.00259'},
 {'id': 'https://openalex.org/W2548529926',
  'doi': 'https://doi.org/10.1145/2993148.2997627'},
 {'id':

In [None]:
pyalex.Works().filter_or(openalex_id=['doi:10.7717/peerj.4375']).get(per_page=1)

QueryError: 'doi:10.7717/peerj.4375' is not a valid OpenAlex ID.

: 

## Testing SemanticScholar API

### What to retrieve:
Database papers:
- paperId, externalIds, abstract, referenceCount

Then get references for each of those papers:
- paperId, externalIds, abstract

In [59]:
from semanticscholar import SemanticScholar
from semanticscholar.SemanticScholarException import ObjectNotFoundException

def get_referenced_papers(sch: SemanticScholar, paper_id: str):
    try:
        raw = sch.get_paper_references(paper_id=paper_id, fields=['paperId', 'title', 'externalIds', 'abstract'], limit=1000)
    except ObjectNotFoundException:
        return [{}]
    
    res = []
    for item in raw.items:
        tmp = {
            'originalPaperId': paper_id, 
            'paperId': item['citedPaper']['paperId'], 
            'title': item['citedPaper']['title'],
            'abstract': item['citedPaper']['abstract']
        }
        if 'externalIds' in item['citedPaper'] and item['citedPaper']['externalIds'] is not None:
            tmp.update({f'externalIds.{k}': v for k, v in item['citedPaper']['externalIds'].items()})
        res.append(tmp)
    return res

sch = SemanticScholar()

paper_id = df.iloc[7].paperId
res = get_referenced_papers(sch, paper_id)
df2 = pd.DataFrame(res)
print(f'{(~df2.abstract.isna()).sum()}/{len(df2)} papers have abstract ({(1-df2.abstract.isna().sum()/len(df2))*100:.0f}%)')

1/43 papers have abstract (2%)


In [72]:
df2[['originalPaperId', 'paperId', 'title', 'abstract']].head()

Unnamed: 0,originalPaperId,paperId,title,abstract
0,7433da608f60204cf0845fbd26cb83982e891875,f09e3845b9857b0c1a251bdf0f572eaa1519cc2f,Efficient Loss Function by Minimizing the Detrimental Effect of Floating-Point Errors on Gradient-Based Attacks,
1,7433da608f60204cf0845fbd26cb83982e891875,163b4d6a79a5b19af88b8585456363340d9efd04,GPT-4 Technical Report,
2,7433da608f60204cf0845fbd26cb83982e891875,a1e7b7a560b493c235eed2429cfbb9c12324ff4d,Scaling Adversarial Training to Large Perturbation Bounds,
3,7433da608f60204cf0845fbd26cb83982e891875,426b0ee8c723aa6086402f743d5cbb447622d9b6,When Adversarial Training Meets Vision Transformers: Recipes from Training to Architecture,
4,7433da608f60204cf0845fbd26cb83982e891875,c570475cab4c8d0662144c4d414c17e776d39409,A Light Recipe to Train Robust Vision Transformers,


In [70]:
res = sch.get_paper(paper_id='f09e3845b9857b0c1a251bdf0f572eaa1519cc2f')
res.abstract

"Attackers can deceive neural networks by adding human imperceptive perturbations to their input data; this reveals the vulnerability and weak robustness of current deep-learning networks. Many attack techniques have been proposed to evaluate the model's robustness. Gradient-based attacks suffer from severely overestimating the robustness. This paper identifies that the relative error in calculated gradients caused by floating-point errors, including floating-point underflow and rounding errors, is a fundamental reason why gradient-based attacks fail to accurately assess the model's robustness. Although it is hard to eliminate the relative error in the gradients, we can control its effect on the gradient-based attacks. Correspondingly, we propose an efficient loss function by minimizing the detrimental impact of the floating-point errors on the attacks. Experimental results show that it is more efficient and reliable than other loss functions when examined across a wide range of defenc

In [82]:
res = sch.get_paper(paper_id='7433da608f60204cf0845fbd26cb83982e891875', fields=['title', 'paperId', 'externalIds', 'abstract', 'references.title', 'references.paperId', 'references.abstract'])
res

{'paperId': '7433da608f60204cf0845fbd26cb83982e891875', 'externalIds': {'DBLP': 'conf/cvpr/JainD24', 'DOI': '10.1109/CVPR52733.2024.02336', 'CorpusId': 272722903}, 'title': 'Towards Understanding and Improving Adversarial Robustness of Vision Transformers', 'abstract': 'Recent literature has demonstrated that vision transformers (VITs) exhibit superior performance compared to convolutional neural networks (CNNs). The majority of recent research on adversarial robustness, however, has predomi-nantly focused on CNNs. In this work, we bridge this gap by analyzing the effectiveness of existing attacks on VITs. We demonstrate that due to the softmax computations in every attention block in VITs, they are inherently vulnerable to floating point underflow errors. This can lead to a gradient masking effect resulting in suboptimal attack strength of well-known attacks, like PGD, Carlini and Wagner (CW) and GAMA. Motivated by this, we propose Adaptive Attention Scaling (AAS) attack that can auto

In [4]:
import requests
import json

r = requests.post(
    'https://api.semanticscholar.org/graph/v1/paper/batch',
    params={'fields': 'references.abstract'},  # 'title,paperId,externalIds,abstract,references.title,references.paperId,
    json={'ids': ['7433da608f60204cf0845fbd26cb83982e891875']}
)
res = r.json()[0]

In [5]:
res['references']

[{'paperId': 'f09e3845b9857b0c1a251bdf0f572eaa1519cc2f',
  'title': 'Efficient Loss Function by Minimizing the Detrimental Effect of Floating-Point Errors on Gradient-Based Attacks',
  'abstract': None,
  'openAccessPdf': {'url': '',
   'status': 'CLOSED',
   'license': None,
   'disclaimer': "Notice: This paper's abstract has been elided by the publisher. Paper or abstract available at https://api.unpaywall.org/v2/10.1109/CVPR52729.2023.00395?email=<INSERT_YOUR_EMAIL> or https://doi.org/10.1109/CVPR52729.2023.00395, which is subject to the license by the author or copyright owner provided with this content. Please go to the source to verify the license and copyright information for your use."},
  'authors': [{'authorId': '2117164874', 'name': 'Yunrui Yu'},
   {'authorId': '2153074991', 'name': 'Chengjie Xu'}]},
 {'paperId': '163b4d6a79a5b19af88b8585456363340d9efd04',
  'title': 'GPT-4 Technical Report',
  'abstract': None,
  'openAccessPdf': {'url': '',
   'status': None,
   'license'

In [8]:
r = requests.post(
    'https://api.semanticscholar.org/graph/v1/paper/batch',
    params={'fields': 'abstract'},  # 'title,paperId,externalIds,abstract,references.title,references.paperId,
    json={'ids': ['f09e3845b9857b0c1a251bdf0f572eaa1519cc2f']}
)
res = r.json()[0]
res

{'paperId': 'f09e3845b9857b0c1a251bdf0f572eaa1519cc2f',
 'title': 'Efficient Loss Function by Minimizing the Detrimental Effect of Floating-Point Errors on Gradient-Based Attacks',
 'abstract': "Attackers can deceive neural networks by adding human imperceptive perturbations to their input data; this reveals the vulnerability and weak robustness of current deep-learning networks. Many attack techniques have been proposed to evaluate the model's robustness. Gradient-based attacks suffer from severely overestimating the robustness. This paper identifies that the relative error in calculated gradients caused by floating-point errors, including floating-point underflow and rounding errors, is a fundamental reason why gradient-based attacks fail to accurately assess the model's robustness. Although it is hard to eliminate the relative error in the gradients, we can control its effect on the gradient-based attacks. Correspondingly, we propose an efficient loss function by minimizing the detr

In [77]:
sch.get_paper_references(paper_id='7433da608f60204cf0845fbd26cb83982e891875', fields=['title', 'paperId', 'externalIds', 'abstract'], limit=1).items[0]['citedPaper']

BadQueryParametersException: Unrecognized or unsupported fields: [s, r, t, b, a, c]

In [51]:
tot = []
for paper_id in tqdm(df.paperId.iloc[:50]):
    res = get_referenced_papers(sch, paper_id)
    tot.extend(res)

df2 = pd.DataFrame(tot)

  0%|          | 0/50 [00:00<?, ?it/s]

In [56]:
df2[df2.abstract.isna()].iloc[:10]

Unnamed: 0,originalPaperId,paperId,abstract,externalIds.ArXiv,externalIds.DBLP,externalIds.DOI,externalIds.CorpusId,externalIds.MAG,externalIds.PubMed,externalIds.PubMedCentral,externalIds.ACL
0,fdb679246a2125dad1628081e45efb7a1c80f2c7,9b91b3031ea159e4964d18b2ce703168660ecf46,,2401.11605,journals/corr/abs-2401-11605,10.48550/arXiv.2401.11605,267069338.0,,,,
1,fdb679246a2125dad1628081e45efb7a1c80f2c7,0e8f1bb91bb4502966fa5e91e0610832dfe4240e,,2401.08639,journals/corr/abs-2401-08639,10.48550/arXiv.2401.08639,267028569.0,,,,
2,fdb679246a2125dad1628081e45efb7a1c80f2c7,b994cf51a8c7cf5c13358a6110d7304d6d04c881,,2310.18605,journals/corr/abs-2310-18605,10.48550/arXiv.2310.18605,264590159.0,,,,
3,fdb679246a2125dad1628081e45efb7a1c80f2c7,5003fdf35af631d4cb17fd3c1ce2469f665064f1,,2305.08891,journals/corr/abs-2305-08891,10.1109/WACV57701.2024.00532,258714883.0,,,,
4,fdb679246a2125dad1628081e45efb7a1c80f2c7,f9570989919338079088270a9cf1a7afc8db8093,,2304.14108,journals/corr/abs-2304-14108,10.48550/arXiv.2304.14108,258352812.0,,,,
5,fdb679246a2125dad1628081e45efb7a1c80f2c7,736973165f98105fec3729b7db414ae4d80fcbeb,,2212.09748,journals/corr/abs-2212-09748,10.1109/ICCV51070.2023.00387,254854389.0,,,,
6,fdb679246a2125dad1628081e45efb7a1c80f2c7,e2e34dc10482795a94e401c343a78cb333960996,,2211.13874,journals/corr/abs-2211-13874,10.1109/CVPR52729.2023.00043,254018271.0,,,,
7,fdb679246a2125dad1628081e45efb7a1c80f2c7,7c8979a99c1a9b214fc6762ae8e73ee4b39749c0,,2210.12867,conf/nips/PokleGK22,10.48550/arXiv.2210.12867,253098319.0,,,,
8,fdb679246a2125dad1628081e45efb7a1c80f2c7,e5c8960eb2ec034ffbd353ef39fd1cb541d3c7c9,,2210.08402,conf/nips/SchuhmannBVGWCC22,10.48550/arXiv.2210.08402,252917726.0,,,,
9,fdb679246a2125dad1628081e45efb7a1c80f2c7,4530c25da949bb2185c50663158ef19d52e3c6b5,,2206.00927,conf/nips/0011ZB0L022,10.48550/arXiv.2206.00927,249282317.0,,,,


In [54]:
print(f'{(~df2.abstract.isna()).sum()}/{len(df2)} papers have abstract ({(1-df2.abstract.isna().sum()/len(df2))*100:.0f}%)')

172/2576 papers have abstract (7%)


In [58]:

sch.get_paper_references(paper_id=paper_id, fields=['title', 'paperId', 'externalIds', 'abstract'], limit=1).items[0]['citedPaper']
df2 = pd.DataFrame(res)
df2.abstract.isna().sum()/len(df2)

0.85

## Testing crossref API

In [2]:
import pandas as pd

df = pd.read_csv('/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/metadata/openalex-ids+refs.csv')
df.head()

Unnamed: 0,fpath,title,doi,oaid,referenced_works
0,/cluster/home/lcarretero/workspace/dsl/dsl-res...,Fixed Point Diffusion Models,https://doi.org/10.1063/1.2121687,https://openalex.org/W2000456051,"['https://openalex.org/W1504980292', 'https://..."
1,/cluster/home/lcarretero/workspace/dsl/dsl-res...,BEVNeXt: Reviving Dense BEV Frameworks for 3D ...,https://doi.org/10.1109/cvpr52733.2024.01901,https://openalex.org/W4402727763,"['https://openalex.org/W1861492603', 'https://..."
2,/cluster/home/lcarretero/workspace/dsl/dsl-res...,Leveraging Pre-trained Multi-task Deep Models ...,https://doi.org/10.1109/cvprw63382.2024.00473,https://openalex.org/W4402916217,[]
3,/cluster/home/lcarretero/workspace/dsl/dsl-res...,Insights from the Use of Previously Unseen Neu...,https://doi.org/10.48550/arxiv.2404.02189,https://openalex.org/W4393967825,[]
4,/cluster/home/lcarretero/workspace/dsl/dsl-res...,Efficient local correlation volume for unsuper...,https://doi.org/10.1109/cvprw63382.2024.00049,https://openalex.org/W4402904316,"['https://openalex.org/W1513100184', 'https://..."


In [None]:
doi = '10.1109/TASC.2010.2088091'
crossref_url = f'https://api.crossref.org/works/{doi}'
response = requests.get(crossref_url)
data = response.json()
data['message']['abstract']


{'indexed': {'date-parts': [[2025, 2, 21]],
  'date-time': '2025-02-21T10:19:07Z',
  'timestamp': 1740133147234,
  'version': '3.37.3'},
 'reference-count': 8,
 'publisher': 'Institute of Electrical and Electronics Engineers (IEEE)',
 'issue': '3',
 'license': [{'start': {'date-parts': [[2011, 6, 1]],
    'date-time': '2011-06-01T00:00:00Z',
    'timestamp': 1306886400000},
   'content-version': 'vor',
   'delay-in-days': 0,
   'URL': 'https://ieeexplore.ieee.org/Xplorehelp/downloads/license-information/IEEE.html'}],
 'funder': [{'DOI': '10.13039/100000015',
   'name': 'U.S. Department of Energy',
   'doi-asserted-by': 'publisher',
   'award': ['DEFG0210ER41650'],
   'id': [{'id': '10.13039/100000015',
     'id-type': 'DOI',
     'asserted-by': 'publisher'}]}],
 'content-domain': {'domain': [], 'crossmark-restriction': False},
 'short-container-title': ['IEEE Trans. Appl. Supercond.'],
 'published-print': {'date-parts': [[2011, 6]]},
 'abstract': "<jats:p>A 1.3 GHz test cavity has been

In [18]:
name_of_paper = df.iloc[1].title
crossref_url = f'https://api.crossref.org/works?query.bibliographic={name_of_paper}&rows=1'
response = requests.get(crossref_url)
data = response.json()
data.get('message', {}).get('items', [None])[0]
data

{'status': 'ok',
 'message-type': 'work-list',
 'message-version': '1.0.0',
 'message': {'facets': {},
  'total-results': 2004128,
  'items': [{'indexed': {'date-parts': [[2025, 4, 16]],
     'date-time': '2025-04-16T08:30:45Z',
     'timestamp': 1744792245500,
     'version': '3.28.0'},
    'reference-count': 73,
    'publisher': 'IEEE',
    'license': [{'start': {'date-parts': [[2024, 6, 16]],
       'date-time': '2024-06-16T00:00:00Z',
       'timestamp': 1718496000000},
      'content-version': 'stm-asf',
      'delay-in-days': 0,
      'URL': 'https://doi.org/10.15223/policy-029'},
     {'start': {'date-parts': [[2024, 6, 16]],
       'date-time': '2024-06-16T00:00:00Z',
       'timestamp': 1718496000000},
      'content-version': 'stm-asf',
      'delay-in-days': 0,
      'URL': 'https://doi.org/10.15223/policy-037'}],
    'content-domain': {'domain': [], 'crossmark-restriction': False},
    'published-print': {'date-parts': [[2024, 6, 16]]},
    'DOI': '10.1109/cvpr52733.2024.01

In [35]:
ref_df = pd.read_csv('/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/metadata/openalex-refs-abstracts.csv')
ref_df.head(1)

Unnamed: 0,oaid,title,abstract,type,topic,domain,field,subfield
0,https://openalex.org/W2194775991,Deep Residual Learning for Image Recognition,"Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers - 8× deeper than VGG nets [40] but still having lower complexity. An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 with 100 and 1000 layers. The depth of representations is of central importance for many visual recognition tasks. Solely due to our extremely deep representations, we obtain a 28% relative improvement on the COCO object detection dataset. Deep residual nets are foundations of our submissions to ILSVRC & COCO 2015 competitions1, where we also won the 1st places on the tasks of ImageNet detection, ImageNet localization, COCO detection, and COCO segmentation.",article,Advanced Neural Network Applications,Physical Sciences,Computer Science,Computer Vision and Pattern Recognition


In [36]:
import requests
from tqdm import trange

fields_of_interest = ','.join(['DOI', 'title', 'type','references-count', 'reference', 'abstract'])

all_raw = []
for i in trange(200):
    name_of_paper = ref_df.iloc[i].title if i > 0 else 'Ultra-Gradient Test Cavity for Testing SRF Wafer Samples'
    crossref_url = f'https://api.crossref.org/works?query.bibliographic={name_of_paper}&rows=2&select={fields_of_interest}'
    response = requests.get(crossref_url)
    data = response.json()
    raw = data.get('message', {}).get('items', [None])[0]  # FIXME: Do proper relevance comparison as advised
    all_raw.append(raw)

  0%|          | 0/200 [00:00<?, ?it/s]

100%|██████████| 200/200 [01:37<00:00,  2.05it/s]


In [37]:
ref_cnts = pd.Series([raw.get('references-count', 0) for raw in all_raw])
ref_cnts.values

array([  8,   0,  29,   0,  30,  14,   0,   0,  24,  10,   0,  91,  92,
        68,  23,  92,  28,  66, 100,  36,   0,   0,   0,  52,  97,   0,
         0,  33, 236,  24,   0,  86,  24,  55,  41,  39,  70,  31,  35,
        41, 125,  87,  13,  26,   0,  17,  58,  43,  73,  67,  62,  51,
        27,   0,  62,  48,  35,  28,  89,  57,  71,  81,  83,   0, 152,
        53,   0, 100, 136,  61,   0,   0, 121,  42, 117,  16,  55,  65,
        54,  22,  37,  76,   8,  28,  24,  56,  50,  25,   0,   0,   0,
        16,  26,  21, 106,  38,  29,  39,   0,  58,   0,  76,   0,   0,
        63,  48,   0,   0,  38,  33,  18,  44,   9,  60,   0,  53,  19,
         0,  47,  95,   0,  39,   0,   0,  64,   0,  21,  60,  79,   1,
         0, 287,   0,   0,  55, 168,  12,   0,  33,   0,   0,  61,  53,
        54,  36,  38,  62,  43,  56,  61,  30,  81,  60,  62,   0,  40,
        47,  21, 210,   0,   0,  43,  36,  51,  44,  48,   0,  60,  24,
         0,  47,  77,  56,  57,  19,  45,  48,  27,  59,  64,  3

In [39]:
all_raw[3]

{'DOI': '10.21123/bsj.2024.11089',
 'title': ['Improved Deep Perceptual Hashing Algorithm (IDP-HA), Information Retrieval System, Microsoft Common Objects In Context (MS COCO), Remote Cloud Computing, Computer science, information systems',
  'نظام محسّن لاسترجاع المعلومات قائم على خوارزمية التجزئة الإدراكية العميقة للحوسبة السحابية عن بعد'],
 'type': 'journal-article',
 'references-count': 0,
 'abstract': '<jats:p>يمكن أن يعزى نمو استرجاع المعلومات والخدمات المرتبطة بها إلى التقدم التقني. وفي الوقت نفسه، تتأثر الطرق التقليدية لاسترجاع المعلومات بقيود الأداء والدقة وقابلية التوسع. يعد نظام استرجاع المعلومات للحوسبة السحابية البعيدة الذي يعتمد على خوارزمية التجزئة الإدراكية العميقة المحسنة (IDP-HA) أحد الحلول التي تم تطويرها لحل هذه القيود. تُستخدم الأنظمة على نطاق واسع نظرًا لقدرتها على التعرف على الأنماط المعقدة في البيانات. لا تزال دقة قياس تشابه المعلومات غير متوفرة بسبب التعقيد الكامن في البيانات وطرق القياس. يستخدم أسلوب التجزئة الإدراكي العميق أطر عمل الشبكة العصبية العميقة (DNN)

In [38]:
abstracts = pd.Series([raw.get('abstract', '') for raw in all_raw])
abstracts.values

array(["<jats:p>A 1.3 GHz test cavity has been designed to test wafer samples of superconducting materials. This mushroom shaped cavity, operating in TE<jats:sub>01</jats:sub>mode, creates a unique distribution of surface fields. The surface magnetic field on the sample wafer is 3.75 times greater than elsewhere on the Niobium cavity surface. This field design is made possible through dielectrically loading the cavity by locating a hemisphere of ultra-pure sapphire just above the sample wafer. The sapphire pulls the fields away from the walls so the maximum field the Nb surface sees is 25% of the surface field on the sample. In this manner, it should be possible to drive the sample wafer well beyond the BCS limit for Niobium while still maintaining a respectable Q. The sapphire's purity must be tested for its loss tangent and dielectric constant to finalize the design of the mushroom test cavity. A sapphire loaded CEBAF cavity has been constructed and tested. The results on the dielect