In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm

## Create vector DB

### Testing HF embedding model

In [27]:
from importlib import reload
from semantic_search import store
reload(store)
from semantic_search.store import LocalEmbeddingModel, FAISSDocumentStore



In [29]:
model = LocalEmbeddingModel(chunk_size=256)
store = FAISSDocumentStore(model, db_dir='/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/db/references-1')

if not store.load_index():
    docs = pd.read_csv('/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/metadata/openalex-refs-abstracts.csv')
    docs.loc[docs.abstract.isna(), 'abstract'] = ''
    docs['has_abstract'] = docs.abstract.apply(len) > 0

    docs['ref_work'] = docs['ref_work'].str.split('/').str[-1]
    docs.rename(columns={'ref_work': 'id', 'abstract': 'text'}, inplace=True)
    docs = docs[docs.has_abstract]

    store.create_index(docs)

2025-04-14 12:41:26.180904: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-14 12:41:30.523603: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Index or document store not found


Token indices sequence length is longer than the specified maximum sequence length for this model (706 > 512). Running this sequence through the model will result in indexing errors
Chunking and encoding: 100%|██████████| 14956/14956 [00:45<00:00, 328.00it/s]


Generating embeddings for 3 chunks...


Generating embeddings: 100%|██████████| 2685/2685 [00:20<00:00, 133.05it/s]


In [31]:
store.search("CLIP related works", top_k=5)

Chunking and encoding: 100%|██████████| 1/1 [00:00<00:00, 536.29it/s]


[{'rank': 1,
  'score': 0.8255381213288242,
  'document_id': 'W4307106676',
  'chunk_text': 'faster, and lighter. our code is available in https : / / github. com / rmokady / clip _ prefix _ caption.'},
 {'rank': 2,
  'score': 0.793164941719414,
  'document_id': 'W4387323008',
  'chunk_text': 'available at https : / / github. com / wusize / clipself.'},
 {'rank': 3,
  'score': 0.7750313673011238,
  'document_id': 'W3190434222',
  'chunk_text': 'recently, there have been breakthroughs in computer vision ( " cv " ) models that are more generalizable with the advent of models such as clip and align. in this paper, we analyze clip and highlight some of the challenges such models pose. clip reduces the need for task specific training data, potentially opening up many niche tasks to automation. clip also allows its users to flexibly specify image classification classes in natural language, which we find can shift how biases manifest. additionally, through some preliminary probes we find that

### Using Docling papers

In [58]:
from importlib import reload
from semantic_search import store
reload(store)
from semantic_search.store import LocalEmbeddingModel, FAISSDocumentStore

In [27]:
embedding_model = LocalEmbeddingModel(
    model_name='sentence-transformers/all-MiniLM-L6-v2',
    device='cuda',
    batch_size=8
)
document_store = FAISSDocumentStore(
    embedding_model=embedding_model,
    db_dir='/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/db/chunk1-txt-all',
)

if not document_store.load_index():
    document_store.create_index('/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt')

Loaded index with 73809 vectors


In [26]:
print(document_store.search("visual", top_k=5)[0]['chunk_text'])

er-efficient visual instruction model. arXiv 2304.15010 , 2023. 1, 3, 4
- [9] Yash Goyal, Tejas Khot, Douglas Summers-Stay, Dhruv Batra, and Devi Parikh. Making the V in VQA matter: Elevating the role...


### Using scraped abstracts

In [3]:
fpath = '/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/metadata/openalex-refs-abstracts.csv'
df = pd.read_csv(fpath)
df.head()

Unnamed: 0,ref_work,abstract,type
0,W1504980292,Classical Monte Carlo simulations have been ca...,article
1,W1646044445,Views Icon Views Article contents Figures & ta...,article
2,W1966745391,ADVERTISEMENT RETURN TO ISSUEPREVArticleNEXTTh...,article
3,W1976499671,"In 1995, the International Association for the...",article
4,W1976755117,The ability of simple potential functions to r...,article


## Create metadata for CVPR dataset

In [14]:
from importlib import reload
from semantic_search import utils
reload(utils)

from semantic_search.utils import get_title_from_fpath, get_metadata, multithread_apply, parse_list_string, count_references, get_abstracts

In [10]:
# Set pandas display options to show wider dataframes
pd.set_option('display.max_colwidth', None)  # Show full text in columns
pd.set_option('display.width', 1000)         # Set the display width
pd.set_option('display.max_columns', 20)     # Show more columns

In [49]:
raw_dir = '/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt'
df = pd.DataFrame([(str(fpath), fpath.name) for fpath in Path(raw_dir).glob("*.txt")], columns=['fpath', 'fname'])
df['title'] = df['fpath'].apply(get_title_from_fpath)
df['doi', 'oaid', 'referenced_works'] = multithread_apply(df['title'].values, get_metadata, n_workers=5)
# df.to_csv('/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/src/semantic_search/dev/paper-metadata.csv', index=False)

  0%|          | 0/1142 [00:00<?, ?it/s]

Unnamed: 0,fpath,fname,title,doi,oaid
0,/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt/Bai_Fixed_Point_Diffusion_Models_CVPR_2024_paper.txt,Bai_Fixed_Point_Diffusion_Models_CVPR_2024_paper.txt,Fixed Point Diffusion Models,https://doi.org/10.1063/1.2121687,https://openalex.org/W2000456051
1,/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt/Li_BEVNeXt_Reviving_Dense_BEV_Frameworks_for_3D_Object_Detection_CVPR_2024_paper.txt,Li_BEVNeXt_Reviving_Dense_BEV_Frameworks_for_3D_Object_Detection_CVPR_2024_paper.txt,BEVNeXt: Reviving Dense BEV Frameworks for 3D Object Detection,https://doi.org/10.1109/cvpr52733.2024.01901,https://openalex.org/W4402727763
2,/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt/Savchenko_Leveraging_Pre-trained_Multi-task_Deep_Models_for_Trustworthy_Facial_Analysis_in_CVPRW_2024_paper.txt,Savchenko_Leveraging_Pre-trained_Multi-task_Deep_Models_for_Trustworthy_Facial_Analysis_in_CVPRW_2024_paper.txt,Leveraging Pre-trained Multi-task Deep Models for Trustworthy Facial Analysis in Affective Behaviour Analysis in-the-Wild,https://doi.org/10.1109/cvprw63382.2024.00473,https://openalex.org/W4402916217


### Retrieving abstracts of cited works
(only on subset of papers whose references we found using OpenAlex)

In [None]:
# Load data
df = pd.read_csv('/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/metadata/openalex-ids+refs.csv')
df['referenced_works'] = df['referenced_works'].apply(parse_list_string)
df[['total_references', 'references_in_dataset']] = df.apply(lambda x: count_references(x, df), axis=1, result_type='expand')

# Retrieve references via OpenAlex API
all_refs = pd.Series(np.concatenate(df.referenced_works.values)).unique()
all_refs_batched = [all_refs[i:i+100] for i in range(0, len(all_refs), 100)]
results = multithread_apply(all_refs_batched, get_abstracts, n_workers=5)

ref_df = pd.DataFrame(np.concatenate(results), columns=['ref_work', 'abstract', 'type'])
ref_df.to_csv('/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/metadata/openalex-refs-abstracts.csv', index=False)
ref_df.head()

  0%|          | 0/187 [00:00<?, ?it/s]

100%|██████████| 187/187 [00:41<00:00,  4.55it/s]


Unnamed: 0,ref_work,abstract,type
0,https://openalex.org/W2194775991,Deeper neural networks are more difficult to t...,article
1,https://openalex.org/W2108598243,The explosion of image data on the Internet ha...,article
2,https://openalex.org/W1976499671,Classical Monte Carlo simulations have been ca...,article
3,https://openalex.org/W1861492603,,book-chapter
4,https://openalex.org/W2565639579,Feature pyramids are a basic component in reco...,preprint


## Misc

### Check papers with missing abstract

In [5]:
import re

raw_dir = '/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt'

def extract_abstract(fpath: str):
    doc_text = fpath.read_text(encoding="utf-8")
    abstract_match = re.search(r'## Abstract\n\n(.*?)(?=\n\n## \d+\.)', doc_text, re.DOTALL)
    return abstract_match.group(1) if abstract_match else ''

for fpath in Path(raw_dir).glob("*.txt"):
    if extract_abstract(fpath) == '':
        print(fpath)

# abstracts =[check_single_doc(fpath) for fpath in Path(raw_dir).glob("*.txt")]

/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt/Korycki_Class-Incremental_Mixture_of_Gaussians_for_Deep_Continual_Learning_CVPRW_2024_paper.txt
/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt/Hoque_IrrNet_Spatio-Temporal_Segmentation_Guided_Classification_for_Irrigation_Mapping_CVPRW_2024_paper.txt
/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt/Zhang_MOHO_Learning_Single-view_Hand-held_Object_Reconstruction_with_Multi-view_Occlusion-Aware_Supervision_CVPR_2024_paper.txt
/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt/Paissan_Structured_Sparse_Back-propagation_for_Lightweight_On-Device_Continual_Learning_on_Microcontroller_CVPRW_2024_paper.txt
/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt/Halawa_Multi-Task_Multi-Modal_Self-Super

## Testing SemanticScholar API

In [3]:
metadata_fpath = '/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/metadata/paper_metadata.csv'
df = pd.read_csv(metadata_fpath)
df.head()

Unnamed: 0,fpath,fname,title,paperId,abstract,externalIds.ArXiv,externalIds.DBLP,externalIds.DOI,externalIds.CorpusId,externalIds.PubMed
0,/cluster/home/lcarretero/workspace/dsl/dsl-res...,Bai_Fixed_Point_Diffusion_Models_CVPR_2024_pap...,Fixed Point Diffusion Models,fdb679246a2125dad1628081e45efb7a1c80f2c7,We introduce the Fixed Point Diffusion Model (...,2401.08741,journals/corr/abs-2401-08741,10.1109/CVPR52733.2024.00901,267027739.0,
1,/cluster/home/lcarretero/workspace/dsl/dsl-res...,Li_BEVNeXt_Reviving_Dense_BEV_Frameworks_for_3...,BEVNeXt: Reviving Dense BEV Frameworks for 3D ...,a23f6180d6908499a8238c06f4fb57bf431a2b43,"Recently, the rise of query-based Transformer ...",2312.01696,journals/corr/abs-2312-01696,10.1109/CVPR52733.2024.01901,265609098.0,
2,/cluster/home/lcarretero/workspace/dsl/dsl-res...,Savchenko_Leveraging_Pre-trained_Multi-task_De...,Leveraging Pre-trained Multi-task Deep Models ...,9c50996ba35eb605cb9bcd5835103b441cf38e07,This article presents our results for the sixt...,,conf/cvpr/Savchenko22a,10.1109/CVPRW63382.2024.00473,272915313.0,
3,/cluster/home/lcarretero/workspace/dsl/dsl-res...,Geada_Insights_from_the_Use_of_Previously_Unse...,Insights from the Use of Previously Unseen Neu...,a494cfa653ae449019ffe8c8908bfec8042218d2,The boundless possibility of neural networks w...,2404.02189,journals/corr/abs-2404-02189,10.1109/CVPR52733.2024.02127,268876045.0,
4,/cluster/home/lcarretero/workspace/dsl/dsl-res...,Khairi_Efficient_Local_Correlation_Volume_for_...,Efficient local correlation volume for unsuper...,1bede9c7617eda6d635114c0ebf9c621f96e6485,"With the advent of deep learning methods, perf...",,conf/cvpr/KhairiMFB22,10.1109/CVPRW63382.2024.00049,272915365.0,


In [11]:
df.abstract.apply(lambda x: str(type(x))).value_counts()

abstract
<class 'str'>      1110
<class 'float'>      32
Name: count, dtype: int64

### What to retrieve:
Database papers:
- paperId, externalIds, abstract, referenceCount

Then get references for each of those papers:
- paperId, externalIds, abstract

In [59]:
from semanticscholar import SemanticScholar
from semanticscholar.SemanticScholarException import ObjectNotFoundException

def get_referenced_papers(sch: SemanticScholar, paper_id: str):
    try:
        raw = sch.get_paper_references(paper_id=paper_id, fields=['paperId', 'title', 'externalIds', 'abstract'], limit=1000)
    except ObjectNotFoundException:
        return [{}]
    
    res = []
    for item in raw.items:
        tmp = {
            'originalPaperId': paper_id, 
            'paperId': item['citedPaper']['paperId'], 
            'title': item['citedPaper']['title'],
            'abstract': item['citedPaper']['abstract']
        }
        if 'externalIds' in item['citedPaper'] and item['citedPaper']['externalIds'] is not None:
            tmp.update({f'externalIds.{k}': v for k, v in item['citedPaper']['externalIds'].items()})
        res.append(tmp)
    return res

sch = SemanticScholar()

paper_id = df.iloc[7].paperId
res = get_referenced_papers(sch, paper_id)
df2 = pd.DataFrame(res)
print(f'{(~df2.abstract.isna()).sum()}/{len(df2)} papers have abstract ({(1-df2.abstract.isna().sum()/len(df2))*100:.0f}%)')

1/43 papers have abstract (2%)


In [72]:
df2[['originalPaperId', 'paperId', 'title', 'abstract']].head()

Unnamed: 0,originalPaperId,paperId,title,abstract
0,7433da608f60204cf0845fbd26cb83982e891875,f09e3845b9857b0c1a251bdf0f572eaa1519cc2f,Efficient Loss Function by Minimizing the Detrimental Effect of Floating-Point Errors on Gradient-Based Attacks,
1,7433da608f60204cf0845fbd26cb83982e891875,163b4d6a79a5b19af88b8585456363340d9efd04,GPT-4 Technical Report,
2,7433da608f60204cf0845fbd26cb83982e891875,a1e7b7a560b493c235eed2429cfbb9c12324ff4d,Scaling Adversarial Training to Large Perturbation Bounds,
3,7433da608f60204cf0845fbd26cb83982e891875,426b0ee8c723aa6086402f743d5cbb447622d9b6,When Adversarial Training Meets Vision Transformers: Recipes from Training to Architecture,
4,7433da608f60204cf0845fbd26cb83982e891875,c570475cab4c8d0662144c4d414c17e776d39409,A Light Recipe to Train Robust Vision Transformers,


In [70]:
res = sch.get_paper(paper_id='f09e3845b9857b0c1a251bdf0f572eaa1519cc2f')
res.abstract

"Attackers can deceive neural networks by adding human imperceptive perturbations to their input data; this reveals the vulnerability and weak robustness of current deep-learning networks. Many attack techniques have been proposed to evaluate the model's robustness. Gradient-based attacks suffer from severely overestimating the robustness. This paper identifies that the relative error in calculated gradients caused by floating-point errors, including floating-point underflow and rounding errors, is a fundamental reason why gradient-based attacks fail to accurately assess the model's robustness. Although it is hard to eliminate the relative error in the gradients, we can control its effect on the gradient-based attacks. Correspondingly, we propose an efficient loss function by minimizing the detrimental impact of the floating-point errors on the attacks. Experimental results show that it is more efficient and reliable than other loss functions when examined across a wide range of defenc

In [82]:
res = sch.get_paper(paper_id='7433da608f60204cf0845fbd26cb83982e891875', fields=['title', 'paperId', 'externalIds', 'abstract', 'references.title', 'references.paperId', 'references.abstract'])
res

{'paperId': '7433da608f60204cf0845fbd26cb83982e891875', 'externalIds': {'DBLP': 'conf/cvpr/JainD24', 'DOI': '10.1109/CVPR52733.2024.02336', 'CorpusId': 272722903}, 'title': 'Towards Understanding and Improving Adversarial Robustness of Vision Transformers', 'abstract': 'Recent literature has demonstrated that vision transformers (VITs) exhibit superior performance compared to convolutional neural networks (CNNs). The majority of recent research on adversarial robustness, however, has predomi-nantly focused on CNNs. In this work, we bridge this gap by analyzing the effectiveness of existing attacks on VITs. We demonstrate that due to the softmax computations in every attention block in VITs, they are inherently vulnerable to floating point underflow errors. This can lead to a gradient masking effect resulting in suboptimal attack strength of well-known attacks, like PGD, Carlini and Wagner (CW) and GAMA. Motivated by this, we propose Adaptive Attention Scaling (AAS) attack that can auto

In [4]:
import requests
import json

r = requests.post(
    'https://api.semanticscholar.org/graph/v1/paper/batch',
    params={'fields': 'references.abstract'},  # 'title,paperId,externalIds,abstract,references.title,references.paperId,
    json={'ids': ['7433da608f60204cf0845fbd26cb83982e891875']}
)
res = r.json()[0]

In [5]:
res['references']

[{'paperId': 'f09e3845b9857b0c1a251bdf0f572eaa1519cc2f',
  'title': 'Efficient Loss Function by Minimizing the Detrimental Effect of Floating-Point Errors on Gradient-Based Attacks',
  'abstract': None,
  'openAccessPdf': {'url': '',
   'status': 'CLOSED',
   'license': None,
   'disclaimer': "Notice: This paper's abstract has been elided by the publisher. Paper or abstract available at https://api.unpaywall.org/v2/10.1109/CVPR52729.2023.00395?email=<INSERT_YOUR_EMAIL> or https://doi.org/10.1109/CVPR52729.2023.00395, which is subject to the license by the author or copyright owner provided with this content. Please go to the source to verify the license and copyright information for your use."},
  'authors': [{'authorId': '2117164874', 'name': 'Yunrui Yu'},
   {'authorId': '2153074991', 'name': 'Chengjie Xu'}]},
 {'paperId': '163b4d6a79a5b19af88b8585456363340d9efd04',
  'title': 'GPT-4 Technical Report',
  'abstract': None,
  'openAccessPdf': {'url': '',
   'status': None,
   'license'

In [8]:
r = requests.post(
    'https://api.semanticscholar.org/graph/v1/paper/batch',
    params={'fields': 'abstract'},  # 'title,paperId,externalIds,abstract,references.title,references.paperId,
    json={'ids': ['f09e3845b9857b0c1a251bdf0f572eaa1519cc2f']}
)
res = r.json()[0]
res

{'paperId': 'f09e3845b9857b0c1a251bdf0f572eaa1519cc2f',
 'title': 'Efficient Loss Function by Minimizing the Detrimental Effect of Floating-Point Errors on Gradient-Based Attacks',
 'abstract': "Attackers can deceive neural networks by adding human imperceptive perturbations to their input data; this reveals the vulnerability and weak robustness of current deep-learning networks. Many attack techniques have been proposed to evaluate the model's robustness. Gradient-based attacks suffer from severely overestimating the robustness. This paper identifies that the relative error in calculated gradients caused by floating-point errors, including floating-point underflow and rounding errors, is a fundamental reason why gradient-based attacks fail to accurately assess the model's robustness. Although it is hard to eliminate the relative error in the gradients, we can control its effect on the gradient-based attacks. Correspondingly, we propose an efficient loss function by minimizing the detr

In [77]:
sch.get_paper_references(paper_id='7433da608f60204cf0845fbd26cb83982e891875', fields=['title', 'paperId', 'externalIds', 'abstract'], limit=1).items[0]['citedPaper']

BadQueryParametersException: Unrecognized or unsupported fields: [s, r, t, b, a, c]

In [51]:
tot = []
for paper_id in tqdm(df.paperId.iloc[:50]):
    res = get_referenced_papers(sch, paper_id)
    tot.extend(res)

df2 = pd.DataFrame(tot)

  0%|          | 0/50 [00:00<?, ?it/s]

In [56]:
df2[df2.abstract.isna()].iloc[:10]

Unnamed: 0,originalPaperId,paperId,abstract,externalIds.ArXiv,externalIds.DBLP,externalIds.DOI,externalIds.CorpusId,externalIds.MAG,externalIds.PubMed,externalIds.PubMedCentral,externalIds.ACL
0,fdb679246a2125dad1628081e45efb7a1c80f2c7,9b91b3031ea159e4964d18b2ce703168660ecf46,,2401.11605,journals/corr/abs-2401-11605,10.48550/arXiv.2401.11605,267069338.0,,,,
1,fdb679246a2125dad1628081e45efb7a1c80f2c7,0e8f1bb91bb4502966fa5e91e0610832dfe4240e,,2401.08639,journals/corr/abs-2401-08639,10.48550/arXiv.2401.08639,267028569.0,,,,
2,fdb679246a2125dad1628081e45efb7a1c80f2c7,b994cf51a8c7cf5c13358a6110d7304d6d04c881,,2310.18605,journals/corr/abs-2310-18605,10.48550/arXiv.2310.18605,264590159.0,,,,
3,fdb679246a2125dad1628081e45efb7a1c80f2c7,5003fdf35af631d4cb17fd3c1ce2469f665064f1,,2305.08891,journals/corr/abs-2305-08891,10.1109/WACV57701.2024.00532,258714883.0,,,,
4,fdb679246a2125dad1628081e45efb7a1c80f2c7,f9570989919338079088270a9cf1a7afc8db8093,,2304.14108,journals/corr/abs-2304-14108,10.48550/arXiv.2304.14108,258352812.0,,,,
5,fdb679246a2125dad1628081e45efb7a1c80f2c7,736973165f98105fec3729b7db414ae4d80fcbeb,,2212.09748,journals/corr/abs-2212-09748,10.1109/ICCV51070.2023.00387,254854389.0,,,,
6,fdb679246a2125dad1628081e45efb7a1c80f2c7,e2e34dc10482795a94e401c343a78cb333960996,,2211.13874,journals/corr/abs-2211-13874,10.1109/CVPR52729.2023.00043,254018271.0,,,,
7,fdb679246a2125dad1628081e45efb7a1c80f2c7,7c8979a99c1a9b214fc6762ae8e73ee4b39749c0,,2210.12867,conf/nips/PokleGK22,10.48550/arXiv.2210.12867,253098319.0,,,,
8,fdb679246a2125dad1628081e45efb7a1c80f2c7,e5c8960eb2ec034ffbd353ef39fd1cb541d3c7c9,,2210.08402,conf/nips/SchuhmannBVGWCC22,10.48550/arXiv.2210.08402,252917726.0,,,,
9,fdb679246a2125dad1628081e45efb7a1c80f2c7,4530c25da949bb2185c50663158ef19d52e3c6b5,,2206.00927,conf/nips/0011ZB0L022,10.48550/arXiv.2206.00927,249282317.0,,,,


In [54]:
print(f'{(~df2.abstract.isna()).sum()}/{len(df2)} papers have abstract ({(1-df2.abstract.isna().sum()/len(df2))*100:.0f}%)')

172/2576 papers have abstract (7%)


In [58]:

sch.get_paper_references(paper_id=paper_id, fields=['title', 'paperId', 'externalIds', 'abstract'], limit=1).items[0]['citedPaper']
df2 = pd.DataFrame(res)
df2.abstract.isna().sum()/len(df2)

0.85

In [None]:
def get_metadata(title: str):
    search_results = pyalex.Works().search(title).select(['id', 'doi', 'referenced_works']).get(page=1, per_page=1)
    return (search_results[0]['doi'], search_results[0]['id'], search_results[0]['referenced_works']) if search_results else (None, None, None)

df['title'] = df['fpath'].apply(get_title_from_fpath)

with ThreadPoolExecutor(max_workers=5) as executor:
    results = list(tqdm(
        executor.map(get_metadata, df['title'].values),
        total=len(df)
    ))
df['doi', 'oaid', 'referenced_works'] = results