In [4]:
import os
import uuid
import time
from urllib.parse import quote

import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry


BASE_URL = "https://api.elsevier.com/content/abstract/doi/{doi}"


def _build_session(
    total_retries: int = 5,
    backoff_factor: float = 0.8,
    status_forcelist=(429, 500, 502, 503, 504),
) -> requests.Session:
    """
    requests Session with sane retries for Elsevier APIs (handles 429 and transient 5xx).
    """
    retry = Retry(
        total=total_retries,
        read=total_retries,
        connect=total_retries,
        status=total_retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
        allowed_methods=frozenset(["GET"]),
        raise_on_status=False,
    )
    adapter = HTTPAdapter(max_retries=retry)
    s = requests.Session()
    s.mount("https://", adapter)
    s.mount("http://", adapter)
    return s


def _extract_abstract(payload: dict) -> str | None:
    """
    Best-effort extraction of abstract text from the Abstract Retrieval API JSON response.

    Elsevier payloads can vary across content types and entitlements, so we try a few
    common locations safely.
    """
    if not isinstance(payload, dict):
        return None

    root = payload.get("abstracts-retrieval-response", payload)

    # Commonly, description appears here
    coredata = root.get("coredata", {})
    for k in ("dc:description", "description", "prism:teaser"):
        v = coredata.get(k)
        if isinstance(v, str) and v.strip():
            return v.strip()

    # Sometimes the abstract is nested under "item" or "originalText" structures
    item = root.get("item", {})
    if isinstance(item, dict):
        bib = item.get("bibrecord", {})
        if isinstance(bib, dict):
            head = bib.get("head", {})
            if isinstance(head, dict):
                abstract = head.get("abstracts", head.get("abstract"))
                if isinstance(abstract, dict):
                    # Many variants exist; try typical keys
                    for k in ("abstract", "ce:abstract", "ce:para", "para"):
                        v = abstract.get(k)
                        if isinstance(v, str) and v.strip():
                            return v.strip()
                        if isinstance(v, list):
                            parts = [x.strip() for x in v if isinstance(x, str) and x.strip()]
                            if parts:
                                return "\n".join(parts)

    # Fallback: try to locate any string field called "abstract" or "description"
    def dfs(obj):
        if isinstance(obj, dict):
            for key, val in obj.items():
                if key.lower() in ("abstract", "dc:description", "description") and isinstance(val, str):
                    if val.strip():
                        return val.strip()
                found = dfs(val)
                if found:
                    return found
        elif isinstance(obj, list):
            for it in obj:
                found = dfs(it)
                if found:
                    return found
        return None

    return dfs(root)


def fetch_scopus_abstract_by_doi(
    doi: str,
    api_key: str,
    *,
    bearer_token: str | None = None,     # "Bearer <token>" (optional)
    inst_token: str | None = None,       # Elsevier institution token (optional)
    view: str = "META_ABS",              # META, META_ABS, FULL, REF, ENTITLED
    timeout: tuple[float, float] = (10.0, 30.0),
    session: requests.Session | None = None,
) -> tuple[str | None, int, str | None]:
    """
    Returns (abstract_text_or_None, http_status, error_message_or_None)
    """
    if doi is None or (isinstance(doi, float) and pd.isna(doi)):
        return None, 0, "Missing DOI"

    doi_clean = str(doi).strip()
    if not doi_clean:
        return None, 0, "Empty DOI"

    s = session or _build_session()

    url = BASE_URL.format(doi=quote(doi_clean, safe=""))  # DOI can contain slashes etc.

    headers = {
        "Accept": "application/json",
        "X-ELS-APIKey": api_key,
        "X-ELS-ReqId": str(uuid.uuid4()),
    }
    if inst_token:
        headers["X-ELS-Insttoken"] = inst_token
    if bearer_token:
        # bearer_token can be either raw token or already prefixed
        headers["Authorization"] = bearer_token if bearer_token.startswith("Bearer ") else f"Bearer {bearer_token}"

    params = {"view": view}

    try:
        resp = s.get(url, headers=headers, params=params, timeout=timeout)
    except requests.RequestException as e:
        return None, 0, f"Request error: {e}"

    status = resp.status_code

    # Graceful handling of common errors
    if status == 404:
        return None, status, "Not found (DOI not resolvable via this endpoint)"
    if status in (401, 403):
        return None, status, "Auth/entitlements error (check API key and tokens)"
    if status == 429:
        # If you want, you can respect Retry-After explicitly; retries are already configured.
        return None, status, "Quota exceeded / rate limited (429)"
    if status >= 400:
        msg = None
        try:
            msg = resp.text[:500]
        except Exception:
            pass
        return None, status, f"HTTP {status}. Body (truncated): {msg}"

    # Parse JSON and extract abstract
    try:
        payload = resp.json()
    except ValueError:
        return None, status, "Non-JSON response despite Accept=application/json"

    abstract = _extract_abstract(payload)
    if abstract:
        return abstract, status, None

    return None, status, "No abstract field found in response payload"


def add_abstracts_to_df(
    df: pd.DataFrame,
    doi_col: str,
    *,
    api_key: str,
    out_col: str = "abstract",
    bearer_token: str | None = None,
    inst_token: str | None = None,
    view: str = "META_ABS",
    sleep_seconds: float = 0.1,          # small pacing to be polite; increase if you hit 429
) -> pd.DataFrame:
    """
    Adds `out_col` with retrieved abstracts (or None). Also adds optional diagnostics columns.
    """
    df = df.copy()
    s = _build_session()

    abstracts = []
    statuses = []
    errors = []

    # Basic in-memory cache to avoid repeating calls for duplicate DOIs
    cache: dict[str, tuple[str | None, int, str | None]] = {}

    for raw_doi in df[doi_col].tolist():
        doi_str = None if pd.isna(raw_doi) else str(raw_doi).strip()

        if doi_str and doi_str in cache:
            abstract, status, err = cache[doi_str]
        else:
            abstract, status, err = fetch_scopus_abstract_by_doi(
                doi=doi_str,
                api_key=api_key,
                bearer_token=bearer_token,
                inst_token=inst_token,
                view=view,
                session=s,
            )
            if doi_str:
                cache[doi_str] = (abstract, status, err)

        abstracts.append(abstract)
        statuses.append(status)
        errors.append(err)

        if sleep_seconds:
            time.sleep(sleep_seconds)

    df[out_col] = abstracts
    df[f"{out_col}_http_status"] = statuses
    df[f"{out_col}_error"] = errors
    return df


In [5]:
df = pd.read_csv('/Users/navid/Documents/1_Projects/0_Age-It/Our Tasks/Mario_report/data/2026-01-09T11-54_papers.csv')

In [6]:
df.columns

Index(['Unnamed: 0', 'title', 'year_scholar', 'authors_abbrev', 'venue_abbrev',
       'scholar_link', 'cited_by', 'doi', 'url', 'authors_full',
       'container_title', 'published_year', 'publisher', 'type', 'abstract',
       'Name', 'CF'],
      dtype='object')

In [10]:
no_abs = df[df['abstract'].isna()]

In [7]:
import os

# 1) Set it (this lasts for the current Python session only)
os.environ["ELSEVIER_API_KEY"] = "013a679c87ca69c176986fe33de3fc47"

# 2) Read it
api_key = os.environ["ELSEVIER_API_KEY"]

In [11]:

api_key = "013a679c87ca69c176986fe33de3fc47"
inst_token = os.environ.get("ELSEVIER_INST_TOKEN")      # optional
bearer = os.environ.get("ELSEVIER_BEARER_TOKEN")        # optional

df2 = add_abstracts_to_df(
    no_abs,
    doi_col="doi",
    api_key=api_key,
    inst_token=inst_token,
    bearer_token=bearer,
    view="META_ABS",
    sleep_seconds=0.2,
)
df2.to_csv("with_abstracts.csv", index=False)

In [14]:
df2['abstract_http_status'].value_counts()

abstract_http_status
0      216
200    203
404     64
Name: count, dtype: int64

In [19]:
len(df2.doi.isna())

483

In [22]:
df2[~df2['doi'].isna()]

Unnamed: 0.1,Unnamed: 0,title,year_scholar,authors_abbrev,venue_abbrev,scholar_link,cited_by,doi,url,authors_full,container_title,published_year,publisher,type,abstract,Name,CF,abstract_http_status,abstract_error
2,2,eXplainable AI for word embeddings: A survey,2025,"R Boselli, S D’Amico, N Nobani","Cognitive Computation 17 (1), 19, 2025",https://scholar.google.com/citations?view_op=v...,10.0,10.1007/s12559-024-10373-2,https://doi.org/10.1007/s12559-024-10373-2,"Roberto Boselli, Simone D’Amico, Navid Nobani",Cognitive Computation,2025.0,Springer Science and Business Media LLC,journal-article,"In recent years, word embeddings have become i...",Navid Nobani,NBNNVD87P21Z224J,200,
3,3,for Spatio-temporal Modelling of Repeated Cros...,2025,"L Schiavon, M Stival",Methodological and Applied Statistics and Demo...,https://scholar.google.com/citations?view_op=v...,,10.1007/978-3-031-64350-7_8,https://doi.org/10.1007/978-3-031-64350-7_8,"Lorenzo Schiavon, Mattia Stival",Italian Statistical Society Series on Advances...,2025.0,Springer Nature Switzerland,book-chapter,,Lorenzo Schiavon,SCHLNZ94M18G224Q,404,Not found (DOI not resolvable via this endpoint)
8,8,Efficient Posterior Inference for Spatio-tempo...,2024,"L Schiavon, M Stival",Scientific Meeting of the Italian Statistical ...,https://scholar.google.com/citations?view_op=v...,,10.1007/978-3-031-64350-7_8,https://doi.org/10.1007/978-3-031-64350-7_8,"Lorenzo Schiavon, Mattia Stival",Italian Statistical Society Series on Advances...,2025.0,Springer Nature Switzerland,book-chapter,,Lorenzo Schiavon,SCHLNZ94M18G224Q,404,Not found (DOI not resolvable via this endpoint)
9,9,Exploring Disease Prevalence in Italy: A Web A...,2024,"M Stival, L Schiavon, G Bertarelli, S Campostrini",Scientific Meeting of the Italian Statistical ...,https://scholar.google.com/citations?view_op=v...,1.0,10.1007/978-3-031-64447-4_96,https://doi.org/10.1007/978-3-031-64447-4_96,"Mattia Stival, Lorenzo Schiavon, Gaia Bertarel...",Italian Statistical Society Series on Advances...,2025.0,Springer Nature Switzerland,book-chapter,,Lorenzo Schiavon,SCHLNZ94M18G224Q,404,Not found (DOI not resolvable via this endpoint)
12,12,The impact of Down syndrome on patients’ careg...,2025,"R Gnasso, A Tavakkolifar, G Esposito, A Palomb...","Neurological Sciences, 1-5, 2025",https://scholar.google.com/citations?view_op=v...,,10.1007/s10072-025-08266-9,https://doi.org/10.1007/s10072-025-08266-9,"Rossana Gnasso, Ayda Tavakkolifar, Giuseppe Es...",Neurological Sciences,2025.0,Springer Science and Business Media LLC,journal-article,Down syndrome is a chronic multisystem conditi...,ANTONIO PICONE,PCNNTN84M09F839T,200,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1114,1114,Evolution of Aesthetic Breast Surgery,2024,"MB Nava, A Rancati, P Mallucci, N Rocco",Landmark Papers in Plastic Surgery: Commented ...,https://scholar.google.com/citations?view_op=v...,,10.1007/978-3-031-57132-9_18,https://doi.org/10.1007/978-3-031-57132-9_18,"Maurizio Bruno Nava, Alberto Rancati, Patrick ...",Landmark Papers in Plastic Surgery,2024.0,Springer International Publishing,book-chapter,This chapter delves into five pivotal mileston...,Nicola Rocco,RCCNCL81D11F839Q,200,
1115,1115,New perspectives in breast reconstruction,2024,"N Rocco, G Catanuto, G Montagna","Annals of Breast Surgery 8, 2024",https://scholar.google.com/citations?view_op=v...,,10.21037/abs-23-39,https://doi.org/10.21037/abs-23-39,"Nicola Rocco, Giuseppe Catanuto, Giacomo Montagna",Annals of Breast Surgery,2024.0,AME Publishing Company,journal-article,,Nicola Rocco,RCCNCL81D11F839Q,200,No abstract field found in response payload
1120,1120,Thoraco-dorsal artery perforator flap for tota...,2023,"D Virzì, F Caruso, G Castiglione, M Marino, M ...","European Journal of Surgical Oncology 49 (11),...",https://scholar.google.com/citations?view_op=v...,3.0,10.1016/j.ejso.2023.106988,https://doi.org/10.1016/j.ejso.2023.106988,"Dario Virzì, Francesco Caruso, Gaetano Castigl...",European Journal of Surgical Oncology,2023.0,Elsevier BV,journal-article,This study reports on feasibility and applicab...,Nicola Rocco,RCCNCL81D11F839Q,200,
1125,1125,Text mining and word embedding for classificat...,2022,"G Catanuto, N Rocco, A Maglia, P Barry, A Kara...","European Journal of Surgical Oncology 48 (7), ...",https://scholar.google.com/citations?view_op=v...,10.0,10.1016/j.ejso.2022.03.002,https://doi.org/10.1016/j.ejso.2022.03.002,"G. Catanuto, N. Rocco, A. Maglia, P. Barry, A....",European Journal of Surgical Oncology,2022.0,Elsevier BV,journal-article,Introduction: Decision making in surgical onco...,Nicola Rocco,RCCNCL81D11F839Q,200,
