# 1.Setup and Imports

In [93]:
# Imports for data handling, API requests, and environment variables
import os
import json
import requests
import time
import re
import math
from typing import Optional, Dict, Any, List, Tuple
from datetime import date
import unicodedata
import xml.etree.ElementTree as ET
from dotenv import load_dotenv
from tqdm import tqdm
import pandas as pd

load_dotenv()

print("Libraries imported and environment loaded.")

Libraries imported and environment loaded.


# 2. Defining Search Queries

In [2]:

semantic_scholar_search_queries = {
    "crl_core": 'Causal Reinforcement Learning | Causal RL',
    "rl_for_ev": 'Reinforcement Learning + (EV Charging | Smart Charging)',
    "xai_for_grid": '(Explainable AI | XAI) + (Smart Grid | Grid Stability)',
    "causal_inference_energy": 'Causal Inference + (Energy Systems | Power Grid)',
    "broader_rl_ev": '(Reinforcement Learning | Deep Reinforcement Learning) + (Vehicle-to-Grid | V2G | Demand Response)',
    "interpretable_rl_grid": '(Interpretable Reinforcement Learning | Explainable Reinforcement Learning") + (Power Grid)',
    "causal_v2g": 'Causal Inference + (Vehicle-to-Grid | EV Charging)',
    "safety_rl_energy": '(Safe Reinforcement Learning | Robust Reinforcement Learning) + (Power Grid | EV Charging | Smart Charging)'
}

# 3. API Definition (Semantic Scholar)

In [41]:
def fetch_paper_details(paper_ids):
    """
    Fetches detailed paper metadata, including abstracts, for a list of paper IDs
    using the /paper/batch endpoint.
    
    This version adds a minimal per-chunk retry mechanism:
    - Retries each chunk up to 3 times on transient failures (network errors, 429, 5xx).
    - Only after the third failure does it skip the chunk and continue.
    - A request timeout is set to avoid hanging connections.
    """
    # The endpoint for fetching details of multiple papers
    details_url = "https://api.semanticscholar.org/graph/v1/paper/batch?fields=paperId,externalIds,url,title,abstract,venue,publicationVenue,year,referenceCount,citationCount,influentialCitationCount,openAccessPdf,fieldsOfStudy,s2FieldsOfStudy,publicationDate,authors,tldr,references"
    
    # The API can handle up to 500 IDs per request
    chunk_size = 400  # Use a slightly smaller chunk size for safety
    
    detailed_papers = []
    n_chunks = (len(paper_ids) + chunk_size - 1) // chunk_size

    # Process the paper IDs in chunks to respect API limits
    for i in tqdm(range(0, len(paper_ids), chunk_size), desc="Fetching paper details", total=n_chunks):
        chunk = paper_ids[i:i + chunk_size]

        # --- Retry this chunk up to 3 times before skipping ---
        max_retries = 3
        for attempt in range(1, max_retries + 1):
            try:
                # This is a POST request, with the IDs sent in the JSON body
                # A timeout is added to avoid hanging; this is a minimal safety improvement.
                response = requests.post(details_url, json={'ids': chunk}, timeout=30)

                # Treat 429 (rate limit) and common 5xx as transient and retryable
                if response.status_code in (429, 500, 502, 503, 504):
                    raise requests.exceptions.HTTPError(
                        f"Retryable status {response.status_code}", response=response
                    )

                response.raise_for_status()
                data = response.json()

                # Filter out any null responses which can occur if an ID is not found
                valid_papers = [paper for paper in data if paper is not None]
                detailed_papers.extend(valid_papers)

                # On success, leave the retry loop
                break

            except requests.exceptions.RequestException as e:
                # Logs the failure; after 3 attempts the chunk is skipped.
                print(f"[warn] An API error occurred during detail fetching for chunk {i//chunk_size + 1}, attempt {attempt}: {e}")
                if attempt == max_retries:
                    print("[warn] Skipping this chunk after 3 failed attempts.")
                    # Proceed to next chunk after exhausting retries
                    break
                # Exponential backoff: 1s, 2s
                time.sleep(2 ** (attempt - 1))

        # Pause between chunks (kept from original code)
        time.sleep(1)

    return detailed_papers

def fetch_semantic_scholar_papers_bulk(query, max_total_records=500):
    """
    Performs a two-step fetch from Semantic Scholar:
    1. Uses the bulk search to efficiently get a list of relevant paper IDs.
    2. Uses the batch details endpoint to retrieve full metadata, including abstracts.
    """
    base_query = f"https://api.semanticscholar.org/graph/v1/paper/search/bulk?query='{query}'"
    
    # In the first step, we only need the paperId
    query_params = {"fields": "title,year,citationCount"}
    
    paper_ids = []
    
    print(f"Starting bulk search for query: {query}")
    try:
        response = requests.get(base_query, params=query_params)
        response.raise_for_status()
        data = response.json()
    except requests.exceptions.RequestException as e:
        print(f"An API error occurred during bulk search: {e}")

    
    articles = data.get('data', [])
    # Collect the paper IDs from the search results
    for article in articles:
        paper_ids.append(article['paperId'])
        
    time.sleep(0.5)

    # --- Step 2: Fetch full details for the collected IDs ---
    if not paper_ids:
        return []

    print(f"Found {len(paper_ids)} paper IDs. Now fetching details...")
    detailed_results = fetch_paper_details(paper_ids[:max_total_records])
    
    # Final processing to match our desired data structure
    final_papers = []
    for paper in detailed_results:
        final_papers.append({
            'id': paper.get('paperId'),
            'title': paper.get('title'),
            'abstract': (
                paper.get('abstract')
                or (paper.get('tldr') or {}).get('text')
                or ""
            ),
            'year': paper.get('year'),
            'publicationDate': paper.get('publicationDate'),
            'doi': (paper.get('externalIds') or {}).get('DOI') or "",
            'venue': paper.get('venue') or "",
            'venue_type': (paper.get('publicationVenue') or {}).get('type') or "",
            'venue_url': (paper.get('publicationVenue') or {}).get('url') or "",
            # Fields may be absent or None → normalize to list
            'fields_of_study': paper.get('fieldsOfStudy') or [],
            's2_fields_of_study': paper.get('fieldsOfStudy') or [],
            # Numeric counts: default to 0 to simplify downstream math
            'citation_count': paper.get('citationCount') or 0,
            'influential_citation_count': paper.get('influentialCitationCount') or 0,
            'reference_count': paper.get('referenceCount') or 0,
            'references': [ref.get('paperId') for ref in (paper.get('references') or [])],
            # Authors may be missing; normalize to lists of strings/ids
            'author_id': [a.get('authorId') for a in (paper.get('authors') or [])],
            'authors':   [a.get('name')     for a in (paper.get('authors') or [])],
            's2_url': paper.get('url') or "",
            'open_access_pdf': (paper.get('openAccessPdf') or {}).get('url') or "",
            'query': query
        })
        
    return final_papers

# 4. Fetch Data

In [42]:
print(f"\nFetching from Semantic Scholar...")
s1_papers_list = []
for name, query in tqdm(semantic_scholar_search_queries.items(), desc="Semantic Scholar Queries"):
    papers = fetch_semantic_scholar_papers_bulk(query, max_total_records=1000)
    s1_papers_list.extend(papers)
    time.sleep(1) 

# De-duplicate within the Semantic Scholar results and save
s1_df = pd.DataFrame(s1_papers_list)
if not s1_df.empty:
    s1_df.drop_duplicates(subset='id', keep='first', inplace=True)
    s1_df.to_csv('./data/raw/raw_semantic_scholar.csv', index=False)
    print(f"Saved {len(s1_df)} unique papers from Semantic Scholar to data/raw/raw_semantic_scholar.csv")
else:
    print("No papers were found from Semantic Scholar.")


Fetching from Semantic Scholar...


Semantic Scholar Queries:   0%|          | 0/8 [00:00<?, ?it/s]

Starting bulk search for query: Causal Reinforcement Learning | Causal RL
Found 403 paper IDs. Now fetching details...


Fetching paper details: 100%|██████████| 2/2 [00:19<00:00,  9.61s/it]
Semantic Scholar Queries:  12%|█▎        | 1/8 [00:21<02:32, 21.75s/it]

Starting bulk search for query: Reinforcement Learning + (EV Charging | Smart Charging)
Found 575 paper IDs. Now fetching details...


Fetching paper details: 100%|██████████| 2/2 [00:13<00:00,  6.74s/it]
Semantic Scholar Queries:  25%|██▌       | 2/8 [00:37<01:50, 18.41s/it]

Starting bulk search for query: (Explainable AI | XAI) + (Smart Grid | Grid Stability)
Found 38 paper IDs. Now fetching details...


Fetching paper details: 100%|██████████| 1/1 [00:07<00:00,  7.60s/it]
Semantic Scholar Queries:  38%|███▊      | 3/8 [00:47<01:11, 14.32s/it]

Starting bulk search for query: Causal Inference + (Energy Systems | Power Grid)
Found 27 paper IDs. Now fetching details...


Fetching paper details: 100%|██████████| 1/1 [00:02<00:00,  2.17s/it]
Semantic Scholar Queries:  50%|█████     | 4/8 [00:51<00:41, 10.40s/it]

Starting bulk search for query: (Reinforcement Learning | Deep Reinforcement Learning) + (Vehicle-to-Grid | V2G | Demand Response)
Found 1000 paper IDs. Now fetching details...


Fetching paper details: 100%|██████████| 3/3 [00:43<00:00, 14.64s/it]
Semantic Scholar Queries:  62%|██████▎   | 5/8 [01:38<01:10, 23.58s/it]

Starting bulk search for query: (Interpretable Reinforcement Learning | Explainable Reinforcement Learning") + (Power Grid)
Found 55 paper IDs. Now fetching details...


Fetching paper details: 100%|██████████| 1/1 [00:02<00:00,  2.71s/it]
Semantic Scholar Queries:  75%|███████▌  | 6/8 [01:43<00:34, 17.21s/it]

Starting bulk search for query: Causal Inference + (Vehicle-to-Grid | EV Charging)
Found 2 paper IDs. Now fetching details...


Fetching paper details: 100%|██████████| 1/1 [00:01<00:00,  1.73s/it]
Semantic Scholar Queries:  88%|████████▊ | 7/8 [01:47<00:12, 12.85s/it]

Starting bulk search for query: (Safe Reinforcement Learning | Robust Reinforcement Learning) + (Power Grid | EV Charging | Smart Charging)
Found 82 paper IDs. Now fetching details...


Fetching paper details: 100%|██████████| 1/1 [00:03<00:00,  3.59s/it]
Semantic Scholar Queries: 100%|██████████| 8/8 [01:52<00:00, 14.10s/it]

Saved 2055 unique papers from Semantic Scholar to data/raw/raw_semantic_scholar.csv





# 6. Analyze CSV Result

In [76]:
df = pd.read_csv('./data/raw/raw_semantic_scholar.csv')
print(f"Loaded {len(df)} papers from CSV.")

df.head()

Loaded 2055 papers from CSV.


Unnamed: 0,id,title,abstract,year,publicationDate,doi,venue,venue_type,venue_url,fields_of_study,s2_fields_of_study,citation_count,influential_citation_count,reference_count,references,author_id,authors,s2_url,open_access_pdf,query
0,00b75f61f8bd3246fff75f84d852ba3e80d5338e,Applications of information Nonanticipative Ra...,The objective of this paper is to further inve...,2014.0,2014-01-22,10.1109/ISIT.2014.6875397,2014 IEEE International Symposium on Informati...,,,"['Mathematics', 'Computer Science']","['Mathematics', 'Computer Science']",4,0,18,"['5761c61368f672c4516a6914674a39e6aa5983b8', '...","['145657810', '2081852', '1745427']","['Photios A. Stavrou', 'C. Kourtellaris', 'C. ...",https://www.semanticscholar.org/paper/00b75f61...,http://arxiv.org/pdf/1401.5828,Causal Reinforcement Learning | Causal RL
1,01befcd360d36d520f595b34d5d26e37e0ac16f3,Explainable Agency in Reinforcement Learning A...,This thesis explores how reinforcement learnin...,2020.0,2020-04-03,10.1609/aaai.v34i10.7134,AAAI Conference on Artificial Intelligence,conference,http://www.aaai.org/,['Computer Science'],['Computer Science'],1,0,14,"['cfb68baa23048e3e0f8845c099fa013797bd623f', '...",['9303604'],['Prashan Madumal'],https://www.semanticscholar.org/paper/01befcd3...,https://doi.org/10.1609/aaai.v34i10.7134,Causal Reinforcement Learning | Causal RL
2,01e9241dbb9eaca99b86468bb079f4b631b71671,Causal prompting model-based offline reinforce...,Model-based offline Reinforcement Learning (RL...,2024.0,2024-06-03,10.48550/arXiv.2406.01065,arXiv.org,,https://arxiv.org,['Computer Science'],['Computer Science'],0,0,0,[],"['2116329956', '2153979217', '2303466987', '22...","['Xuehui Yu', 'Yi Guan', 'Rujia Shen', 'Xin Li...",https://www.semanticscholar.org/paper/01e9241d...,,Causal Reinforcement Learning | Causal RL
3,026dc8d3cbb360bdd12d19c924bc633221c9b423,Learning Causal Overhypotheses through Explora...,Despite recent progress in reinforcement learn...,2022.0,2022-02-21,,CLEaR,conference,http://www.jolace.com/publications/clear/,['Computer Science'],['Computer Science'],9,1,50,"['3be84e24b144541a8cd9030526ef2b8ef2cbfe54', '...","['8519553', '2150491107', '39229748', '1525028...","['Eliza Kosoy', 'Adrian Liu', 'Jasmine Collins...",https://www.semanticscholar.org/paper/026dc8d3...,,Causal Reinforcement Learning | Causal RL
4,0348b36927f740b82f51afcd1c35cae8386bc336,Segmented Encoding for Sim2Real of RL-based En...,Among the challenges in the recent research of...,2022.0,2022-06-05,10.1109/iv51971.2022.9827374,2022 IEEE Intelligent Vehicles Symposium (IV),,,['Computer Science'],['Computer Science'],3,0,20,[],"['47238664', '39530824', '2179287901', '673452...","['Seung H. Chung', 'S. Kong', 'S. Cho', 'I. M....",https://www.semanticscholar.org/paper/0348b369...,,Causal Reinforcement Learning | Causal RL


In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2055 entries, 0 to 2054
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          2055 non-null   object 
 1   title                       2055 non-null   object 
 2   abstract                    1902 non-null   object 
 3   year                        2015 non-null   float64
 4   publicationDate             1734 non-null   object 
 5   doi                         1780 non-null   object 
 6   venue                       1792 non-null   object 
 7   venue_type                  1139 non-null   object 
 8   venue_url                   1191 non-null   object 
 9   fields_of_study             2055 non-null   object 
 10  s2_fields_of_study          2055 non-null   object 
 11  citation_count              2055 non-null   int64  
 12  influential_citation_count  2055 non-null   int64  
 13  reference_count             2055 

# 7. Try openAlex to fill in missing information

In [70]:
OPENALEX_BASE = "https://api.openalex.org/works"
OPENALEX_EMAIL = "maxistahl92@gmail.com"  # <-- set a real contact email for the polite pool

# Reuse one session; identify the client per OpenAlex guidance.
_OA = requests.Session()
_OA.headers.update({
    "User-Agent": f"Verdantis-Research/1.0 (mailto:{OPENALEX_EMAIL})",
    "Accept": "application/json",
})

# ----------------------------- small utilities -----------------------------

def _normalize_name(s: str) -> str:
    """Normalizes names/titles for robust equality (third-person commentary)."""
    if not isinstance(s, str): 
        return ""
    t = unicodedata.normalize("NFKD", s)
    t = "".join(ch for ch in t if not unicodedata.combining(ch))
    t = re.sub(r"[^\w\s]", " ", t.lower())
    return re.sub(r"\s+", " ", t).strip()

def _to_bare_doi(doi: Optional[str]) -> str:
    """Converts any DOI form to bare '10.xxxx/...', matching your CSV style."""
    if not doi:
        return ""
    d = doi.strip().lower()
    if d.startswith("https://doi.org/"):
        return d[len("https://doi.org/"):]
    if d.startswith("http://doi.org/"):
        return d[len("http://doi.org/"):]
    return d

def _request_with_retries(url: str, params: Dict[str, Any] | None = None, max_retries: int = 3) -> Optional[dict]:
    """Performs a GET with basic retries on 429/5xx; returns parsed JSON or None."""
    for attempt in range(1, max_retries + 1):
        try:
            resp = _OA.get(url, params=params, timeout=30)
            if resp.status_code in (429, 500, 502, 503, 504):
                ra = resp.headers.get("Retry-After")
                sleep_s = float(ra) if ra and ra.isdigit() else (2 ** (attempt - 1))
                time.sleep(sleep_s)
                continue
            resp.raise_for_status()
            return resp.json()
        except requests.exceptions.RequestException as e:
            print(f"[warn] OpenAlex GET attempt {attempt} failed: {e}")
            if attempt == max_retries:
                return None
            time.sleep(2 ** (attempt - 1))
    return None

# ----------------------------- API-facing primitives -----------------------------

def openalex_get_by_doi(doi_bare: str, select: str = "id,doi,display_name,publication_year,publication_date,authorships,primary_location,locations,abstract_inverted_index") -> Optional[dict]:
    """
    Fetches a single Work by DOI via `/works/https://doi.org/<doi>`. 
    This follows the 'get a single work' contract for external IDs. 
    """
    # Per docs, DOIs can be used in the path: /works/https://doi.org/<doi>
    # Add select to minimize payload. 
    url = f"{OPENALEX_BASE}/https://doi.org/{doi_bare}"
    data = _request_with_retries(url, params={"select": select})
    return data

def openalex_get_by_openalex_id(openalex_id: str, select: str = "id,doi,display_name,publication_year,publication_date,authorships,primary_location,locations,abstract_inverted_index") -> Optional[dict]:
    """Fetches a single Work by OpenAlex ID (`/works/W...`)."""
    url = f"{OPENALEX_BASE}/{openalex_id}"
    data = _request_with_retries(url, params={"select": select})
    return data

def _search_candidates_by_title_year(title: str, year_center: Optional[int] = None, year_window: int = 1, per_page: int = 25) -> List[dict]:
    """
    Searches for candidate Works by title and optional year window using official search+filter.
    - Uses `search=<title>` (recommended) and combines with `filter` on publication_year or date range.
    - Returns a small list of candidate Work dicts with minimal fields.
    """
    params = {
        "search": title,  # per docs: best way to search works (titles, abstracts, some fulltext)
        "per_page": per_page,
        "select": "id,doi,display_name,publication_year,publication_date,authorships,cited_by_count",
        "mailto": OPENALEX_EMAIL,
        "sort": "relevance_score:desc"
    }

    # Build a publication_year OR list if a center is known (e.g., y-1|y|y+1). Docs allow OR with pipes.
    # Alternatively, from/to_publication_date could be used; year OR is simpler here.
    if isinstance(year_center, int):
        ys = [str(y) for y in range(year_center - year_window, year_center + year_window + 1)]
        params["filter"] = f"publication_year:{'|'.join(ys)}"

    data = _request_with_retries(OPENALEX_BASE, params=params)
    if not data or "results" not in data:
        return []
    return data["results"] or []

def _pick_best_candidate(title: str, authors: List[str] | None, candidates: List[dict]) -> Optional[dict]:
    """
    Picks the best candidate:
    1) Exact-ish normalized title match AND any author display_name matching (if authors provided).
    2) Exact-ish title match.
    3) Otherwise most-cited with DOI.
    4) Else top result.
    """
    t_norm = _normalize_name(title)
    auth_norm = {_normalize_name(a) for a in (authors or []) if a}

    # Pass 1: title + author name match (client-side, because API discourages related-entity name search)
    for c in candidates:
        if _normalize_name(c.get("display_name", "")) == t_norm and c.get("authorships"):
            if not auth_norm:
                return c
            c_names = {_normalize_name(a.get("author", {}).get("display_name", "")) for a in c["authorships"]}
            if auth_norm & c_names:
                return c

    # Pass 2: title-only match
    for c in candidates:
        if _normalize_name(c.get("display_name", "")) == t_norm:
            return c

    # Pass 3: best cited with DOI
    with_doi = [c for c in candidates if c.get("doi")]
    if with_doi:
        return max(with_doi, key=lambda x: x.get("cited_by_count") or 0)

    # Pass 4: first by relevance
    return candidates[0] if candidates else None

# ----------------------------- field extraction helpers -----------------------------

def _extract_venue(work: dict) -> str:
    """
    Extracts venue name from OpenAlex Work. 
    Prefer primary_location.source.display_name; else first locations[].source.display_name.
    """
    # primary_location and locations are the new fields per docs (host_venue deprecated).
    # https://docs.openalex.org/api-entities/works/work-object
    src = ((work.get("primary_location") or {}).get("source") or {}).get("display_name")
    if src:
        return src
    for loc in (work.get("locations") or []):
        dn = ((loc or {}).get("source") or {}).get("display_name")
        if dn:
            return dn
    return ""

def _extract_plain_abstract(work: dict) -> str:
    """
    Converts abstract_inverted_index (if present) to a plaintext abstract.
    OpenAlex provides abstracts as an inverted index; this reconstructs the text.
    """
    inv = work.get("abstract_inverted_index") or {}
    if not isinstance(inv, dict) or not inv:
        return ""
    # Build position->token map, then join in order.
    pos_to_token: Dict[int, str] = {}
    for token, positions in inv.items():
        for p in positions:
            pos_to_token[p] = token
    return " ".join(pos_to_token[i] for i in range(0, max(pos_to_token.keys()) + 1) if i in pos_to_token)

# ----------------------------- high-level cases -----------------------------

def resolve_by_title_author_year_to_openalex(title: str,
                                             authors: List[str] | None = None,
                                             year_center: Optional[int] = None,
                                             year_window: int = 1) -> Optional[dict]:
    """
    Case 1: DOI missing. Finds the best Work by title (+ optional authors, year±window), then returns the Work dict.
    - Uses search + publication_year filter (OR list).
    - Post-filters by author names locally (per docs: author name search is not supported at /works).
    """
    cands = _search_candidates_by_title_year(title, year_center, year_window)
    best = _pick_best_candidate(title, authors, cands)
    if not best:
        return None
    # If only minimal fields were selected, fetch full single entity for completeness (cheap, 1 call).
    return openalex_get_by_openalex_id(best["id"].split("/")[-1], 
        select="id,doi,display_name,publication_year,publication_date,authorships,primary_location,locations,abstract_inverted_index")

def resolve_venue_abstract_by_doi_or_title(doi_bare: Optional[str],
                                           title: Optional[str],
                                           authors: List[str] | None = None,
                                           year_center: Optional[int] = None) -> Optional[dict]:
    """
    Case 2: Venue/Abstract missing. 
    - If DOI present: get single work by DOI.
    - Else: fall back to Case 1 (title+author+/-year).
    """
    if isinstance(doi_bare, str) and doi_bare.strip():
        w = openalex_get_by_doi(_to_bare_doi(doi_bare))
        if w:
            return w
    if isinstance(title, str) and title.strip():
        return resolve_by_title_author_year_to_openalex(title, authors, year_center)
    return None

def resolve_year_by_any(doi_bare: Optional[str],
                        title: Optional[str],
                        authors: List[str] | None = None) -> Optional[int]:
    """
    Case 3: Year missing. 
    - Prefer DOI → single work → publication_year.
    - Else title+author → best candidate → publication_year.
    """
    if isinstance(doi_bare, str) and doi_bare.strip():
        w = openalex_get_by_doi(_to_bare_doi(doi_bare), select="id,publication_year")
        if w and w.get("publication_year") is not None:
            return int(w["publication_year"])
    if isinstance(title, str) and title.strip():
        best = resolve_by_title_author_year_to_openalex(title, authors, year_center=None)
        if best and best.get("publication_year") is not None:
            return int(best["publication_year"])
    return None

# ----------------------------- DataFrame integrator -----------------------------

def enrich_df_with_openalex(df,
                            title_col: str = "title",
                            authors_col: str = "authors",   # expects list of author names or a stringified list
                            doi_col: str = "doi",
                            venue_col: str = "venue",
                            abstract_col: str = "abstract",
                            year_col: str = "year",
                            mark_cols: Dict[str, str] = None,
                            sleep_between_requests: float = 0.25):
    """
    Enriches rows using OpenAlex, handling your three cases:
      1) DOI missing  -> resolve by title+authors(+/-year) to get DOI.
      2) Venue/Abstract missing -> fetch by DOI; if no DOI, reuse case 1.
      3) Year missing -> fetch via DOI or title+authors.
    The function updates df in-place and returns it for convenience.
    """
    if mark_cols is None:
        mark_cols = {
            "doi_from_openalex": "doi_from_openalex",
            "venue_from_openalex": "venue_from_openalex",
            "abstract_from_openalex": "abstract_from_openalex",
            "year_from_openalex": "year_from_openalex",
            "openalex_id": "openalex_id"
        }
    # Ensure marker columns exist
    for k in mark_cols.values():
        if k not in df.columns:
            df[k] = False if "from_openalex" in k else ""

    # Iterate once; try to minimize calls while covering cases
    for idx in tqdm(df.index, desc="OpenAlex enrichment"):
        title = df.at[idx, title_col] if title_col in df.columns else None
        authors_val = df.at[idx, authors_col] if authors_col in df.columns else None
        # Accept both list and stringified list; normalize to list[str]
        if isinstance(authors_val, str):
            # naive split for your CSV examples
            if authors_val.startswith("[") and authors_val.endswith("]"):
                # attempt to split names from stringified list like "['A', 'B']"
                names = re.findall(r"'([^']+)'|\"([^\"]+)\"", authors_val)
                authors = [a or b for (a, b) in names if (a or b)]
            else:
                authors = [a.strip() for a in authors_val.split(",") if a.strip()]
        elif isinstance(authors_val, list):
            authors = authors_val
        else:
            authors = None

        doi_bare = df.at[idx, doi_col] if doi_col in df.columns else None
        venue = df.at[idx, venue_col] if venue_col in df.columns else None
        abstract_txt = df.at[idx, abstract_col] if abstract_col in df.columns else None
        year_val = df.at[idx, year_col] if year_col in df.columns else None

        # Parse year as int if possible
        y = None
        if isinstance(year_val, (int, float)) and not (isinstance(year_val, float) and math.isnan(year_val)):
            y = int(year_val)

        # 1) DOI missing
        if not (isinstance(doi_bare, str) and doi_bare.strip()):
            work = resolve_by_title_author_year_to_openalex(title or "", authors, year_center=y, year_window=1)
            if work:
                doi_found = _to_bare_doi(work.get("doi") or "")
                if doi_found:
                    df.at[idx, doi_col] = doi_found
                    df.at[idx, mark_cols["doi_from_openalex"]] = True
                # keep openalex id for traceability
                df.at[idx, mark_cols["openalex_id"]] = work.get("id", "")
        else:
            # Normalize DOI in-place
            df.at[idx, doi_col] = _to_bare_doi(doi_bare)

        # 2) Venue / Abstract missing
        need_venue = not (isinstance(venue, str) and venue.strip())
        need_abs = not (isinstance(abstract_txt, str) and abstract_txt.strip())

        if need_venue or need_abs:
            # prefer DOI path
            doi_now = df.at[idx, doi_col]
            if isinstance(doi_now, str) and doi_now.strip():
                work = openalex_get_by_doi(doi_now)
            else:
                work = resolve_by_title_author_year_to_openalex(title or "", authors, year_center=y, year_window=1)

            if work:
                if need_venue:
                    v = _extract_venue(work)
                    if v:
                        df.at[idx, venue_col] = v
                        df.at[idx, mark_cols["venue_from_openalex"]] = True
                if need_abs:
                    abs_plain = _extract_plain_abstract(work)
                    if abs_plain:
                        df.at[idx, abstract_col] = abs_plain
                        df.at[idx, mark_cols["abstract_from_openalex"]] = True
                # record id if not present
                if not df.at[idx, mark_cols["openalex_id"]]:
                    df.at[idx, mark_cols["openalex_id"]] = work.get("id", "")

        # 3) Year missing
        if y is None or (isinstance(year_val, float) and math.isnan(year_val)):
            doi_now = df.at[idx, doi_col]
            year_guess = resolve_year_by_any(doi_now, title, authors)
            if year_guess is not None:
                df.at[idx, year_col] = int(year_guess)
                df.at[idx, mark_cols["year_from_openalex"]] = True

        time.sleep(sleep_between_requests)

    return df

In [79]:
# Enrich in place
df_alex = enrich_df_with_openalex(
    df,
    title_col="title",
    authors_col="authors",
    doi_col="doi",
    venue_col="venue",
    abstract_col="abstract",
    year_col="year",
    mark_cols={
        "doi_from_openalex": "doi_from_openalex",
        "venue_from_openalex": "venue_from_openalex",
        "abstract_from_openalex": "abstract_from_openalex",
        "year_from_openalex": "year_from_openalex",
        "openalex_id": "openalex_id"
    },
    sleep_between_requests=0.2
)

OpenAlex enrichment: 100%|██████████| 2055/2055 [10:25<00:00,  3.28it/s]


In [80]:
df_alex.head()

Unnamed: 0,id,title,abstract,year,publicationDate,doi,venue,venue_type,venue_url,fields_of_study,...,author_id,authors,s2_url,open_access_pdf,query,doi_from_openalex,venue_from_openalex,abstract_from_openalex,year_from_openalex,openalex_id
0,00b75f61f8bd3246fff75f84d852ba3e80d5338e,Applications of information Nonanticipative Ra...,The objective of this paper is to further inve...,2014.0,2014-01-22,10.1109/isit.2014.6875397,2014 IEEE International Symposium on Informati...,,,"['Mathematics', 'Computer Science']",...,"['145657810', '2081852', '1745427']","['Photios A. Stavrou', 'C. Kourtellaris', 'C. ...",https://www.semanticscholar.org/paper/00b75f61...,http://arxiv.org/pdf/1401.5828,Causal Reinforcement Learning | Causal RL,False,False,False,False,
1,01befcd360d36d520f595b34d5d26e37e0ac16f3,Explainable Agency in Reinforcement Learning A...,This thesis explores how reinforcement learnin...,2020.0,2020-04-03,10.1609/aaai.v34i10.7134,AAAI Conference on Artificial Intelligence,conference,http://www.aaai.org/,['Computer Science'],...,['9303604'],['Prashan Madumal'],https://www.semanticscholar.org/paper/01befcd3...,https://doi.org/10.1609/aaai.v34i10.7134,Causal Reinforcement Learning | Causal RL,False,False,False,False,
2,01e9241dbb9eaca99b86468bb079f4b631b71671,Causal prompting model-based offline reinforce...,Model-based offline Reinforcement Learning (RL...,2024.0,2024-06-03,10.48550/arxiv.2406.01065,arXiv.org,,https://arxiv.org,['Computer Science'],...,"['2116329956', '2153979217', '2303466987', '22...","['Xuehui Yu', 'Yi Guan', 'Rujia Shen', 'Xin Li...",https://www.semanticscholar.org/paper/01e9241d...,,Causal Reinforcement Learning | Causal RL,False,False,False,False,
3,026dc8d3cbb360bdd12d19c924bc633221c9b423,Learning Causal Overhypotheses through Explora...,Despite recent progress in reinforcement learn...,2022.0,2022-02-21,10.48550/arxiv.2202.10430,CLEaR,conference,http://www.jolace.com/publications/clear/,['Computer Science'],...,"['8519553', '2150491107', '39229748', '1525028...","['Eliza Kosoy', 'Adrian Liu', 'Jasmine Collins...",https://www.semanticscholar.org/paper/026dc8d3...,,Causal Reinforcement Learning | Causal RL,True,False,False,False,https://openalex.org/W4221153082
4,0348b36927f740b82f51afcd1c35cae8386bc336,Segmented Encoding for Sim2Real of RL-based En...,Among the challenges in the recent research of...,2022.0,2022-06-05,10.1109/iv51971.2022.9827374,2022 IEEE Intelligent Vehicles Symposium (IV),,,['Computer Science'],...,"['47238664', '39530824', '2179287901', '673452...","['Seung H. Chung', 'S. Kong', 'S. Cho', 'I. M....",https://www.semanticscholar.org/paper/0348b369...,,Causal Reinforcement Learning | Causal RL,False,False,False,False,


In [81]:
df_alex.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2055 entries, 0 to 2054
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          2055 non-null   object 
 1   title                       2055 non-null   object 
 2   abstract                    1972 non-null   object 
 3   year                        2048 non-null   float64
 4   publicationDate             1734 non-null   object 
 5   doi                         1930 non-null   object 
 6   venue                       1947 non-null   object 
 7   venue_type                  1139 non-null   object 
 8   venue_url                   1191 non-null   object 
 9   fields_of_study             2055 non-null   object 
 10  s2_fields_of_study          2055 non-null   object 
 11  citation_count              2055 non-null   int64  
 12  influential_citation_count  2055 non-null   int64  
 13  reference_count             2055 

In [86]:
df_normalized.to_csv('./data/raw/openAlex_enriched_papers.csv', index=False)

# 8. Normalize the Data

In [94]:
def _clean_str(s):
    """Trim + collapse spaces; preserve case for display."""
    if not isinstance(s, str):
        return ""
    s = unicodedata.normalize("NFKC", s).strip()
    return re.sub(r"\s+", " ", s)

def _normalize_pubdate(s):
    """Preserve YYYY-MM-DD; map YYYY-MM -> first of month; YYYY -> Jan 1; else blank."""
    if not isinstance(s, str) or not s.strip():
        return ""
    s = s.strip()
    if re.fullmatch(r"\d{4}-\d{2}-\d{2}", s):
        try:
            return date.fromisoformat(s).isoformat()
        except ValueError:
            return ""
    if re.fullmatch(r"\d{4}-\d{2}", s):
        y, m = s.split("-")
        return f"{y}-{m}-01"
    if re.fullmatch(r"\d{4}", s):
        return f"{s}-01-01"
    return ""

def normalize_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # DOIs: bare lowercase, trimmed (assumes you’ve already stripped https://doi.org/)
    df["doi"] = df["doi"].astype(str).str.strip().str.lower()

    # Titles/Venues: trim + collapse whitespace (preserve case for display)
    df["title"] = df["title"].apply(_clean_str)
    df["venue"] = df["venue"].apply(_clean_str)

    # Publication date: preserve day when present
    df["publicationDate"] = df["publicationDate"].apply(_normalize_pubdate)

    # Year as nullable Int64
    if "year" in df.columns and pd.api.types.is_numeric_dtype(df["year"]):
        df["year"] = df["year"].round().astype("Int64")

    return df

In [95]:
df_normalized = normalize_df(df_alex)
df_normalized.head()

Unnamed: 0,id,title,abstract,year,publicationDate,doi,venue,venue_type,venue_url,fields_of_study,...,author_id,authors,s2_url,open_access_pdf,query,doi_from_openalex,venue_from_openalex,abstract_from_openalex,year_from_openalex,openalex_id
0,00b75f61f8bd3246fff75f84d852ba3e80d5338e,Applications of information Nonanticipative Ra...,The objective of this paper is to further inve...,2014,2014-01-22,10.1109/isit.2014.6875397,2014 IEEE International Symposium on Informati...,,,"['Mathematics', 'Computer Science']",...,"['145657810', '2081852', '1745427']","['Photios A. Stavrou', 'C. Kourtellaris', 'C. ...",https://www.semanticscholar.org/paper/00b75f61...,http://arxiv.org/pdf/1401.5828,Causal Reinforcement Learning | Causal RL,False,False,False,False,
1,01befcd360d36d520f595b34d5d26e37e0ac16f3,Explainable Agency in Reinforcement Learning A...,This thesis explores how reinforcement learnin...,2020,2020-04-03,10.1609/aaai.v34i10.7134,AAAI Conference on Artificial Intelligence,conference,http://www.aaai.org/,['Computer Science'],...,['9303604'],['Prashan Madumal'],https://www.semanticscholar.org/paper/01befcd3...,https://doi.org/10.1609/aaai.v34i10.7134,Causal Reinforcement Learning | Causal RL,False,False,False,False,
2,01e9241dbb9eaca99b86468bb079f4b631b71671,Causal prompting model-based offline reinforce...,Model-based offline Reinforcement Learning (RL...,2024,2024-06-03,10.48550/arxiv.2406.01065,arXiv.org,,https://arxiv.org,['Computer Science'],...,"['2116329956', '2153979217', '2303466987', '22...","['Xuehui Yu', 'Yi Guan', 'Rujia Shen', 'Xin Li...",https://www.semanticscholar.org/paper/01e9241d...,,Causal Reinforcement Learning | Causal RL,False,False,False,False,
3,026dc8d3cbb360bdd12d19c924bc633221c9b423,Learning Causal Overhypotheses through Explora...,Despite recent progress in reinforcement learn...,2022,2022-02-21,10.48550/arxiv.2202.10430,CLEaR,conference,http://www.jolace.com/publications/clear/,['Computer Science'],...,"['8519553', '2150491107', '39229748', '1525028...","['Eliza Kosoy', 'Adrian Liu', 'Jasmine Collins...",https://www.semanticscholar.org/paper/026dc8d3...,,Causal Reinforcement Learning | Causal RL,True,False,False,False,https://openalex.org/W4221153082
4,0348b36927f740b82f51afcd1c35cae8386bc336,Segmented Encoding for Sim2Real of RL-based En...,Among the challenges in the recent research of...,2022,2022-06-05,10.1109/iv51971.2022.9827374,2022 IEEE Intelligent Vehicles Symposium (IV),,,['Computer Science'],...,"['47238664', '39530824', '2179287901', '673452...","['Seung H. Chung', 'S. Kong', 'S. Cho', 'I. M....",https://www.semanticscholar.org/paper/0348b369...,,Causal Reinforcement Learning | Causal RL,False,False,False,False,


In [98]:
df_normalized.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2055 entries, 0 to 2054
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   id                          2055 non-null   object
 1   title                       2055 non-null   object
 2   abstract                    1972 non-null   object
 3   year                        2048 non-null   Int64 
 4   publicationDate             2055 non-null   object
 5   doi                         2055 non-null   object
 6   venue                       2055 non-null   object
 7   venue_type                  1139 non-null   object
 8   venue_url                   1191 non-null   object
 9   fields_of_study             2055 non-null   object
 10  s2_fields_of_study          2055 non-null   object
 11  citation_count              2055 non-null   int64 
 12  influential_citation_count  2055 non-null   int64 
 13  reference_count             2055 non-null   int6

In [97]:
df_normalized.to_csv('./data/processed/normalized_papers.csv', index=False)