# 1. Imports, paths and logging

In [2]:
from __future__ import annotations

import json
import re
from pathlib import Path
from datetime import datetime
from typing import List, Optional, Tuple

import numpy as np
import pandas as pd

import logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
logger = logging.getLogger("create_node_csvs")

# Input and output locations
IN_PATH = Path("./data/processed/normalized_papers.csv")
OUT_DIR = Path("./data/processed")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Node CSV outputs
PAPERS_CSV         = OUT_DIR / "Papers.csv"
QUERIES_CSV        = OUT_DIR / "Queries.csv"
FIELDS_CSV         = OUT_DIR / "FieldsOfStudy.csv"
AUTHORS_CSV        = OUT_DIR / "Authors.csv"
VENUES_CSV         = OUT_DIR / "Venues.csv"

ADDED_AT = datetime.utcnow().replace(microsecond=0).isoformat() + "Z"
print("Input:", IN_PATH)
print("Output dir:", OUT_DIR)

Input: data/processed/normalized_papers.csv
Output dir: data/processed


# 2. Helpers

In [3]:
def parse_list_field(val) -> List[str]:
    """
    Parse a list-like field that may be:
    - a true Python list,
    - a JSON-encoded list of strings or dicts,
    - a delimited string (| ; ,),
    - a bare scalar.
    Returns a list of stripped strings (empty if nothing usable).
    """
    if val is None:
        return []
    if isinstance(val, list):
        out = []
        for x in val:
            if isinstance(x, dict):
                # Author/references dicts may occur elsewhere; here we just stringify non-empty values.
                out.append(json.dumps(x, ensure_ascii=False))
            else:
                s = str(x).strip()
                if s:
                    out.append(s)
        return out
    s = str(val).strip()
    if not s:
        return []
    # Try JSON list first
    try:
        obj = json.loads(s)
        if isinstance(obj, list):
            out = []
            for x in obj:
                if isinstance(x, dict):
                    out.append(json.dumps(x, ensure_ascii=False))
                else:
                    xs = str(x).strip()
                    if xs:
                        out.append(xs)
            return out
    except Exception:
        pass
    # Fallback delimiters
    for sep in ("|", "; ", ";", ", ", ","):
        if sep in s:
            return [t.strip() for t in s.split(sep) if t.strip()]
    return [s]

def slugify(text: str) -> str:
    """Make a deterministic slug: lowercase, spaces→hyphens, keep [a-z0-9-]."""
    if text is None:
        return ""
    s = str(text).strip().lower()
    s = re.sub(r"\s+", "-", s)
    s = re.sub(r"[^a-z0-9\-]+", "", s)
    s = re.sub(r"-{2,}", "-", s).strip("-")
    return s

def norm_doi(doi: Optional[str]) -> str:
    """
    Normalize DOI to lowercase without URL prefixes. Returns empty string if missing.
    Assumes upstream normalization mostly done; this adds extra safety.
    """
    if doi is None:
        return ""
    s = str(doi).strip().lower()
    if not s:
        return ""
    s = re.sub(r"^https?://(dx\.)?doi\.org/", "", s)
    return s

# 3. Load Normalized Data

In [4]:
# Load the user's normalized CSV and standardize minimal types without re-normalizing content.
df = pd.read_csv(IN_PATH)

# Critical identifiers as strings; year as nullable Int64
df["id"] = df["id"].astype(str)
if "year" in df.columns:
    df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")
if "doi" in df.columns:
    df["doi"] = df["doi"].apply(norm_doi)

# Quick schema check print
print("Columns:", df.columns.tolist())
logger.info("Rows: %d | Unique paper ids: %d", len(df), df["id"].nunique())

# Ensure expected columns exist; fill missing non-critical columns with empty strings if absent
for col in ["title","abstract","publicationDate","venue","venue_type","venue_url","fields_of_study",
            "s2_fields_of_study","citation_count","influential_citation_count","reference_count",
            "s2_url","open_access_pdf","query","openalex_id"]:
    if col not in df.columns:
        df[col] = ""

df.head(2)

2025-08-29 14:57:13,405 | INFO | Rows: 2055 | Unique paper ids: 2055


Columns: ['id', 'title', 'abstract', 'year', 'publicationDate', 'doi', 'venue', 'venue_type', 'venue_url', 'fields_of_study', 's2_fields_of_study', 'citation_count', 'influential_citation_count', 'reference_count', 'references', 'author_id', 'authors', 's2_url', 'open_access_pdf', 'query', 'doi_from_openalex', 'venue_from_openalex', 'abstract_from_openalex', 'year_from_openalex', 'openalex_id']


Unnamed: 0,id,title,abstract,year,publicationDate,doi,venue,venue_type,venue_url,fields_of_study,...,author_id,authors,s2_url,open_access_pdf,query,doi_from_openalex,venue_from_openalex,abstract_from_openalex,year_from_openalex,openalex_id
0,00b75f61f8bd3246fff75f84d852ba3e80d5338e,Applications of information Nonanticipative Ra...,The objective of this paper is to further inve...,2014,2014-01-22,10.1109/isit.2014.6875397,2014 IEEE International Symposium on Informati...,,,"['Mathematics', 'Computer Science']",...,"['145657810', '2081852', '1745427']","['Photios A. Stavrou', 'C. Kourtellaris', 'C. ...",https://www.semanticscholar.org/paper/00b75f61...,http://arxiv.org/pdf/1401.5828,Causal Reinforcement Learning | Causal RL,False,False,False,False,
1,01befcd360d36d520f595b34d5d26e37e0ac16f3,Explainable Agency in Reinforcement Learning A...,This thesis explores how reinforcement learnin...,2020,2020-04-03,10.1609/aaai.v34i10.7134,AAAI Conference on Artificial Intelligence,conference,http://www.aaai.org/,['Computer Science'],...,['9303604'],['Prashan Madumal'],https://www.semanticscholar.org/paper/01befcd3...,https://doi.org/10.1609/aaai.v34i10.7134,Causal Reinforcement Learning | Causal RL,False,False,False,False,


In [5]:
df

Unnamed: 0,id,title,abstract,year,publicationDate,doi,venue,venue_type,venue_url,fields_of_study,...,author_id,authors,s2_url,open_access_pdf,query,doi_from_openalex,venue_from_openalex,abstract_from_openalex,year_from_openalex,openalex_id
0,00b75f61f8bd3246fff75f84d852ba3e80d5338e,Applications of information Nonanticipative Ra...,The objective of this paper is to further inve...,2014,2014-01-22,10.1109/isit.2014.6875397,2014 IEEE International Symposium on Informati...,,,"['Mathematics', 'Computer Science']",...,"['145657810', '2081852', '1745427']","['Photios A. Stavrou', 'C. Kourtellaris', 'C. ...",https://www.semanticscholar.org/paper/00b75f61...,http://arxiv.org/pdf/1401.5828,Causal Reinforcement Learning | Causal RL,False,False,False,False,
1,01befcd360d36d520f595b34d5d26e37e0ac16f3,Explainable Agency in Reinforcement Learning A...,This thesis explores how reinforcement learnin...,2020,2020-04-03,10.1609/aaai.v34i10.7134,AAAI Conference on Artificial Intelligence,conference,http://www.aaai.org/,['Computer Science'],...,['9303604'],['Prashan Madumal'],https://www.semanticscholar.org/paper/01befcd3...,https://doi.org/10.1609/aaai.v34i10.7134,Causal Reinforcement Learning | Causal RL,False,False,False,False,
2,01e9241dbb9eaca99b86468bb079f4b631b71671,Causal prompting model-based offline reinforce...,Model-based offline Reinforcement Learning (RL...,2024,2024-06-03,10.48550/arxiv.2406.01065,arXiv.org,,https://arxiv.org,['Computer Science'],...,"['2116329956', '2153979217', '2303466987', '22...","['Xuehui Yu', 'Yi Guan', 'Rujia Shen', 'Xin Li...",https://www.semanticscholar.org/paper/01e9241d...,,Causal Reinforcement Learning | Causal RL,False,False,False,False,
3,026dc8d3cbb360bdd12d19c924bc633221c9b423,Learning Causal Overhypotheses through Explora...,Despite recent progress in reinforcement learn...,2022,2022-02-21,10.48550/arxiv.2202.10430,CLEaR,conference,http://www.jolace.com/publications/clear/,['Computer Science'],...,"['8519553', '2150491107', '39229748', '1525028...","['Eliza Kosoy', 'Adrian Liu', 'Jasmine Collins...",https://www.semanticscholar.org/paper/026dc8d3...,,Causal Reinforcement Learning | Causal RL,True,False,False,False,https://openalex.org/W4221153082
4,0348b36927f740b82f51afcd1c35cae8386bc336,Segmented Encoding for Sim2Real of RL-based En...,Among the challenges in the recent research of...,2022,2022-06-05,10.1109/iv51971.2022.9827374,2022 IEEE Intelligent Vehicles Symposium (IV),,,['Computer Science'],...,"['47238664', '39530824', '2179287901', '673452...","['Seung H. Chung', 'S. Kong', 'S. Cho', 'I. M....",https://www.semanticscholar.org/paper/0348b369...,,Causal Reinforcement Learning | Causal RL,False,False,False,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2050,97372d4647c90b9ead00e26c249c86e5bb435614,Intelligent Algorithms for Coordinated Control...,\n In an effort to lessen the impact of cities...,2025,2025-01-01,10.1115/1.4068494,ASME Open Journal of Engineering,journal,,[],...,"['2361199578', '2343294431', '2293586726', '23...","['Yanfang Liu', 'Songling Pang', 'Ruien Zhang'...",https://www.semanticscholar.org/paper/97372d46...,,(Safe Reinforcement Learning | Robust Reinforc...,False,False,False,False,
2051,b25c560cbf5e997c4be5909877243ba9662b71c4,Research on Optimization Configuration of Urba...,With the deepening of the “dual carbon” policy...,2024,2024-10-25,10.1109/icemce64157.2024.10862876,2024 8th International Conference on Electrica...,,,[],...,"['2345474608', '2345286608', '2345016098', '23...","['Jun Jia', 'Dong Liu', 'Ligang Ge', 'Fanyi Lu']",https://www.semanticscholar.org/paper/b25c560c...,,(Safe Reinforcement Learning | Robust Reinforc...,False,False,False,False,
2052,bb0c2063aa407119dbd8d128806f98fffdab015c,Deep Reinforcement Learning for Adaptive Optim...,This work focused on improving the control sch...,2024,2024-06-04,10.1109/gpecom61896.2024.10582714,"Global Power, Energy and Communication Conference",conference,,[],...,"['2223164613', '152709525']","['Richard Wiencek', 'Sagnika Ghosh']",https://www.semanticscholar.org/paper/bb0c2063...,,(Safe Reinforcement Learning | Robust Reinforc...,False,False,False,False,
2053,c29f7df6d625d5c1caf277e55182dfc426603cdc,State Predictive Control of Modular SMES Magne...,Modular superconducting magnetic energy storag...,2022,,10.1109/tasc.2022.3148682,IEEE transactions on applied superconductivity,journal,http://ieeexplore.ieee.org/servlet/opac?punumb...,[],...,"['2116459274', '2109351453', '2119110951', '12...","['Zitong Zhang', 'Jing Shi', 'Shuqiang Guo', '...",https://www.semanticscholar.org/paper/c29f7df6...,,(Safe Reinforcement Learning | Robust Reinforc...,False,False,False,False,


# 4. Build Papers.csv

In [21]:
# Construct Papers.csv as the core node file.
# Columns chosen to balance graph normalization with convenience for analytics.

papers = pd.DataFrame({
    "paper_id": df["id"].astype(str),
    "title": df["title"].fillna(""),
    "abstract": df["abstract"].fillna(""),
    "year": df["year"].astype("Int64"),
    "publication_date": df["publicationDate"].fillna(""),
    "doi": df["doi"].fillna(""),
    "s2_url": df["s2_url"].fillna(""),
    "open_access_pdf": df["open_access_pdf"].astype(str).replace("nan",""),
    "venue": df["venue"].fillna(""),
    "added_at": ADDED_AT,
    "source": "SemanticScholar"
})

# Deduplicate strictly on paper_id
before = len(papers)
papers = papers.drop_duplicates(subset=["paper_id"])
after = len(papers)

papers.to_csv(PAPERS_CSV, index=False)
logger.info("Papers.csv written: %d rows (dropped %d duplicates)", after, before - after)
papers.head(2)

2025-08-29 11:22:44,999 | INFO | Papers.csv written: 2055 rows (dropped 0 duplicates)


Unnamed: 0,paper_id,title,abstract,year,publication_date,doi,s2_url,open_access_pdf,venue,added_at,source
0,00b75f61f8bd3246fff75f84d852ba3e80d5338e,Applications of information Nonanticipative Ra...,The objective of this paper is to further inve...,2014,2014-01-22,10.1109/isit.2014.6875397,https://www.semanticscholar.org/paper/00b75f61...,http://arxiv.org/pdf/1401.5828,2014 IEEE International Symposium on Informati...,2025-08-29T09:21:57Z,SemanticScholar
1,01befcd360d36d520f595b34d5d26e37e0ac16f3,Explainable Agency in Reinforcement Learning A...,This thesis explores how reinforcement learnin...,2020,2020-04-03,10.1609/aaai.v34i10.7134,https://www.semanticscholar.org/paper/01befcd3...,https://doi.org/10.1609/aaai.v34i10.7134,AAAI Conference on Artificial Intelligence,2025-08-29T09:21:57Z,SemanticScholar


# 5. Build Queries.csv

In [22]:
# Create a node file for queries so they can be connected later (Paper)-[:FROM_QUERY]->(Query).
queries = df["query"].fillna("").astype(str)
queries = queries[queries.str.len() > 0].drop_duplicates().sort_values()

queries_df = pd.DataFrame({
    "query_id": queries.apply(lambda s: "Q:" + slugify(s)),
    "name": queries.values,
    "added_at": ADDED_AT
})

queries_df.to_csv(QUERIES_CSV, index=False)
logger.info("Queries.csv written: %d rows", len(queries_df))
queries_df.head(10)

2025-08-29 11:23:25,942 | INFO | Queries.csv written: 8 rows


Unnamed: 0,query_id,name,added_at
977,Q:explainable-ai-xai-smart-grid-grid-stability,(Explainable AI | XAI) + (Smart Grid | Grid St...,2025-08-29T09:21:57Z
1996,Q:interpretable-reinforcement-learning-explain...,(Interpretable Reinforcement Learning | Explai...,2025-08-29T09:21:57Z
1041,Q:reinforcement-learning-deep-reinforcement-le...,(Reinforcement Learning | Deep Reinforcement L...,2025-08-29T09:21:57Z
2045,Q:safe-reinforcement-learning-robust-reinforce...,(Safe Reinforcement Learning | Robust Reinforc...,2025-08-29T09:21:57Z
1014,Q:causal-inference-energy-systems-power-grid,Causal Inference + (Energy Systems | Power Grid),2025-08-29T09:21:57Z
2043,Q:causal-inference-vehicle-to-grid-ev-charging,Causal Inference + (Vehicle-to-Grid | EV Charg...,2025-08-29T09:21:57Z
0,Q:causal-reinforcement-learning-causal-rl,Causal Reinforcement Learning | Causal RL,2025-08-29T09:21:57Z
403,Q:reinforcement-learning-ev-charging-smart-cha...,Reinforcement Learning + (EV Charging | Smart ...,2025-08-29T09:21:57Z


# 6. Build FieldsOfStudy.csv

In [27]:
# -- strip wrappers like quotes/brackets and tidy whitespace --
def _clean_label(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = s.strip()
    # remove a single pair of wrapping quotes if present
    if (s.startswith("'") and s.endswith("'")) or (s.startswith('"') and s.endswith('"')):
        s = s[1:-1].strip()
    # remove any leading/trailing bracket/paren/brace/quotes noise
    s = re.sub(r"^[\s\[\](){}'\"`]+|[\s$begin:math:display$$end:math:display$(){}'\"`]+$", "", s)
    # collapse inner whitespace
    s = re.sub(r"\s+", " ", s).strip()
    return s

fos_rows = []

def collect_fos(series, source_name):
    for raw in series.fillna("").astype(str):
        items = parse_list_field(raw) if raw else []
        for it in items:
            try:
                obj = json.loads(it)
                label = obj["name"].strip() if isinstance(obj, dict) and "name" in obj else str(it).strip()
            except Exception:
                label = str(it).strip()
            # Clean trailing punctuation
            label = label.strip("[]'\" ").strip()
            # Collapse inner whitespace
            label = re.sub(r"\s+", " ", label)
            if label:
                fos_rows.append((label, source_name))

collect_fos(df["fields_of_study"], "fields_of_study")
collect_fos(df["s2_fields_of_study"], "s2_fields_of_study")

tmp = pd.DataFrame(fos_rows, columns=["name", "source"])

# Deduplicate by name, preferring fields_of_study
def pick_source(sources):
    return "fields_of_study" if "fields_of_study" in sources.values else "s2_fields_of_study"

fos_df = (
    tmp.groupby("name", as_index=False)
       .agg(source=("source", pick_source))
)

# Make concept_id
fos_df["concept_id"] = fos_df["name"].apply(lambda n: f"F:{slugify(n)}")

# Final DataFrame
fos_df = fos_df[["concept_id", "name", "source"]].sort_values("name")

fos_df.to_csv(FIELDS_CSV, index=False)
logger.info("FieldsOfStudy.csv written: %d unique concepts", len(fos_df))
fos_df.head(10)

2025-08-29 13:29:05,244 | INFO | FieldsOfStudy.csv written: 14 unique concepts


Unnamed: 0,concept_id,name,source
0,F:art,Art,fields_of_study
1,F:biology,Biology,fields_of_study
2,F:business,Business,fields_of_study
3,F:computer-science,Computer Science,fields_of_study
4,F:economics,Economics,fields_of_study
5,F:engineering,Engineering,fields_of_study
6,F:environmental-science,Environmental Science,fields_of_study
7,F:geography,Geography,fields_of_study
8,F:mathematics,Mathematics,fields_of_study
9,F:medicine,Medicine,fields_of_study


# 7. Build Authors.csv

In [29]:
# Build Authors.csv by aligning 'author_id' and 'authors' list-like columns.
# If an 'author_id' is missing but a name exists, synthesize a stable id from the name (prefixed with 'name:').

author_rows: List[Tuple[str, str]] = []

def parse_authors(row: pd.Series) -> List[Tuple[str, str]]:
    ids = parse_list_field(row.get("author_id"))
    names = parse_list_field(row.get("authors"))
    n = max(len(ids), len(names))
    out = []
    for i in range(n):
        aid = ids[i] if i < len(ids) else ""
        nm  = names[i] if i < len(names) else ""
        aid = str(aid).strip("[]'\" ").strip()
        nm  = str(nm).strip("[]'\" ").strip()
        if not aid and nm:
            aid = f"name:{slugify(nm)}"
        if aid:
            out.append((aid, nm))
    return out

for _, r in df.iterrows():
    for aid, nm in parse_authors(r):
        author_rows.append((aid, nm))

authors_df = pd.DataFrame(author_rows, columns=["author_id","name"]).drop_duplicates().sort_values("author_id")
authors_df.to_csv(AUTHORS_CSV, index=False)
logger.info("Authors.csv written: %d authors", len(authors_df))
authors_df.head(10)

2025-08-29 13:48:57,676 | INFO | Authors.csv written: 7004 authors


Unnamed: 0,author_id,name
803,100508183,Harry Kitsikopoulos
2353,100560409,M. Tuka
5270,100638425,N. Madonsela
195,100667849,S. S. Eshkevari
2503,100702353,S. Nengroo
4067,100704713,Y. Villarroel
8025,100731317,Mehdi Jabbari Zideh
685,100847016,Bruce Nagy
962,10115554,Jeremy Nixon
1003,101181945,Tridib Mukherjee


# 8. Build Venues.csv

In [42]:
# Build Venues.csv with a deterministic venue_id derived from the venue name.
# Keep venue 'type' and 'url' as attributes (if provided).

tmp = df[["venue","venue_type","venue_url"]].copy()
tmp["venue"] = tmp["venue"].fillna("").astype(str).str.strip()
tmp["venue_type"] = tmp["venue_type"].fillna("").astype(str).str.strip()
tmp["venue_url"] = tmp["venue_url"].fillna("").astype(str).str.strip()

# Only keep non-empty venue names
tmp = tmp[tmp["venue"] != ""]

venues_df = tmp.drop_duplicates().copy()
venues_df["venue_id"] = venues_df["venue"].apply(lambda s: "V:" + slugify(s))

# Reorder and save
venues_df = venues_df[["venue_id","venue","venue_type","venue_url"]].rename(columns={
    "venue": "name",
    "venue_type": "type",
    "venue_url": "url"
})
venues_df.to_csv(VENUES_CSV, index=False)
logger.info("Venues.csv written: %d venues", len(venues_df))
venues_df.head(10)

2025-08-29 13:58:16,260 | INFO | Venues.csv written: 932 venues


Unnamed: 0,venue_id,name,type,url
0,V:2014-ieee-international-symposium-on-informa...,2014 IEEE International Symposium on Informati...,,
1,V:aaai-conference-on-artificial-intelligence,AAAI Conference on Artificial Intelligence,conference,http://www.aaai.org/
2,V:arxivorg,arXiv.org,,https://arxiv.org
3,V:clear,CLEaR,conference,http://www.jolace.com/publications/clear/
4,V:2022-ieee-intelligent-vehicles-symposium-iv,2022 IEEE Intelligent Vehicles Symposium (IV),,
6,V:annals-of-statistics,Annals of Statistics,journal,https://www.jstor.org/journal/annalsstatistics
7,V:international-conference-on-learning-represe...,International Conference on Learning Represent...,conference,https://iclr.cc/
8,V:north-american-chapter-of-the-association-fo...,North American Chapter of the Association for ...,conference,https://www.aclweb.org/portal/naacl
9,V:the-european-physical-journal-c,The European Physical Journal C,,
10,V:neural-information-processing-systems,Neural Information Processing Systems,conference,http://neurips.cc/
