In [1]:
import os
from pathlib import Path
from datetime import datetime
from typing import Iterable, Dict, Any, List

import pandas as pd
from neo4j import GraphDatabase, basic_auth
from neo4j.exceptions import ServiceUnavailable, TransientError
from dotenv import load_dotenv

In [13]:
project_dir = Path().cwd()
dotenv_path = project_dir / '.env'

# Checks if the .env file exists at the constructed path before loading.
if dotenv_path.exists():
    # Loads the environment variables from the found .env file.
    load_dotenv(dotenv_path=dotenv_path)
    print(f"Loaded .env file from: {dotenv_path}")
else:
    print(f".env file not found at: {dotenv_path}")

load_dotenv(dotenv_path=dotenv_path)
URI  = os.getenv("NEO4J_URI") 
URI = os.getenv("NEO4J_URI")
AUTH = (os.getenv("NEO4J_USERNAME"), os.getenv("NEO4J_PASSWORD"))


# Create a helper function to display results nicely
def query_to_dataframe(driver, query, **params):
    """Runs a query and returns the results as a pandas DataFrame."""
    with driver.session() as session:
        result = session.run(query, **params)
        return pd.DataFrame([r.data() for r in result])

# Establish the connection to the database
try:
    driver = GraphDatabase.driver(URI, auth=AUTH)
    driver.verify_connectivity()
    print("✅ Connection to Neo4j successful!")
except Exception as e:
    print(f"❌ Failed to connect to Neo4j: {e}")

Loaded .env file from: /Users/stahlma/Desktop/01_Studium/10_Seminar/causal-rl-ev-review/.env
✅ Connection to Neo4j successful!


# 1. Most Prolific Authors

In [4]:
# --- Query 1: Most Prolific Authors ---
prolific_authors_query = """
MATCH (a:Author)-[:AUTHORED]->(p:Paper)
RETURN a.name AS author, count(p) AS papers_authored
ORDER BY papers_authored DESC
LIMIT 15
"""
print("Finding the 15 most prolific authors...")
prolific_authors_df = query_to_dataframe(driver, prolific_authors_query)
display(prolific_authors_df)

Finding the 15 most prolific authors...


Unnamed: 0,author,papers_authored
0,Yang Li,12
1,Qingyu Yang,11
2,Biwei Huang,10
3,C. Spanos,10
4,Lucas Spangher,10
5,B. Claessens,10
6,Kun Zhang,9
7,F. Ruelens,9
8,Hao Wang,9
9,Donghe Li,9


# 2. Most Influential Papers

In [11]:
# --- Query 2: Most Cited Papers ---
# Load the CSV file into a pandas DataFrame
normalized_papers_df = pd.read_csv("./data/processed/normalized_papers.csv")

# Sort by citation_count (descending), then by influential_citation (descending) for tie-breaking
top_25_cited_df = normalized_papers_df.sort_values(
    by=["citation_count", "influential_citation_count"], 
    ascending=[False, False]
).head(25)

display(top_25_cited_df[["title", "year", "citation_count", "influential_citation_count"]])

Unnamed: 0,title,year,citation_count,influential_citation_count
315,Decision Transformer: Reinforcement Learning v...,2021.0,1747,301
1450,Reinforcement learning for demand response: A ...,2019.0,604,15
1317,Perceptual Learning Directs Auditory Cortical ...,2006.0,568,36
244,Reinforcement Knowledge Graph Reasoning for Ex...,2019.0,484,47
359,Social Influence as Intrinsic Motivation for M...,2018.0,462,50
1531,Working-memory capacity protects model-based l...,2013.0,432,34
628,Model-Free Real-Time EV Charging Scheduling Ba...,2019.0,393,23
1695,A Dynamic pricing demand response algorithm fo...,2018.0,360,20
1601,Deep Reinforcement Learning for Power System A...,2020.0,348,12
1846,Incentive-based demand response for smart grid...,2019.0,320,7


# 3. Key Collaborators

In [6]:
# --- Query 3: Top Co-Author Pairs ---
co_authors_query = """
// Find a paper that has at least two authors
MATCH (a1:Author)-[:AUTHORED]->(p:Paper)<-[:AUTHORED]-(a2:Author)
// Ensure we don't count the same pair twice (e.g., A-B and B-A)
WHERE id(a1) < id(a2)
RETURN a1.name AS author1, a2.name AS author2, count(p) AS collaborations
ORDER BY collaborations DESC
LIMIT 15
"""
print("\nFinding the top 15 collaborating author pairs...")
co_authors_df = query_to_dataframe(driver, co_authors_query)
display(co_authors_df)


Finding the top 15 collaborating author pairs...




Unnamed: 0,author1,author2,collaborations
0,Lucas Spangher,C. Spanos,10
1,Xiaoying Tang,Jie Liu,8
2,Qingyu Yang,Donghe Li,8
3,Sangyoon Lee,Dae-Hyun Choi,7
4,F. Ruelens,B. Claessens,7
5,C. Spanos,Utkarsha Agwan,6
6,Lucas Spangher,Utkarsha Agwan,6
7,Lucas Spangher,Doseok Jang,6
8,Zhiqiang Wan,Haibo He,6
9,Hepeng Li,Haibo He,6


# 4. Foundational Papers

In [7]:
# --- Query 4: Most Cited Papers within our Corpus ---
internal_citations_query = """
MATCH (p1:Paper)-[:CITES]->(p2:Paper)
RETURN p2.title AS foundational_paper, count(p1) AS citations_within_corpus
ORDER BY citations_within_corpus DESC
LIMIT 15
"""
print("\nFinding the 15 most cited papers by other papers in our corpus...")
internal_citations_df = query_to_dataframe(driver, internal_citations_query)
display(internal_citations_df)


Finding the 15 most cited papers by other papers in our corpus...


Unnamed: 0,foundational_paper,citations_within_corpus
0,Model-Free Real-Time EV Charging Scheduling Ba...,39
1,Reinforcement learning for demand response: A ...,31
2,Reinforcement Learning-Based Plug-in Electric ...,24
3,Incentive-based demand response for smart grid...,20
4,A Dynamic pricing demand response algorithm fo...,18
5,Optimal Demand Response Using Device-Based Rei...,18
6,CDDPG: A Deep-Reinforcement-Learning-Based App...,17
7,Reinforcement Learning for Real-Time Pricing a...,17
8,Definition and Evaluation of Model-Free Coordi...,16
9,Demand Response for Home Energy Management Usi...,16


# 5. Original Query Analysis

In [14]:
# --- Query 5: Paper count by original query ---
papers_by_query = """
MATCH (p:Paper)-[:FOUND_BY]->(q:Query)
RETURN q.name AS query, count(p) AS number_of_papers
ORDER BY number_of_papers DESC
"""
print("\nCounting papers found by each original query...")
papers_by_query_df = query_to_dataframe(driver, papers_by_query)
display(papers_by_query_df)


Counting papers found by each original query...


Unnamed: 0,query,number_of_papers
0,(Reinforcement Learning | Deep Reinforcement L...,955
1,Reinforcement Learning + (EV Charging | Smart ...,574
2,Causal Reinforcement Learning | Causal RL,403
3,(Interpretable Reinforcement Learning | Explai...,47
4,(Explainable AI | XAI) + (Smart Grid | Grid St...,37
5,Causal Inference + (Energy Systems | Power Grid),27
6,(Safe Reinforcement Learning | Robust Reinforc...,10
7,Causal Inference + (Vehicle-to-Grid | EV Charg...,2


In [15]:
# --- Query 6: Top papers for a specific query ---
top_papers_from_query = """
MATCH (p:Paper)-[:FOUND_BY]->(q:Query)
WHERE q.queryId = 'Q:causal-reinforcement-learning-causal-rl'
RETURN p.title AS title, p.citationCount AS citations
ORDER BY citations DESC
LIMIT 5
"""
print("\nFinding top 5 cited papers from the 'Causal RL' query...")
top_papers_from_query_df = query_to_dataframe(driver, top_papers_from_query)
display(top_papers_from_query_df)




Finding top 5 cited papers from the 'Causal RL' query...


Unnamed: 0,title,citations
0,Explainable Agency in Reinforcement Learning A...,
1,Causal prompting model-based offline reinforce...,
2,Learning Causal Overhypotheses through Explora...,
3,Segmented Encoding for Sim2Real of RL-based En...,
4,Applications of information Nonanticipative Ra...,


# 6. Analysis by Field of Study

In [16]:
# --- Query 7: Most common Fields of Study ---
papers_by_fos = """
MATCH (p:Paper)-[:HAS_FIELD]->(f:FieldOfStudy)
RETURN f.name AS field_of_study, count(p) AS number_of_papers
ORDER BY number_of_papers DESC
LIMIT 10
"""
print("\nFinding the top 10 most common fields of study...")
papers_by_fos_df = query_to_dataframe(driver, papers_by_fos)
display(papers_by_fos_df)


Finding the top 10 most common fields of study...


Unnamed: 0,field_of_study,number_of_papers
0,Computer Science,1083
1,Engineering,145
2,Medicine,127
3,Mathematics,83
4,Psychology,65
5,Physics,23
6,Biology,23
7,Economics,16
8,Business,11
9,Environmental Science,3


In [17]:
# --- Query 8: Top Fields of Study for a specific query ---
fos_by_query = """
MATCH (q:Query {queryId: 'Q:causal-reinforcement-learning-causal-rl'})<-[:FOUND_BY]-(p:Paper)-[:HAS_FIELD]->(f:FieldOfStudy)
RETURN f.name AS field_of_study, count(p) AS papers_in_field
ORDER BY papers_in_field DESC
"""
print("\nFinding the top fields of study for papers from the 'Causal RL' query...")
fos_by_query_df = query_to_dataframe(driver, fos_by_query)
display(fos_by_query_df)


Finding the top fields of study for papers from the 'Causal RL' query...


Unnamed: 0,field_of_study,papers_in_field
0,Computer Science,268
1,Mathematics,63
2,Medicine,45
3,Biology,16
4,Psychology,14
5,Engineering,12
6,Economics,5
7,Business,5
8,Physics,3
9,Art,1
