# 1.Setup and Imports

In [None]:
# Imports for data handling, API requests, and environment variables
import os
import json
import requests
import xml.etree.ElementTree as ET
from dotenv import load_dotenv
from tqdm import tqdm
import pandas as pd

load_dotenv()

print("Libraries imported and environment loaded.")

Libraries imported and environment loaded.


# 2. Methodology and Objective
Traditional Systematic Literature Reviews (SLRs) are often labor-intensive, time-consuming, and prone to error, particularly in rapidly evolving research fields Recent studies highlight significant gaps in the automation of critical phases such as data extraction, quality assessment, and data synthesis. To address these challenges, this study adopts a semi-automated pipeline that leverages Large Language Models (LLMs) and advanced Natural Language Processing (NLP) to enhance the efficiency and rigor of the review process.
The approach is directly informed by recently proposed frameworks like PROMPTHEUS, which demonstrate the feasibility of an end-to-end automated SLR pipeline. The methodology consists of three main stages.
- First, a **systematic search and screening** phase will use an LLM to generate a robust search query and Sentence-BERT embeddings to filter for the most relevant papers based on semantic similarity. 
- Second, a **data extraction and topic modeling** phase will employ the BERTopic algorithm to cluster the selected literature into coherent themes. This technique leverages UMAP for dimensionality reduction and HDBSCAN for clustering, providing a robust method for thematic analysis. 
- Finally, a **knowledge graph** will be constructed and analyzed to map the relationships between concepts, papers, and authors, providing a structural overview of the research landscape. This comprehensive, AI-assisted methodology aims to produce a systematic and data-driven analysis while significantly reducing the manual workload associated with traditional review methods.

# 3. Defining Search Queries

In [38]:
# Define the core search queries for our literature review on CRL for EV charging
search_queries = {
    "crl_core": '"Causal Reinforcement Learning" OR "Causal RL"',
    "rl_for_ev": '"Reinforcement Learning" AND ("EV Charging" OR "Smart Charging")',
    "xai_for_grid": '("Explainable AI" OR "XAI") AND ("Smart Grid" OR "Grid Stability")',
    "causal_inference_energy": '"Causal Inference" AND ("Energy Systems" OR "Power Grid")',
    "broader_rl_ev": '("Reinforcement Learning" OR "Deep Reinforcement Learning") AND ("Vehicle-to-Grid" OR "V2G" OR "Demand Response")',
    "interpretable_rl_grid": '("Interpretable Reinforcement Learning" OR "Explainable Reinforcement Learning") AND ("Power Grid")',
    "causal_v2g": '("Causal Inference") AND ("Vehicle-to-Grid" OR "EV Charging")'
}

semantic_scholar_search_queries = {
    "crl_core": 'Causal Reinforcement Learning | Causal RL',
    "rl_for_ev": 'Reinforcement Learning + (EV Charging | Smart Charging)',
    "xai_for_grid": '(Explainable AI | XAI) + (Smart Grid | Grid Stability)',
    "causal_inference_energy": 'Causal Inference + (Energy Systems | Power Grid)',
    "broader_rl_ev": '(Reinforcement Learning | Deep Reinforcement Learning) + (Vehicle-to-Grid | V2G | Demand Response)',
    "interpretable_rl_grid": '(Interpretable Reinforcement Learning | Explainable Reinforcement Learning") + (Power Grid)',
    "causal_v2g": 'Causal Inference + (Vehicle-to-Grid | EV Charging)'
}

# 4. API Clients

### ArXiv

In [88]:
def fetch_arxiv_papers(query, max_results=200):
    """
    Fetches paper metadata from the ArXiv API based on a search query.
    It parses the XML response and returns a list of dictionaries.
    """
    # The base URL for the ArXiv API
    base_url = 'http://export.arxiv.org/api/query?'
    
    # Prepares the search query by replacing spaces with '+'
    search_query = f'all:{query.replace(" ", "+")}'
    
    # Constructs the full request URL with parameters
    request_url = f'{base_url}search_query={search_query}&start=0&max_results={max_results}'
    
    try:
        # Sends a GET request to the ArXiv API
        response = requests.get(request_url)
        # Raises an exception for bad status codes (like 404 or 500)
        response.raise_for_status()

        # Parses the XML response from the request
        root = ET.fromstring(response.content)
        
        # A list to store the metadata for each paper
        papers = []
        
        # The Atom XML namespace used by ArXiv
        namespace = {'atom': 'http://www.w3.org/2005/Atom'}
        
        # Iterates through each 'entry' tag in the XML, which corresponds to a paper
        for entry in root.findall('atom:entry', namespace):
            # Extracts the paper ID, title, and summary (abstract)
            paper_id = entry.find('atom:id', namespace).text
            title = entry.find('atom:title', namespace).text
            summary = entry.find('atom:summary', namespace).text
            published = entry.find('atom:published', namespace).text
            updated = entry.find('atom:updated', namespace).text
            
            # Extracts all author names
            authors = [author.find('atom:name', namespace).text for author in entry.findall('atom:author', namespace)]
            
            # Appends the extracted data as a dictionary to the list
            papers.append({
                'id': paper_id,
                'title': title.strip(),
                'summary': summary.strip(),
                'authors': authors,
                'year': updated or published,
                'source': 'arxiv'
            })
            
        return papers
    
    except requests.exceptions.RequestException as e:
        # Handles potential network errors
        print(f"An error occurred: {e}")
        return []

### IEEE

In [17]:
import time # We need this to handle rate limiting

def fetch_ieee_papers(query, max_total_records=500):
    """
    Fetches paper metadata from the IEEE Xplore API, handling pagination.
    It parses the JSON response and returns a list of dictionaries.
    """
    # Retrieves the API key from the environment variables
   # Retrieves the API key from the environment variables
    api_key = os.getenv("API_KEY_IEEE")
    if not api_key:
        print("Error: IEEE API key not found in .env file.")
        return []

    # The correct base URL for the modern IEEE API
    base_url = "https://ieeexploreapi.ieee.org/api/v1/search/articles"
    
    # Sets the headers to request JSON data, a best practice for REST APIs
    headers = {'Accept': 'application/json'}
    
    # Sets the initial parameters for the first API call
    params = {
        'apikey': api_key,
        'meta_data': query,
        'rows_per_page': 200, # Max allowed per call is 200
        'page_number': 1
    }
    
    papers = []
    
    # Sets a flag for the first request to get total records
    is_first_request = True
    total_records = 0
    
    while True:
        try:
            # Sends the GET request to the IEEE API
            response = requests.get(base_url, headers=headers, params=params)
            response.raise_for_status()
            data = response.json()

            # On the first successful request, determine the total number of records available
            if is_first_request:
                total_records = data.get('total_records', 0)
                if total_records == 0:
                    break # Exits if the query returned no results
                is_first_request = False
            
            # The API returns a list of articles under the 'articles' key
            articles = data.get('articles', [])
            if not articles:
                break # Exits the loop if no more articles are returned

            # Processes each article in the current batch
            for article in articles:
                papers.append({
                    'id': article.get('doi', article.get('article_number')),
                    'title': article.get('title', '').strip(),
                    'summary': article.get('abstract', '').strip(),
                    'authors': [author.get('full_name') for author in article.get('authors', {}).get('authors', [])],
                    'source': 'ieee'
                })
            
            # Checks if we have reached our desired limit or fetched all available records
            if len(papers) >= max_total_records or len(papers) >= total_records:
                break

            # Increments the page number for the next request
            params['page_number'] += 1
            
            # Respects the API rate limit (10 calls/sec)
            time.sleep(0.2)

        except requests.exceptions.RequestException as e:
            print(f"An API error occurred: {e}")
            break
        except json.JSONDecodeError:
            print(f"Error decoding JSON. Response content: {response.text}")
            break
            
    # Returns only the number of records requested by the user
    return papers[:max_total_records]

### Semantic Scholar

In [84]:
def fetch_paper_details(paper_ids):
    """
    Fetches detailed paper metadata, including abstracts, for a list of paper IDs
    using the /paper/batch endpoint.
    """
    # The endpoint for fetching details of multiple papers
    details_url = "https://api.semanticscholar.org/graph/v1/paper/batch?fields=title,abstract,year,authors,citationCount,externalIds,paperId,tldr"
    
    # The API can handle up to 500 IDs per request
    chunk_size = 400 # Use a slightly smaller chunk size for safety
    
    detailed_papers = []

    # Process the paper IDs in chunks to respect API limits
    for i in range(0, len(paper_ids), chunk_size):
        chunk = paper_ids[i:i + chunk_size]
        
        try:
            # This is a POST request, with the IDs sent in the JSON body
            response = requests.post(details_url, json={'ids': chunk})
            response.raise_for_status()
            data = response.json()
            print(data)

            # Filter out any null responses which can occur if an ID is not found
            valid_papers = [paper for paper in data if paper is not None]
            detailed_papers.extend(valid_papers)

        except requests.exceptions.RequestException as e:
            print(f"An API error occurred during detail fetching: {e}")
            continue # Continue to the next chunk
            
        time.sleep(1) # Pause between chunks

    return detailed_papers

def fetch_semantic_scholar_papers_bulk(query, max_total_records=500):
    """
    Performs a two-step fetch from Semantic Scholar:
    1. Uses the bulk search to efficiently get a list of relevant paper IDs.
    2. Uses the batch details endpoint to retrieve full metadata, including abstracts.
    """
    base_query = f"https://api.semanticscholar.org/graph/v1/paper/search/bulk?query='{query}'"
    
    # In the first step, we only need the paperId
    query_params = {"fields": "title,year,citationCount"}
    
    paper_ids = []
    
    try:
        print(base_query)
        response = requests.get(base_query, params=query_params)
        response.raise_for_status()
        data = response.json()
    except requests.exceptions.RequestException as e:
        print(f"An API error occurred during bulk search: {e}")

    
    articles = data.get('data', [])
    print(articles)
    # Collect the paper IDs from the search results
    for article in articles:
        paper_ids.append(article['paperId'])
        
    time.sleep(0.5)

    # --- Step 2: Fetch full details for the collected IDs ---
    if not paper_ids:
        return []

    print(f"Found {len(paper_ids)} paper IDs. Now fetching details...")
    detailed_results = fetch_paper_details(paper_ids[:max_total_records])
    print(detailed_results)
    
    # Final processing to match our desired data structure
    final_papers = []
    for paper in detailed_results:
        final_papers.append({
            'id': paper.get('paperId'),
            'doi': paper.get('externalIds').get('DOI'),
            'title': paper.get('title'),
            'summary': (
                paper.get('abstract')
                or (paper.get('tldr') or {}).get('text')
                or ""
            ),
            'authors': [author.get('name') for author in paper.get('authors', [])],
            'citation_count': paper.get('citationCount'),
            'year': paper.get('year'),
            'source': 'semantic_scholar'
        })
        
    return final_papers

# 5. Fetch Data

In [None]:
# ===================================================================
# Section 1: Fetch and Save ArXiv Data
# ===================================================================
print("Fetching papers from ArXiv...")
arxiv_papers_list = []
for name, query in tqdm(search_queries.items()):
    papers = fetch_arxiv_papers(query, max_results=500)
    arxiv_papers_list.extend(papers)

# De-duplicate within the ArXiv results and save
arxiv_df = pd.DataFrame(arxiv_papers_list)
arxiv_df.drop_duplicates(subset='id', keep='first', inplace=True)
arxiv_df.to_csv('./data/raw/raw_arxiv.csv', index=False)
print(f"Saved {len(arxiv_df)} unique papers from ArXiv to data/raw/raw_arxiv.csv")


# ===================================================================
# Section 2: Fetch and Save IEEE Xplore Data
# ===================================================================
##### NO API KEY COULD BE ACQUIRED - LEFT OUT OF FINAL RESULTS ######
# print("\nFetching papers from IEEE Xplore...")
# ieee_papers_list = []
# for name, query in tqdm(expanded_search_queries.items()):
#     papers = fetch_ieee_papers(query, max_total_records=500)
#     ieee_papers_list.extend(papers)
#
# ieee_df = pd.DataFrame(ieee_papers_list)
# ieee_df.drop_duplicates(subset='id', keep='first', inplace=True)
# ieee_df.to_csv('../data/raw/raw_ieee.csv', index=False)
# print(f"Saved {len(ieee_df)} unique papers from IEEE to data/raw/raw_ieee.csv")


# ===================================================================
# Section 3: Fetch and Save Semantic Scholar Data (Bulk Version)
# ===================================================================
# print(f"\nFetching from Semantic Scholar...")
# ss_papers_list = []
# for name, query in tqdm(semantic_scholar_search_queries.items()):
#     papers = fetch_semantic_scholar_papers_bulk(query, max_total_records=1000)
#     ss_papers_list.extend(papers)
#     time.sleep(1) 

# # De-duplicate within the Semantic Scholar results and save
# ss_df = pd.DataFrame(ss_papers_list)
# if not ss_df.empty:
#     ss_df.drop_duplicates(subset='id', keep='first', inplace=True)
#     ss_df.to_csv('./data/raw/raw_semantic_scholar.csv', index=False)
#     print(f"Saved {len(ss_df)} unique papers from Semantic Scholar to data/raw/raw_semantic_scholar.csv")
# else:
#     print("No papers were found from Semantic Scholar.")

Fetching papers from ArXiv...


100%|██████████| 7/7 [00:01<00:00,  5.07it/s]


Saved 85 unique papers from ArXiv to data/raw/raw_arxiv.csv
