## Final Project: Query-Driven Retrieval-Augmented Graph Exploration Tool
By Karl Simon

### Step 1: Load the dataset into PyG (PyTorch Geometric)

In [1]:
# Necessary imports for entire notebook
import json
import torch
import re
from IPython.display import display, HTML
from torch_geometric.data import HeteroData
from collections import defaultdict
from torch_geometric.utils import k_hop_subgraph
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import requests
import os
# from openai import OpenAI
import google.generativeai as genai
import random
from scholarly import scholarly
from bs4 import BeautifulSoup


MAX_VAL_LEN = 1000 # max text length for input to LLM from graph_results for each node


# Load JSON data from file
file_path = "/home/karlsimon/CSCI6365/final/graph.json"
graph_data = []

# Load data line by line to prevent memory overload
with open(file_path, "r") as f:
    for line in f:
        try:
            graph_data.append(json.loads(line))
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON line: {e}")
            continue

# Initialize HeteroData object
data = HeteroData()

# Mapping for node indices per node type
node_mappings = defaultdict(dict)

# Temporary storage for properties
node_properties = defaultdict(lambda: defaultdict(list))
edge_indices = defaultdict(lambda: defaultdict(list))

# # Define limits for node subsets based on type
# node_limits = {
#     'Publication': 1000,
#     'Dataset': 500,
#     'ScienceKeyword': 300,
#     'Instrument': 200,
#     'Platform': 150,
#     'Project': 100,
#     'DataCenter': 50
# }

# Track the number of nodes added per type
node_counts = defaultdict(int)

# Process nodes with limits
for item in graph_data:
    if item['type'] == 'node':
        node_type = item['labels'][0]
        # if node_counts[node_type] >= node_limits.get(node_type, 50):
        #     continue  # Skip nodes once the limit is reached

        node_id = item['id']
        properties = item['properties']

        # Store the node index mapping
        node_index = len(node_mappings[node_type])
        node_mappings[node_type][node_id] = node_index
        node_counts[node_type] += 1

        # Store properties temporarily by type
        for key, value in properties.items():
            if isinstance(value, list) and all(isinstance(v, (int, float)) for v in value):
                node_properties[node_type][key].append(torch.tensor(value, dtype=torch.float))
            elif isinstance(value, (int, float)):
                node_properties[node_type][key].append(torch.tensor([value], dtype=torch.float))
            else:
                node_properties[node_type][key].append(value)  # non-numeric properties as lists

# # Define limits for relationships based on type
# relationship_limits = {
#     'CITES': 2000,
#     'HAS_APPLIED_RESEARCH_AREA': 1000,
#     'HAS_SCIENCEKEYWORD': 500,
#     'HAS_PLATFORM': 500,
#     'HAS_DATASET': 500,
#     'OF_PROJECT': 300,
#     'HAS_INSTRUMENT': 200
# }

# Track the number of relationships added per type
relationship_counts = defaultdict(int)

# Filter relationships to only include sampled nodes
for item in graph_data:
    if item['type'] == 'relationship':
        start_type = item['start']['labels'][0]
        end_type = item['end']['labels'][0]
        start_id = item['start']['id']
        end_id = item['end']['id']
        edge_type = item['label']

        # # Skip if relationship limit reached
        # if relationship_counts[edge_type] >= relationship_limits.get(edge_type, 100):
        #     continue

        # Check if start and end nodes exist in the sampled nodes
        if start_id in node_mappings[start_type] and end_id in node_mappings[end_type]:
            start_idx = node_mappings[start_type][start_id]
            end_idx = node_mappings[end_type][end_id]

            # Append to edge list
            edge_indices[(start_type, edge_type, end_type)]['start'].append(start_idx)
            edge_indices[(start_type, edge_type, end_type)]['end'].append(end_idx)
            relationship_counts[edge_type] += 1

# Finalize node properties by batch processing
for node_type, properties in node_properties.items():
    data[node_type].num_nodes = len(node_mappings[node_type])
    for key, values in properties.items():
        if isinstance(values[0], torch.Tensor):
            data[node_type][key] = torch.stack(values)
        else:
            data[node_type][key] = values  # Keep non-tensor properties as lists

# Finalize edge indices in bulk
for (start_type, edge_type, end_type), indices in edge_indices.items():
    edge_index = torch.tensor([indices['start'], indices['end']], dtype=torch.long)
    data[start_type, edge_type, end_type].edge_index = edge_index

# Display statistics for verification
print("Nodes and Properties:")
for node_type in data.node_types:
    print(f"\nNode Type: {node_type}")
    print(f"Number of Nodes: {data[node_type].num_nodes}")
    for key, value in data[node_type].items():
        if key != 'num_nodes':
            if isinstance(value, torch.Tensor):
                print(f"  - {key}: {value.shape}")
            else:
                print(f"  - {key}: {len(value)} items (non-numeric)")

print("\nEdges and Types:")
for edge_type in data.edge_types:
    edge_index = data[edge_type].edge_index
    print(f"Edge Type: {edge_type} - Number of Edges: {edge_index.size(1)} - Shape: {edge_index.shape}")


Nodes and Properties:

Node Type: Dataset
Number of Nodes: 6390
  - temporalExtentStart: 6375 items (non-numeric)
  - seCorner: 5330 items (non-numeric)
  - cmrId: 6390 items (non-numeric)
  - globalId: 6390 items (non-numeric)
  - fastrp_embedding_with_labels: torch.Size([6390, 512])
  - abstract: 6390 items (non-numeric)
  - daac: 6131 items (non-numeric)
  - nwCorner: 5330 items (non-numeric)
  - temporalFrequency: 6390 items (non-numeric)
  - pagerank_global: torch.Size([6390, 1])
  - temporalExtentEnd: 3765 items (non-numeric)
  - shortName: 6390 items (non-numeric)
  - landingPageUrl: 3037 items (non-numeric)
  - doi: 6390 items (non-numeric)
  - longName: 6390 items (non-numeric)

Node Type: DataCenter
Number of Nodes: 184
  - pagerank_global: torch.Size([184, 1])
  - globalId: 184 items (non-numeric)
  - fastrp_embedding_with_labels: torch.Size([184, 512])
  - shortName: 184 items (non-numeric)
  - url: 184 items (non-numeric)
  - longName: 184 items (non-numeric)

Node Type: P

### Step 2.1 : Search Graph for nodes based on user query

In [2]:
# Next Steps:
# 1. improve graph search and rank results.
# 2. improve subgraph exploration.
# 3. improve external context retrieval (NASA API).

# Functions definitions for keywords, search and display used in next cell
def extract_keywords(query):
    keywords = re.findall(r'\b\w+\b', query)
    return [kw.lower() for kw in keywords]

# Updated search_graph function with TF-IDF scoring
# TODO: make max_per_type specific to each node type
def search_graph(data, keywords, node_types=['Dataset', 'Project', 'ScienceKeyword', 'Instrument', 'Platform', 'Publication'], max_results=50, max_per_type=10):
    results = []
    texts = []  # Collect text data for TF-IDF processing
    metadata = []  # To store corresponding metadata (node type, index, key, value)

    # Step 1: Collect all matching nodes and their text data
    for node_type in node_types:
        for key in data[node_type]:
            if key == 'num_nodes':
                continue
            
            values = data[node_type][key]
            if isinstance(values, list):
                for idx, value in enumerate(values):
                    value_str = str(value).lower()
                    if any(kw in value_str for kw in keywords):
                        texts.append(value_str)
                        metadata.append((node_type, idx, key, value))

    if not texts:
        return []

    # Step 2: Compute TF-IDF scores for the collected texts
    # NOTE: texts stores the properties of the nodes which contain the keywords
    vectorizer = TfidfVectorizer(vocabulary=keywords)
    tfidf_matrix = vectorizer.fit_transform(texts)
    scores = tfidf_matrix.sum(axis=1).A1  # Sum the TF-IDF scores for each text

    # Step 3: Sort the results by TF-IDF score in descending order
    sorted_indices = np.argsort(scores)[::-1]
    sorted_results = [metadata[i] for i in sorted_indices]
    # with open("sorted_results.txt", "w") as file:
    #     for result in sorted_results:
    #         file.write(f"{result}\n")

    # Step 4: Limit the number of results overall and per node type
    final_results = []
    counts_per_type = {node_type: 0 for node_type in node_types}

    for result in sorted_results:
        node_type = result[0]
        if len(final_results) >= max_results:
            break
        if counts_per_type[node_type] < max_per_type:
            final_results.append(result)
            counts_per_type[node_type] += 1

    # write the 50 final_results to a file
    print("Writing 50 final_results to file")
    with open("final_results.txt", "w") as file:
        for result in final_results:
            file.write(f"{result}\n")

    return final_results

# Updated display_results function to trim long values
def display_results(results, max_value_length=MAX_VAL_LEN):
    if not results:
        print("No relevant nodes found.")
        return

    with open("query_results.txt", "w") as file:
        print(f"\nFound {len(results)} relevant nodes:\n")
        for node_type, idx, key, value in results:
            value_str = str(value)
            if len(value_str) > max_value_length:
                value_str = value_str[:max_value_length] + "..."
            output_line = f"Node Type: {node_type} | Index: {idx} | Property: {key} | Value: {value_str}\n"
            file.write(output_line)


In [3]:
# Given query, extract keywords, search the graph for relevant nodes, and display the results
# NOTE: currently only searches for exact keyword matches in node properties

def get_subgraph(data, node_type, node_indices, num_hops=2):
    # Find all edge types where the node_type is either the source or target
    relevant_edges = [
        (src, rel, dst) for (src, rel, dst) in data.edge_types if src == node_type or dst == node_type
    ]
    
    print("relevant_edges = ", relevant_edges)

    if not relevant_edges:
        print(f"No edges found for node type '{node_type}'")
        return None, None, None

    # Combine edge indices from all relevant edge types
    combined_edge_index = []
    combined_edge_types = []

    for edge_type in relevant_edges:
        edge_index = data[edge_type].edge_index
        combined_edge_index.append(edge_index)
        combined_edge_types.append(edge_type)

    # Stack all edge indices into a single tensor
    combined_edge_index = torch.cat(combined_edge_index, dim=1)

    # Extract the subgraph using the combined edge index
    subset, edge_index, _, _ = k_hop_subgraph(node_idx=node_indices, num_hops=num_hops, edge_index=combined_edge_index)
    return subset, edge_index, combined_edge_types


# Explore subgraphs based on the search results.
def explore_subgraphs(data, results, num_hops=2):
    if not results:
        print("No nodes to explore for subgraphs.")
        return

    # Group the results by node type
    nodes_by_type = defaultdict(list)
    for node_type, idx, _, _ in results:
        nodes_by_type[node_type].append(idx)

    # Extract and display subgraphs for each node type
    for node_type, indices in nodes_by_type.items():
        print(f"\nExploring subgraph for node type: {node_type}")
        # print(f"Number of nodes: {len(indices)}") #10 nodes
        # Get the valid range for node indices
        num_nodes = data[node_type].num_nodes
        valid_indices = [idx for idx in indices if idx < num_nodes]

        if not valid_indices:
            print(f"No valid indices for node type '{node_type}'.")
            continue

        node_indices = torch.tensor(valid_indices[:10])  # Limit to 10 nodes (only using 10 per node_type anyways for now)
        print(f"Exploring subgraph for node indices: {node_indices}") # may not be sequential due to search results ordering 
        subset, edge_index, edge_type = get_subgraph(data, node_type, node_indices, num_hops=num_hops)

        if subset is not None and edge_index is not None:
            print(f"Extracted subgraph with {len(subset)} nodes and {edge_index.size(1)} edges.")
            print(f"Edge Type: {edge_type}")
        else:
            print(f"Could not extract subgraph for node type: {node_type}")

### Step 2.2 : Use APIs Wikipedia for external information based on user query
- Question: should API be queries based on keywords or the extracted graph nodes from keywords?

In [4]:
# Get external context from Wikipedia using the REST API
def fetch_wikipedia_context(keywords):
    search_term = " ".join(keywords)
    
    # Step 1: Use the Action API to get the top 5 search results
    search_url = "https://en.wikipedia.org/w/api.php"
    search_params = {
        "action": "query",
        "list": "search",
        "srsearch": search_term,
        "srlimit": 5,
        "format": "json"
    }
    
    headers = {
        "User-Agent": "GraphExplorationTool/1.0 (ksimon24@gwu.edu)"
    }
    
    try:
        search_response = requests.get(search_url, params=search_params, headers=headers)
        search_response.raise_for_status()
        search_data = search_response.json()
        
        search_results = search_data.get("query", {}).get("search", [])
        
        if not search_results:
            return None

        # Step 2: Fetch summaries using the REST API for each search result
        context_list = []
        for result in search_results:
            print("result = ", result)
            page_title = result.get("title")
            rest_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{page_title.replace(' ', '_')}"
            
            rest_response = requests.get(rest_url, headers=headers)
            rest_response.raise_for_status()
            rest_data = rest_response.json()
            
            # Extract relevant information
            title = rest_data.get("title", "No Title")
            description = rest_data.get("description", "No Description Available.")
            summary = rest_data.get("extract", "No Summary Available.")
            link = rest_data.get("content_urls", {}).get("desktop", {}).get("page", "No Link Available.")
            thumbnail = rest_data.get("thumbnail", {}).get("source", None)
            
            context_entry = {
                "title": title,
                "description": description,
                "summary": summary,
                "link": link,
                "thumbnail": thumbnail
            }
            context_list.append(context_entry)
        
        return context_list

    except requests.RequestException as e:
        print(f"Error fetching Wikipedia context: {e}")
        return None
    
def display_wikipedia_context(context_list):
    if not context_list:
        print("\nNo external context available from Wikipedia.")
        return

    print("\nWikipedia Context:")
    for i, context in enumerate(context_list, start=1):
        print(f"\nResult {i}:")
        print(f"Title: {context['title']}")
        print(f"Description: {context['description']}")
        print(f"Summary: {context['summary']}")
        print(f"Link: {context['link']}")
        if context['thumbnail']:
            print(f"Thumbnail: {context['thumbnail']}")


In [5]:
# ################ Pre-LLM Steps ################
# query = input("Enter your query (e.g., 'Find datasets related to climate change projects'): ")
query = "climate change" #TODO: remove hardcoded query
keywords = extract_keywords(query)
print(f"\nExtracted Keywords: {keywords}")

# Search the graph with TF-IDF ranking
graph_results = search_graph(data, keywords)
display_results(graph_results) # 50 results

# Fetch Wikipedia context
wikipedia_context = fetch_wikipedia_context(keywords)
display_wikipedia_context(wikipedia_context)

# Explore subgraphs based on the results
# TODO: save these subgraphs
explore_subgraphs(data, graph_results)



Extracted Keywords: ['climate', 'change']
Writing 50 final_results to file

Found 50 relevant nodes:

result =  {'ns': 0, 'title': 'Climate change', 'pageid': 5042951, 'size': 317341, 'wordcount': 27919, 'snippet': 'Present-day <span class="searchmatch">climate</span> <span class="searchmatch">change</span> includes both global warming—the ongoing increase in global average temperature—and its wider effects on Earth\'s <span class="searchmatch">climate</span>. <span class="searchmatch">Climate</span> change', 'timestamp': '2024-12-10T03:05:32Z'}
result =  {'ns': 0, 'title': 'Climate change denial', 'pageid': 12474403, 'size': 237127, 'wordcount': 22133, 'snippet': '<span class="searchmatch">Climate</span> <span class="searchmatch">change</span> denial (also global warming denial) is a form of science denial characterized by rejecting, refusing to acknowledge, disputing, or fighting', 'timestamp': '2024-12-04T00:54:58Z'}
result =  {'ns': 0, 'title': 'Climate change mitigation', 'pageid

### Begin the RAG Pipeline with LLM Summary

In [6]:
############ Gemini API ############
# Define the path to the text file containing the API key
file_path = "/home/karlsimon/CSCI6365/final/gemini_api_key.txt"
with open(file_path, "r") as file:
    api_key = file.read().strip()
# print(api_key)
genai.configure(api_key=api_key)

# Create a model instance (using Gemini 1.5 Flash in this case)
model = genai.GenerativeModel('gemini-1.5-flash-latest')


In [7]:
# Function to summarize combined results using the LLM
def summarize_results_with_llm(graph_results, wikipedia_context):
    prompt = "Summarize the following search results and Wikipedia context:\n\n"

    # Add graph results to the prompt
    prompt += "Graph Search Results:\n"
    for node_type, idx, key, value in graph_results[:10]:  # Limit to top 10 results
        prompt += f"- Node Type: {node_type}, Property: {key}, Value: {str(value)[:MAX_VAL_LEN]}...\n"

    # Add Wikipedia context to the prompt
    prompt += "\nWikipedia Context:\n"
    for i, context in enumerate(wikipedia_context, start=1):
        prompt += f"{i}. Title: {context['title']}\n"
        prompt += f"   Summary: {context['summary'][:MAX_VAL_LEN]}...\n"
    
    with open("prompt_file.txt", "w") as file:
        file.write(f"{prompt}\n")

    # Call the Gemini model to generate the summary
    response = model.generate_content(prompt)
    return response.text

# Generate an LLM summary of the combined results
summary = summarize_results_with_llm(graph_results, wikipedia_context)
print("\nLLM-Generated Summary:")
print(summary)


LLM-Generated Summary:
The provided text comprises abstracts from several research papers investigating diverse aspects of climate change and its impacts.  The studies cover a wide range of topics including:

* **Impact on Evapotranspiration (ET):** One study assesses the spatial and temporal variations in ET in the Narmada river basin (India) using SEBAL and predicts future changes based on land use and climate change models (ACCESS1-0 and Markov Chain).

* **Climate Change and Forest Fires:** Another study examines the relationship between climate change (using GRACE data) and forest fires in Yunnan province, China, analyzing the spatiotemporal distribution of fires and their correlation with hydrological and climatic factors.

* **Climate Change Impacts on Marine Ecosystems:**  Research investigates the effect of climate change-induced alterations in prey quality (fatty acid composition) on juvenile Chinook salmon, focusing on their nutritional condition and growth.

* **Stratosphe

### Step 3: Use subgraph for additional information and display

In [8]:

# Explore subgraphs based on the results (NOT LLM dependent, just for output to terminal)
explore_subgraphs(data, graph_results)

# TODO: change metrics to evaluate the LLM generated explanations
# Generate LLM explanations for each explored subgraph

# print("length of graph_results[:3]: ", len(graph_results)) # len(graph_results) = 50, and len(graph_results[:3]) = 3
# print("graph_results[:3]: ", graph_results[:3])

for node_type, idx, key, value in graph_results[11:12]:  # Limit to 3 nodes for brevity
    # Get subgraph for the current node type and indices
    subset, edge_index, edge_types = get_subgraph(data, node_type, torch.tensor([idx]))
    print(f"subset = ", subset, ", edge_index = ", edge_index, ", edge_types = ", edge_types, "and node_type = ", node_type)



Exploring subgraph for node type: Publication
Exploring subgraph for node indices: tensor([76606, 94077, 50763, 55702, 88548, 42689, 82757, 12919, 60848, 35890])
relevant_edges =  [('Publication', 'CITES', 'Publication'), ('Publication', 'HAS_APPLIED_RESEARCH_AREA', 'ScienceKeyword')]
Extracted subgraph with 18 nodes and 8 edges.
Edge Type: [('Publication', 'CITES', 'Publication'), ('Publication', 'HAS_APPLIED_RESEARCH_AREA', 'ScienceKeyword')]

Exploring subgraph for node type: Dataset
Exploring subgraph for node indices: tensor([4329, 2323,   97,   61,   65, 2131, 4180, 3752,  293, 4253])
relevant_edges =  [('DataCenter', 'HAS_DATASET', 'Dataset'), ('Dataset', 'OF_PROJECT', 'Project'), ('Dataset', 'HAS_PLATFORM', 'Platform'), ('Dataset', 'HAS_SCIENCEKEYWORD', 'ScienceKeyword')]
Extracted subgraph with 2263 nodes and 16366 edges.
Edge Type: [('DataCenter', 'HAS_DATASET', 'Dataset'), ('Dataset', 'OF_PROJECT', 'Project'), ('Dataset', 'HAS_PLATFORM', 'Platform'), ('Dataset', 'HAS_SCIENC

In [9]:
import random

# specify the priorities to use in value selection
def get_priority_properties():
    priority_properties = {
        'Dataset': ['longName', 'abstract', 'shortName'],
        'Publication': ['title', 'abstract'],
        'ScienceKeyword': ['name'],
        'Instrument': ['longName', 'shortName'],
        'Platform': ['longName', 'shortName'],
        'Project': ['longName', 'shortName'],
        'DataCenter': ['longName', 'shortName']
    }
    return priority_properties

def create_nodes_of_interest(graph_results, max_per_type=3):
    nodes_by_type = defaultdict(list)
    for node_type, idx, key, value in graph_results:
        nodes_by_type[node_type].append((idx, node_type, key, value))

    # Select up to max_per_type nodes for each type
    nodes_of_interest = []
    for node_type, nodes in nodes_by_type.items():
        nodes_of_interest.extend(random.sample(nodes, min(max_per_type, len(nodes))))

    return nodes_of_interest

def explore_subgraph_nodes(data, node_type, node_id, num_hops=2, max_per_type=3):
    priority_properties = get_priority_properties()
    subset, edge_index, edge_types = get_subgraph(data, node_type, torch.tensor([node_id]), num_hops=num_hops)

    if subset is None:
        return []

    # Map node indices to their types and values
    subgraph_nodes = []
    print(f"Number of nodes in subgraph for node_id: {node_id} = {len(subset)} | subset = {subset}")

    for sub_id in subset.tolist():
        for sub_node_type in data.node_types:
            num_nodes = data[sub_node_type].num_nodes
            if sub_id < num_nodes:
                # Attempt to find a meaningful property
                value = None
                for prop in priority_properties.get(sub_node_type, []):
                    if prop in data[sub_node_type] and len(data[sub_node_type][prop]) > sub_id:
                        value = data[sub_node_type][prop][sub_id]
                        break
                if value is None:  # Fallback to globalId or indicate no value
                    value = data[sub_node_type].get('globalId', ['No value'])[sub_id] if 'globalId' in data[sub_node_type] else 'No value'
                
                subgraph_nodes.append((sub_id, sub_node_type, value))

    # Group by node type and select a random subset of up to max_per_type nodes
    nodes_by_type = defaultdict(list)
    for node_id, node_type, value in subgraph_nodes:
        nodes_by_type[node_type].append((node_id, node_type, value))

    exploration_list = []
    for node_type, nodes in nodes_by_type.items():
        exploration_list.extend(random.sample(nodes, min(max_per_type, len(nodes))))

    return exploration_list

def write_exploration_to_file(data, graph_results, filename="graph_exploration.txt"):
    nodes_of_interest = create_nodes_of_interest(graph_results)

    with open(filename, "w") as file:
        file.write("=== Nodes of Interest ===\n")
        for idx, node_type, key, value in nodes_of_interest:
            value_str = str(value)
            display_value = value_str[:MAX_VAL_LEN] + ("..." if len(value_str) > MAX_VAL_LEN else "")
            file.write(f"ID: {idx}, Type: {node_type}, Key: {key}, Value: {display_value}\n")

        file.write("\n=== Subgraph Exploration ===\n")
        for idx, node_type, key, value in nodes_of_interest:
            file.write(f"\nExploring Subgraph for Node ID: {idx} (Type: {node_type})\n")
            subgraph_nodes = explore_subgraph_nodes(data, node_type, idx)
            for sub_id, sub_node_type, sub_value in subgraph_nodes:
                sub_value_str = str(sub_value)
                display_sub_value = sub_value_str[:MAX_VAL_LEN] + ("..." if len(sub_value_str) > MAX_VAL_LEN else "")
                file.write(f"  - ID: {sub_id}, Type: {sub_node_type}, Value: {display_sub_value}\n")

    print(f"\nExploration results written to '{filename}'.")


def interactive_exploration(data):
    priority_properties = get_priority_properties()
    while True:
        choice = input("\nEnter a Node ID to explore further (or 'q' to quit): ")
        if choice.lower() == 'q':
            break

        try:
            node_id = int(choice)
            node_type = input("Enter the Node Type (e.g., Dataset, ScienceKeyword, Instrument): ").strip()

            # Validate the node type
            if node_type not in data.node_types:
                print(f"Invalid node type '{node_type}'. Available types: {data.node_types}")
                continue

            num_nodes = data[node_type].num_nodes
            if node_id >= num_nodes:
                print(f"No node with ID: {node_id} in type '{node_type}'.")
                continue

            print(f"\nSelected Node ID: {node_id} (Type: {node_type})")
            action = input("Enter 'wiki' to fetch Wikipedia context or 'subgraph' to explore subgraph of node: ").lower()

            if action == 'wiki':
                # Select a meaningful property using priority_properties
                value = None
                for prop in priority_properties.get(node_type, []):
                    if prop in data[node_type] and len(data[node_type][prop]) > node_id:
                        value = data[node_type][prop][node_id]
                        break

                if value is None:
                    value = 'No value'

                wikipedia_context = fetch_wikipedia_context([str(value)])
                print("The prompt used for the Wikipedia context =", str(value))
                display_wikipedia_context(wikipedia_context)

            elif action == 'subgraph':
                subgraph_nodes = explore_subgraph_nodes(data, node_type, node_id)
                print(f"\nSubgraph for Node ID: {node_id} (Type: {node_type})")
                for sub_id, sub_node_type, sub_value in subgraph_nodes:
                    sub_value_str = str(sub_value)
                    display_sub_value = sub_value_str[:MAX_VAL_LEN] + ("..." if len(sub_value_str) > MAX_VAL_LEN else "")
                    print(f"  - ID: {sub_id}, Type: {sub_node_type}, Value: {display_sub_value}")

            else:
                print("Invalid action. Please enter 'wiki' or 'subgraph'.")

        except ValueError:
            print("Invalid Node ID. Please enter a valid number.")

def run_exploration_tool(data, graph_results):
    # Write initial exploration to file
    write_exploration_to_file(data, graph_results)

    # Start interactive exploration
    interactive_exploration(data)


In [10]:
# Assumes `graph_results` contains the 50 search results
run_exploration_tool(data, graph_results)

relevant_edges =  [('Publication', 'CITES', 'Publication'), ('Publication', 'HAS_APPLIED_RESEARCH_AREA', 'ScienceKeyword')]
Number of nodes in subgraph for node_id: 60848 = 1 | subset = tensor([60848])
relevant_edges =  [('Publication', 'CITES', 'Publication'), ('Publication', 'HAS_APPLIED_RESEARCH_AREA', 'ScienceKeyword')]
Number of nodes in subgraph for node_id: 76606 = 1 | subset = tensor([76606])
relevant_edges =  [('Publication', 'CITES', 'Publication'), ('Publication', 'HAS_APPLIED_RESEARCH_AREA', 'ScienceKeyword')]
Number of nodes in subgraph for node_id: 42689 = 1 | subset = tensor([42689])
relevant_edges =  [('DataCenter', 'HAS_DATASET', 'Dataset'), ('Dataset', 'OF_PROJECT', 'Project'), ('Dataset', 'HAS_PLATFORM', 'Platform'), ('Dataset', 'HAS_SCIENCEKEYWORD', 'ScienceKeyword')]
Number of nodes in subgraph for node_id: 3752 = 10 | subset = tensor([   6,   54,  492,  497,  500,  778,  940,  967, 1107, 3752])
relevant_edges =  [('DataCenter', 'HAS_DATASET', 'Dataset'), ('Dataset

In [11]:
# See what kinds of edge types exist for a given node
def check_node_connections(data, node_type, node_idx):
    connections = []
    for edge_type in data.edge_types:
        edge_index = data[edge_type].edge_index
        if node_idx in edge_index[0] or node_idx in edge_index[1]:
            connections.append(edge_type)
    return connections

node_idx = 76606
node_type = 'Publication'
connections = check_node_connections(data, node_type, node_idx)
print(f"Connections for node {node_idx} of type '{node_type}': {connections}")


Connections for node 76606 of type 'Publication': [('Publication', 'CITES', 'Publication'), ('Publication', 'HAS_APPLIED_RESEARCH_AREA', 'ScienceKeyword')]


In [13]:
# Test scholarly PiPy package
from scholarly import scholarly

def fetch_scholar_info(query):
    try:
        search_query = scholarly.search_pubs(query)
        for i in range(5):  # Get top 3 results
            paper = next(search_query)
            print(f"Title: {paper['bib']['title']}")
            print(f"Abstract: {paper['bib'].get('abstract', 'No abstract available')}")
            print(f"URL: {paper.get('pub_url', 'No URL available')}\n")
    except StopIteration:
        print("No results found on Google Scholar.")
    except Exception as e:
        print(f"Error fetching data from Google Scholar: {e}")

# Example usage
query = "AIRS Aqua CO2 free troposphere"
fetch_scholar_info(query)


Title: The Technique Analysis of CO2 in Troposphere using AIRS
Abstract: CO2 pollutants (in this study on troposphere layer) and the data used are derived AIRS which  is  The result from the analysis is CO2 profile obtained from AIRS/Aqua L3 Monthly CO2
URL: http://sunankalijaga.org/prosiding/index.php/icse/article/view/282

Title: Seven years of observations of mid-tropospheric CO2 from the Atmospheric Infrared Sounder
Abstract: 1, is a hyperspectral infrared instrument on the EOS Aqua Spacecraft, launched on May 4,   We are finding that the AIRS mid-tropospheric CO 2 is a good indicator of vertical motion in
URL: https://www.sciencedirect.com/science/article/pii/S0094576511001457

Title: Midtropospheric CO2 concentration retrieval from AIRS observations in the tropics
Abstract: Atmospheric Infrared Sounder (AIRS), launched onboard the NASA's Aqua platform in May   sensitive to CO 2 and well covering the mid-to-high troposphere. Also flying onboard Aqua,
URL: https://agupubs.onlinelib

### Notes:

#### Known Issues:
- Duplicates in the dataset are not manually removed
- Using WIKIPEDIA as external resource. NASA APIs are very specific and not generalizeable to the specifc user queries