### Set up and environment variables


In [None]:

# Import libraries
import os
import re
import json
import pandas as pd
from dotenv import load_dotenv
from bs4 import BeautifulSoup

from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_community.graphs import Neo4jGraph
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.docstore.document import Document
from langchain_core.messages import HumanMessage


# --- Configuration ---

# Load environment variables from .env file
load_dotenv()

# Neo4j Credentials
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

# Google API Key
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
if not GOOGLE_API_KEY:
    raise ValueError("GOOGLE_API_KEY not found in environment variables.")
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY


DATA_DIR = "./Research Data set"
TEXT_FOLDER = os.path.join(DATA_DIR, "text")
TABLES_FOLDER = os.path.join(DATA_DIR, "tables_data")
IMAGES_FILE = os.path.join(DATA_DIR, "images_data.json")
CSV_FILE = os.path.join(DATA_DIR, "SB_publication_PMC.csv")

print("Setup complete. Ensure your .env file and data paths are correct.")

### Initializing connections

In [None]:
# --- Initialize Connections ---

# Initialize the Neo4j graph connection
graph = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD
)

# Initialize the LLM for graph transformation (using a powerful model is key)
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash-latest", temperature=0)

# Initialize the multimodal LLM for image analysis
llm_vision = ChatGoogleGenerativeAI(model="gemini-2.5-flash-latest", temperature=0)


# --- Define Graph Schema ---
# This is the prompt that instructs the LLMGraphTransformer
# It's based on our stakeholder-aware schema discussion.

graph_creation_prompt = """
You are a brilliant NASA biologist and data scientist. Your task is to extract a knowledge graph from the provided research paper text.

1.  **Nodes**: Identify all relevant entities and classify them into one of the following categories:
    * `Paper`: The research paper itself.
    * `BioEntity`: Biological components like Genes, Proteins, Cell Types, Molecules.
    * `Concept`: Abstract ideas or processes like "Bone Loss", "Oxidative Stress", Health Risks, or Diseases.
    * `Stressor`: Environmental factors unique to space like "Microgravity", "Galactic Cosmic Rays".
    * `Organism`: The subject of the study (e.g., "Mus musculus", "Homo sapiens").
    * `MissionContext`: Missions, hardware, or facilities like "ISS Expedition 41", "Rodent Research-1".
    * `Application`: Potential real-world benefits like "Osteoporosis Treatment", "Cancer Therapy".
    * `Institution`: Organizations involved.

2.  **Relationships**: Identify the relationships between these entities. Use the following relationship types:
    * `AFFECTS`: The primary relationship for scientific findings. **Crucially, add an `effect` property** to describe the nature of the effect (e.g., 'upregulates', 'inhibits', 'causes', 'correlates_with'). Also add an `evidence` property with the text snippet that supports the finding.
    * `INVESTIGATES`: Connects a `Paper` to what it studies.
    * `STUDIED_IN`: Connects a finding or entity to the `Organism`.
    * `PART_OF`: Links research to a `MissionContext`.
    * `HAS_POTENTIAL`: Links a finding to an `Application`.
    * `AFFILIATED_WITH`: Connects a `Paper` to an `Institution`.

Provide the output as a list of graph nodes and a list of graph relationships. Do not add any nodes or relationships that are not explicitly mentioned in the text.
"""


# Initialize the LLMGraphTransformer
transformer = LLMGraphTransformer(
    llm=llm,
    prompt=graph_creation_prompt
)

print("Graph schema defined and connections initialized.")

### Unified context processing functions

In [None]:


def process_text_chunk(text_chunk, paper_node):
    """Processes a plain text chunk and adds it to the graph."""
    if not text_chunk.strip():
        return
    print(f"  Processing text chunk of {len(text_chunk)} chars...")
    document = Document(page_content=text_chunk)
    graph_documents = transformer.convert_to_graph_documents([document])
    graph.add_graph_documents(graph_documents, base=paper_node)

def process_table(table_id, context_text, paper_node):
    """Processes a table with its context, adds to graph, and creates VisualEvidence."""
    print(f"  Processing table: {table_id}...")
    table_html_path = os.path.join(TABLES_FOLDER, f"{table_id}.html")
    if not os.path.exists(table_html_path):
        print(f"    [WARN] Table HTML file not found: {table_id}.html")
        return

    with open(table_html_path, 'r') as f:
        table_html = f.read()

    # Create a rich prompt for the LLM
    table_prompt = f"""
    The following text is a snippet from a research paper that references a table:
    ---
    CONTEXT: "{context_text}"
    ---
    The full data for {table_id} is provided here in HTML format:
    ---
    TABLE DATA: "{table_html}"
    ---
    Based on BOTH the text context and the table data, extract all relevant scientific entities and their relationships.
    """
    document = Document(page_content=table_prompt)
    graph_documents = transformer.convert_to_graph_documents([document])
    graph.add_graph_documents(graph_documents, base=paper_node)

    # Create the VisualEvidence node for the table
    # This Cypher query creates the node and links it to the concepts found in the context
    for doc in graph_documents:
        for node in doc.nodes:
            # We only want to link to non-Paper nodes
            if node.label != 'Paper':
                cypher_query = """
                MERGE (p:Paper {id: $paper_id})
                MERGE (v:VisualEvidence {id: $table_id, type: 'Table', content: $content, caption: $caption})
                MERGE (c:%s {id: $concept_id})
                MERGE (v)-[:ILLUSTRATES]->(c)
                MERGE (p)-[:HAS_EVIDENCE]->(v) // Link the paper to the visual
                """ % node.label
                graph.query(cypher_query, params={
                    "paper_id": paper_node['properties']['id'],
                    "table_id": table_id,
                    "content": f"{table_id}.html",
                    "caption": context_text,
                    "concept_id": node.id
                })

def process_image(image_id, context_text, paper_node, image_url_map):
    """
    Processes an image with its context, adds to graph, and creates VisualEvidence.
    This is the actual implementation for the multimodal call.
    """
    print(f"  Processing image: {image_id}...")
    image_url = image_url_map.get(image_id)
    if not image_url:
        print(f"    [WARN] Image URL not found for: {image_id}")
        return

    # Construct the multimodal message for Gemini
    vision_prompt_text = f"""
    Analyze the following scientific image in the context of its caption from a research paper.
    ---
    CAPTION CONTEXT: "{context_text}"
    ---
    Based on the image at the provided URL and its caption, describe the primary scientific finding in one clear sentence.
    This sentence will be used to create knowledge graph relationships, so be precise and factual.
    """
    
    # Create the message payload with both text and image
    message = HumanMessage(
        content=[
            {"type": "text", "text": vision_prompt_text},
            {"type": "image_url", "image_url": image_url},
        ]
    )

    # Invoke the vision model
    response = llm_vision.invoke([message])
    finding_text = response.content

    # Now, process this description to extract graph elements
    if finding_text:
        document = Document(page_content=finding_text)
        graph_documents = transformer.convert_to_graph_documents([document])
        graph.add_graph_documents(graph_documents, base=paper_node)

        # Create the VisualEvidence node for the image
        for doc in graph_documents:
            for node in doc.nodes:
                 # We only want to link to non-Paper nodes
                if node.label != 'Paper':
                    cypher_query = """
                    MERGE (p:Paper {id: $paper_id})
                    MERGE (v:VisualEvidence {id: $image_id, type: 'Image', content: $content, caption: $caption})
                    MERGE (c:%s {id: $concept_id})
                    MERGE (v)-[:ILLUSTRATES]->(c)
                    MERGE (p)-[:HAS_EVIDENCE]->(v) // Link the paper to the visual
                    """ % node.label
                    graph.query(cypher_query, params={
                        "paper_id": paper_node['properties']['id'],
                        "image_id": image_id,
                        "content": image_url,
                        "caption": context_text,
                        "concept_id": node.id
                    })

print("Processing functions defined with actual vision model implementation.")

### Main Ingestion Loop

In [None]:
# --- Main Processing Loop ---

# 1. Load and Pre-process Supporting Data for Robust Matching
try:
    with open(IMAGES_FILE, 'r') as f:
        image_url_map = json.load(f)
    
    # Load the CSV
    papers_df = pd.read_csv(CSV_FILE)
    
    # --- IMPROVEMENT: Create a direct mapping from PMC ID to Title and URL ---
    # Extract PMC ID from the URL link. Assumes format like ".../PMC12345/"
    papers_df['pmc_id'] = papers_df['Link'].str.extract(r'(PMC\d+)', expand=False)
    
    # Drop any rows where a PMC ID couldn't be extracted
    papers_df.dropna(subset=['pmc_id'], inplace=True)
    
    # Create dictionaries for fast, reliable lookup using the PMC ID as the key
    id_to_title_map = pd.Series(papers_df.Title.values, index=papers_df.pmc_id).to_dict()
    id_to_url_map = pd.Series(papers_df.Link.values, index=papers_df.pmc_id).to_dict()
    
    print("Supporting data loaded and pre-processed for direct PMC ID matching.")

except Exception as e:
    print(f"[ERROR] Could not load or process supporting data files: {e}")
    # Exit or handle error appropriately

# 2. Get list of text files to process
text_files = [f for f in os.listdir(TEXT_FOLDER) if f.endswith('.txt')]

# Regex to find all our placeholders for tables and images
media_pattern = re.compile(r'(img-[a-zA-Z0-9]+|table\d+)')

# 3. Iterate through each paper file
for filename in text_files:
    # --- IMPROVEMENT: This section is now much cleaner and more reliable ---
    paper_id = os.path.splitext(filename)[0] # e.g., "PMC12345"
    print(f"\n--- Processing Paper: {paper_id} ---")
    
    # Use the PMC ID to directly and safely look up the title and URL
    # The .get() method returns 'Unknown Title' or 'URL not found' if the ID isn't in our map
    paper_title = id_to_title_map.get(paper_id, "Title Not Found in CSV")
    paper_url = id_to_url_map.get(paper_id, "URL Not Found in CSV")

    if "Not Found" in paper_title:
        print(f"  [WARN] Metadata for {paper_id} not found in {CSV_FILE}. Skipping metadata creation.")
        continue # Or handle as you see fit

    # Create the central Paper node for this document
    graph.query(
        "MERGE (p:Paper {id: $id}) SET p.title = $title, p.url = $url",
        params={"id": paper_id, "title": paper_title, "url": paper_url}
    )
    paper_node = {"type": "Paper", "properties": {"id": paper_id}}


    # --- The rest of the content processing logic remains the same ---
    file_path = os.path.join(TEXT_FOLDER, filename)
    with open(file_path, 'r', encoding='utf-8') as f:
        full_text = f.read()

    last_end = 0
    for match in media_pattern.finditer(full_text):
        start, end = match.span()
        media_id = match.group(0)

        # Process the text chunk before this media item
        text_chunk = full_text[last_end:start]
        process_text_chunk(text_chunk, paper_node)

        # Process the media item itself
        # Grab ~300 chars of context around the ID for the caption
        context_start = max(0, start - 150)
        context_end = min(len(full_text), end + 150)
        context_text = full_text[context_start:context_end]

        if media_id.startswith('table'):
            process_table(media_id, context_text, paper_node)
        elif media_id.startswith('img-'):
            process_image(media_id, context_text, paper_node, image_url_map)
        
        last_end = end

    # Process any remaining text after the last media item
    remaining_text = full_text[last_end:]
    process_text_chunk(remaining_text, paper_node)

print("\n--- Ingestion Complete! ---")
print("Your Neo4j database is now populated with the knowledge graph.")