### Set up and environment variables


In [1]:
# Install necessary libraries
!pip install langchain langchain_experimental langchain_community langchain-google-genai neo4j python-dotenv beautifulsoup4 pandas

# Import libraries
import os
import re
import json
import pandas as pd
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from pathlib import Path
from langchain_neo4j import Neo4jGraph
from langchain_core.messages import HumanMessage
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_community.graphs import Neo4jGraph
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.docstore.document import Document

# --- Configuration ---

# --- FIX: Define the project's absolute root directory ---
# This assumes your notebook is in a subfolder like 'Model Tunning'
# It goes up one level to find the root.
ROOT_DIR = Path.cwd().parent
print(f"Project Root Directory identified as: {ROOT_DIR}")

# Construct absolute paths from the root directory
dotenv_path = ROOT_DIR / "Backend" / ".env"
DATA_DIR = ROOT_DIR / "data"

# Load environment variables from the correct, absolute path
load_dotenv(dotenv_path=dotenv_path)
print(f"Attempting to load .env file from: {dotenv_path}")

# Neo4j Credentials
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

# Google API Key
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# Verification Step
if not GOOGLE_API_KEY:
    raise ValueError("GOOGLE_API_KEY not found. Check the path to your .env file and the variable name.")
if not NEO4J_URI:
    raise ValueError("NEO4J credentials not found. Check the path to your .env file.")

os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
print("Successfully loaded environment variables.")

# --- File Paths (now constructed from the absolute DATA_DIR) ---
TEXT_FOLDER = DATA_DIR / "text"
TABLES_FOLDER = DATA_DIR / "tables_data"
IMAGES_FILE = DATA_DIR / "images_data.json"
CSV_FILE = DATA_DIR / "SB_publication_PMC.csv"

print("Setup complete. All paths are now absolute and robust.")

Project Root Directory identified as: d:\Astro-NOTS\Space-Biology-Knowledge-Engine
Attempting to load .env file from: d:\Astro-NOTS\Space-Biology-Knowledge-Engine\Backend\.env
Successfully loaded environment variables.
Setup complete. All paths are now absolute and robust.


### Initializing connections

In [2]:
# --- Initialize Connections ---
graph = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD
)

llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash-latest", temperature=0)
llm_vision = ChatGoogleGenerativeAI(model="gemini-2.5-flash-latest", temperature=0)

# --- Define Graph Schema ---
graph_creation_prompt = """
You are a brilliant NASA biologist and data scientist. Your task is to extract a knowledge graph from the provided research paper text.

1.  **Nodes**: Identify all relevant entities and classify them into one of the following categories:
    * `Paper`: The research paper itself.
    * `BioEntity`: Biological components like Genes, Proteins, Cell Types, Molecules.
    * `Concept`: Abstract ideas or processes like "Bone Loss", "Oxidative Stress", Health Risks, or Diseases.
    * `Stressor`: Environmental factors unique to space like "Microgravity", "Galactic Cosmic Rays".
    * `Organism`: The subject of the study (e.g., "Mus musculus", "Homo sapiens").
    * `MissionContext`: Missions, hardware, or facilities like "ISS Expedition 41", "Rodent Research-1".
    * `Application`: Potential real-world benefits like "Osteoporosis Treatment", "Cancer Therapy".
    * `Institution`: Organizations involved.

2.  **Relationships**: Identify the relationships between these entities. Use the following relationship types:
    * `AFFECTS`: The primary relationship for scientific findings. **Crucially, add an `effect` property** to describe the nature of the effect (e.g., 'upregulates', 'inhibits', 'causes', 'correlates_with'). Also add an `evidence` property with the text snippet that supports the finding.
    * `INVESTIGATES`: Connects a `Paper` to what it studies.
    * `STUDIED_IN`: Connects a finding or entity to the `Organism`.
    * `PART_OF`: Links research to a `MissionContext`.
    * `HAS_POTENTIAL`: Links a finding to an `Application`.
    * `AFFILIATED_WITH`: Connects a `Paper` to an `Institution`.

Provide the output as a list of graph nodes and a list of graph relationships. Do not add any nodes or relationships that are not explicitly mentioned in the text.
"""

transformer = LLMGraphTransformer(
    llm=llm,
    prompt=graph_creation_prompt
)

print("Graph schema defined and connections initialized.")

  graph = Neo4jGraph(
Unable to retrieve routing information


ValueError: Could not connect to Neo4j database. Please ensure that the url is correct

### Unified context processing functions

In [None]:
def process_text_chunk(text_chunk, paper_node):
    """Processes a plain text chunk and adds it to the graph."""
    if not text_chunk.strip():
        return
    print(f"  Processing text chunk of {len(text_chunk)} chars...")
    document = Document(page_content=text_chunk)
    graph_documents = transformer.convert_to_graph_documents([document])
    graph.add_graph_documents(graph_documents, base=paper_node)

def process_table(pmc_id, table_id_local, context_text, paper_node):
    """Processes a CSV table with its context."""
    table_filename = f"{pmc_id}_{table_id_local}.csv"
    print(f"  Processing table: {table_filename}...")
    table_csv_path = os.path.join(TABLES_FOLDER, table_filename)
    if not os.path.exists(table_csv_path):
        print(f"    [WARN] Table CSV file not found: {table_filename}")
        return

    # Convert CSV to a string format for the LLM
    try:
        df = pd.read_csv(table_csv_path)
        table_string = df.to_markdown(index=False)
    except Exception as e:
        print(f"    [ERROR] Could not parse CSV {table_filename}: {e}")
        return

    table_prompt = f"""
    CONTEXT: "{context_text}"
    ---
    TABLE DATA for {table_id_local} (in Markdown format):
    {table_string}
    ---
    Based on BOTH the text context and the table data, extract all relevant scientific entities and their relationships.
    """
    document = Document(page_content=table_prompt)
    graph_documents = transformer.convert_to_graph_documents([document])
    graph.add_graph_documents(graph_documents, base=paper_node)

    # Create the globally unique VisualEvidence node for the table
    unique_table_id = f"{pmc_id}_{table_id_local}"
    for doc in graph_documents:
        for node in doc.nodes:
            if node.label != 'Paper':
                graph.query("""
                MERGE (p:Paper {id: $pmc_id})
                MERGE (v:VisualEvidence {id: $unique_id, type: 'Table', content: $filename, caption: $caption})
                MERGE (c:%s {id: $concept_id})
                MERGE (v)-[:ILLUSTRATES]->(c)
                MERGE (p)-[:HAS_EVIDENCE]->(v)
                """ % node.label, params={
                    "pmc_id": pmc_id, "unique_id": unique_table_id,
                    "filename": table_filename, "caption": context_text,
                    "concept_id": node.id
                })

def process_image(pmc_id, image_id_local, context_text, paper_node, image_url_map):
    """Processes an image using the nested JSON structure."""
    print(f"  Processing image: {image_id_local} for paper {pmc_id}...")
    try:
        image_url = image_url_map[pmc_id][image_id_local]
    except KeyError:
        print(f"    [WARN] Image URL not found for {pmc_id} -> {image_id_local}")
        return

    vision_prompt_text = f"""
    CAPTION CONTEXT: "{context_text}"
    ---
    Based on the image at the provided URL and its caption, describe the primary scientific finding in one clear sentence. This sentence will be used to create knowledge graph relationships, so be precise and factual.
    """
    message = HumanMessage(content=[
        {"type": "text", "text": vision_prompt_text},
        {"type": "image_url", "image_url": image_url},
    ])

    response = llm_vision.invoke([message])
    finding_text = response.content

    if finding_text:
        document = Document(page_content=finding_text)
        graph_documents = transformer.convert_to_graph_documents([document])
        graph.add_graph_documents(graph_documents, base=paper_node)

        unique_image_id = f"{pmc_id}_{image_id_local}"
        for doc in graph_documents:
            for node in doc.nodes:
                if node.label != 'Paper':
                    graph.query("""
                    MERGE (p:Paper {id: $pmc_id})
                    MERGE (v:VisualEvidence {id: $unique_id, type: 'Image', content: $url, caption: $caption})
                    MERGE (c:%s {id: $concept_id})
                    MERGE (v)-[:ILLUSTRATES]->(c)
                    MERGE (p)-[:HAS_EVIDENCE]->(v)
                    """ % node.label, params={
                        "pmc_id": pmc_id, "unique_id": unique_image_id,
                        "url": image_url, "caption": context_text,
                        "concept_id": node.id
                    })

print("Processing functions updated for new data structure.")

### Main Ingestion Loop

In [None]:
# --- Main Processing Loop ---

# 1. Load and Pre-process Supporting Data
try:
    with open(IMAGES_FILE, 'r') as f:
        image_url_map = json.load(f)
    papers_df = pd.read_csv(CSV_FILE)
    papers_df['pmc_id'] = papers_df['Link'].str.extract(r'(PMC\d+)', expand=False)
    papers_df.dropna(subset=['pmc_id'], inplace=True)
    id_to_title_map = pd.Series(papers_df.Title.values, index=papers_df.pmc_id).to_dict()
    id_to_url_map = pd.Series(papers_df.Link.values, index=papers_df.pmc_id).to_dict()
    print("Supporting data loaded and pre-processed.")
except Exception as e:
    print(f"[ERROR] Could not load or process supporting data files: {e}")

# 2. Get list of text files to process
text_files = [f for f in os.listdir(TEXT_FOLDER) if f.endswith('.txt')]

# Regex to find all placeholders. Now matches 'table001' and 'Img001' formats.
media_pattern = re.compile(r'(table\d+|Img\d+)')

# 3. Iterate through each paper file
for filename in text_files:
    pmc_id = os.path.splitext(filename)[0]
    print(f"\n--- Processing Paper: {pmc_id} ---")

    paper_title = id_to_title_map.get(pmc_id, "Title Not Found")
    paper_url = id_to_url_map.get(pmc_id, "URL Not Found")

    if "Not Found" in paper_title:
        print(f"  [WARN] Metadata for {pmc_id} not found in CSV. Skipping.")
        continue

    graph.query(
        "MERGE (p:Paper {id: $id}) SET p.title = $title, p.url = $url",
        params={"id": pmc_id, "title": paper_title, "url": paper_url}
    )
    paper_node = {"type": "Paper", "properties": {"id": pmc_id}}

    file_path = os.path.join(TEXT_FOLDER, filename)
    with open(file_path, 'r', encoding='utf-8') as f:
        full_text = f.read()

    last_end = 0
    for match in media_pattern.finditer(full_text):
        start, end = match.span()
        media_id_local = match.group(0)

        text_chunk = full_text[last_end:start]
        process_text_chunk(text_chunk, paper_node)

        context_start = max(0, start - 250) # Increased context window
        context_end = min(len(full_text), end + 250)
        context_text = full_text[context_start:context_end]

        if media_id_local.startswith('table'):
            process_table(pmc_id, media_id_local, context_text, paper_node)
        elif media_id_local.startswith('Img'):
            process_image(pmc_id, media_id_local, context_text, paper_node, image_url_map)
        
        last_end = end

    remaining_text = full_text[last_end:]
    process_text_chunk(remaining_text, paper_node)

print("\n--- Ingestion Complete! ---")