### Set up and environment variables


In [1]:
# Install necessary libraries
!pip install langchain langchain_experimental langchain_community langchain-google-genai neo4j python-dotenv beautifulsoup4 pandas tabulate
# Import libraries
import os
import re
import json
import pandas as pd
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from pathlib import Path
from langchain_core.messages import HumanMessage
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_community.graphs import Neo4jGraph
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

# --- Configuration ---

# Define the project's absolute root directory
# Assumes your notebook is in a subfolder like 'Model Tunning'
ROOT_DIR = Path.cwd().parent
print(f"Project Root Directory identified as: {ROOT_DIR}")

# Construct absolute paths from the root directory
dotenv_path = ROOT_DIR / "Backend" / ".env"
DATA_DIR = ROOT_DIR / "Research Data set"

# Load environment variables from the correct, absolute path
load_dotenv(dotenv_path=dotenv_path)
print(f"Attempting to load .env file from: {dotenv_path}")

# Neo4j Credentials
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

# Google API Key
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# Verification Step
if not GOOGLE_API_KEY:
    raise ValueError("GOOGLE_API_KEY not found. Check the path to your .env file and the variable name.")
if not NEO4J_URI:
    raise ValueError("NEO4J credentials not found. Check the path to your .env file.")

os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
print("Successfully loaded environment variables.")

# --- File Paths (now constructed from the absolute DATA_DIR) ---
TEXT_FOLDER = DATA_DIR / "text"
TABLES_FOLDER = DATA_DIR / "tables_data"
IMAGES_FILE = DATA_DIR / "images_data.json"
CSV_FILE = DATA_DIR / "SB_publication_PMC.csv"

print("Setup complete. All paths are now absolute and robust.")

Project Root Directory identified as: c:\Users\Akshit Aggarwal\Space-Biology-Knowledge-Engine
Attempting to load .env file from: c:\Users\Akshit Aggarwal\Space-Biology-Knowledge-Engine\Backend\.env
Successfully loaded environment variables.
Setup complete. All paths are now absolute and robust.


### Initializing connections

In [2]:
from langchain_core.prompts import ChatPromptTemplate

# --- Initialize Connections ---
graph = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD
)

# Initialize models with an explicit timeout for resilience
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0, request_timeout=120)
llm_vision = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0, request_timeout=120)

# --- Define Graph Schema ---
system_prompt = """
You are a brilliant NASA biologist and data scientist. Your task is to extract a knowledge graph from the provided research paper text.

1.  **Nodes**: Identify all relevant entities. For each entity, provide a unique 'id' (its name) and a 'type' from the following categories:
    * `Paper`, `BioEntity`, `Concept`, `Stressor`, `Organism`, `MissionContext`, `Application`, `Institution`.

2.  **Relationships**: Identify the relationships between these entities. Use the following relationship types:
    * `AFFECTS`: Primary relationship for findings. Add an 'effect' property (e.g., 'upregulates', 'inhibits') and an 'evidence' property with the supporting text snippet.
    * `INVESTIGATES`, `STUDIED_IN`, `PART_OF`, `HAS_POTENTIAL`, `AFFILIATED_WITH`.

Provide the output as a list of graph nodes and a list of graph relationships. Do not add any nodes or relationships that are not explicitly mentioned in the text.
"""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

transformer = LLMGraphTransformer(
    llm=llm,
    prompt=prompt
)

print("Graph schema defined and connections initialized.")

  graph = Neo4jGraph(


Graph schema defined and connections initialized.


### Unified context processing functions

In [3]:
def process_and_enrich_graph(graph_documents, base_node):
    """Adds graph documents and links them to a base node in a single batch."""
    if not graph_documents:
        return

    # Add the extracted nodes and relationships to the graph
    graph.add_graph_documents(graph_documents)

    # Batch-link all new non-Paper nodes to the base Paper node
    paper_id = base_node['properties']['id']
    node_ids_to_link = []
    for doc in graph_documents:
        for node in doc.nodes:
            if node.type != 'Paper':
                node_ids_to_link.append(node.id)

    if node_ids_to_link:
        graph.query("""
        MATCH (p:Paper {id: $paper_id})
        UNWIND $node_ids AS node_id
        MATCH (n) WHERE n.id = node_id
        MERGE (p)-[:MENTIONS]->(n)
        """, params={"paper_id": paper_id, "node_ids": node_ids_to_link})

def process_text_chunk(text_chunk, paper_node):
    if not text_chunk.strip():
        return
    print(f"  Processing text chunk of {len(text_chunk)} chars...")
    document = Document(page_content=text_chunk)
    graph_documents = transformer.convert_to_graph_documents([document])
    process_and_enrich_graph(graph_documents, paper_node)

def process_table(pmc_id, table_id_local, context_text, paper_node):
    table_filename = f"{pmc_id}_{table_id_local}.csv"
    print(f"  Processing table: {table_filename}...")
    table_csv_path = os.path.join(TABLES_FOLDER, table_filename)
    if not os.path.exists(table_csv_path): return

    try:
        table_string = pd.read_csv(table_csv_path).to_markdown(index=False)
    except Exception as e:
        print(f"    [ERROR] Could not parse CSV {table_filename}: {e}"); return

    document = Document(page_content=f"CONTEXT: \"{context_text}\"\n---\nTABLE DATA:\n{table_string}")
    graph_documents = transformer.convert_to_graph_documents([document])
    process_and_enrich_graph(graph_documents, paper_node)

    # Link Visual Evidence in a single, efficient query
    unique_table_id = f"{pmc_id}_{table_id_local}"
    concept_ids = [node.id for doc in graph_documents for node in doc.nodes if node.type != 'Paper']
    if concept_ids:
        graph.query("""
        MATCH (p:Paper {id: $pmc_id})
        MERGE (v:VisualEvidence {id: $unique_id, type: 'Table', content: $filename, caption: $caption})
        MERGE (p)-[:HAS_EVIDENCE]->(v)
        WITH v, $concept_ids AS concept_ids
        UNWIND concept_ids AS concept_id
        MATCH (c) WHERE c.id = concept_id
        MERGE (v)-[:ILLUSTRATES]->(c)
        """, params={
            "pmc_id": pmc_id, "unique_id": unique_table_id,
            "filename": table_filename, "caption": context_text,
            "concept_ids": concept_ids
        })

def process_image(pmc_id, image_id_local, context_text, paper_node, image_url_map):
    print(f"  Processing image: {image_id_local} for paper {pmc_id}...")
    try:
        image_url = image_url_map[pmc_id][image_id_local]
    except KeyError:
        print(f"    [WARN] Image URL not found for {pmc_id} -> {image_id_local}"); return

    message = HumanMessage(content=[
        {"type": "text", "text": f"CAPTION CONTEXT: \"{context_text}\"\n---\nDescribe the primary scientific finding from the image."},
        {"type": "image_url", "image_url": image_url},
    ])
    response = llm_vision.invoke([message])
    finding_text = response.content

    if finding_text:
        document = Document(page_content=finding_text)
        graph_documents = transformer.convert_to_graph_documents([document])
        process_and_enrich_graph(graph_documents, paper_node)

        unique_image_id = f"{pmc_id}_{image_id_local}"
        concept_ids = [node.id for doc in graph_documents for node in doc.nodes if node.type != 'Paper']
        if concept_ids:
            graph.query("""
            MATCH (p:Paper {id: $pmc_id})
            MERGE (v:VisualEvidence {id: $unique_id, type: 'Image', content: $url, caption: $caption})
            MERGE (p)-[:HAS_EVIDENCE]->(v)
            WITH v, $concept_ids AS concept_ids
            UNWIND concept_ids AS concept_id
            MATCH (c) WHERE c.id = concept_id
            MERGE (v)-[:ILLUSTRATES]->(c)
            """, params={
                "pmc_id": pmc_id, "unique_id": unique_image_id,
                "url": image_url, "caption": context_text,
                "concept_ids": concept_ids
            })

print("Processing functions optimized for performance and correct syntax.")

Processing functions optimized for performance and correct syntax.


### Main Ingestion Loop

In [None]:
# --- Main Processing Loop ---
try:
    with open(IMAGES_FILE, 'r') as f:
        image_url_map = json.load(f)
    papers_df = pd.read_csv(CSV_FILE)
    papers_df['pmc_id'] = papers_df['Link'].str.extract(r'(PMC\d+)', expand=False)
    papers_df.dropna(subset=['pmc_id'], inplace=True)
    id_to_title_map = pd.Series(papers_df.Title.values, index=papers_df.pmc_id).to_dict()
    id_to_url_map = pd.Series(papers_df.Link.values, index=papers_df.pmc_id).to_dict()
    print("Supporting data loaded and pre-processed.")
except Exception as e:
    print(f"[ERROR] Could not load or process supporting data files: {e}")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=200
)

text_files = [f for f in os.listdir(TEXT_FOLDER) if f.endswith('.txt')]
media_pattern = re.compile(r'(table\d+|Img\d+)')

# Processing a subset of 20 papers for your sprint run.
for filename in text_files[:20]:
    pmc_id = os.path.splitext(filename)[0]
    print(f"\n--- Processing Paper: {pmc_id} ---")

    # --- CRITICAL FIX: Safe, Transactional Cleanup ---
    # This query safely removes only the specific contributions of this paper,
    # leaving shared knowledge nodes intact.
    try:
        print(f"  Ensuring a clean slate for {pmc_id}...")
        graph.query("""
        MATCH (p:Paper {id: $pmc_id})
        OPTIONAL MATCH (p)-[:HAS_EVIDENCE]->(v:VisualEvidence)
        DETACH DELETE v
        """, params={"pmc_id": pmc_id})
        graph.query("""
        MATCH (p:Paper {id: $pmc_id})-[r:MENTIONS]->(n)
        DELETE r
        """, params={"pmc_id": pmc_id})
    except Exception as e:
        print(f"  [WARN] Could not perform cleanup for {pmc_id}. Error: {e}")

    paper_title = id_to_title_map.get(pmc_id, "Title Not Found")
    paper_url = id_to_url_map.get(pmc_id, "URL Not Found")

    if "Not Found" in paper_title:
        print(f"  [WARN] Metadata for {pmc_id} not found in CSV. Skipping.")
        continue

    graph.query(
        "MERGE (p:Paper {id: $id}) SET p.title = $title, p.url = $url",
        params={"id": pmc_id, "title": paper_title, "url": paper_url}
    )
    paper_node = {"type": "Paper", "properties": {"id": pmc_id}}

    file_path = os.path.join(TEXT_FOLDER, filename)
    with open(file_path, 'r', encoding='utf-8') as f:
        full_text = f.read()

    last_end = 0
    for match in media_pattern.finditer(full_text):
        start, end = match.span()
        media_id_local = match.group(0)

        large_text_chunk = full_text[last_end:start]
        small_text_chunks = text_splitter.split_text(large_text_chunk)
        for chunk in small_text_chunks:
            process_text_chunk(chunk, paper_node)

        context_start = max(0, start - 250)
        context_end = min(len(full_text), end + 250)
        context_text = full_text[context_start:context_end]

        if media_id_local.startswith('table'):
            process_table(pmc_id, media_id_local, context_text, paper_node)
        elif media_id_local.startswith('Img'):
            process_image(pmc_id, media_id_local, context_text, paper_node, image_url_map)
        
        last_end = end

    remaining_text = full_text[last_end:]
    small_text_chunks = text_splitter.split_text(remaining_text)
    for chunk in small_text_chunks:
        process_text_chunk(chunk, paper_node)

print("\n--- Sprint Ingestion Complete! ---")

Supporting data loaded and pre-processed.

--- Processing Paper: PMC10020673 ---
  Ensuring a clean slate for PMC10020673...
  Processing text chunk of 8 chars...
  Processing text chunk of 1494 chars...
  Processing text chunk of 1031 chars...
  Processing text chunk of 760 chars...
  Processing text chunk of 1498 chars...
  Processing text chunk of 289 chars...
  Processing text chunk of 9 chars...
  Processing image: Img001 for paper PMC10020673...
  Processing text chunk of 151 chars...
  Processing text chunk of 1436 chars...
  Processing text chunk of 750 chars...
  Processing text chunk of 1460 chars...
  Processing text chunk of 1363 chars...
  Processing text chunk of 1494 chars...
  Processing text chunk of 207 chars...
  Processing text chunk of 842 chars...
  Processing text chunk of 1493 chars...
  Processing text chunk of 456 chars...
  Processing text chunk of 947 chars...
  Processing text chunk of 732 chars...
  Processing text chunk of 1240 chars...
  Processing text 