In [44]:
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.output_parsers import StrOutputParser
import os
from langchain_community.graphs import Neo4jGraph
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama.chat_models import ChatOllama
from langchain_experimental.graph_transformers import LLMGraphTransformer
from neo4j import GraphDatabase
from langchain_community.vectorstores import Neo4jVector
from langchain_ollama import OllamaEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import TextLoader, PyMuPDFLoader
from langchain_core.runnables import RunnablePassthrough, RunnableConfig
from pprint import pprint
from langchain_neo4j.vectorstores.neo4j_vector import remove_lucene_chars

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
print(os.getenv("NEO4J_DATABASE"))

None


In [4]:
graph = Neo4jGraph(
    url=os.getenv("NEO4J_URI"),
    username=os.getenv("NEO4J_USERNAME"),
    password=os.getenv("NEO4J_PASSWORD"),
    database="neo4j",
)

  graph = Neo4jGraph(


In [6]:
loader = PyMuPDFLoader("data/2019_ApJ_Telloni_et_al._DetectionofCoronalMassEjectionsatL1andForecastofTheirGeoeffectiveness.pdf")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=200)
documents = text_splitter.split_documents(docs)

In [9]:
print(len(documents))

92


In [13]:
llm = ChatOllama(
    model="llama3.1:8b",
    temperature=0,
).with_config({"max_retries": 3})
llm_transformer = LLMGraphTransformer(llm=llm)
graph_docs = llm_transformer.convert_to_graph_documents(documents)

In [14]:
graph_docs[0]

GraphDocument(nodes=[Node(id='Detection Of Coronal Mass Ejections At L1 And Forecast Of Their Geoeffectiveness', type='Publication', properties={}), Node(id='The Astrophysical Journal', type='Journal', properties={}), Node(id='E. Antonucci', type='Person', properties={}), Node(id='Alessandro Bemporad', type='Person', properties={}), Node(id='Tiziano Bianchi', type='Person', properties={}), Node(id='National Institute Of Astrophysics', type='Organization', properties={}), Node(id='Polytechnic University Of Turin', type='Organization', properties={})], relationships=[Relationship(source=Node(id='E. Antonucci', type='Person', properties={}), target=Node(id='National Institute Of Astrophysics', type='Organization', properties={}), type='AFFILIATION', properties={}), Relationship(source=Node(id='Alessandro Bemporad', type='Person', properties={}), target=Node(id='National Institute Of Astrophysics', type='Organization', properties={}), type='AFFILIATION', properties={}), Relationship(source

In [16]:
graph.add_graph_documents(
    graph_documents=graph_docs,
    baseEntityLabel=True,
    include_source=True
)

In [20]:
embedding_model = OllamaEmbeddings(
    model="nomic-embed-text:latest"
)

In [22]:
vector_index = Neo4jVector.from_existing_graph(
    embedding=embedding_model,
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text", "source", "title"],
    embedding_node_property="embedding"
)
vector_retriever = vector_index.as_retriever()

In [None]:
res = vector_retriever.invoke(
    "How do i calculate cme?",
    top_k=5,
    similarity_threshold=0.7,
    include_embeddings=True
)
pprint(res[0].model_dump()["page_content"])

In [None]:
class Entities(BaseModel):
    """Identifying information about entities in the text."""

    address: str = Field(description="An address, such as a street address or a location.")
    attribute: str = Field(description="An attribute or characteristic of an entity.")
    author: str = Field(description="A single author of the paper or a cited work.")
    authors: str = Field(description="List of authors associated with the paper.")
    chunk: str = Field(description="A section or meaningful unit of the text.")
    company: str = Field(description="A company or corporate entity mentioned in the text.")
    concept: str = Field(description="An abstract idea, principle, or domain-specific notion.")
    curve: str = Field(description="A graphical representation or mathematical curve described in the paper.")
    cycle: str = Field(description="A repeated process or time cycle referenced in the study.")
    data: str = Field(description="Specific data values, raw or processed, mentioned in the paper.")
    dataset: str = Field(description="A collection of data used, referenced, or produced in the study.")
    department: str = Field(description="A specific department within an organization or institution.")
    distribution: str = Field(description="The statistical or spatial distribution of data or variables.")
    document: str = Field(description="A referenced or related document, article, or paper.")
    electric_current: str = Field(description="Mentions of electric current or related electrical measurements.")
    entity: str = Field(description="A general named entity not otherwise categorized.")
    environment: str = Field(description="The environmental context or conditions mentioned.")
    equation: str = Field(description="A mathematical equation or formula stated in the text.")
    event: str = Field(description="An occurrence or happening described in the context of the study.")
    figure: str = Field(description="A referenced figure, chart, or diagram within the paper.")
    function: str = Field(description="A mathematical or computational function described or used.")
    group: str = Field(description="A collection of people, items, or elements considered together.")
    index: str = Field(description="An index value or indexing term used in the paper.")
    instrument: str = Field(description="A scientific or technical instrument used in data collection.")
    journal: str = Field(description="The journal where the paper or referenced articles are published.")
    location: str = Field(description="A geographic location or place mentioned.")
    magnetometer: str = Field(description="A specific instrument measuring magnetic fields.")
    measurement: str = Field(description="The act or result of measuring a quantity.")
    number: str = Field(description="A numerical value mentioned in the text.")
    orcid_id: str = Field(description="The ORCID identifier for an author.")
    organization: str = Field(description="An organization or institution involved or referenced.")
    person: str = Field(description="A named individual mentioned in the text.")
    phenomenon: str = Field(description="A scientific or observable phenomenon discussed.")
    planet: str = Field(description="A planet mentioned in the astronomical or environmental context.")
    publication: str = Field(description="A published work referenced or discussed.")
    quantity: str = Field(description="A measurable amount or value.")
    reference: str = Field(description="A citation or bibliographic reference.")
    region: str = Field(description="A specific area or region, geographic or conceptual.")
    resource: str = Field(description="Any resource—material, computational, or informational—mentioned.")
    satellite: str = Field(description="A satellite referenced in context to data collection or observation.")
    spacecraft: str = Field(description="A spacecraft or probe mentioned in the study.")
    state: str = Field(description="A physical or logical state or condition of a system or material.")
    structure: str = Field(description="A physical, logical, or organizational structure.")
    system: str = Field(description="A system—technical, natural, or conceptual—described in the paper.")
    thresholds: str = Field(description="Threshold values or limits defined or measured.")
    time_unit: str = Field(description="Units of time used in measurements or descriptions.")
    tool: str = Field(description="A software or hardware tool used in the analysis or study.")
    unit_of_measurement: str = Field(description="A standardized unit for measuring variables.")
    value: str = Field(description="A specific value, usually numerical or categorical, relevant to the context.")
    variable: str = Field(description="A changing or measured quantity in the study.")
    website: str = Field(description="A URL or web-based resource referenced.")
    year: str = Field(description="A specific year mentioned in the paper.")


In [28]:
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert in scientific papers. Your task is to extract entities from the text. "
        ),
        (
            "human",
            "Use the given format to extract information from the following"
            "input: {question}"
        )
    ]
)

entity_chain = prompt | llm.with_structured_output(Entities)

In [52]:
entities = entity_chain.invoke(
    {"question": "What is cme?"}
)

entities.author

''

In [53]:
entities

Entities(address='', attribute='cme', author='', authors='', chunk='What is cme?', company='', concept='cme', curve='', cycle='', data='', dataset='', department='', distribution='', document='', electric_current='', entity='cme', environment='', equation='', event='', figure='', function='', group='', index='', instrument='', journal='', location='', magnetometer='', measurement='', number='', orcid_id='', organization='', person='', phenomenon='', planet='', publication='', quantity='', reference='', region='', resource='', satellite='', spacecraft='', state='', structure='', system='', thresholds='', time_unit='', tool='', unit_of_measurement='', value='', variable='', website='', year='')

In [None]:
def generate_full_text_query(input: str) ->  str:
    words = [el for el in remove_lucene_chars(input).split() if el]
    if not words:
        return ""
    full_text_query = " AND ".join([f"{word}~2" for word in words])
    return full_text_query.strip()

def graph_retriever(question: str) -> str:
    """Retrieves relevant documents and related entities based on the question."""
    result = ""
    entities = entity_chain.invoke({"question": question}) 
    seen = set()
    for entity_type, value in entities.dict().items():
        value = value.strip()
        if not value or value in seen:
            continue  # Skip empty fields
        seen.add(value)

        fuzzy_query = generate_full_text_query(value)

        response = graph.query(
            """
            CALL db.index.fulltext.queryNodes('entityFullTextIndex', $query, {limit: 5})
            YIELD node, score
            CALL {
                WITH node
                OPTIONAL MATCH (node)-[:MENTIONS|RELATED_TO|CITED_IN|GENERATED_BY*1..2]-(doc:Document)
                RETURN DISTINCT doc.title AS title, doc.abstract AS abstract
            }
            RETURN title, abstract, score
            ORDER BY score DESC
            LIMIT 3
            """,
            {"query": fuzzy_query}
        )

        for record in response:
            result += f"Matched Field: {entity_type}\n"
            result += f"Entity: {value}\n"
            result += f"Title: {record['title']}\n"
            result += f"Abstract: {record['abstract']}\n"
            result += f"Score: {record['score']:.2f}\n\n"

    return result.strip()



# generate_full_text_query("What the contents of this paper say?")
# graph_retriever("how to detect cme?")



'Matched Field: attribute\nEntity: cme\nTitle: Detection of Coronal Mass Ejections at L1 and Forecast of Their Geoeffectiveness\nAbstract: None\nScore: 2.08\n\nMatched Field: attribute\nEntity: cme\nTitle: Detection of Coronal Mass Ejections at L1 and Forecast of Their Geoeffectiveness\nAbstract: None\nScore: 1.63\n\nMatched Field: attribute\nEntity: cme\nTitle: Detection of Coronal Mass Ejections at L1 and Forecast of Their Geoeffectiveness\nAbstract: None\nScore: 1.63'

In [69]:
def full_retriever(query: str):
    """Retrieves relevant documents and entities based on the query."""
    graph_data = graph_retriever(question=query)
    vector_data = [el.model_dump()["page_content"] for el in vector_retriever.invoke(query)]

    final_data = f"""Graph Data:\n{graph_data}\n\nVector Data:\n
                  {"\nDocument".join(vector_data)}"""
    
    return final_data.strip()

In [71]:
op = full_retriever("how to detect cme?")



In [73]:
template = """Answer the question based on the provided data.
Context: 
{context}

Question: {question}

Use the context to answer the question as accurately as possible. Be concise and to the point.

Answer:
"""

prompt = ChatPromptTemplate.from_template(template)

chain = (
    {
        "context": full_retriever,
        "question": RunnablePassthrough(),
    } 
    | prompt
    | llm
    | StrOutputParser()
)

In [77]:
output = chain.invoke(input="how to detect cme from particle activity in l1, in detail?")



In [78]:
from IPython.display import Markdown as md
md(output)

The document discusses a method for detecting Coronal Mass Ejections (CMEs) at L1 using an algorithm that detects images containing single or multiple CMEs and reduces data volume. The detection efficiency is higher for bright CMEs (>90%) and lower for faint CMEs (<10%). To detect CMEs from particle activity in L1, the following steps can be taken:

1. Use a coronagraph with an FOV close to the solar limb, such as LASCO C1 or VELC on board ADITYA-L1.
2. Obtain continuous data from the coronagraph using high-resolution images and spectroscopy/spectropolarimetric capabilities.
3. Apply the algorithm to detect images containing single or multiple CMEs.
4. Measure properties of CMEs, such as apparent central position angle, average angular width, and height as a function of time.

Note: The document does not provide detailed information on how to detect CMEs from particle activity in L1, but rather discusses the detection method using images and spectroscopy/spectropolarimetric capabilities.