In [1]:
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.output_parsers import StrOutputParser
import os
from langchain_community.graphs import Neo4jGraph
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama.chat_models import ChatOllama
from langchain_experimental.graph_transformers import LLMGraphTransformer
from neo4j import GraphDatabase
from langchain_community.vectorstores import Neo4jVector
from langchain_ollama import OllamaEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import TextLoader, PyMuPDFLoader
from langchain_core.runnables import RunnablePassthrough, RunnableConfig
from pprint import pprint
from langchain_neo4j.vectorstores.neo4j_vector import remove_lucene_chars


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
print(os.getenv("NEO4J_DATABASE"))

None


In [4]:
graph = Neo4jGraph(
    url=os.getenv("NEO4J_URI"),
    username=os.getenv("NEO4J_USERNAME"),
    password=os.getenv("NEO4J_PASSWORD"),
    database="neo4j",
)

  graph = Neo4jGraph(


In [11]:
loader = PyMuPDFLoader("data/AllSem23.pdf")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=200)
documents = text_splitter.split_documents(docs)

In [15]:
for doc in documents:
    doc.page_content = remove_lucene_chars(doc.page_content)
    doc.page_content = doc.page_content.replace("\n", " ")

In [17]:
pprint(documents[56].page_content)

('Maulana Abul Kalam Azad University of Technology  Formerly West Bengal '
 'University of Technology  Syllabus and Curricular Mapping for B. Tech. in '
 'Computer Science and Engineering Effective from Academic Session 2023 24 '
 'Page 43   81 of inherited attributes. 5 Type checking  4L  Type systems, '
 'Specification of a simple type checker, Equivalence of type expressions, '
 'Type conversions 4 6 Run time environments  5L  Source language issues  '
 'Activation trees, Control stack, scope of declaration, Binding of names , '
 'Storage organization  Subdivision of run time memory, Activation records , '
 'Storage allocation strategies, Parameter passing  call by value, call by '
 'reference, copy restore, call by name , Symbol tables, dynamic storage '
 'allocation techniques 5 7 Intermediate code generation  4L  Intermediate '
 'languages, Graphical representation, Three  address code, Implementation of '
 'three address statements  Quadruples, Triples, Indirect triples . 4 8 Co

In [57]:
print(len(documents))

105


In [54]:
from typing import List

class Entities(BaseModel):
    """A generalized model for identifying key entities in various documents,
    such as research papers and academic syllabi."""

    person: List[str] = Field(
        description="Names of individuals, such as authors, professors, or cited researchers.",
        default_factory=list
    )
    organization: List[str] = Field(
        description="Organizations, including universities, departments, research institutes, or companies.",
        default_factory=list
    )
    topic: List[str] = Field(
        description="Key subjects, concepts, or topics discussed in the text.",
        default_factory=list
    )
    publication: List[str] = Field(
        description="Cited or mentioned publications, such as papers, books, or articles.",
        default_factory=list
    )
    location: List[str] = Field(
        description="Geographical or institutional locations mentioned.",
        default_factory=list
    )
    date: List[str] = Field(
        description="Specific dates or years relevant to the document's context.",
        default_factory=list
    )
    course: List[str] = Field(
        description="The names or codes of academic courses, if mentioned.",
        default_factory=list
    )
    assessment: List[str] = Field(
        description="Methods of evaluation, such as exams, quizzes, projects, or assignments.",
        default_factory=list
    )

In [56]:
allowed_nodes = list(Entities.__fields__.keys())
allowed_relationships = [
    "MENTIONS", "DESCRIBES", "CITES", "RELATED_TO", "PART_OF",
    "AUTHORED_BY", "USES_INSTRUMENT", "MEASURES", "LOCATED_IN",
    "STUDIES", "PUBLISHED_IN", "REFERENCES"
]

In [59]:
llm = ChatOllama(
    model="llama3.1:8b",
    temperature=0,
).with_config({"max_retries": 3})
llm_transformer = LLMGraphTransformer(llm=llm,
                                      allowed_nodes=allowed_nodes,
                                      allowed_relationships=allowed_relationships,)
graph_docs = llm_transformer.convert_to_graph_documents(documents)

In [60]:
graph_docs[0]

GraphDocument(nodes=[Node(id='Maulana Abul Kalam Azad University Of Technology', type='Organization', properties={}), Node(id='West Bengal University Of Technology', type='Organization', properties={})], relationships=[], source=Document(metadata={'producer': 'Microsoft: Print To PDF', 'creator': '', 'creationdate': '2023-05-23T13:52:40+05:30', 'source': 'data/AllSem23.pdf', 'file_path': 'data/AllSem23.pdf', 'total_pages': 81, 'format': 'PDF 1.7', 'title': 'Syllabus & Curriculum Mapping B Tech (CSE) (1).docx', 'author': 'MAKAUT 10', 'subject': '', 'keywords': '', 'moddate': '2023-05-23T13:52:40+05:30', 'trapped': '', 'modDate': "D:20230523135240+05'30'", 'creationDate': "D:20230523135240+05'30'", 'page': 0, 'id': 'a664efaa6d7db0d85dcc362ab6ebb376'}, page_content='Maulana Abul Kalam Azad University of Technology  Formerly West Bengal University of Technology  Syllabus and Curricular Mapping for B. Tech. in Computer Science and Engineering Effective from Academic Session 2023 24 Page 1  

In [61]:
graph.query("MATCH (n) DETACH DELETE n")

[]

In [62]:
graph.add_graph_documents(
    graph_documents=graph_docs,
    baseEntityLabel=True,
    include_source=True
)

In [65]:
from yfiles_jupyter_graphs import GraphWidget

def show_graph():
    driver = GraphDatabase.driver(
        uri=os.getenv("NEO4J_URI"),
        auth=(os.getenv("NEO4J_USERNAME"), os.getenv("NEO4J_PASSWORD"))
    )
    with driver.session() as session:
        result = session.run("""
            MATCH (n)-[r]->(m)
            RETURN n, r, m
        """)
        widget = GraphWidget(graph=result.graph())
    return widget

show_graph()

show_graph()

GraphWidget(layout=Layout(height='800px', width='100%'))

In [66]:
embedding_model = OllamaEmbeddings(
    model="nomic-embed-text:latest"
)

In [45]:
# graph.query("DROP INDEX vector IF EXISTS")
# graph.query(
#     "CREATE FULLTEXT INDEX documentFullTextIndex IF NOT EXISTS "
#     "FOR (n:Document) ON EACH [n.text, n.source, n.title]"
# )

[]

In [47]:
vector_index = Neo4jVector.from_existing_graph(
    embedding=embedding_model,
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text", "source", "title"],
    embedding_node_property="embedding",
    keyword_index_name="documentFullTextIndex" 
)
vector_retriever = vector_index.as_retriever()

In [67]:
res = vector_retriever.invoke(
    "Automata syllabus?",
    top_k=5,
    similarity_threshold=0.7,
    include_embeddings=True
)
pprint(res[0].model_dump()["page_content"])



('\n'
 'text: Maulana Abul Kalam Azad University of Technology  Formerly West Bengal '
 'University of Technology  Syllabus and Curricular Mapping for B. Tech. in '
 'Computer Science and Engineering Effective from Academic Session 2023 24 '
 'Page 33   81 COURSE OUTCOMES  COs  On completion of the course students will '
 'be able to Course Outcomes Details Action Verb Knowledge Level PCC CS403.CO1 '
 'Write a formal notation for strings, languages and machines. Write K6 PCC '
 'CS403.CO2 Design finite automata to accept a set of strings of a language. '
 'Design K6 PCC CS403.CO3 For a given language determine whether the given '
 'language is regular or not Determine K5 PCC CS403.CO4 Design context free '
 'grammars to generate strings of context free language. Design K6 PCC '
 'CS403.CO5 Determine equivalence of languages accepted by Push Down Automata '
 'and languages generated by context free grammars Determine K5 PCC CS403.CO6 '
 'Write the hierarchy of formal languages, grammars

In [68]:
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert in scientific papers. Your task is to extract entities from the text. "
        ),
        (
            "human",
            "Use the given format to extract information from the following"
            "input: {question}"
        )
    ]
)

entity_chain = prompt | llm.with_structured_output(Entities)

In [70]:
entities = entity_chain.invoke(
    {"question": "What is the syllabus of automata?"}
)

entities

Entities(person=[], organization=[], topic=['automata'], publication=[], location=[], date=[], course=['syllabus'], assessment=[])

In [78]:
def generate_full_text_query(input: str) ->  str:
    words = [el for el in remove_lucene_chars(input).split() if el]
    if not words:
        return ""
    full_text_query = " AND ".join([f"{word}~2" for word in words])
    return full_text_query.strip()

def graph_retriever(question: str) -> str:
    """Retrieves relevant documents and related entities based on the question."""
    result = ""
    entities = entity_chain.invoke({"question": question}) 
    seen = set()
    for entity_type, value in entities.dict().items():
        if not value:
            continue
        value = value[0].strip()
        if not value or value in seen:
            continue  # Skip empty fields
        seen.add(value)

        fuzzy_query = generate_full_text_query(value)

        response = graph.query(
            """
            CALL db.index.fulltext.queryNodes('entityFullTextIndex', $query, {limit: 5})
            YIELD node, score
            CALL {
                WITH node
                OPTIONAL MATCH (node)-[:MENTIONS|RELATED_TO|CITED_IN|GENERATED_BY*1..2]-(doc:Document)
                RETURN DISTINCT doc.title AS title, doc.abstract AS abstract
            }
            RETURN title, abstract, score
            ORDER BY score DESC
            LIMIT 3
            """,
            {"query": fuzzy_query}
        )

        for record in response:
            result += f"Matched Field: {entity_type}\n"
            result += f"Entity: {value}\n"
            result += f"Title: {record['title']}\n"
            result += f"Abstract: {record['abstract']}\n"
            result += f"Score: {record['score']:.2f}\n\n"

    return result.strip()



# generate_full_text_query("What the contents of this paper say?")
graph_retriever("What is the syllabus of automata?")



''

In [79]:
def full_retriever(query: str):
    """Retrieves relevant documents and entities based on the query."""
    graph_data = graph_retriever(question=query)
    vector_data = [el.model_dump()["page_content"] for el in vector_retriever.invoke(query)]

    final_data = f"""Graph Data:\n{graph_data}\n\nVector Data:\n
                  {"\nDocument".join(vector_data)}"""
    
    return final_data.strip()

In [80]:
op = full_retriever("What is the syllabus of automata?")



In [81]:
template = """Answer the question based on the provided data.
Context: 
{context}

Question: {question}

Use the context to answer the question as accurately as possible. Be concise and to the point.

Answer:
"""

prompt = ChatPromptTemplate.from_template(template)

chain = (
    {
        "context": full_retriever,
        "question": RunnablePassthrough(),
    } 
    | prompt
    | llm
    | StrOutputParser()
)

In [82]:
output = chain.invoke(input="What is the syllabus of automata?")



In [83]:
from IPython.display import Markdown as md
md(output)

The syllabus for Automata Theory (PCC CS403) includes:

* Unit 1: Introduction - Alphabet, languages, and grammars; Chomsky hierarchy of languages
* Unit 2: Regular Languages and Finite Automata - Regular expressions and languages, deterministic and nondeterministic finite automata, equivalence with regular expressions, properties of regular languages, pumping lemma for regular languages
* Unit 3: Context-Free Languages and Pushdown Automata - Context-free grammars and languages, Chomsky and Greibach normal forms, nondeterministic pushdown automata, parse trees, ambiguity in context-free grammars, closure properties of CFLs
* Unit 4: Context-Sensitive Languages - Context-sensitive grammars and languages, linear bounded automata
* Unit 5: Turing Machines - Basic model for Turing machines, Turing recognizable, recursively enumerable, and decidable languages, variants of Turing machines

The course has a total of 70 marks, with 25 marks allocated to continuous assessment and 45 marks to the final exam.

In [84]:
md(chain.invoke(input="what is this doc about?"))



This document appears to be a syllabus for the Bachelor of Technology in Computer Science and Engineering program at Maulana Abul Kalam Azad University of Technology, effective from Academic Session 2023-24. It outlines the course structure, content, and outcomes for various subjects, including Big Data Analytics, Workshop Practice, and others.