In [None]:
import os
from dotenv import load_dotenv
import time
import requests

from typing_extensions import List, TypedDict
from langchain_core.documents import Document
from langgraph.graph import StateGraph, MessagesState, END
from langgraph.checkpoint.memory import MemorySaver

from langchain.document_loaders import PyPDFLoader
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter, TokenTextSplitter
from langchain_chroma import Chroma

from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnableMap
from langchain.schema.output_parser import StrOutputParser

import langchain_core

In [None]:
load_dotenv()

## Load Documents 

In [None]:
file_path_1 = os.path.join(os.path.dirname(os.getcwd()), 'docs', 'knowledge_base')
file_path_1

In [None]:
file_path_2 = os.path.join(os.path.dirname(os.getcwd()), 'docs', 'knowledge_base', 'MPEP')
file_path_2

In [None]:
pdf_list_1 = [f for f in os.listdir(file_path_1) if (f.endswith('.pdf') and not f.startswith('mpep'))]
pdf_list_1

In [None]:
pdf_list_2 = [f for f in os.listdir(file_path_2) if f.endswith('.pdf')]
pdf_list_2

In [None]:
len(pdf_list_1), len(pdf_list_2)

In [None]:
pdf_docs = {}
for doc in pdf_list_1:
    loader = PyPDFLoader(os.path.join(file_path_1, doc))
    pages = loader.load()
    pdf_docs[doc] = pages

for doc in pdf_list_2:
    loader = PyPDFLoader(os.path.join(file_path_2, doc))
    pages = loader.load()
    pdf_docs[doc] = pages

pdf_docs

In [None]:
len(pdf_docs)

## Split documents into chunks

In [None]:
text_splitter = TokenTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)

In [None]:
pdf_splits = {}
for p in pdf_docs:
    pdf_splits[p] = text_splitter.split_documents(pdf_docs[p])

pdf_splits

In [None]:
len(pdf_docs['consolidated_laws.pdf']), len(pdf_splits['consolidated_laws.pdf'])

## Embed chunks and save in vector store

In [None]:
persist_directory = os.path.join(os.path.dirname(os.getcwd()), 'vector_store')
persist_directory

In [None]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [None]:
vector_store = Chroma(embedding_function=embeddings, persist_directory=persist_directory)

In [None]:
pdf_splits['consolidated_laws.pdf']

In [None]:
id_list = []
for s in pdf_splits:
    num_chunks = len(pdf_splits[s])

    if num_chunks > 500:
        first_half = pdf_splits[s][:500]
        second_half = pdf_splits[s][500:]

        print(f"Adding the first 500 chunks out of {num_chunks} from {s}")
        ids = vector_store.add_documents(first_half)
        id_list.extend(ids)

        print(f"Adding the remaining {num_chunks - 500} chunks out of {num_chunks} from {s}")
        ids = vector_store.add_documents(second_half)
        id_list.extend(ids)

    else:
        print(f"Adding {num_chunks} chunks from {s}")
        ids = vector_store.add_documents(pdf_splits[s])
        id_list.extend(ids)

In [None]:
vector_store._collection.count()

In [None]:
vector_store.get(id_list[0])

## Querying the data base

In [None]:
results = vector_store.similarity_search_with_score(
    "What are the 3 most important things to know when submitting a patent application?",
    k=5
)

results

In [None]:
print(results[0][0].page_content)

In [None]:
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

In [None]:
results = vectordb.similarity_search_with_score(
    "What are the 3 most important things to know when submitting a patent application?",
    k=5
)

print(results[0][0].page_content)

## Retrieval

In [None]:
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 20}
)

retriever

In [None]:
retriever.invoke("What are the 3 most important things to know when submitting a patent application?")

## Q&A chat model

In [None]:
model = ChatOpenAI(model="gpt-4o", temperature=0, streaming=True)

In [None]:
prompt_template = """
    # Persona
    You are an expert patent lawyer. You have deep knoeledge of the patent laws and regulations in the United States. \
    as well as the procedures and requirements for submitting a patent application to the United States Patent and Trademark Office.

    # Task
    Answer the query of the user using your knowledge base. Always refer to your knowledge base when answering the user's query. \
    If the answer to the user's query is not in your knowledge base, say you don't know, do not make something up. \
    You can ask the user for more information if you need it to answer the query. Answer the user's query in a clear, \
    complete and concise manner, make sure you have given a full answer.

    'query': {query}
    'knowledge_base': {knowledge_base}
    """

In [None]:
prompt_template = PromptTemplate.from_template(prompt_template)

In [None]:
inputs = RunnableMap({
    "knowledge_base": lambda x: retriever.invoke(x["query"]),
    "query": lambda x: x["query"]
})

In [None]:
query = """Respond to this clain rejection:
The following is a quotation of 35 U.S.C. 112(b):
(b) CONCLUSION.-The specification shall conclude with one or more claims particularly pointing out and distinctly claiming the subject matter which the inventor or a joint inventor regards as the invention.
The following is a quotation of 35 U.S.C. 112 (pre-AIA), second paragraph: The specification shall conclude with one or more claims particularly pointing out and distinctly claiming the subject matter which the applicant regards as his invention.
Claim 4 is rejected under 35 U.S.C. 112(b) or 35 U.S.C. 112 (pre-AIA), second paragraph, as being indefinite for failing to particularly point out and distinctly claim the subject matter which the inventor or a joint inventor (or for applications subject to pre- AIA 35 U.S.C. 112, the applicant), regards as the invention.
Claim 4 contains the trademark/trade name Carbopol. Where a trademark or trade name is used in a claim as a limitation to identify or describe a particular material or product, the claim does not comply with the requirements of 35 U.S.C. 112(b) or 35 U.S.C. 112 (pre-AIA), second paragraph. See Ex parte Simpson, 218 USPQ 1020 (Bd. App. 1982). The claim scope is uncertain since the trademark or trade name cannot be used properly to identify any particular material or product. A trademark or trade name is used to identify a source of goods, and not the goods themselves. Thus, a trademark or trade name does not identify or describe the goods associated with the trademark or trade name. In the present case, the trademark/trade name is used to
identify/describe specific polyacrylic polymers and, accordingly, the identification/description is indefinite.
"""

In [None]:
inputs.invoke({"query": query})

In [None]:
output_parser = StrOutputParser()

In [None]:
chain = inputs | prompt_template | model | output_parser

In [None]:
async for t in chain.astream({"query": query}):
    print(t, end="", flush=True)
    time.sleep(0.1)