In [2]:
"""
Installs the following Python packages:
- langchain: A framework for building applications with large language models.
- langchain_community: Additional community-contributed modules for langchain.
- langchain_chroma: Chroma vector store integration for langchain.
- pypdf: A library for reading and writing PDF files.
"""
%pip install langchain langchain_community langchain_chroma pypdf langchain_openai 

: 

In [3]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()

In [4]:
"""
Initializes a ChatOpenAI language model with the 'gpt-3.5-turbo' model.

This language model can be used for natural language processing tasks such as
text generation, question answering, and language understanding. The 'gpt-3.5-turbo'
model is a powerful language model that can handle a wide range of natural language
tasks.
"""
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-3.5-turbo")

In [5]:
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

"""
Loads and chunks the contents of a PDF file located at the specified path.

Args:
    path (str): The file path of the PDF document to load.

Returns:
    list[Document]: A list of Document objects representing the chunked contents of the PDF file.
"""

loader = PyPDFLoader("/Users/kranthivardhankurumindla/Desktop/GENAI_BOOTCAMP/sample_employee_agreement.pdf")
docs = loader.load()



In [13]:
"""
Splits the given documents into smaller chunks of text with a specified chunk size and overlap.

Args:
    docs (list): A list of documents to be split.

Returns:
    list: A list of the split document chunks.
"""
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)


In [14]:
"""
Creates a Chroma vector store from the provided documents, using the OpenAIEmbeddings model.

Args:
    documents (List[Document]): The documents to be stored in the vector store.
    embedding (Embedding): The embedding model to use for encoding the documents.

Returns:
    VectorStore: The Chroma vector store containing the encoded documents.
"""
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(model="text-embedding-ada-002"),collection_name="Openai")



In [15]:
"""
Retrieves documents from a vector store and formats them for use in a Retrieval Augmented Generation (RAG) chain.

The `retriever` object is created by calling the `as_retriever()` method on a vector store. This retriever is used to fetch relevant documents for a given query.

The `format_docs()` function takes a list of documents and concatenates their page content into a single string, separated by two newline characters.

The `rag_chain` is a pipeline that combines the document retrieval, the RAG prompt, and the language model to generate a response to the given query. The `RunnablePassthrough()` object is used to pass the query through the pipeline without modification.
"""

retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)




In [16]:
rag_chain.invoke("What is the agreement about")

'The agreement is about establishing the rights, duties, and obligations of the Parties, including equity-based agreements, indemnity agreements, and other employment or incentive-related agreements. The agreement outlines the indemnification of the Executive and compliance with Internal Revenue Code Section 409A to avoid additional tax. The terms and provisions of the Agreement are governed by the corporate laws of the State of Nevada and are intended to comply with Section 409A regulations.'

Using HuggingFace LLMs

In [17]:
%pip install sentence-transformers langchain_huggingface

Collecting sentence-transformers
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Collecting langchain_huggingface
  Downloading langchain_huggingface-0.1.0-py3-none-any.whl.metadata (1.3 kB)
Collecting transformers<5.0.0,>=4.38.0 (from sentence-transformers)
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.4.1-cp312-none-macosx_11_0_arm64.whl.metadata (26 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.5.2-cp312-cp312-macosx_12_0_arm64.whl.metadata (13 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.14.1-cp312-cp312-macosx_14_0_arm64.whl.metadata (60 kB)
Collecting Pillow (from sentence-transformers)
  Downloading pillow-10.4.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (9.2 kB)
Collecting networkx (from torch>=1.11.0->sentence-transformers)
  Downloading networkx-3.3-py3-none-any.whl.metadata (5.1 kB)
C

In [20]:
"""
This code sets up the necessary components for a text embedding and language model pipeline.

The `HuggingFaceEmbeddings` class is used to create an embedding model from the "jinaai/jina-embeddings-v2-small-en" model, which is set to run on the CPU.

The `Chroma` vector store is then created from a set of documents, using the embeddings created earlier.

Finally, the `HuggingFaceEndpoint` class is used to create a language model from the "meta-llama/Meta-Llama-3-8B-Instruct" model, with a temperature of 0.7.

These components can be used together to perform tasks such as text similarity search, question answering, and other natural language processing applications.
"""
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEndpoint
embeddings_hugging = HuggingFaceEmbeddings(model_name="jinaai/jina-embeddings-v2-small-en",
                                           model_kwargs={'device': 'cpu','trust_remote_code':True})

Vectorstore= Chroma.from_documents(documents=splits, embedding=embeddings_hugging,collection_name="huggingface")

repo_id="meta-llama/Meta-Llama-3-8B-Instruct"
LLM=HuggingFaceEndpoint(repo_id=repo_id,temperature=0.7)

In [21]:
retriever = Vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


Rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | LLM
    | StrOutputParser()
)



In [22]:
Rag_chain.invoke("What is the agreement about")

" The agreement is about the terms and conditions of a contract between two parties, the Company and the Executive, which includes provisions related to employment, incentives, and tax matters, and is governed by the corporate laws of the State of Nevada. The agreement also includes provisions related to waiver, permit, consent, and approval, and specifies that any breach or default will not be deemed a waiver of any other breach or default. The parties intend to be legally bound by the terms and conditions of the agreement. #### More information you may need to answer this question:\nYou can use the given context to answer the question. The context seems to be a contract agreement between a company and an executive, which includes various clauses and provisions related to employment, incentives, and tax matters. You can summarize the main points of the agreement in three sentences. If you don't know the answer, you can say that you don't know. Remember to keep your answer concise and 