In [9]:
import os
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

local_path = "case.pdf"

# Local PDF file uploads
if local_path:
    loader = UnstructuredPDFLoader(file_path=local_path)
    data = loader.load()
else:
    print("Upload a PDF file")
    data = None  # Ensure data is defined
    exit()  # Exit if no data is loaded

# Split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_documents(data)

current_dir = os.getcwd()
persistent_directory = os.path.join(current_dir, "db", "chroma_db_of_case")

# Check if the Chroma database already exists
if os.path.exists(persistent_directory):
    print("Chroma database already exists. Loading existing database...")
    vector_db = Chroma(persist_directory=persistent_directory, embedding_function=OllamaEmbeddings(model="nomic-embed-text"))
else:
    print("Chroma database does not exist. Creating a new one...")
    vector_db = Chroma.from_documents(
        documents=chunks, 
        embedding=OllamaEmbeddings(model="nomic-embed-text", show_progress=True),
        collection_name="local-rag",
        persist_directory=persistent_directory
    )




Chroma database does not exist. Creating a new one...


OllamaEmbeddings: 100%|██████████| 7/7 [00:14<00:00,  2.14s/it]


In [12]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

# LLM from Ollama
local_model = "llama3"
llm = ChatOllama(model=local_model)

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(search_kwargs={"k": 3}), 
    llm,
    prompt=QUERY_PROMPT
)

# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)


chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


In [7]:
chain.invoke(input(""))

OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.47s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.08s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.04s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.11s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.13s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.17s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.15s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.07s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.14s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.06s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.14s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.10s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.10s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.12s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.10s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [

'Based on the provided context, it appears that the authors of this paper (Attention Is All You Need) are presenting a new approach to natural language processing (NLP) called the "Transformer" model. Specifically, they are describing a sequence-to-sequence transduction model that replaces traditional recurrent neural network (RNN) layers with multi-headed self-attention mechanisms.'

In [None]:
def continual_chat():
    print("Start chatting with the AI! Type 'exit' to end the conversation.")
    chat_history = []  # Collect chat history here (a sequence of messages)
    while True:
        query = input("You: ")
        if query.lower() == "exit":
            break
        
        # Process the user's query through the retrieval chain
        result = chain.invoke({"question": query})
        
        # Display the AI's response
        print(f"AI: {result['result']}")
        
        # Update the chat history
        chat_history.append({"role": "user", "content": query})
        chat_history.append({"role": "assistant", "content": result['result']})

# Main function to start the continual chat
if __name__ == "__main__":
    continual_chat()

In [5]:
def continual_chat():
    print("Start chatting with the AI! Type 'exit' to end the conversation.")
    chat_history = []  # Collect chat history here (a sequence of messages)
    while True:
        query = input("You: ")
        if query.lower() == "exit":
            break
        
        import time
        start_time = time.time()  # Start time for profiling
        
        # Process the user's query through the retrieval chain
        result = chain.invoke({"question": query})
        
        end_time = time.time()  # End time for profiling
        print(f"Processing time: {end_time - start_time:.2f} seconds")
        
        # Display the AI's response
        print(f"AI: {result}")
        
        # Update the chat history
        chat_history.append({"role": "user", "content": query})
        chat_history.append({"role": "assistant", "content": result})

# Main function to start the continual chat
if __name__ == "__main__":
    continual_chat()

Start chatting with the AI! Type 'exit' to end the conversation.


OllamaEmbeddings: 100%|██████████| 1/1 [00:04<00:00,  4.67s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.06s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.06s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.08s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.14s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.13s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.13s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.12s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.13s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.12s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.13s/it]


Processing time: 75.95 seconds
AI: Based on the given context, I would answer that this is about a paper titled "Attention Is All You Need" published in NIPS 2017, which presents a novel sequence-to-sequence model called the Transformer that relies entirely on self-attention mechanisms and achieves state-of-the-art results on several machine translation tasks.


OllamaEmbeddings: 100%|██████████| 1/1 [00:05<00:00,  5.29s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.06s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.07s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.09s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.13s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.12s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.12s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.12s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.14s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.12s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.13s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.11s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.16s/it]


Processing time: 142.11 seconds
AI: Based on the provided context, I will attempt to explain the concept of "Scaled Dot-Product Attention".

In the Transformer model, Scaled Dot-Product Attention is a mechanism used in both the encoder and decoder layers. It allows each position in the input sequence (or output sequence) to attend to all positions in the same sequence.

The attention process involves computing dot products between query vectors and key vectors, dividing by the square root of the key vector's dimension, and then applying a softmax function to obtain the weights on the values. This is illustrated in Figure 2 (left).

In other words, Scaled Dot-Product Attention allows each position in the input sequence to consider all positions in that sequence as potential references when generating its output. This mechanism mimics human-like attention patterns, where we tend to focus on certain parts of a sentence or text more than others.

The Transformer model uses this mechanism i

In [2]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

# Load environment variables from .env
load_dotenv()

# Define paths
case_file_path = "case.pdf"  # Path to the case file

# Load and process the case file
if case_file_path:
    loader = UnstructuredPDFLoader(case_file_path)
    data = loader.load()
else:
    print("Upload a case file")
    data = None
    exit()

# Split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_documents(data)

current_dir = os.getcwd()
persistent_directory = os.path.join(current_dir, "db", "chroma_db_for_case2")

# Check if the Chroma database already exists
if os.path.exists(persistent_directory):
    print("Chroma database already exists. Loading existing database...")
    vector_db = Chroma(persist_directory=persistent_directory, embedding_function=OllamaEmbeddings(model="nomic-embed-text"))
else:
    print("Chroma database does not exist. Creating a new one...")
    vector_db = Chroma.from_documents(
        documents=chunks, 
        embedding=OllamaEmbeddings(model="nomic-embed-text", show_progress=True),
        collection_name="local-rag",
        persist_directory=persistent_directory
    )

# LLM from Ollama
local_model = "llama3"
llm = ChatOllama(model=local_model)

# Query prompt for generating multiple questions
LEGAL_QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are a legal assistant helping clients represent themselves in legal matters. Given the user's question about their case, provide clear, actionable advice based on the provided case details. Ensure your response aligns with the legal information available and is understandable for someone without legal expertise.
    Original question: {question}"""
)

# MultiQueryRetriever setup
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=LEGAL_QUERY_PROMPT
)

# RAG prompt template
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

# Define the chain
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Function to simulate a continual chat
# def continual_chat():
#     print("Start chatting with the AI! Type 'exit' to end the conversation.")
#     chat_history = []
#     while True:
#         query = input("You: ")
#         if query.lower() == "exit":
#             break
        
        
#         try:
#             result = chain.invoke({"question": query})
#             answer = result['result']
#         except Exception as e:
#             answer = f"An error occurred: {e}"
        

#         print(f"AI: {answer}")
        
#         chat_history.append({"role": "user", "content": query})
#         chat_history.append({"role": "assistant", "content": answer})

# # Main function to start the continual chat
# if __name__ == "__main__":
#     continual_chat()

Chroma database already exists. Loading existing database...


In [4]:
chain.invoke("what are the ways in which i can defend myself in court")