# Development of the LlamaIndex model
<hr>

- This model is built based on llamaindex

### Import the libarary

In [29]:
from langchain_chroma  import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader  # Text loader
from langchain.document_loaders import PyPDFLoader  # PDF loader
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

### Step 1. Set up the environment

- import the .env from dotenv

In [1]:
import os

from dotenv import load_dotenv
load_dotenv()

True

### Step 2. Set up the directory

### 1. Define the dicrectory

In jupyter I use this 

```os.getcwd()```

In python file replace it with

```current_dir = os.path.dirname(os.path.abspath(__file__))```

In [6]:
current_dir = os.getcwd()
persistent_directory = os.path.join(current_dir, "db", "chroma_db")

### Step 3. Chunck data and Embed it and put to database

### 1 Find the data source

- Here we use the file from ./data/Haida bracelet.pdf

In [5]:
file_path = os.path.join(current_dir, "data", "Haida bracelet.pdf")

### Chunk the data and save to the vector database

- the ```chunk_size``` and ```chunk_overlap``` can be modified
- the model here can also be changed later ```model="text-embedding-3-small"```

In [12]:
# Check if the Chroma vector store already exists
if not os.path.exists(persistent_directory):
    print("Persistent directory does not exist. Initializing vector store...")

    # Ensure the text file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(
            f"The file {file_path} does not exist. Please check the path."
        )

    # Read the text content from the file
    loader = PyPDFLoader(file_path)
    documents = loader.load()

    # Split the document into chunks
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    docs = text_splitter.split_documents(documents)

    # Display information about the split documents
    print("\n--- Document Chunks Information ---")
    print(f"Number of document chunks: {len(docs)}")
    print(f"Sample chunk:\n{docs[0].page_content}\n")

    # Create embeddings
    print("\n--- Creating embeddings ---")
    embeddings = OpenAIEmbeddings(
        model="text-embedding-3-small"
    )  # Update to a valid embedding model if needed
    print("\n--- Finished creating embeddings ---")

    # Create the vector store and persist it automatically
    print("\n--- Creating vector store ---")
    db = Chroma.from_documents(
        docs, embeddings, persist_directory=persistent_directory)
    print("\n--- Finished creating vector store ---")

else:
    print("Vector store already exists. No need to initialize.")

Persistent directory does not exist. Initializing vector store...

--- Document Chunks Information ---
Number of document chunks: 2
Sample chunk:
Visit
Plan Your Visit (http://royalbcmuseum.bc.ca/visit)
About (http://royalbcmuseum.bc.ca/about)
Support Us (http://royalbcmuseum.bc.ca/support)
Contact Us (http://royalbcmuseum.bc.ca/contact)
Tickets (http://royalbcmuseum.bc.ca/tickets)
What's On
Exhibitions (http://royalbcmuseum.bc.ca/exhibitions)
Imax® (http://royalbcmuseum.bc.ca/imax)
Calendar (http://royalbcmuseum.bc.ca/calendar)
Members Events (http://royalbcmuseum.bc.ca/members-events)
RBCM Channel (http://royalbcmuseum.bc.ca/rbcm-channel)
Natural History
Collections (http://royalbcmuseum.bc.ca/nh-collections)
Curators (http://royalbcmuseum.bc.ca/nh-curators)
Search Collection (http://search-collections.royalbcmuseum.bc.ca/KeywordNaturalHistory)
Collections Care (http://royalbcmuseum.bc.ca/conservation)
Research (http://royalbcmuseum.bc.ca/research)
Human History
Collections (http://r

### Step 4 Retrieve the data and give response

### 1 Define the embedding model

- we use OpenAI model for default

In [13]:

embeddings = OpenAIEmbeddings(model='text-embedding-3-small')

### 2 Load the existing vector store

In [16]:
db = Chroma(persist_directory=persistent_directory, 
            embedding_function=embeddings)

### 3 Define the query question

In [17]:
query_1 =  "What is the check-in time for the Airbnb?"
query_2 =  "Who created the Haida Bracelet?"
query_3 =  "Where's the Haida Bracelet?"

### 4 Retriveal the documents based on the query

- the ```search_type``` can be modified
- the ```search_kwargs``` can be modified

In [21]:
retriever = db.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 3,"score_threshold": 0.1},
)

### 5 Get the relevant doc

In [22]:
relavant_docs_1 = retriever.invoke(query_1)
relavant_docs_2 = retriever.invoke(query_2)
relavant_docs_3 = retriever.invoke(query_3)

Number of requested results 3 is greater than number of elements in index 2, updating n_results = 2
  self.vectorstore.similarity_search_with_relevance_scores(
No relevant docs were retrieved using the relevance score threshold 0.1
Number of requested results 3 is greater than number of elements in index 2, updating n_results = 2
Number of requested results 3 is greater than number of elements in index 2, updating n_results = 2


### 6 Print the result

In [23]:
print("\n--- Relevant Documents ---")
for i, doc in enumerate(relavant_docs_1, 1):
    print(f"Document {i}:\n{doc.page_content}\n")
    if doc.metadata:
        print(f"Source: {doc.metadata.get('source', 'Unknown')}\n")


--- Relevant Documents ---


In [24]:
print("\n--- Relevant Documents ---")
for i, doc in enumerate(relavant_docs_2, 1):
    print(f"Document {i}:\n{doc.page_content}\n")
    if doc.metadata:
        print(f"Source: {doc.metadata.get('source', 'Unknown')}\n")


--- Relevant Documents ---
Document 1:
Visit
Plan Your Visit (http://royalbcmuseum.bc.ca/visit)
About (http://royalbcmuseum.bc.ca/about)
Support Us (http://royalbcmuseum.bc.ca/support)
Contact Us (http://royalbcmuseum.bc.ca/contact)
Tickets (http://royalbcmuseum.bc.ca/tickets)
What's On
Exhibitions (http://royalbcmuseum.bc.ca/exhibitions)
Imax® (http://royalbcmuseum.bc.ca/imax)
Calendar (http://royalbcmuseum.bc.ca/calendar)
Members Events (http://royalbcmuseum.bc.ca/members-events)
RBCM Channel (http://royalbcmuseum.bc.ca/rbcm-channel)
Natural History
Collections (http://royalbcmuseum.bc.ca/nh-collections)
Curators (http://royalbcmuseum.bc.ca/nh-curators)
Search Collection (http://search-collections.royalbcmuseum.bc.ca/KeywordNaturalHistory)
Collections Care (http://royalbcmuseum.bc.ca/conservation)
Research (http://royalbcmuseum.bc.ca/research)
Human History
Collections (http://royalbcmuseum.bc.ca/hh-collections)
Curators (http://royalbcmuseum.bc.ca/hh-curators)
Search Collection (ht

In [25]:
print("\n--- Relevant Documents ---")
for i, doc in enumerate(relavant_docs_3, 1):
    print(f"Document {i}:\n{doc.page_content}\n")
    if doc.metadata:
        print(f"Source: {doc.metadata.get('source', 'Unknown')}\n")


--- Relevant Documents ---
Document 1:
Visit
Plan Your Visit (http://royalbcmuseum.bc.ca/visit)
About (http://royalbcmuseum.bc.ca/about)
Support Us (http://royalbcmuseum.bc.ca/support)
Contact Us (http://royalbcmuseum.bc.ca/contact)
Tickets (http://royalbcmuseum.bc.ca/tickets)
What's On
Exhibitions (http://royalbcmuseum.bc.ca/exhibitions)
Imax® (http://royalbcmuseum.bc.ca/imax)
Calendar (http://royalbcmuseum.bc.ca/calendar)
Members Events (http://royalbcmuseum.bc.ca/members-events)
RBCM Channel (http://royalbcmuseum.bc.ca/rbcm-channel)
Natural History
Collections (http://royalbcmuseum.bc.ca/nh-collections)
Curators (http://royalbcmuseum.bc.ca/nh-curators)
Search Collection (http://search-collections.royalbcmuseum.bc.ca/KeywordNaturalHistory)
Collections Care (http://royalbcmuseum.bc.ca/conservation)
Research (http://royalbcmuseum.bc.ca/research)
Human History
Collections (http://royalbcmuseum.bc.ca/hh-collections)
Curators (http://royalbcmuseum.bc.ca/hh-curators)
Search Collection (ht

### Step 5 Get the rspone from llm

### 1 Combine the query and the relevant document content

- It is acting as relavent content + prompt

In [26]:
combined_input_1 = (
    "Here are some documents that might help answer the question: "
    + query_1
    + "\n\nRelevant Documents:\n"
    + "\n\n".join([doc.page_content for doc in relavant_docs_1])
    + "\n\nPlease provide an answer based only on the provided documents. If the answer is not found in the documents, respond with 'I'm not sure'."
)

In [27]:
combined_input_2 = (
    "Here are some documents that might help answer the question: "
    + query_2
    + "\n\nRelevant Documents:\n"
    + "\n\n".join([doc.page_content for doc in relavant_docs_2])
    + "\n\nPlease provide an answer based only on the provided documents. If the answer is not found in the documents, respond with 'I'm not sure'."
)

In [28]:
combined_input_3 = (
    "Here are some documents that might help answer the question: "
    + query_3
    + "\n\nRelevant Documents:\n"
    + "\n\n".join([doc.page_content for doc in relavant_docs_3])
    + "\n\nPlease provide an answer based only on the provided documents. If the answer is not found in the documents, respond with 'I'm not sure'."
)

### 2 Create a ChatOpenAI model

- ```gpt-4o``` can be changed

In [30]:
model = ChatOpenAI(model="gpt-4o")

### 3 Define a message fpr model

In [31]:
messages_1 = [
    SystemMessage(content="You are a helpful assistant."),
    HumanMessage(content=combined_input_1),
]

In [32]:
messages_2 = [
    SystemMessage(content="You are a helpful assistant."),
    HumanMessage(content=combined_input_2),
]

In [33]:
messages_3 = [
    SystemMessage(content="You are a helpful assistant."),
    HumanMessage(content=combined_input_3),
]

### 4 Invoke the model with the result

In [34]:
result_1 = model.invoke(messages_1)
result_2 = model.invoke(messages_2)
result_3 = model.invoke(messages_3)

<hr>

### The output of the responses.

- system answer

In [35]:
print("\n--- Chat Response ---")
print(result_1)
print("\n--- Chat Response ---")
print(result_2)
print("\n--- Chat Response ---")
print(result_3)


--- Chat Response ---
content="I'm not sure." additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 4, 'prompt_tokens': 69, 'total_tokens': 73, 'completion_tokens_details': {'reasoning_tokens': 0}}, 'model_name': 'gpt-4o-2024-05-13', 'system_fingerprint': 'fp_057232b607', 'finish_reason': 'stop', 'logprobs': None} id='run-b793ebaa-bbdd-4daa-9818-76004c7c76dc-0' usage_metadata={'input_tokens': 69, 'output_tokens': 4, 'total_tokens': 73}

--- Chat Response ---
content='The Haida Bracelet was created by Robert Davidson in 1980.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 14, 'prompt_tokens': 1209, 'total_tokens': 1223, 'completion_tokens_details': {'reasoning_tokens': 0}}, 'model_name': 'gpt-4o-2024-05-13', 'system_fingerprint': 'fp_a5d11b2ef2', 'finish_reason': 'stop', 'logprobs': None} id='run-d3a1304b-b639-4005-84fc-d23a7c3d9c0e-0' usage_metadata={'input_tokens': 1209, 'output_tokens': 14, 'total_to

In [None]:
print("\n--- Chat Response ---")
print(result_1.content)
print("\n--- Chat Response ---")
print(result_2.content)
print("\n--- Chat Response ---")
print(result_3.content)