# Installing packages

In [None]:
!pip install -q langchain langchain-community langchain-openai chromadb python-dotenv


## importing packages


In [None]:
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate
import openai
import os
import shutil
import argparse

## connect to OpenAI gpt API
To connect to the OpenAI GPT API, we utilized [AvalAi](https://avalai.ir/). After signing in, we generated an API key specifically for our project.

In [None]:
from langchain_openai import ChatOpenAI
from langchain_community.callbacks import get_openai_callback

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Hello world!"},
]

model_name = "gpt-4o-mini" # in this case we want to use gpt-4o-mini

llm = ChatOpenAI(
    model=model_name,
    base_url="https://api.avalai.ir/v1",
    temperature=0,
    max_tokens=None,
    max_retries=0,
    api_key="aa-**"
)
# this is testing the API connection and tracking token usage
with get_openai_callback() as cb:
    response = llm.invoke(messages)
    print(response)
    print(cb)

content='Hello! How can I assist you today?' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 20, 'total_tokens': 30, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0, 'text_tokens': None, 'image_tokens': None}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_efad92c60b', 'id': 'chatcmpl-BzxNhmuRhKuawWZ8IuKwpCE1chqrT', 'service_tier': None, 'finish_reason': 'stop', 'logprobs': None} id='run--8e971a33-f561-4f92-a786-7ca5450810c5-0' usage_metadata={'input_tokens': 20, 'output_tokens': 10, 'total_tokens': 30, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}
Tokens Used: 30
	Prompt Tokens: 20
		Prompt Tokens Cached: 0
	Completion Tokens: 10
		Reasoning Tokens: 0
Successful Requests: 1
Total Cost (US

In [None]:
CHROMA_PATH = "/content/chroma" # Path of our data base DIR

# Data preprocessing

In [None]:
def load_documents(Data): #this will load our data into document
    loader = TextLoader(Data)
    documents = loader.load()
    return documents


In [None]:
def split_text(documents: list[Document]): # a text splitter to divide documents into smaller parts (chunks)
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, # Each chunk will have up to 500 characters
        chunk_overlap=100, # Each chunk will overlap with the previous one by 100 characters
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks


## Create and save chroma data base





In [None]:
from chromadb.config import Settings
import time # Import time

def save_to_chroma(chunks: list[Document]):
    # if path exists remove
    if os.path.exists(CHROMA_PATH):
        print(f"Removing existing directory: {CHROMA_PATH}")
        shutil.rmtree(CHROMA_PATH)
        time.sleep(1) # Add a small delay after removal

    # if not exists create
    print(f"Creating directory: {CHROMA_PATH}")
    os.makedirs(CHROMA_PATH, exist_ok=True)

    # Create a new Chroma collection from documents
    db = Chroma.from_documents(
        chunks,
        OpenAIEmbeddings(
            api_key="aa-***",
            base_url="https://api.avalai.ir/v1"
        ),
        # persist_directory=CHROMA_PATH
    )

    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
    return db

In [None]:
documents = load_documents("your_document.md")
chunks = split_text(documents)
save_to_chroma(chunks)

Split 1 documents into 435 chunks.
All the animals were now present except Moses, the tame raven, who slept
on a perch behind the back door. When Major saw that they had all made
themselves comfortable and were waiting attentively, he cleared his throat and
began:
`Comrades, you have heard already about the strange dream that I had last
night. But I will come to the dream later. I have something else to say rst. I
do not think, comrades, that I shall be with you for many months longer, and
{'source': 'animal farm.md', 'start_index': 3947}
Removing existing directory: /content/chroma
Creating directory: /content/chroma
Saved 435 chunks to /content/chroma.


<langchain_community.vectorstores.chroma.Chroma at 0x794dd3bc9790>

# How to use the model

In [None]:
# a template for the model's answer
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

In [None]:
def query():
    question = input("Ask your question: ")
    query_text = question
     # Perform similarity search with relevance scores using the user's question
    results = db.similarity_search_with_relevance_scores(query_text, k=2) # k=2 means Find the 2 most relevant document chunks for the query

    if len(results) == 0 or results[0][1] < 0.7:
        print("Unable to find matching results.")
    else:
        context_text = "\n\n---\n\n".join([doc.page_content for doc, _ in results])
        prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
        prompt = prompt_template.format(context=context_text, question=query_text)
        print("Prompt:\n", prompt)

        # Use the existing llm object and its invoke method
        response = llm.invoke(prompt)
        response_text = response.content

        sources = [doc.metadata.get("source", None) for doc, _ in results]
        formatted_response = f"Response: {response_text}\nSources: {sources}"
        print(formatted_response)


In [None]:
query()

Ask your question: is there any horses ?
Prompt:
 Human: 
Answer the question based only on the following context:

hairy hoofs with great care lest there should be some small animal concealed in
the straw. Clover was a stout motherly mare approaching middle life, who had
never quite got her gure back after her fourth foal. Boxer was an enormous
beast, nearly eighteen hands high, and as strong as any two ordinary horses put
together. A white stripe down his nose gave him a somewhat stupid appearance,
and in fact he was not of rst-rate intelligence, but he was universally respected
for his steadiness of character and tremendous powers of work. After the horses
came Muriel, the white goat, and Benjamin, the donkey. Benjamin was the
oldest animal on the farm, and the worst tempered. He seldom talked, and
when he did, it was usually to make some cynical remark | for instance, he

---

hairy hoofs with great care lest there should be some small animal concealed in
the straw. Clover was a 