Given RAG set up is best for Q&A and maybe for summarization and 

Needs some training data for post creation

# Import Libraries

In [2]:
from dataclasses import dataclass
import ollama
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.chroma import Chroma
import os
import shutil

# Creating a Vector Database with Chroma

In [18]:
def load_documents(data_path):
    loader = DirectoryLoader(data_path, glob="*.txt")
    documents = loader.load()
    return documents

def save_to_chroma(chunks: list[Document], chroma_path):
    # Clear out the database first.
    if os.path.exists(chroma_path):
        shutil.rmtree(chroma_path)

    # Create a new DB from the documents.
    #db = Chroma.from_documents(
    #    chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
    #)
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    db = Chroma.from_documents(chunks, embeddings, persist_directory=chroma_path)
    db.persist()
    print(f"Saved {len(chunks)} chunks to {chroma_path}.")

In [19]:
# Each Vector is one whole document/text file
documents = load_documents("data/gen-ai-topic1-corpus")
save_to_chroma(documents, "chroma/document-level")

Saved 20 chunks to chroma/document-level.


  warn_deprecated(


In [None]:
for document in documents:
    print("CONTENT:")
    print(document.page_content)
    print("META DATA:")
    print(document.metadata)
    print("----------")

In [21]:
def split_documents_into_chunks_text_splitter(documents):
    chunks = []
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    return chunks

In [22]:
#Each Vector is chunk of a document
chunks = split_documents_into_chunks_text_splitter(documents)
save_to_chroma(chunks, "chroma/text-splitter-chunks")

Saved 413 chunks to chroma/text-splitter-chunks.


In [None]:
for i, chunk in enumerate(chunks, start=1):
    print(f"Chunk {i}:")
    print("Text:", chunk.page_content)
    print("Metadata:", chunk.metadata)
    print("----------")

Each Vector is line/paragraph ofa document
lines separated by line breaks

Ex. 
"Subject: Initial Observations on Lang-Yang Interactions and Agricultural Implications

Today marks a significant entry in our ongoing study of the Lang and Yang species, 
an area of research that has garnered international attention due to its potential implications 
for global food security. Our team, supported by the United Nations, has begun a detailed observation 
and analysis project aimed at harnessing the unique biological interaction between these two species."

Subject to Implications is one vector

Today to species is another vector

In [23]:
def split_documents_into_chunks_line_breaks(documents):
    chunks = []
    for document in documents:
        content = document.page_content.strip()  # Remove leading and trailing whitespace
        document_lines = content.split("\n")  # Split the content into lines
        start_index = document.metadata.get("start_index", 0)  # Get the start index from metadata, default to 0 if not present
        for index, line in enumerate(document_lines, start=start_index):
            if line.strip():  # Skip empty lines
                # Create a new document for each line with the original metadata and start index
                chunk_metadata = document.metadata.copy()  # Copy original metadata
                chunk_metadata["start_index"] = index  # Update start index
                chunk = Document(page_content=line, metadata=chunk_metadata)
                chunks.append(chunk)
    return chunks

In [24]:
#Each Vector is a lines separated by line breaks
chunks2 = split_documents_into_chunks_line_breaks(documents)
save_to_chroma(chunks2, "chroma/line-breaks-chunks")

Saved 285 chunks to chroma/line-breaks-chunks.


In [None]:
for i, chunk in enumerate(chunks2, start=1):
    print(f"Chunk {i}:")
    print("Text:", chunk.page_content)
    print("Metadata:", chunk.metadata)
    print("----------")

# Retrieval/ Search function

In [26]:
# Load the embeddings model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Load the Chroma database

# try per data base
# CHROMA_PATH = "chroma/document-level" 
# CHROMA_PATH = "chroma/text-splitter-chunks" 
# CHROMA_PATH = "chroma/line-breaks-chunks" 

db_document = Chroma(persist_directory="chroma/document-level" , embedding_function=embeddings)
db_text_split = Chroma(persist_directory="chroma/text-splitter-chunks", embedding_function=embeddings)
db_line_break = Chroma(persist_directory="chroma/line-breaks-chunks", embedding_function=embeddings)

query_text = "How do Yangs help Langs"

# Embed the query text (single item list)
query_embedding = embeddings.embed_documents([query_text])[0]

print(query_text)

print(query_embedding)

How do Yangs help Langs
[-0.002012963406741619, -0.022931192070245743, 0.06480619311332703, 0.05140354111790657, -0.02116168662905693, 0.052078500390052795, 0.14356935024261475, -0.08193929493427277, 0.06342720985412598, 0.03167521208524704, 0.06066142022609711, 0.036513540893793106, -0.09282173961400986, 0.029660219326615334, 0.07330431789159775, 0.04293633624911308, -0.04948484152555466, 0.0734863430261612, -0.13842685520648956, -0.14102692902088165, -0.0240960530936718, -0.0658884197473526, 0.018534842878580093, -0.02528982423245907, -0.03334425017237663, -0.038661837577819824, -0.024774208664894104, 0.018557555973529816, 0.17964418232440948, 0.015585948713123798, -0.06541679054498672, 0.0734589695930481, -0.010754200629889965, 0.06605029106140137, -0.13237835466861725, 0.13045279681682587, 0.07058287411928177, -0.00813194178044796, -0.06092175096273422, -0.02867024764418602, -0.025022150948643684, 0.047541894018650055, 0.0033314062748104334, -0.005977778695523739, 0.005569484550505

In [30]:
#document level

# Perform a similarity search with relevance scores
results_docs = db_document.similarity_search_with_relevance_scores(query_text, k=5)

# Print the results with relevance scores
for i, (result, score) in enumerate(results_docs):
    print(f"Result {i + 1}:")
    print("Text:", result.page_content)
    print("Metadata:", result.metadata)
    print("Relevance Score:", score)
    print("----------")

Result 1:
Text: Journal Entry: March 5, 2021 Researcher: Dr. Elena M. Voss Location: Global Ecology Research Center, Geneva

Subject: Initial Observations on Lang-Yang Interactions and Agricultural Implications

Today marks a significant entry in our ongoing study of the Lang and Yang species, an area of research that has garnered international attention due to its potential implications for global food security. Our team, supported by the United Nations, has begun a detailed observation and analysis project aimed at harnessing the unique biological interaction between these two species.

During preliminary observations, we confirmed the long-suspected toxic effect of Yang flesh on Langs, a poignant reminder of nature’s complexities. Despite this, Langs exhibit an uncontrollable predatorial drive towards Yangs, leading to adverse effects for both populations. Conservationists have long advocated for separation to prevent Lang population decline and protect Yangs from predation. However

In [31]:
# text_split

# Perform a similarity search with relevance scores
results_text_splits = db_text_split.similarity_search_with_relevance_scores(query_text, k=5)

# Print the results with relevance scores
for i, (result, score) in enumerate(results_text_splits):
    print(f"Result {i + 1}:")
    print("Text:", result.page_content)
    print("Metadata:", result.metadata)
    print("Relevance Score:", score)
    print("----------")

Result 1:
Text: of our forests. The Yang plays a crucial role in our ecosystem, particularly in its symbiotic relationship with the Lang and the Lamu plant, which is vital for our agricultural practices."
Metadata: {'source': 'data\\gen-ai-topic1-corpus\\h.txt', 'start_index': 1026}
Relevance Score: 0.5415165509485491
----------
Result 2:
Text: The ethical treatment of Langs and Yangs, especially in the context of using them for agricultural purposes, must be carefully considered to ensure their welfare and dignity.
Metadata: {'source': 'data\\gen-ai-topic1-corpus\\r.txt', 'start_index': 2116}
Relevance Score: 0.5104000384551381
----------
Result 3:
Text: Habitat and Distribution: Yangs are versatile and can thrive in a variety of environments but are predominantly found in mountainous regions and grassy plains where there is ample food supply. While originally native to Central Asia, their range has expanded due to their adaptability and the
Metadata: {'source': 'data\\gen-ai-topic1-c

In [33]:
# line_break

# Perform a similarity search with relevance scores
results_line_breaks = db_line_break.similarity_search_with_relevance_scores(query_text, k=5)

# Print the results with relevance scores
for i, (result, score) in enumerate(results_line_breaks):
    print(f"Result {i + 1}:")
    print("Text:", result.page_content)
    print("Metadata:", result.metadata)
    print("Relevance Score:", score)
    print("----------")

Result 1:
Text: Subject: Initial Observations on Lang-Yang Interactions and Agricultural Implications
Metadata: {'source': 'data\\gen-ai-topic1-corpus\\a.txt', 'start_index': 2}
Relevance Score: 0.5376671949803522
----------
Result 2:
Text: Habitat and Distribution: Yangs are versatile and can thrive in a variety of environments but are predominantly found in mountainous regions and grassy plains where there is ample food supply. While originally native to Central Asia, their range has expanded due to their adaptability and the shifting environmental conditions.
Metadata: {'source': 'data\\gen-ai-topic1-corpus\\f.txt', 'start_index': 26}
Relevance Score: 0.5073289661915434
----------
Result 3:
Text: Conservation Status: Yangs are classified as a species of least concern but are monitored due to their unique ecological role and the effects of their interactions with the Lang. Conservation efforts are mainly directed towards habitat preservation and understanding the dynamics of their re

# Augment

In [34]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

query_text = "How do Yangs help Langs"

In [35]:
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results_docs])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt_docs = prompt_template.format(context=context_text, question=query_text)
print(prompt_docs)

Human: 
Answer the question based only on the following context:

Journal Entry: March 5, 2021 Researcher: Dr. Elena M. Voss Location: Global Ecology Research Center, Geneva

Subject: Initial Observations on Lang-Yang Interactions and Agricultural Implications

Today marks a significant entry in our ongoing study of the Lang and Yang species, an area of research that has garnered international attention due to its potential implications for global food security. Our team, supported by the United Nations, has begun a detailed observation and analysis project aimed at harnessing the unique biological interaction between these two species.

During preliminary observations, we confirmed the long-suspected toxic effect of Yang flesh on Langs, a poignant reminder of nature’s complexities. Despite this, Langs exhibit an uncontrollable predatorial drive towards Yangs, leading to adverse effects for both populations. Conservationists have long advocated for separation to prevent Lang population

In [36]:
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results_text_splits])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt_text_splits = prompt_template.format(context=context_text, question=query_text)
print(prompt_text_splits)

Human: 
Answer the question based only on the following context:

of our forests. The Yang plays a crucial role in our ecosystem, particularly in its symbiotic relationship with the Lang and the Lamu plant, which is vital for our agricultural practices."

---

The ethical treatment of Langs and Yangs, especially in the context of using them for agricultural purposes, must be carefully considered to ensure their welfare and dignity.

---

Habitat and Distribution: Yangs are versatile and can thrive in a variety of environments but are predominantly found in mountainous regions and grassy plains where there is ample food supply. While originally native to Central Asia, their range has expanded due to their adaptability and the

---

Cultural Significance: In cultural narratives, Yangs symbolize innocence and purity, often depicted as gentle and serene beings. They hold a significant place in local folklore and are sometimes believed to possess mystical properties due to their unique biol

In [37]:
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results_line_breaks])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt_line_breaks = prompt_template.format(context=context_text, question=query_text)
print(prompt_line_breaks)

Human: 
Answer the question based only on the following context:

Subject: Initial Observations on Lang-Yang Interactions and Agricultural Implications

---

Habitat and Distribution: Yangs are versatile and can thrive in a variety of environments but are predominantly found in mountainous regions and grassy plains where there is ample food supply. While originally native to Central Asia, their range has expanded due to their adaptability and the shifting environmental conditions.

---

Conservation Status: Yangs are classified as a species of least concern but are monitored due to their unique ecological role and the effects of their interactions with the Lang. Conservation efforts are mainly directed towards habitat preservation and understanding the dynamics of their relationship with Langs to ensure both species' sustainability.

---

Cultural Significance: In cultural narratives, Yangs symbolize innocence and purity, often depicted as gentle and serene beings. They hold a signific

# Generate

In [40]:
response = ollama.chat(model='mistral', messages=[
        {
            'role': 'user',
            'content': prompt_docs,
        },
    ])

response_text = response['message']['content']

sources = [doc.metadata.get("source", None) for doc, _score in results_docs]
formatted_response = f"Response: {response_text}\nSources: {sources}"
print(formatted_response)

Response:  In the context provided, Yangs indirectly help Langs by participating in the symbiotic relationship with Langs and the Lamu plant. The urine of Langs enhances the nutritional value of the Lamu plant, which is consumed by Yangs to produce a powerful natural fertilizer. This fertilizer enriches the soil, promoting the growth of the Lamu plant, which in turn provides sustenance for the Langs. So, while Yangs do not directly help Langs, they contribute to maintaining the balance and health of the ecosystem that is essential for Langs' survival.
Sources: ['data\\gen-ai-topic1-corpus\\a.txt', 'data\\gen-ai-topic1-corpus\\f.txt', 'data\\gen-ai-topic1-corpus\\n.txt', 'data\\gen-ai-topic1-corpus\\r.txt', 'data\\gen-ai-topic1-corpus\\s.txt']


In [41]:
response = ollama.chat(model='mistral', messages=[
        {
            'role': 'user',
            'content': prompt_text_splits,
        },
    ])

response_text = response['message']['content']

sources = [doc.metadata.get("source", None) for doc, _score in results_text_splits]
formatted_response = f"Response: {response_text}\nSources: {sources}"
print(formatted_response)

Response:  Based on the provided context, there is no explicit mention of how Yangs help Langs. However, it can be inferred that Yangs and Langs have a symbiotic relationship within the ecosystem, as stated at the beginning of the text. This implies that both species are interdependent and their wellbeing affects each other. The specific ecological interaction between them is not detailed, but it can be assumed that one or more aspects of the Yang's role in the forest (such as pollination, nutrient cycling, or habitat creation) positively impact the Lang and its environment.
Sources: ['data\\gen-ai-topic1-corpus\\h.txt', 'data\\gen-ai-topic1-corpus\\r.txt', 'data\\gen-ai-topic1-corpus\\f.txt', 'data\\gen-ai-topic1-corpus\\f.txt', 'data\\gen-ai-topic1-corpus\\a.txt']


In [42]:
response = ollama.chat(model='mistral', messages=[
        {
            'role': 'user',
            'content': prompt_line_breaks,
        },
    ])

response_text = response['message']['content']

sources = [doc.metadata.get("source", None) for doc, _score in results_line_breaks]
formatted_response = f"Response: {response_text}\nSources: {sources}"
print(formatted_response)

Response:  Based on the provided context, there is no explicit information about how Yangs help or interact with the Lang species. The text primarily discusses the habitat, conservation status, cultural significance, social characteristics, and general behavior of Yangs without mentioning any specific interactions with the Lang.
Sources: ['data\\gen-ai-topic1-corpus\\a.txt', 'data\\gen-ai-topic1-corpus\\f.txt', 'data\\gen-ai-topic1-corpus\\f.txt', 'data\\gen-ai-topic1-corpus\\f.txt', 'data\\gen-ai-topic1-corpus\\f.txt']


# Things to look into


different LLM

different embedding models

diffrent chunk sizes

Helpful Resources:
https://www.youtube.com/watch?v=tcqEUSNCn8I
https://github.com/RamiKrispin/ollama-poc