### File manipulation

In [21]:
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain_core.documents import Document
from langchain_google_genai import GoogleGenerativeAIEmbeddings

# Load environment variables
import os
from dotenv import load_dotenv
import os
import fitz  # PyMuPDF
import docx

def read_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

def read_pdf(file_path):
    doc = fitz.open(file_path)
    return "\n".join([page.get_text() for page in doc])

def read_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])


docs = []
for root, _, files in os.walk("Docs"):
    for file in files:
        path = os.path.join(root, file)
        ext = file.lower().split('.')[-1]
        try:
            if ext == 'txt':
                text = read_txt(path)
            elif ext == 'pdf':
                text = read_pdf(path)
            elif ext == 'docx':
                text = read_docx(path)
            else:
                continue
            docs.append({'text': text, 'path': path})
        except Exception as e:
            print(f"Failed to read {file}: {e}")

#create an list of texts 
texts = [doc['text'] for doc in docs]
#add path to the metadata
metadatas = [{"source": doc['path']} for doc in docs]
# Generate unique IDs for each document
ids = [f"doc_{i}" for i in range(len(texts))]


### Chosing Embedding function

#### Using HUgginFace to do the embeddings

In [22]:
"""

# chose the embedding function used 
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


"""

'\n\n# chose the embedding function used \nembedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")\n\n\n'

 #### Using google Embedding model 

#### set up google api key

In [23]:
"""
import os
import getpass

if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google API key: ")
"""


'\nimport os\nimport getpass\n\nif "GOOGLE_API_KEY" not in os.environ:\n    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google API key: ")\n'

#### Replace Hugging Face Embeddings with Google Gemini Embeddings

In [24]:
"""
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embedding_function = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

"""


'\nfrom langchain_google_genai import GoogleGenerativeAIEmbeddings\n\nembedding_function = GoogleGenerativeAIEmbeddings(model="models/embedding-001")\n\n'

#### Using Local Model

In [None]:

from sentence_transformers import SentenceTransformer

# Path to your local model
model_path = "Model/all-MiniLM-L6-v2"
embedding_function = HuggingFaceEmbeddings(
    model_name=model_path,
    # use Local model
    model_kwargs={"local_files_only": True}
)

### Chunking

In [32]:
# creating embeddings using huggingface and chunking them 
#Text_split Chunking

text_splitter = SemanticChunker(
    embedding_function,
    breakpoint_threshold_type="percentile",
    
    )

#Converts the input files to documents and splits them into chunks
chunks = text_splitter.split_documents([Document(page_content=text, metadata=metadata) for text, metadata in zip(texts, metadatas)])

print(chunks)

### Agentic chunking 





### vector db creation and adding embeddings 

In [None]:

#create the vector store
vector_store = Chroma(
    collection_name="google_collection",
    embedding_function=embedding_function,
    #persist_directory="VectorDataBase",
    )


# Add documents to the vector store with text metadata and IDs
vector_store.add_documents(
    documents=chunks
)


### Query

In [None]:
results = vector_store.similarity_search("What fruits and vegies give most energy", k=3)
print(f"Found {len(results)} results")

for r in results:
    print(r.metadata["source"], "\n", r.page_content[:200], "...\n")

#TODO: Create the flow diagram
#TODO: Add to the documentation the librareis and models used
#TODO: Add UI to the project using tkinter or streamlit

Found 3 results
Docs\FruitsandVegetablesanditsNutritionalBenefits.pdf 
 Kaparapu et al. 243
14.2  Functional Properties of Fruits and Vegetables
Fruits and Vegetables are composed of several macro and micronutrients. Macronutrients are required in larger amounts and are  ...

Docs\FruitsandVegetablesanditsNutritionalBenefits.pdf 
 2007). 14.8  Conclusion
Food is a substance that we eat which provides nutrition to maintain growth and 
sustain life. Instead of having anything to consume, it will be good to take food 
which provi ...

Docs\FruitsandVegetablesanditsNutritionalBenefits.pdf 
 Ngwira, and Fanuel Lampiao
Inde������������������������������������������������������������������������������������������������������������������  629
Contents

241
© Springer Nature Switzerland AG ...



# Explaining how to chunk files

- Tutorial: https://www.youtube.com/watch?v=-knP6V1Bn3E
- Load Documents: Use document loaders to read your raw files (e.g., .txt, .pdf, .docx) into LangChain's Document objects.​

- Split Documents into Chunks: Utilize text splitters to divide large documents into smaller, manageable chunks. This is essential because embedding models have a maximum token limit.​

- Embed Chunks: Convert each chunk into a vector representation (embedding) using an embedding model like OpenAI's text-embedding-ada-002 or Hugging Face models.​

- Store Embeddings in a Vector Store: Insert the embeddings along with their corresponding metadata into a vector store such as Chroma, FAISS, or Pinecone.​
Introduction | 🦜️🔗 LangChain

- Querying: When a user poses a question, embed the query and retrieve the most similar document chunks from the vector store based on vector similarity.​


### Test

In [None]:
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
# Load environment variables
import os
from dotenv import load_dotenv
import os
import fitz  # PyMuPDF
import docx

class LocalEmbeddingFunction:
    def __init__(self, model):
        self.model = model

    def embed_documents(self, texts):
        return self.model.encode(texts, convert_to_numpy=True)

    def embed_query(self, text):
        return self.model.encode([text], convert_to_numpy=True)[0]

def read_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

def read_pdf(file_path):
    doc = fitz.open(file_path)
    return "\n".join([page.get_text() for page in doc])

def read_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])


docs = []
for root, _, files in os.walk("Docs"):
    for file in files:
        path = os.path.join(root, file)
        ext = file.lower().split('.')[-1]
        try:
            if ext == 'txt':
                text = read_txt(path)
            elif ext == 'pdf':
                text = read_pdf(path)
            elif ext == 'docx':
                text = read_docx(path)
            else:
                continue
            docs.append({'text': text, 'path': path})
        except Exception as e:
            print(f"Failed to read {file}: {e}")

from langchain_community.vectorstores import Chroma
from sentence_transformers import SentenceTransformer

# Path to your local model
model_path = "Model/all-MiniLM-L6-v2"
model = SentenceTransformer(model_path)

# Create texts and metadata
texts = [doc['text'] for doc in docs]
metadatas = [{"source": doc['path']} for doc in docs]
ids = [f"doc_{i}" for i in range(len(texts))]

# Generate embeddings locally
embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)

#TODO: Chunking the documents
# Semantic text splitter (can only use semantinc splitter after embedding)
from langchain_experimental.text_splitter import SemanticChunker





# Initialize ChromaDB store (no LangChain embeddings wrapper needed)
vector_store = Chroma(
    collection_name="my_collection",
    embedding_function=LocalEmbeddingFunction(model),
    
)

# Add to vector store
vector_store._collection.upsert(
    documents=texts,
    embeddings=embeddings,
    metadatas=metadatas,
    ids=ids,
)


results = vector_store.similarity_search("maintenance procedures", k=3)
print(f"Found {len(results)} results")

for r in results:
    print(r.metadata["source"], "\n", r.page_content[:200], "...\n")



