1. Setup and Dependencies
First, let's install all the necessary libraries:

In [1]:
# Install required packages
%pip install langchain langchain-community langchain-groq faiss-cpu sentence-transformers pdf2image pytesseract beautifulsoup4 unstructured pandas python-dotenv chromadb redis Pillow

# For PDF processing
%pip install "unstructured[pdf]"
%pip install pymupdf

Note: you may need to restart the kernel to use updated packages.Collecting langchain
  Downloading langchain-0.3.23-py3-none-any.whl (1.0 MB)
Collecting langchain-community
  Downloading langchain_community-0.3.21-py3-none-any.whl (2.5 MB)
Collecting langchain-groq
  Downloading langchain_groq-0.3.2-py3-none-any.whl (15 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp310-cp310-win_amd64.whl (13.7 MB)
Collecting sentence-transformers
  Downloading sentence_transformers-4.0.2-py3-none-any.whl (340 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Collecting beautifulsoup4
  Downloading beautifulsoup4-4.13.3-py3-none-any.whl (186 kB)
Collecting unstructured
  Downloading unstructured-0.17.2-py3-none-any.whl (1.8 MB)
Collecting pandas
  Downloading pandas-2.2.3-cp310-cp310-win_amd64.whl (11.6 MB)
Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.w

You should consider upgrading via the 'c:\Users\zeesh\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.



Collecting onnx>=1.17.0
  Downloading onnx-1.17.0-cp310-cp310-win_amd64.whl (14.5 MB)
Collecting unstructured-inference>=0.8.10
  Downloading unstructured_inference-0.8.10-py3-none-any.whl (48 kB)
Collecting pdfminer.six
  Downloading pdfminer_six-20250327-py3-none-any.whl (5.6 MB)
Collecting unstructured.pytesseract>=0.3.12
  Downloading unstructured.pytesseract-0.3.15-py3-none-any.whl (14 kB)
Collecting pikepdf
  Downloading pikepdf-9.6.0-cp310-cp310-win_amd64.whl (3.5 MB)
Collecting pi-heif
  Downloading pi_heif-0.22.0-cp310-cp310-win_amd64.whl (1.8 MB)
Collecting google-cloud-vision
  Downloading google_cloud_vision-3.10.1-py3-none-any.whl (526 kB)
Collecting effdet
  Downloading effdet-0.4.1-py3-none-any.whl (112 kB)
Collecting pypdfium2
  Downloading pypdfium2-4.30.1-py3-none-win_amd64.whl (3.0 MB)
Collecting opencv-python!=4.7.0.68
  Downloading opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl (39.5 MB)
Collecting matplotlib
  Downloading matplotlib-3.10.1-cp310-cp310-win_amd64.

You should consider upgrading via the 'c:\Users\zeesh\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-win_amd64.whl (16.6 MB)
Installing collected packages: pymupdf
Successfully installed pymupdf-1.25.5
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\zeesh\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


Now let's import all the required libraries:


In [4]:
import os
import pandas as pd
import numpy as np
from dotenv import load_dotenv
from typing import List, Dict, Any, Optional

# LangChain imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS, Chroma
from langchain_groq import ChatGroq

# Document loaders
from langchain_community.document_loaders import (
    PyPDFLoader, 
    CSVLoader, 
    WebBaseLoader, 
    DirectoryLoader, 
    TextLoader, 
    UnstructuredMarkdownLoader,
    DataFrameLoader,
    AsyncChromiumLoader
)

# For web scraping enhancement
from langchain_community.document_transformers import BeautifulSoupTransformer

# Load environment variables
load_dotenv()

False

2. Create Helper Functions for Document Loading
Let's create helper functions for loading different types of documents:

In [None]:
def load_pdf(file_path: str) -> List[Dict[str, Any]]:
    """
    Load a PDF file and return its contents as documents.
    """
    loader = PyPDFLoader(file_path)
    return loader.load()

def load_text(file_path: str) -> List[Dict[str, Any]]:
    """
    Load a text file and return its contents as a document.
    """
    loader = TextLoader(file_path)
    return loader.load()

def load_csv(file_path: str) -> List[Dict[str, Any]]:
    """
    Load a CSV file and return its contents as documents.
    """
    loader = CSVLoader(file_path)
    return loader.load()

def load_website(url: str) -> List[Dict[str, Any]]:
    """
    Load content from a website URL and return as documents.
    """
    loader = WebBaseLoader(url)
    return loader.load()

def load_javascript_website(url: str) -> List[Dict[str, Any]]:
    """
    Load content from a JavaScript-heavy website using a headless browser.
    Requires Chrome/Chromium to be installed.
    """
    loader = AsyncChromiumLoader([url])
    html_documents = loader.load()
    bs_transformer = BeautifulSoupTransformer()
    return bs_transformer.transform_documents(html_documents)

def load_directory(directory_path: str, glob_pattern: str = "**/*") -> List[Dict[str, Any]]:
    """
    Load all files matching the glob pattern from a directory.
    """
    # Try to determine the appropriate loader based on file extension
    loaders = {
        ".pdf": PyPDFLoader,
        ".txt": TextLoader,
        ".csv": CSVLoader,
        ".md": UnstructuredMarkdownLoader
    }
    
    documents = []
    for ext, loader_cls in loaders.items():
        try:
            dir_loader = DirectoryLoader(
                directory_path, 
                glob=f"**/*{ext}", 
                loader_cls=loader_cls
            )
            documents.extend(dir_loader.load())
            print(f"Loaded {ext} files")
        except Exception as e:
            print(f"Error loading {ext} files: {e}")
    
    return documents

def load_dataframe(df: pd.DataFrame, page_content_column: str) -> List[Dict[str, Any]]:
    """
    Load a pandas DataFrame and convert it to documents.
    """
    loader = DataFrameLoader(df, page_content_column=page_content_column)
    return loader.load()

3. Implement Text Splitting

In [None]:
def split_documents(documents, chunk_size=1000, chunk_overlap=100):
    """
    Split documents into smaller chunks for better processing.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        add_start_index=True,
    )
    return text_splitter.split_documents(documents)

4. Set Up Embeddings Model

In [None]:
def get_embeddings_model():
    """
    Initialize and return the embeddings model.
    We'll use Nomic's free embedding model that can run locally.
    """
    model_name = "nomic-ai/nomic-embed-text-v1"
    
    # For very limited hardware, you could use a smaller model like:
    # model_name = "sentence-transformers/all-MiniLM-L6-v2"
    
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs={'device': 'cpu'},  # Use 'cuda' if you have GPU
        encode_kwargs={'normalize_embeddings': True}
    )
    return embeddings

5. Create Vector Store

In [None]:
def create_vector_store(documents, embeddings, vector_store_type="faiss", persist_directory=None):
    """
    Create a vector store from documents using the specified embeddings.
    
    Args:
        documents: List of documents to index
        embeddings: Embedding model to use
        vector_store_type: Either "faiss" or "chroma"
        persist_directory: Directory to save the vector store (for Chroma)
    """
    if vector_store_type.lower() == "faiss":
        vector_store = FAISS.from_documents(documents, embeddings)
        return vector_store
    
    elif vector_store_type.lower() == "chroma":
        if persist_directory:
            vector_store = Chroma.from_documents(
                documents=documents,
                embedding=embeddings,
                persist_directory=persist_directory
            )
            vector_store.persist()  # Save to disk
            return vector_store
        else:
            return Chroma.from_documents(documents, embeddings)
    
    else:
        raise ValueError("Vector store type must be either 'faiss' or 'chroma'")