In [None]:
from typing import Union
from langchain_community.document_loaders import TextLoader, PyMuPDFLoader, DirectoryLoader

def load_docs(path: str, doc_type: str) -> Union[str, list]:
    """
    Load documents from specified path based on document type.

    Args:
        path (str): Path to the document(s) or folder containing documents.
        doc_type (str): Type of document(s) to load. Supported types are 'text', 'pdf', or 'folder'.

    Returns:
        Union[str, list]: Loaded document(s) as a string for single documents ('text', 'pdf'),
                          or a list of strings for multiple text files ('folder').
    
    Raises:
        ValueError: If an unsupported document type is provided.
    """
    if doc_type == 'text':
        loader = TextLoader(path)
    elif doc_type == 'pdf':
        loader = PyMuPDFLoader(path)
    elif doc_type == 'folder':
        loader = DirectoryLoader(path, glob="**/*.txt", loader_cls=TextLoader)
    else:
        raise ValueError(f"Unsupported document type: {doc_type}")

    loaded = loader.load()
    
    return loaded


In [1]:
from langchain_community.document_loaders import TextLoader, PyMuPDFLoader, DirectoryLoader
loader = PyMuPDFLoader('data/Boston Scientific.RhythmCare POC.SOW 20231226 - SF.pdf')

In [2]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)

In [11]:
loader.load()[0]

Document(metadata={'source': 'data/Boston Scientific.RhythmCare POC.SOW 20231226 - SF.pdf', 'file_path': 'data/Boston Scientific.RhythmCare POC.SOW 20231226 - SF.pdf', 'page': 0, 'total_pages': 8, 'format': 'PDF 1.7', 'title': '', 'author': 'Jaccarino, Marcus', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word for Microsoft 365', 'producer': 'Microsoft® Word for Microsoft 365', 'creationDate': "D:20231228181808-05'00'", 'modDate': "D:20240104133558+05'30'", 'trapped': ''}, page_content=' \n \n \nBSC – RhythmCare POC SOW \nSTATEMENT OF WORK  \nStatement of Work No. 01 \nRhythmCare POC \nBY AND BETWEEN \nBOSTON SCIENTIFIC CORPORATION \nAND \nVIRTUSA CORPORATION \n \nThis Statement of Work No 01 (“Statement of Work” or “SOW”), is entered into as of December 26, 2023 (“SOW \nEffective Date”) pursuant to the Master Services Agreement between Boston Scientific Corporation (the “Client” \nor “BSC”) and Virtusa Corporation (the “Vendor” or Service Provider”), with an effective date of

In [15]:
text_splitter.split_documents(loader.load())

[Document(metadata={'source': 'data/Boston Scientific.RhythmCare POC.SOW 20231226 - SF.pdf', 'file_path': 'data/Boston Scientific.RhythmCare POC.SOW 20231226 - SF.pdf', 'page': 0, 'total_pages': 8, 'format': 'PDF 1.7', 'title': '', 'author': 'Jaccarino, Marcus', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word for Microsoft 365', 'producer': 'Microsoft® Word for Microsoft 365', 'creationDate': "D:20231228181808-05'00'", 'modDate': "D:20240104133558+05'30'", 'trapped': ''}, page_content='BSC – RhythmCare POC SOW \nSTATEMENT OF WORK  \nStatement of Work No. 01 \nRhythmCare POC \nBY AND BETWEEN \nBOSTON SCIENTIFIC CORPORATION \nAND \nVIRTUSA CORPORATION \n \nThis Statement of Work No 01 (“Statement of Work” or “SOW”), is entered into as of December 26, 2023 (“SOW \nEffective Date”) pursuant to the Master Services Agreement between Boston Scientific Corporation (the “Client” \nor “BSC”) and Virtusa Corporation (the “Vendor” or Service Provider”), with an effective date of October

In [20]:
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=100, chunk_overlap=0)
# texts = text_splitter.split_documents(docs)

In [31]:
split_text = text_splitter.split_documents(loader.load())

In [29]:
import os
from dotenv import load_dotenv
load_dotenv()

API_KEY = os.getenv('GOOGLE_API_KEY')

In [27]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

  from .autonotebook import tqdm as notebook_tqdm


In [30]:
gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=API_KEY)

I0000 00:00:1721902384.949532   22812 config.cc:230] gRPC experiments enabled: call_status_override_on_cancellation, event_engine_dns, event_engine_listener, http2_stats_fix, monitoring_experiment, pick_first_new, trace_record_callops, work_serializer_clears_time_cache


In [34]:
split_text

[Document(metadata={'source': 'data/Boston Scientific.RhythmCare POC.SOW 20231226 - SF.pdf', 'file_path': 'data/Boston Scientific.RhythmCare POC.SOW 20231226 - SF.pdf', 'page': 0, 'total_pages': 8, 'format': 'PDF 1.7', 'title': '', 'author': 'Jaccarino, Marcus', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word for Microsoft 365', 'producer': 'Microsoft® Word for Microsoft 365', 'creationDate': "D:20231228181808-05'00'", 'modDate': "D:20240104133558+05'30'", 'trapped': ''}, page_content='BSC – RhythmCare POC SOW \nSTATEMENT OF WORK  \nStatement of Work No. 01 \nRhythmCare POC \nBY AND BETWEEN \nBOSTON SCIENTIFIC CORPORATION \nAND \nVIRTUSA CORPORATION \n \nThis Statement of Work No 01 (“Statement of Work” or “SOW”), is entered into as of December 26, 2023 (“SOW \nEffective Date”) pursuant to the Master Services Agreement between Boston Scientific Corporation (the “Client” \nor “BSC”) and Virtusa Corporation (the “Vendor” or Service Provider”), with an effective date of October

In [33]:
gemini_embeddings.embed_documents()

TypeError: object of type 'Document' has no len()