In [41]:
import os
from dotenv import load_dotenv
load_dotenv()
os.environ['GROQ_API_KEY']=os.getenv('GROQ_API_KEY')
os.environ['OPENAI_API_KEY']=os.getenv('OPENAI_API_KEY')


In [40]:
from langchain_groq import ChatGroq

llm=ChatGroq(model="deepseek-r1-distill-llama-70b")

from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")


In [46]:
from typing import AsyncIterator, Iterator

from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document


class CustomDocumentLoader(BaseLoader):
    """An example document loader that reads a file line by line."""

    def __init__(self, file_path: str) -> None:
        """Initialize the loader with a file path.

        Args:
            file_path: The path to the file to load.
        """
        self.file_path = file_path

    def lazy_load(self) -> Iterator[Document]:  # <-- Does not take any arguments
        """A lazy loader that reads a file line by line.

        When you're implementing lazy load methods, you should use a generator
        to yield documents one by one.
        """
        with open(self.file_path, encoding="utf-8") as f:
            line_number = 0
            for line in f:
                yield Document(
                    page_content=line,
                    metadata={"line_number": line_number, "source": self.file_path},
                )
                line_number += 1

    # alazy_load is OPTIONAL.
    # If you leave out the implementation, a default implementation which delegates to lazy_load will be used!
    async def alazy_load(
        self,
    ) -> AsyncIterator[Document]:  # <-- Does not take any arguments
        """An async lazy loader that reads a file line by line."""
        # Requires aiofiles (install with pip)
        # https://github.com/Tinche/aiofiles
        import aiofiles

        async with aiofiles.open(self.file_path, encoding="utf-8") as f:
            line_number = 0
            async for line in f:
                yield Document(
                    page_content=line,
                    metadata={"line_number": line_number, "source": self.file_path},
                )
                line_number += 1

In [48]:


loader = CustomDocumentLoader(r"C:\Users\DELL\Documents\RAG\Images\image.png")

In [51]:
## Test out the async implementation
async for doc in loader.alazy_load():
    print()
    print(type(doc))
    print(doc)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte

In [14]:
from langchain.document_loaders import DirectoryLoader, UnstructuredFileLoader, YoutubeLoader, WebBaseLoader

USER_AGENT environment variable not set, consider setting it to identify your requests.



1. Local Files:
   - Text files (.txt)
   - PDF files (.pdf)
   - Microsoft Word documents (.doc, .docx)
   - Microsoft Excel spreadsheets (.xls, .xlsx)
   - Microsoft PowerPoint presentations (.ppt, .pptx)
   - Rich Text Format files (.rtf)
   - CSV files (.csv)
   - JSON files (.json)
   - XML files (.xml)
   - Markdown files (.md)
   - HTML files (.html, .htm)
   - Images (.jpg, .jpeg, .png, .gif, .bmp)
   - Email files (.eml, .msg)

2. Local Directories:
   - Any directory containing one or more of the above file types

3. Web URLs:
   - Any valid website URL (e.g., https://www.example.com)

The script uses different loaders based on the input type:

- For local files and directories, it uses UnstructuredFileLoader, which can handle a wide variety of file formats.
- For web URLs, it uses WebBaseLoader, which can fetch and process web content.

It's important to note that while the UnstructuredFileLoader can theoretically handle many file types, the actual ability to process these files depends on the installed dependencies and the specific implementation of the UnstructuredFileLoader in your LangChain version.

For optimal performance and compatibility, ensure you have the necessary dependencies installed for the file types you intend to process. Some file types may require additional libraries or tools for proper extraction and processing.





In [34]:
import os
from urllib.parse import urlparse
from langchain_community.document_loaders import DirectoryLoader, UnstructuredFileLoader, WebBaseLoader
import bs4

def load_dynamic_sources(source):
    all_documents = []

    if os.path.exists(source):
        # Local directory or file
        if os.path.isdir(source):
            directory_loader = DirectoryLoader(source, glob="**/*", loader_cls=UnstructuredFileLoader)
            documents = directory_loader.load()
        else:
            file_loader = UnstructuredFileLoader(source)
            documents = file_loader.load()
        all_documents.extend(documents)
    
    elif urlparse(source).scheme:
        # URL (website)
        web_loader = WebBaseLoader(source)
        documents = web_loader.load()
        all_documents.extend(documents)
    
    else:
        raise ValueError("Invalid source provided. Must be a valid local path or URL.")

    return all_documents

# Example usage:
# docs = load_dynamic_sources('/path/to/local/directory')
# docs = load_dynamic_sources('/path/to/local/file.txt')
# docs = load_dynamic_sources('https://www.example.com')


In [35]:
docs=load_dynamic_sources('https://python.langchain.com/v0.1/docs/modules/data_connection/document_loaders/')

In [36]:
docs

[Document(metadata={'source': 'https://python.langchain.com/v0.1/docs/modules/data_connection/document_loaders/', 'title': 'Document loaders | 🦜️🔗 LangChain', 'description': 'Head to Integrations for documentation on built-in document loader integrations with 3rd-party tools.', 'language': 'en'}, page_content='\n\n\n\n\nDocument loaders | 🦜️🔗 LangChain\n\n\n\n\n\n\n\nSkip to main contentThis is documentation for LangChain v0.1, which is no longer actively maintained. Check out the docs for the latest version here.ComponentsIntegrationsGuidesAPI ReferenceMorePeopleVersioningContributingTemplatesCookbooksTutorialsYouTubev0.1Latestv0.2v0.1🦜️🔗LangSmithLangSmith DocsLangServe GitHubTemplates GitHubTemplates HubLangChain HubJS/TS Docs💬SearchModel I/OPromptsChat modelsLLMsOutput parsersRetrievalDocument loadersDocument loadersCustom Document LoaderCSVFile DirectoryHTMLJSONMarkdownMicrosoft OfficePDFText splittersEmbedding modelsVector storesRetrieversIndexingCompositionToolsAgentsChainsMoreCo

In [43]:
from langchain_community.vectorstores import FAISS

vectorstore=FAISS.from_documents(docs,embedding=embeddings)
vectorstore

<langchain_community.vectorstores.faiss.FAISS at 0x1cb46e73d00>

In [44]:
from langchain_core.runnables import RunnableLambda

retriever=RunnableLambda(vectorstore.similarity_search).bind(k=2)

In [45]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

message = """
Answer this question using the provided context only.

{question}

Context:
{context}
"""
prompt = ChatPromptTemplate.from_messages([("human", message)])

rag_chain={"context":retriever,"question":RunnablePassthrough()}|prompt|llm

response=rag_chain.invoke("tell me about langchain")
print(response.content)

<think>
Okay, so I need to figure out what LangChain is based on the provided context. Let me start by reading through the context carefully. 

The context is a document from LangChain's documentation, specifically about document loaders. It mentions that document loaders are used to load data from various sources into documents. Each document consists of text and associated metadata. Examples given include loading from .txt files, web pages, and even YouTube transcripts.

The document explains that each loader has a "load" method to load documents and an optional "lazy load" method for loading data into memory as needed. There's also a mention of a "load and split" method, which uses a text splitter. 

I see that there's an example provided where they import TextLoader from langchain_community.document_loaders and use it to load a file. The API reference section shows how the TextLoader is used, which gives me an idea of how these loaders are implemented.

So, putting this together, L