In [3]:
from langchain_community.document_loaders import DirectoryLoader, UnstructuredFileLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
 
import os
import logging
from pathlib import Path
from typing import List, Dict, Any
 
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
 
 
class MultiDocumentLoader:
    def __init__(self, folder_path: str):
        self.folder_path = Path(folder_path)
        self.supported_extensions = {
            '.pdf': self._load_pdf,
            '.doc': self._load_word,
            '.docx': self._load_word,
        }
 
    def _validate_folder(self) -> bool:
        if not self.folder_path.exists():
            raise FileNotFoundError(f"Directory {self.folder_path} does not exist")
        if not self.folder_path.is_dir():
            raise ValueError(f"{self.folder_path} is not a directory")
        return True
 
    def _get_files_by_type(self) -> Dict[str, List[Path]]:
        files_by_type = {}
        for file_path in self.folder_path.rglob('*'):
            if file_path.is_file():
                extension = file_path.suffix.lower()
                if extension not in files_by_type:
                    files_by_type[extension] = []
                files_by_type[extension].append(file_path)
        return files_by_type
 
    def _load_pdf(self, file_path: Path) -> List[Document]:
        try:
            loader = PyPDFLoader(str(file_path))
            documents = loader.load()
            logger.info(f"Loaded PDF: {file_path.name} ({len(documents)} pages)")
            return documents
        except Exception as e:
            logger.error(f"Error loading PDF {file_path.name}: {e}")
            return []
 
    def _load_word(self, file_path: Path) -> List[Document]:
        try:
            loader = UnstructuredWordDocumentLoader(str(file_path))
            documents = loader.load()
            logger.info(f"Loaded Word document: {file_path.name}")
            return documents
        except Exception as e:
            logger.error(f"Error loading Word document {file_path.name}: {e}")
            return []
 
    def _load_other_files(self, file_path: Path) -> List[Document]:
        try:
            loader = UnstructuredFileLoader(str(file_path))
            documents = loader.load()
            logger.info(f"Loaded file: {file_path.name}")
            return documents
        except Exception as e:
            logger.error(f"Error loading file {file_path.name}: {e}")
            return []
 
    def load_documents(self) -> List[Document]:
        self._validate_folder()
        logger.info(f"Starting document loading from: {self.folder_path}")
        files_by_type = self._get_files_by_type()
        logger.info("Files found:")
        for ext, files in files_by_type.items():
            logger.info(f"  {ext}: {len(files)} files")
        all_documents = []
        for file_path in self.folder_path.rglob('*'):
            if file_path.is_file():
                extension = file_path.suffix.lower()
                if extension in self.supported_extensions:
                    documents = self.supported_extensions[extension](file_path)
                else:
                    documents = self._load_other_files(file_path)
                all_documents.extend(documents)
        logger.info(f"Total documents loaded: {len(all_documents)}")
        return all_documents
 
 
def create_vector_store(documents: List[Document],
                        model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
                        chunk_size: int = 1000,
                        chunk_overlap: int = 150,
                        save_path: str = "multi_format_faiss_index") -> FAISS:
    if not documents:
        raise ValueError("No documents provided for vector store creation")
 
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunked_docs = text_splitter.split_documents(documents)
    logger.info(f"Created {len(chunked_docs)} chunks from {len(documents)} documents")
 
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs={'device': 'cpu'},
        encode_kwargs={'normalize_embeddings': False}
    )
 
    logger.info("Creating FAISS vector store...")
    vectorstore = FAISS.from_documents(chunked_docs, embeddings)
    vectorstore.save_local(save_path)
    logger.info(f"Vector store saved to: {save_path}")
    return vectorstore
 
 
def main():
    try:
        folder_path = r"D:\Muskan.Verma_OneDrive_Data\OneDrive - Course5 Intelligence Limited\Desktop\ai\dataset"
        doc_loader = MultiDocumentLoader(folder_path)
        documents = doc_loader.load_documents()
 
        if not documents:
            logger.warning("No documents were loaded.")
            return
 
        create_vector_store(
            documents=documents,
            chunk_size=1000,
            chunk_overlap=150,
            save_path="multi_format_faiss_index"
        )
 
        logger.info("Document processing completed successfully!")
 
    except Exception as e:
        logger.error(f"An error occurred: {e}")
        import traceback
        traceback.print_exc()
 
 
if __name__ == "__main__":
    main()

2025-07-24 15:12:30,785 - INFO - Starting document loading from: D:\Muskan.Verma_OneDrive_Data\OneDrive - Course5 Intelligence Limited\Desktop\ai\dataset
2025-07-24 15:12:30,788 - INFO - Files found:
2025-07-24 15:12:30,790 - INFO -   .pdf: 3 files
2025-07-24 15:12:30,791 - INFO -   .docx: 1 files
2025-07-24 15:12:30,823 - INFO - Loaded PDF: Case study-FCFF with basics.pdf (1 pages)
2025-07-24 15:12:30,881 - INFO - Loaded Word document: Case_Valuation in LBO transaction.docx
2025-07-24 15:12:30,951 - INFO - Loaded PDF: Lecture Intro on APV.pdf (15 pages)
2025-07-24 15:12:31,466 - INFO - Loaded PDF: Optional article_Using APV_ A Better Tool for Valuing Operations.pdf (15 pages)
2025-07-24 15:12:31,467 - INFO - Total documents loaded: 32
2025-07-24 15:12:31,469 - INFO - Created 54 chunks from 32 documents
2025-07-24 15:12:31,477 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
2025-07-24 15:12:34,643 - INFO - Creating FAISS vector store...
2025-07-24 1