# PREPROCESSING
<br>
Preprocessing data is very crucial for the performance of RAG, given below are the list of things we do as a part of preprocesing our data :
<br>
- Extract text and metadata
<br>
- Data Cleaning
<br>
- Document Chunking
<br>
- Create Embedding Vectors & Vector Database
<br>
The data contains Indian Government acts in pdf format. 


In [3]:
#install dependencies
!pip install -q pypdf2 pdfplumber langchain
!pip install -q -U langchain-community
!pip install -q sentence-transformers

In [9]:
#import modules
import os
import PyPDF2.errors
import pdfplumber
from typing import Callable, List, Tuple, Dict
from PyPDF2 import PdfReader
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
import re
from langchain.embeddings import HuggingFaceBgeEmbeddings
import PyPDF2

## Extract text and metadata from the documents.
- Read metadata (specifically the title) from a PDF file.
- Extract text from each page of a PDF file.
- Combine the functionalities of extracting metadata and extracting text from a PDF file.


In [12]:
#Reads metadata, specifically the title, from a PDF file using PyPDF2.

def extract_metadata_from_pdf(file_path: str) -> dict:
    try:
        with open(file_path, "rb") as pdf_file:
            # Open PDF file using PyPDF2
            reader = PdfReader(pdf_file) 
            docmetadata = reader.metadata
            # Extract and return the title from the metadata as a dictionary
            return {"title": str(docmetadata.title)}
    except PyPDF2.errors.PdfReadError as e:
        # Handle PdfReadError exceptions
        print(f"Error reading PDF file: {e}")

In [13]:
def extract_pages_from_pdf(file_path: str) -> List[Tuple[int, str]]:
    """
    Extracts the text from each page of the PDF.

    :param file_path: The path to the PDF file.
    :return: A list of tuples containing the page number and the extracted text.
    """
    if not os.path.isfile(file_path):
        # Raise an error if the file does not exist
        raise FileNotFoundError(f"File not found: {file_path}")

    with pdfplumber.open(file_path) as pdf:
        pages = []
        for page_num, page in enumerate(pdf.pages):
            # Extract text from each page
            text = page.extract_text()
            # Check if extracted text is not empty
            if text.strip():  
                # Append page number and extracted text to the list
                pages.append((page_num + 1, text))
    return pages

In [14]:
def parse_pdf(file_path: str) -> Tuple[List[Tuple[int, str]], Dict[str, str]]:
    """
    Extracts the title and text from each page of the PDF.

    :param file_path: The path to the PDF file.
    :return: A tuple containing the title and a list of tuples with page numbers and extracted text.
    """
    if not os.path.isfile(file_path):
        # Raise an error if the file does not exist
        raise FileNotFoundError(f"File not found: {file_path}")

    # Extract metadata from the PDF file
    metadata = extract_metadata_from_pdf(file_path)
    # Extract text from each page of the PDF file
    pages = extract_pages_from_pdf(file_path)

    return pages, metadata

## Data Cleaning
- Merge Hyphenated Words: Combines hyphenated words separated by a newline character into a single word.
- Fix Newlines: Corrects newlines in the text by replacing single newlines with spaces.
- Remove Multiple Newlines: Eliminates consecutive multiple newlines from the text.

In [16]:

def merge_hyphenated_words(text: str) -> str:
    """
    Merges hyphenated words separated by a newline character in a text.
    
    :param text: The input text.
    :return: The text with hyphenated words merged.
    """
    return re.sub(r"(\w)-\n(\w)", r"\1\2", text)


def fix_newlines(text: str) -> str:
    """
    Fixes newlines in the text by replacing single newlines with spaces.
    
    :param text: The input text.
    :return: The text with fixed newlines.
    """
    return re.sub(r"(?<!\n)\n(?!\n)", " ", text)


def remove_multiple_newlines(text: str) -> str:
    """
    Removes consecutive multiple newlines from the text.
    
    :param text: The input text.
    :return: The text with consecutive multiple newlines removed.
    """
    return re.sub(r"\n{2,}", "\n", text)
def clean_text(
    pages: List[Tuple[int, str]], cleaning_functions: List[Callable[[str], str]]
) -> List[Tuple[int, str]]:
    """
    Cleans the text of each page using a list of cleaning functions.

    Args:
        pages (List[Tuple[int, str]]): List of tuples where each tuple contains a page number and the corresponding text.
        cleaning_functions (List[Callable[[str], str]]): List of functions to clean the text. Each function takes a string and returns a cleaned string.

    Returns:
        List[Tuple[int, str]]: List of tuples with the cleaned text.
    """
    cleaned_pages = []  # Initialize an empty list to hold the cleaned pages.

    for page_num, text in pages:  # Iterate over each page in the pages list.
        for cleaning_function in cleaning_functions:  # Apply each cleaning function to the text.
            text = cleaning_function(text)  # Update the text with the cleaned version.
        
        cleaned_pages.append((page_num, text))  # Add the cleaned page to the list.

    return cleaned_pages  # Return the list of cleaned pages.



## Document Chunking
- The document is split into chunks, with a specified chunk size.
- Chunk size significantly impacts search results.
- Large chunks may result in vectors that are too generalized, losing specificity.
- Small chunks may lose the context necessary for accurate understanding.
- Optimal chunk size balances specificity and context, enhancing search effectiveness.


In [15]:
def text_to_docs(text, metadata: Dict[str, str]) -> List[Document]:
    """
    Converts a list of strings to a list of Documents with metadata.

    Args:
        text (List[Tuple[int, str]]): List of tuples where each tuple contains a page number and the corresponding text.
        metadata (Dict[str, str]): Dictionary containing additional metadata to be added to each Document.

    Returns:
        List[Document]: List of Document objects with chunked text and metadata.
    """
    doc_chunks = []  # Initialize an empty list to hold the Document chunks.
    
    for page_num, page in text:  # Iterate over each page in the text list.
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,  # Maximum size of each chunk.
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],  # Characters to split the text.
            chunk_overlap=200,  # Number of overlapping characters between chunks.
        )
        
        chunks = text_splitter.split_text(page)  # Split the text into chunks.
        
        for i, chunk in enumerate(chunks):  # Iterate over each chunk.
            doc = Document(
                page_content=chunk,  # Set the content of the Document to the current chunk.
                metadata={
                    "page_number": page_num,  # Add the page number to the metadata.
                    "chunk": i,  # Add the chunk index to the metadata.
                    "source": f"p{page_num}-{i}",  # Create a source identifier for the chunk.
                    **metadata,  # Include additional metadata passed to the function.
                },
            )
            doc_chunks.append(doc)  # Add the Document to the list of chunks.

    return doc_chunks  # Return the list of Document chunks.


## Create Embedding Vectors & Vector Database

1. **Convert Texts to Embedding Vectors:**
   - BAAI/bge-base-en is used to transform each chunk of text into an embedding vector.
   - Embedding vectors are numerical representations that encapsulate the semantic meaning of the text.

2. **Store Embeddings into a Vector Database:**
   - Chromadb is used to save the embedding vectors, enabling efficient and scalable retrieval.
   - Here the database is being stored locally u can tweak the code to store the db in the cloud storage provided by chroma, or use other vector databases.



In [None]:
#load the embedding model
model_name = "BAAI/bge-base-en" # try out alternative models available on huggingface
encode_kwargs = {"normalize_embeddings": True}  # Set to True to compute cosine similarity
embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name, encode_kwargs=encode_kwargs
)

# Specify the path to the data directory
root_directory = " "

database = []  # Initialize a list to store document chunks

# Iterate through the files in the root directory
for root, directories, files in os.walk(root_directory):
    for filename in files:
        # Construct the full path to the current file
        file_path = os.path.join(root, filename[0:])
        file_path = file_path.replace("\\", "/")
        print(file_path)  # Print the file path for reference

        # Step 1: Parse PDF
        raw_pages, metadata = parse_pdf(file_path)  # Parse the PDF file

        # Step 2: Create text chunks
        cleaning_functions = [
            merge_hyphenated_words,
            fix_newlines,
            remove_multiple_newlines,
        ]  # Define cleaning functions
        cleaned_text_pdf = clean_text(raw_pages, cleaning_functions)  # Clean the text
        document_chunks = text_to_docs(cleaned_text_pdf, metadata)  # Split text into document chunks
        database.extend(document_chunks)  # Add document chunks to the database

        if len(database) > 2000:  # Check if database size exceeds a threshold 

            # Store embeddings in Chroma vector database
            vector_store = Chroma.from_documents(
                database,
                embeddings,
                persist_directory="",  # Specify the directory for storing the vector database
            )
            vector_store.persist()  # Persist the vector database

            database = []  # Reset the database for the next batch of documents
